kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -13,17 +13,20 @@
13
13
  //! - Citations and references
14
14
  //! - Supplementary material information
15
15
 
16
+ mod elements;
17
+ mod metadata;
18
+ mod parser;
19
+
16
20
  use crate::Result;
17
21
  use crate::core::config::ExtractionConfig;
18
- use crate::extraction::cells_to_markdown;
19
22
  use crate::plugins::{DocumentExtractor, Plugin};
20
- use crate::types::{ExtractionResult, Metadata, Table};
23
+ use crate::types::{ExtractionResult, Metadata};
21
24
  use async_trait::async_trait;
22
- use quick_xml::Reader;
23
- use quick_xml::events::Event;
24
25
  #[cfg(feature = "tokio-runtime")]
25
26
  use std::path::Path;
26
27
 
28
+ use elements::extract_jats_all_in_one;
29
+
27
30
  /// JATS document extractor.
28
31
  ///
29
32
  /// Supports JATS (Journal Article Tag Suite) XML documents in various versions,
@@ -42,414 +45,6 @@ impl JatsExtractor {
42
45
  }
43
46
  }
44
47
 
45
- /// Extract text content from a JATS element and its children.
46
- fn extract_text_content(reader: &mut Reader<&[u8]>) -> Result<String> {
47
- let mut text = String::new();
48
- let mut depth = 0;
49
-
50
- loop {
51
- match reader.read_event() {
52
- Ok(Event::Start(_)) => {
53
- depth += 1;
54
- }
55
- Ok(Event::End(_)) => {
56
- if depth == 0 {
57
- break;
58
- }
59
- depth -= 1;
60
- if !text.is_empty() && !text.ends_with('\n') {
61
- text.push(' ');
62
- }
63
- }
64
- Ok(Event::Text(t)) => {
65
- let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
66
- if !decoded.trim().is_empty() {
67
- text.push_str(&decoded);
68
- text.push(' ');
69
- }
70
- }
71
- Ok(Event::CData(t)) => {
72
- let decoded = std::str::from_utf8(t.as_ref()).unwrap_or("").to_string();
73
- if !decoded.trim().is_empty() {
74
- text.push_str(&decoded);
75
- text.push('\n');
76
- }
77
- }
78
- Ok(Event::Eof) => break,
79
- Err(e) => {
80
- return Err(crate::error::KreuzbergError::parsing(format!(
81
- "XML parsing error: {}",
82
- e
83
- )));
84
- }
85
- _ => {}
86
- }
87
- }
88
-
89
- Ok(text.trim().to_string())
90
- }
91
-
92
- /// Structure to hold extracted JATS metadata.
93
- #[derive(Debug, Clone, Default)]
94
- struct JatsMetadataExtracted {
95
- title: String,
96
- subtitle: Option<String>,
97
- authors: Vec<String>,
98
- affiliations: Vec<String>,
99
- doi: Option<String>,
100
- pii: Option<String>,
101
- keywords: Vec<String>,
102
- publication_date: Option<String>,
103
- volume: Option<String>,
104
- issue: Option<String>,
105
- pages: Option<String>,
106
- journal_title: Option<String>,
107
- article_type: Option<String>,
108
- abstract_text: Option<String>,
109
- corresponding_author: Option<String>,
110
- }
111
-
112
- /// Extract all content in a single optimized pass.
113
- /// Combines metadata extraction, content parsing, and table extraction into one pass.
114
- fn extract_jats_all_in_one(content: &str) -> Result<(JatsMetadataExtracted, String, String, Vec<Table>)> {
115
- let mut reader = Reader::from_str(content);
116
- let mut metadata = JatsMetadataExtracted::default();
117
- let mut body_content = String::new();
118
- let mut title = String::new();
119
-
120
- let mut in_article_meta = false;
121
- let mut in_article_title = false;
122
- let mut in_subtitle = false;
123
- let mut in_contrib = false;
124
- let mut in_name = false;
125
- let mut in_aff = false;
126
- let mut in_abstract = false;
127
- let mut in_kwd_group = false;
128
- let mut in_kwd = false;
129
- let mut current_author = String::new();
130
- let mut current_aff = String::new();
131
- let mut abstract_content = String::new();
132
-
133
- let mut in_body = false;
134
- let mut in_section = false;
135
- let mut in_para = false;
136
-
137
- let mut in_table = false;
138
- let mut in_thead = false;
139
- let mut in_tbody = false;
140
- let mut in_row = false;
141
- let mut current_table: Vec<Vec<String>> = Vec::new();
142
- let mut current_row: Vec<String> = Vec::new();
143
- let mut tables = Vec::new();
144
- let mut table_index = 0;
145
-
146
- loop {
147
- match reader.read_event() {
148
- Ok(Event::Start(e)) => {
149
- let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
150
-
151
- match tag.as_str() {
152
- "article" => {
153
- for attr in e.attributes() {
154
- if let Ok(attr) = attr
155
- && String::from_utf8_lossy(attr.key.as_ref()) == "article-type"
156
- {
157
- metadata.article_type = Some(String::from_utf8_lossy(attr.value.as_ref()).to_string());
158
- }
159
- }
160
- }
161
- "article-meta" => {
162
- in_article_meta = true;
163
- }
164
- "article-title" if in_article_meta => {
165
- in_article_title = true;
166
- }
167
- "subtitle" if in_article_meta => {
168
- in_subtitle = true;
169
- }
170
- "contrib" if in_article_meta => {
171
- in_contrib = true;
172
- current_author.clear();
173
- }
174
- "name" if in_contrib => {
175
- in_name = true;
176
- }
177
- "aff" if in_article_meta => {
178
- in_aff = true;
179
- current_aff.clear();
180
- }
181
- "article-id" if in_article_meta => {
182
- let mut id_type = String::new();
183
- for attr in e.attributes() {
184
- if let Ok(attr) = attr
185
- && String::from_utf8_lossy(attr.key.as_ref()) == "pub-id-type"
186
- {
187
- id_type = String::from_utf8_lossy(attr.value.as_ref()).to_string();
188
- }
189
- }
190
-
191
- let id_text = extract_text_content(&mut reader)?;
192
- match id_type.as_str() {
193
- "doi" => metadata.doi = Some(id_text),
194
- "pii" => metadata.pii = Some(id_text),
195
- _ => {}
196
- }
197
- continue;
198
- }
199
- "volume" if in_article_meta => {
200
- let vol_text = extract_text_content(&mut reader)?;
201
- metadata.volume = Some(vol_text);
202
- continue;
203
- }
204
- "issue" if in_article_meta => {
205
- let issue_text = extract_text_content(&mut reader)?;
206
- metadata.issue = Some(issue_text);
207
- continue;
208
- }
209
- "fpage" | "lpage" if in_article_meta => {
210
- let page_text = extract_text_content(&mut reader)?;
211
- if let Some(pages) = &mut metadata.pages {
212
- pages.push('-');
213
- pages.push_str(&page_text);
214
- } else {
215
- metadata.pages = Some(page_text);
216
- }
217
- continue;
218
- }
219
- "pub-date" if in_article_meta => {
220
- let date_text = extract_text_content(&mut reader)?;
221
- if metadata.publication_date.is_none() {
222
- metadata.publication_date = Some(date_text);
223
- }
224
- continue;
225
- }
226
- "journal-title" if in_article_meta => {
227
- let journal_text = extract_text_content(&mut reader)?;
228
- if metadata.journal_title.is_none() {
229
- metadata.journal_title = Some(journal_text);
230
- }
231
- continue;
232
- }
233
- "abstract" if in_article_meta => {
234
- in_abstract = true;
235
- abstract_content.clear();
236
- }
237
- "kwd-group" if in_article_meta => {
238
- in_kwd_group = true;
239
- }
240
- "kwd" if in_kwd_group => {
241
- in_kwd = true;
242
- }
243
- "corresp" if in_article_meta => {
244
- let corresp_text = extract_text_content(&mut reader)?;
245
- metadata.corresponding_author = Some(corresp_text);
246
- continue;
247
- }
248
- "body" => {
249
- in_body = true;
250
- }
251
- "sec" if in_body => {
252
- in_section = true;
253
- }
254
- "title" if (in_section || in_body) && !in_article_title => {
255
- let section_title = extract_text_content(&mut reader)?;
256
- if !section_title.is_empty() {
257
- body_content.push_str("## ");
258
- body_content.push_str(&section_title);
259
- body_content.push_str("\n\n");
260
- }
261
- continue;
262
- }
263
- "p" if in_body || in_section => {
264
- in_para = true;
265
- }
266
- "table" => {
267
- in_table = true;
268
- current_table.clear();
269
- }
270
- "thead" if in_table => {
271
- in_thead = true;
272
- }
273
- "tbody" if in_table => {
274
- in_tbody = true;
275
- }
276
- "tr" if (in_thead || in_tbody) && in_table => {
277
- in_row = true;
278
- current_row.clear();
279
- }
280
- "td" | "th" if in_row => {
281
- let mut cell_text = String::new();
282
- let mut cell_depth = 0;
283
-
284
- loop {
285
- match reader.read_event() {
286
- Ok(Event::Start(_)) => {
287
- cell_depth += 1;
288
- }
289
- Ok(Event::End(e)) => {
290
- let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
291
- if (tag == "td" || tag == "th") && cell_depth == 0 {
292
- break;
293
- }
294
- if cell_depth > 0 {
295
- cell_depth -= 1;
296
- }
297
- }
298
- Ok(Event::Text(t)) => {
299
- let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
300
- if !decoded.trim().is_empty() {
301
- if !cell_text.is_empty() {
302
- cell_text.push(' ');
303
- }
304
- cell_text.push_str(decoded.trim());
305
- }
306
- }
307
- Ok(Event::Eof) => break,
308
- Err(e) => {
309
- return Err(crate::error::KreuzbergError::parsing(format!(
310
- "XML parsing error: {}",
311
- e
312
- )));
313
- }
314
- _ => {}
315
- }
316
- }
317
-
318
- current_row.push(cell_text);
319
- }
320
- _ => {}
321
- }
322
- }
323
- Ok(Event::End(e)) => {
324
- let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
325
-
326
- match tag.as_str() {
327
- "article-meta" => {
328
- in_article_meta = false;
329
- }
330
- "article-title" if in_article_title => {
331
- in_article_title = false;
332
- }
333
- "subtitle" if in_subtitle => {
334
- in_subtitle = false;
335
- }
336
- "contrib" if in_contrib => {
337
- if !current_author.is_empty() {
338
- metadata.authors.push(current_author.clone());
339
- }
340
- in_contrib = false;
341
- current_author.clear();
342
- }
343
- "name" if in_name => {
344
- in_name = false;
345
- }
346
- "aff" if in_aff => {
347
- if !current_aff.is_empty() {
348
- metadata.affiliations.push(current_aff.clone());
349
- }
350
- in_aff = false;
351
- current_aff.clear();
352
- }
353
- "abstract" if in_abstract => {
354
- in_abstract = false;
355
- metadata.abstract_text = Some(abstract_content.trim().to_string());
356
- }
357
- "kwd-group" if in_kwd_group => {
358
- in_kwd_group = false;
359
- }
360
- "kwd" if in_kwd => {
361
- in_kwd = false;
362
- }
363
- "body" => {
364
- in_body = false;
365
- }
366
- "sec" if in_section => {
367
- in_section = false;
368
- }
369
- "p" if in_para => {
370
- in_para = false;
371
- }
372
- "table" if in_table => {
373
- if !current_table.is_empty() {
374
- let markdown = cells_to_markdown(&current_table);
375
- tables.push(Table {
376
- cells: current_table.clone(),
377
- markdown,
378
- page_number: table_index + 1,
379
- });
380
- table_index += 1;
381
- current_table.clear();
382
- }
383
- in_table = false;
384
- }
385
- "thead" if in_thead => {
386
- in_thead = false;
387
- }
388
- "tbody" if in_tbody => {
389
- in_tbody = false;
390
- }
391
- "tr" if in_row => {
392
- if !current_row.is_empty() {
393
- current_table.push(current_row.clone());
394
- current_row.clear();
395
- }
396
- in_row = false;
397
- }
398
- _ => {}
399
- }
400
- }
401
- Ok(Event::Text(t)) => {
402
- let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
403
- let trimmed = decoded.trim();
404
-
405
- if !trimmed.is_empty() {
406
- if in_article_title && metadata.title.is_empty() {
407
- metadata.title.push_str(trimmed);
408
- } else if in_subtitle && metadata.subtitle.is_none() {
409
- metadata.subtitle = Some(trimmed.to_string());
410
- } else if in_name {
411
- if !current_author.is_empty() {
412
- current_author.push(' ');
413
- }
414
- current_author.push_str(trimmed);
415
- } else if in_aff {
416
- if !current_aff.is_empty() {
417
- current_aff.push(' ');
418
- }
419
- current_aff.push_str(trimmed);
420
- } else if in_abstract {
421
- if !abstract_content.is_empty() {
422
- abstract_content.push(' ');
423
- }
424
- abstract_content.push_str(trimmed);
425
- } else if in_kwd {
426
- metadata.keywords.push(trimmed.to_string());
427
- } else if in_para && in_body {
428
- body_content.push_str(trimmed);
429
- body_content.push_str("\n\n");
430
- }
431
- }
432
- }
433
- Ok(Event::Eof) => break,
434
- Err(e) => {
435
- return Err(crate::error::KreuzbergError::parsing(format!(
436
- "XML parsing error: {}",
437
- e
438
- )));
439
- }
440
- _ => {}
441
- }
442
- }
443
-
444
- let mut final_output = body_content;
445
- if !metadata.title.is_empty() {
446
- final_output = format!("# {}\n\n{}", metadata.title, final_output);
447
- title = metadata.title.clone();
448
- }
449
-
450
- Ok((metadata, final_output.trim().to_string(), title, tables))
451
- }
452
-
453
48
  impl Plugin for JatsExtractor {
454
49
  fn name(&self) -> &str {
455
50
  "jats-extractor"
@@ -571,6 +166,8 @@ impl DocumentExtractor for JatsExtractor {
571
166
  chunks: None,
572
167
  images: None,
573
168
  pages: None,
169
+ djot_content: None,
170
+ elements: None,
574
171
  })
575
172
  }
576
173
 
@@ -602,6 +199,7 @@ impl DocumentExtractor for JatsExtractor {
602
199
  #[cfg(test)]
603
200
  mod tests {
604
201
  use super::*;
202
+ use elements::extract_jats_all_in_one;
605
203
 
606
204
  #[test]
607
205
  fn test_jats_extractor_plugin_interface() {
@@ -0,0 +1,52 @@
1
+ //! XML parsing and document structure traversal for JATS documents.
2
+
3
+ use crate::Result;
4
+ use quick_xml::Reader;
5
+ use quick_xml::events::Event;
6
+
7
+ /// Extract text content from a JATS element and its children.
8
+ pub(super) fn extract_text_content(reader: &mut Reader<&[u8]>) -> Result<String> {
9
+ let mut text = String::new();
10
+ let mut depth = 0;
11
+
12
+ loop {
13
+ match reader.read_event() {
14
+ Ok(Event::Start(_)) => {
15
+ depth += 1;
16
+ }
17
+ Ok(Event::End(_)) => {
18
+ if depth == 0 {
19
+ break;
20
+ }
21
+ depth -= 1;
22
+ if !text.is_empty() && !text.ends_with('\n') {
23
+ text.push(' ');
24
+ }
25
+ }
26
+ Ok(Event::Text(t)) => {
27
+ let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
28
+ if !decoded.trim().is_empty() {
29
+ text.push_str(&decoded);
30
+ text.push(' ');
31
+ }
32
+ }
33
+ Ok(Event::CData(t)) => {
34
+ let decoded = std::str::from_utf8(t.as_ref()).unwrap_or("").to_string();
35
+ if !decoded.trim().is_empty() {
36
+ text.push_str(&decoded);
37
+ text.push('\n');
38
+ }
39
+ }
40
+ Ok(Event::Eof) => break,
41
+ Err(e) => {
42
+ return Err(crate::error::KreuzbergError::parsing(format!(
43
+ "XML parsing error: {}",
44
+ e
45
+ )));
46
+ }
47
+ _ => {}
48
+ }
49
+ }
50
+
51
+ Ok(text.trim().to_string())
52
+ }
@@ -341,6 +341,8 @@ impl DocumentExtractor for JupyterExtractor {
341
341
  detected_languages: None,
342
342
  chunks: None,
343
343
  images: None,
344
+ djot_content: None,
345
+ elements: None,
344
346
  })
345
347
  }
346
348
 
@@ -0,0 +1,93 @@
1
+ //! LaTeX command processing.
2
+ //!
3
+ //! This module handles inline LaTeX commands like formatting (\textbf, \emph, etc.),
4
+ //! math mode ($...$), and other inline elements.
5
+
6
+ use super::utilities::read_braced_from_chars;
7
+
8
+ /// Processes a line of LaTeX, handling commands and inline math.
9
+ ///
10
+ /// Recursively processes nested commands and preserves math mode content.
11
+ pub fn process_line(line: &str) -> String {
12
+ let mut result = String::new();
13
+ let mut chars = line.chars().peekable();
14
+
15
+ while let Some(ch) = chars.next() {
16
+ if ch == '\\' {
17
+ let mut cmd = String::new();
18
+ while let Some(&c) = chars.peek() {
19
+ if c.is_alphabetic() {
20
+ cmd.push(chars.next().unwrap());
21
+ } else {
22
+ break;
23
+ }
24
+ }
25
+
26
+ process_command(&cmd, &mut chars, &mut result);
27
+ } else if ch == '$' {
28
+ // Handle inline math
29
+ result.push(ch);
30
+ while let Some(&c) = chars.peek() {
31
+ result.push(chars.next().unwrap());
32
+ if c == '$' {
33
+ break;
34
+ }
35
+ }
36
+ } else {
37
+ result.push(ch);
38
+ }
39
+ }
40
+
41
+ result
42
+ }
43
+
44
+ /// Processes a single LaTeX command.
45
+ ///
46
+ /// Handles formatting commands (\textbf, \emph, etc.) and extracts their content.
47
+ fn process_command(cmd: &str, chars: &mut std::iter::Peekable<std::str::Chars>, result: &mut String) {
48
+ match cmd {
49
+ "textbf" => {
50
+ if let Some(content) = read_braced_from_chars(chars) {
51
+ let processed = process_line(&content);
52
+ result.push_str(&processed);
53
+ }
54
+ }
55
+ "textit" | "emph" => {
56
+ if let Some(content) = read_braced_from_chars(chars) {
57
+ let processed = process_line(&content);
58
+ result.push_str(&processed);
59
+ }
60
+ }
61
+ "texttt" => {
62
+ if let Some(content) = read_braced_from_chars(chars) {
63
+ result.push_str(&content);
64
+ }
65
+ }
66
+ "underline" => {
67
+ if let Some(content) = read_braced_from_chars(chars) {
68
+ let processed = process_line(&content);
69
+ result.push_str(&processed);
70
+ }
71
+ }
72
+ "font" => {
73
+ // Skip font commands
74
+ while let Some(&c) = chars.peek() {
75
+ if c == '\\' {
76
+ break;
77
+ }
78
+ chars.next();
79
+ }
80
+ }
81
+ "usepackage" => {
82
+ // Skip package declarations
83
+ read_braced_from_chars(chars);
84
+ }
85
+ _ => {
86
+ // For unknown commands, try to extract and process content
87
+ if let Some(content) = read_braced_from_chars(chars) {
88
+ let processed = process_line(&content);
89
+ result.push_str(&processed);
90
+ }
91
+ }
92
+ }
93
+ }