kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -13,17 +13,20 @@
13
13
  //! - Citations and references
14
14
  //! - Supplementary material information
15
15
 
16
+ mod elements;
17
+ mod metadata;
18
+ mod parser;
19
+
16
20
  use crate::Result;
17
21
  use crate::core::config::ExtractionConfig;
18
- use crate::extraction::cells_to_markdown;
19
22
  use crate::plugins::{DocumentExtractor, Plugin};
20
- use crate::types::{ExtractionResult, Metadata, Table};
23
+ use crate::types::{ExtractionResult, Metadata};
21
24
  use async_trait::async_trait;
22
- use quick_xml::Reader;
23
- use quick_xml::events::Event;
24
25
  #[cfg(feature = "tokio-runtime")]
25
26
  use std::path::Path;
26
27
 
28
+ use elements::extract_jats_all_in_one;
29
+
27
30
  /// JATS document extractor.
28
31
  ///
29
32
  /// Supports JATS (Journal Article Tag Suite) XML documents in various versions,
@@ -42,414 +45,6 @@ impl JatsExtractor {
42
45
  }
43
46
  }
44
47
 
45
- /// Extract text content from a JATS element and its children.
46
- fn extract_text_content(reader: &mut Reader<&[u8]>) -> Result<String> {
47
- let mut text = String::new();
48
- let mut depth = 0;
49
-
50
- loop {
51
- match reader.read_event() {
52
- Ok(Event::Start(_)) => {
53
- depth += 1;
54
- }
55
- Ok(Event::End(_)) => {
56
- if depth == 0 {
57
- break;
58
- }
59
- depth -= 1;
60
- if !text.is_empty() && !text.ends_with('\n') {
61
- text.push(' ');
62
- }
63
- }
64
- Ok(Event::Text(t)) => {
65
- let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
66
- if !decoded.trim().is_empty() {
67
- text.push_str(&decoded);
68
- text.push(' ');
69
- }
70
- }
71
- Ok(Event::CData(t)) => {
72
- let decoded = std::str::from_utf8(t.as_ref()).unwrap_or("").to_string();
73
- if !decoded.trim().is_empty() {
74
- text.push_str(&decoded);
75
- text.push('\n');
76
- }
77
- }
78
- Ok(Event::Eof) => break,
79
- Err(e) => {
80
- return Err(crate::error::KreuzbergError::parsing(format!(
81
- "XML parsing error: {}",
82
- e
83
- )));
84
- }
85
- _ => {}
86
- }
87
- }
88
-
89
- Ok(text.trim().to_string())
90
- }
91
-
92
- /// Structure to hold extracted JATS metadata.
93
- #[derive(Debug, Clone, Default)]
94
- struct JatsMetadataExtracted {
95
- title: String,
96
- subtitle: Option<String>,
97
- authors: Vec<String>,
98
- affiliations: Vec<String>,
99
- doi: Option<String>,
100
- pii: Option<String>,
101
- keywords: Vec<String>,
102
- publication_date: Option<String>,
103
- volume: Option<String>,
104
- issue: Option<String>,
105
- pages: Option<String>,
106
- journal_title: Option<String>,
107
- article_type: Option<String>,
108
- abstract_text: Option<String>,
109
- corresponding_author: Option<String>,
110
- }
111
-
112
- /// Extract all content in a single optimized pass.
113
- /// Combines metadata extraction, content parsing, and table extraction into one pass.
114
- fn extract_jats_all_in_one(content: &str) -> Result<(JatsMetadataExtracted, String, String, Vec<Table>)> {
115
- let mut reader = Reader::from_str(content);
116
- let mut metadata = JatsMetadataExtracted::default();
117
- let mut body_content = String::new();
118
- let mut title = String::new();
119
-
120
- let mut in_article_meta = false;
121
- let mut in_article_title = false;
122
- let mut in_subtitle = false;
123
- let mut in_contrib = false;
124
- let mut in_name = false;
125
- let mut in_aff = false;
126
- let mut in_abstract = false;
127
- let mut in_kwd_group = false;
128
- let mut in_kwd = false;
129
- let mut current_author = String::new();
130
- let mut current_aff = String::new();
131
- let mut abstract_content = String::new();
132
-
133
- let mut in_body = false;
134
- let mut in_section = false;
135
- let mut in_para = false;
136
-
137
- let mut in_table = false;
138
- let mut in_thead = false;
139
- let mut in_tbody = false;
140
- let mut in_row = false;
141
- let mut current_table: Vec<Vec<String>> = Vec::new();
142
- let mut current_row: Vec<String> = Vec::new();
143
- let mut tables = Vec::new();
144
- let mut table_index = 0;
145
-
146
- loop {
147
- match reader.read_event() {
148
- Ok(Event::Start(e)) => {
149
- let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
150
-
151
- match tag.as_str() {
152
- "article" => {
153
- for attr in e.attributes() {
154
- if let Ok(attr) = attr
155
- && String::from_utf8_lossy(attr.key.as_ref()) == "article-type"
156
- {
157
- metadata.article_type = Some(String::from_utf8_lossy(attr.value.as_ref()).to_string());
158
- }
159
- }
160
- }
161
- "article-meta" => {
162
- in_article_meta = true;
163
- }
164
- "article-title" if in_article_meta => {
165
- in_article_title = true;
166
- }
167
- "subtitle" if in_article_meta => {
168
- in_subtitle = true;
169
- }
170
- "contrib" if in_article_meta => {
171
- in_contrib = true;
172
- current_author.clear();
173
- }
174
- "name" if in_contrib => {
175
- in_name = true;
176
- }
177
- "aff" if in_article_meta => {
178
- in_aff = true;
179
- current_aff.clear();
180
- }
181
- "article-id" if in_article_meta => {
182
- let mut id_type = String::new();
183
- for attr in e.attributes() {
184
- if let Ok(attr) = attr
185
- && String::from_utf8_lossy(attr.key.as_ref()) == "pub-id-type"
186
- {
187
- id_type = String::from_utf8_lossy(attr.value.as_ref()).to_string();
188
- }
189
- }
190
-
191
- let id_text = extract_text_content(&mut reader)?;
192
- match id_type.as_str() {
193
- "doi" => metadata.doi = Some(id_text),
194
- "pii" => metadata.pii = Some(id_text),
195
- _ => {}
196
- }
197
- continue;
198
- }
199
- "volume" if in_article_meta => {
200
- let vol_text = extract_text_content(&mut reader)?;
201
- metadata.volume = Some(vol_text);
202
- continue;
203
- }
204
- "issue" if in_article_meta => {
205
- let issue_text = extract_text_content(&mut reader)?;
206
- metadata.issue = Some(issue_text);
207
- continue;
208
- }
209
- "fpage" | "lpage" if in_article_meta => {
210
- let page_text = extract_text_content(&mut reader)?;
211
- if let Some(pages) = &mut metadata.pages {
212
- pages.push('-');
213
- pages.push_str(&page_text);
214
- } else {
215
- metadata.pages = Some(page_text);
216
- }
217
- continue;
218
- }
219
- "pub-date" if in_article_meta => {
220
- let date_text = extract_text_content(&mut reader)?;
221
- if metadata.publication_date.is_none() {
222
- metadata.publication_date = Some(date_text);
223
- }
224
- continue;
225
- }
226
- "journal-title" if in_article_meta => {
227
- let journal_text = extract_text_content(&mut reader)?;
228
- if metadata.journal_title.is_none() {
229
- metadata.journal_title = Some(journal_text);
230
- }
231
- continue;
232
- }
233
- "abstract" if in_article_meta => {
234
- in_abstract = true;
235
- abstract_content.clear();
236
- }
237
- "kwd-group" if in_article_meta => {
238
- in_kwd_group = true;
239
- }
240
- "kwd" if in_kwd_group => {
241
- in_kwd = true;
242
- }
243
- "corresp" if in_article_meta => {
244
- let corresp_text = extract_text_content(&mut reader)?;
245
- metadata.corresponding_author = Some(corresp_text);
246
- continue;
247
- }
248
- "body" => {
249
- in_body = true;
250
- }
251
- "sec" if in_body => {
252
- in_section = true;
253
- }
254
- "title" if (in_section || in_body) && !in_article_title => {
255
- let section_title = extract_text_content(&mut reader)?;
256
- if !section_title.is_empty() {
257
- body_content.push_str("## ");
258
- body_content.push_str(&section_title);
259
- body_content.push_str("\n\n");
260
- }
261
- continue;
262
- }
263
- "p" if in_body || in_section => {
264
- in_para = true;
265
- }
266
- "table" => {
267
- in_table = true;
268
- current_table.clear();
269
- }
270
- "thead" if in_table => {
271
- in_thead = true;
272
- }
273
- "tbody" if in_table => {
274
- in_tbody = true;
275
- }
276
- "tr" if (in_thead || in_tbody) && in_table => {
277
- in_row = true;
278
- current_row.clear();
279
- }
280
- "td" | "th" if in_row => {
281
- let mut cell_text = String::new();
282
- let mut cell_depth = 0;
283
-
284
- loop {
285
- match reader.read_event() {
286
- Ok(Event::Start(_)) => {
287
- cell_depth += 1;
288
- }
289
- Ok(Event::End(e)) => {
290
- let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
291
- if (tag == "td" || tag == "th") && cell_depth == 0 {
292
- break;
293
- }
294
- if cell_depth > 0 {
295
- cell_depth -= 1;
296
- }
297
- }
298
- Ok(Event::Text(t)) => {
299
- let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
300
- if !decoded.trim().is_empty() {
301
- if !cell_text.is_empty() {
302
- cell_text.push(' ');
303
- }
304
- cell_text.push_str(decoded.trim());
305
- }
306
- }
307
- Ok(Event::Eof) => break,
308
- Err(e) => {
309
- return Err(crate::error::KreuzbergError::parsing(format!(
310
- "XML parsing error: {}",
311
- e
312
- )));
313
- }
314
- _ => {}
315
- }
316
- }
317
-
318
- current_row.push(cell_text);
319
- }
320
- _ => {}
321
- }
322
- }
323
- Ok(Event::End(e)) => {
324
- let tag = String::from_utf8_lossy(e.name().as_ref()).to_string();
325
-
326
- match tag.as_str() {
327
- "article-meta" => {
328
- in_article_meta = false;
329
- }
330
- "article-title" if in_article_title => {
331
- in_article_title = false;
332
- }
333
- "subtitle" if in_subtitle => {
334
- in_subtitle = false;
335
- }
336
- "contrib" if in_contrib => {
337
- if !current_author.is_empty() {
338
- metadata.authors.push(current_author.clone());
339
- }
340
- in_contrib = false;
341
- current_author.clear();
342
- }
343
- "name" if in_name => {
344
- in_name = false;
345
- }
346
- "aff" if in_aff => {
347
- if !current_aff.is_empty() {
348
- metadata.affiliations.push(current_aff.clone());
349
- }
350
- in_aff = false;
351
- current_aff.clear();
352
- }
353
- "abstract" if in_abstract => {
354
- in_abstract = false;
355
- metadata.abstract_text = Some(abstract_content.trim().to_string());
356
- }
357
- "kwd-group" if in_kwd_group => {
358
- in_kwd_group = false;
359
- }
360
- "kwd" if in_kwd => {
361
- in_kwd = false;
362
- }
363
- "body" => {
364
- in_body = false;
365
- }
366
- "sec" if in_section => {
367
- in_section = false;
368
- }
369
- "p" if in_para => {
370
- in_para = false;
371
- }
372
- "table" if in_table => {
373
- if !current_table.is_empty() {
374
- let markdown = cells_to_markdown(&current_table);
375
- tables.push(Table {
376
- cells: current_table.clone(),
377
- markdown,
378
- page_number: table_index + 1,
379
- });
380
- table_index += 1;
381
- current_table.clear();
382
- }
383
- in_table = false;
384
- }
385
- "thead" if in_thead => {
386
- in_thead = false;
387
- }
388
- "tbody" if in_tbody => {
389
- in_tbody = false;
390
- }
391
- "tr" if in_row => {
392
- if !current_row.is_empty() {
393
- current_table.push(current_row.clone());
394
- current_row.clear();
395
- }
396
- in_row = false;
397
- }
398
- _ => {}
399
- }
400
- }
401
- Ok(Event::Text(t)) => {
402
- let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
403
- let trimmed = decoded.trim();
404
-
405
- if !trimmed.is_empty() {
406
- if in_article_title && metadata.title.is_empty() {
407
- metadata.title.push_str(trimmed);
408
- } else if in_subtitle && metadata.subtitle.is_none() {
409
- metadata.subtitle = Some(trimmed.to_string());
410
- } else if in_name {
411
- if !current_author.is_empty() {
412
- current_author.push(' ');
413
- }
414
- current_author.push_str(trimmed);
415
- } else if in_aff {
416
- if !current_aff.is_empty() {
417
- current_aff.push(' ');
418
- }
419
- current_aff.push_str(trimmed);
420
- } else if in_abstract {
421
- if !abstract_content.is_empty() {
422
- abstract_content.push(' ');
423
- }
424
- abstract_content.push_str(trimmed);
425
- } else if in_kwd {
426
- metadata.keywords.push(trimmed.to_string());
427
- } else if in_para && in_body {
428
- body_content.push_str(trimmed);
429
- body_content.push_str("\n\n");
430
- }
431
- }
432
- }
433
- Ok(Event::Eof) => break,
434
- Err(e) => {
435
- return Err(crate::error::KreuzbergError::parsing(format!(
436
- "XML parsing error: {}",
437
- e
438
- )));
439
- }
440
- _ => {}
441
- }
442
- }
443
-
444
- let mut final_output = body_content;
445
- if !metadata.title.is_empty() {
446
- final_output = format!("# {}\n\n{}", metadata.title, final_output);
447
- title = metadata.title.clone();
448
- }
449
-
450
- Ok((metadata, final_output.trim().to_string(), title, tables))
451
- }
452
-
453
48
  impl Plugin for JatsExtractor {
454
49
  fn name(&self) -> &str {
455
50
  "jats-extractor"
@@ -571,6 +166,8 @@ impl DocumentExtractor for JatsExtractor {
571
166
  chunks: None,
572
167
  images: None,
573
168
  pages: None,
169
+ djot_content: None,
170
+ elements: None,
574
171
  })
575
172
  }
576
173
 
@@ -602,6 +199,7 @@ impl DocumentExtractor for JatsExtractor {
602
199
  #[cfg(test)]
603
200
  mod tests {
604
201
  use super::*;
202
+ use elements::extract_jats_all_in_one;
605
203
 
606
204
  #[test]
607
205
  fn test_jats_extractor_plugin_interface() {
@@ -0,0 +1,52 @@
1
+ //! XML parsing and document structure traversal for JATS documents.
2
+
3
+ use crate::Result;
4
+ use quick_xml::Reader;
5
+ use quick_xml::events::Event;
6
+
7
+ /// Extract text content from a JATS element and its children.
8
+ pub(super) fn extract_text_content(reader: &mut Reader<&[u8]>) -> Result<String> {
9
+ let mut text = String::new();
10
+ let mut depth = 0;
11
+
12
+ loop {
13
+ match reader.read_event() {
14
+ Ok(Event::Start(_)) => {
15
+ depth += 1;
16
+ }
17
+ Ok(Event::End(_)) => {
18
+ if depth == 0 {
19
+ break;
20
+ }
21
+ depth -= 1;
22
+ if !text.is_empty() && !text.ends_with('\n') {
23
+ text.push(' ');
24
+ }
25
+ }
26
+ Ok(Event::Text(t)) => {
27
+ let decoded = String::from_utf8_lossy(t.as_ref()).to_string();
28
+ if !decoded.trim().is_empty() {
29
+ text.push_str(&decoded);
30
+ text.push(' ');
31
+ }
32
+ }
33
+ Ok(Event::CData(t)) => {
34
+ let decoded = std::str::from_utf8(t.as_ref()).unwrap_or("").to_string();
35
+ if !decoded.trim().is_empty() {
36
+ text.push_str(&decoded);
37
+ text.push('\n');
38
+ }
39
+ }
40
+ Ok(Event::Eof) => break,
41
+ Err(e) => {
42
+ return Err(crate::error::KreuzbergError::parsing(format!(
43
+ "XML parsing error: {}",
44
+ e
45
+ )));
46
+ }
47
+ _ => {}
48
+ }
49
+ }
50
+
51
+ Ok(text.trim().to_string())
52
+ }
@@ -341,6 +341,8 @@ impl DocumentExtractor for JupyterExtractor {
341
341
  detected_languages: None,
342
342
  chunks: None,
343
343
  images: None,
344
+ djot_content: None,
345
+ elements: None,
344
346
  })
345
347
  }
346
348
 
@@ -0,0 +1,93 @@
1
+ //! LaTeX command processing.
2
+ //!
3
+ //! This module handles inline LaTeX commands like formatting (\textbf, \emph, etc.),
4
+ //! math mode ($...$), and other inline elements.
5
+
6
+ use super::utilities::read_braced_from_chars;
7
+
8
+ /// Processes a line of LaTeX, handling commands and inline math.
9
+ ///
10
+ /// Recursively processes nested commands and preserves math mode content.
11
+ pub fn process_line(line: &str) -> String {
12
+ let mut result = String::new();
13
+ let mut chars = line.chars().peekable();
14
+
15
+ while let Some(ch) = chars.next() {
16
+ if ch == '\\' {
17
+ let mut cmd = String::new();
18
+ while let Some(&c) = chars.peek() {
19
+ if c.is_alphabetic() {
20
+ cmd.push(chars.next().unwrap());
21
+ } else {
22
+ break;
23
+ }
24
+ }
25
+
26
+ process_command(&cmd, &mut chars, &mut result);
27
+ } else if ch == '$' {
28
+ // Handle inline math
29
+ result.push(ch);
30
+ while let Some(&c) = chars.peek() {
31
+ result.push(chars.next().unwrap());
32
+ if c == '$' {
33
+ break;
34
+ }
35
+ }
36
+ } else {
37
+ result.push(ch);
38
+ }
39
+ }
40
+
41
+ result
42
+ }
43
+
44
+ /// Processes a single LaTeX command.
45
+ ///
46
+ /// Handles formatting commands (\textbf, \emph, etc.) and extracts their content.
47
+ fn process_command(cmd: &str, chars: &mut std::iter::Peekable<std::str::Chars>, result: &mut String) {
48
+ match cmd {
49
+ "textbf" => {
50
+ if let Some(content) = read_braced_from_chars(chars) {
51
+ let processed = process_line(&content);
52
+ result.push_str(&processed);
53
+ }
54
+ }
55
+ "textit" | "emph" => {
56
+ if let Some(content) = read_braced_from_chars(chars) {
57
+ let processed = process_line(&content);
58
+ result.push_str(&processed);
59
+ }
60
+ }
61
+ "texttt" => {
62
+ if let Some(content) = read_braced_from_chars(chars) {
63
+ result.push_str(&content);
64
+ }
65
+ }
66
+ "underline" => {
67
+ if let Some(content) = read_braced_from_chars(chars) {
68
+ let processed = process_line(&content);
69
+ result.push_str(&processed);
70
+ }
71
+ }
72
+ "font" => {
73
+ // Skip font commands
74
+ while let Some(&c) = chars.peek() {
75
+ if c == '\\' {
76
+ break;
77
+ }
78
+ chars.next();
79
+ }
80
+ }
81
+ "usepackage" => {
82
+ // Skip package declarations
83
+ read_braced_from_chars(chars);
84
+ }
85
+ _ => {
86
+ // For unknown commands, try to extract and process content
87
+ if let Some(content) = read_braced_from_chars(chars) {
88
+ let processed = process_line(&content);
89
+ result.push_str(&processed);
90
+ }
91
+ }
92
+ }
93
+ }