kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,102 @@
1
+ //! Interned string type and trait implementations.
2
+ //!
3
+ //! This module provides the `InternedString` type which wraps an Arc<String>
4
+ //! to enable string deduplication and pointer-based comparisons.
5
+
6
+ use std::sync::Arc;
7
+
8
+ /// A reference to an interned string stored in an Arc.
9
+ ///
10
+ /// This wraps an Arc<String> and provides convenient access to the string content.
11
+ /// Multiple calls with the same string content will share the same Arc, reducing memory usage.
12
+ #[derive(Clone)]
13
+ pub struct InternedString(pub(super) Arc<String>);
14
+
15
+ impl InternedString {
16
+ /// Get the string content.
17
+ pub fn as_str(&self) -> &str {
18
+ self.0.as_str()
19
+ }
20
+ }
21
+
22
+ impl AsRef<str> for InternedString {
23
+ fn as_ref(&self) -> &str {
24
+ self.as_str()
25
+ }
26
+ }
27
+
28
+ impl std::fmt::Display for InternedString {
29
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
30
+ write!(f, "{}", self.as_str())
31
+ }
32
+ }
33
+
34
+ impl std::fmt::Debug for InternedString {
35
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36
+ f.debug_tuple("InternedString").field(&self.as_str()).finish()
37
+ }
38
+ }
39
+
40
+ impl PartialEq for InternedString {
41
+ fn eq(&self, other: &Self) -> bool {
42
+ Arc::ptr_eq(&self.0, &other.0) || self.as_str() == other.as_str()
43
+ }
44
+ }
45
+
46
+ impl Eq for InternedString {}
47
+
48
+ impl std::hash::Hash for InternedString {
49
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
50
+ self.as_str().hash(state);
51
+ }
52
+ }
53
+
54
+ impl std::ops::Deref for InternedString {
55
+ type Target = str;
56
+
57
+ fn deref(&self) -> &Self::Target {
58
+ self.as_str()
59
+ }
60
+ }
61
+
62
+ #[cfg(test)]
63
+ mod tests {
64
+ use super::*;
65
+
66
+ #[test]
67
+ fn test_interned_string_display() {
68
+ let s = InternedString(Arc::new("text/html".to_string()));
69
+ assert_eq!(format!("{}", s), "text/html");
70
+ }
71
+
72
+ #[test]
73
+ fn test_interned_string_deref() {
74
+ let s = InternedString(Arc::new("application/json".to_string()));
75
+ assert_eq!(&*s, "application/json");
76
+ assert_eq!(s.as_ref(), "application/json");
77
+ assert_eq!(s.as_str(), "application/json");
78
+ }
79
+
80
+ #[test]
81
+ fn test_interned_string_hash() {
82
+ use std::collections::HashSet;
83
+
84
+ let s1 = InternedString(Arc::new("application/pdf".to_string()));
85
+ let s2 = InternedString(Arc::clone(&s1.0));
86
+
87
+ let mut set = HashSet::new();
88
+ set.insert(s1);
89
+ set.insert(s2);
90
+
91
+ assert_eq!(set.len(), 1);
92
+ }
93
+
94
+ #[test]
95
+ fn test_interned_string_clone() {
96
+ let s1 = InternedString(Arc::new("text/html".to_string()));
97
+ let s2 = s1.clone();
98
+
99
+ assert_eq!(s1, s2);
100
+ assert!(Arc::ptr_eq(&s1.0, &s2.0));
101
+ }
102
+ }
@@ -0,0 +1,119 @@
1
+ //! String pool for language codes with pre-interning of common ISO 639 codes.
2
+ //!
3
+ //! This module provides a thread-safe string pool specifically for language codes,
4
+ //! with lazy initialization of common language codes on first access.
5
+
6
+ use super::interned::InternedString;
7
+ use once_cell::sync::Lazy;
8
+ use std::sync::Arc;
9
+ use std::sync::atomic::{AtomicBool, Ordering};
10
+
11
+ /// String pool for language codes.
12
+ ///
13
+ /// Lazily initializes with common ISO 639 language codes.
14
+ /// Pre-interning is deferred until first access to reduce startup memory usage.
15
+ pub(super) struct LanguageStringPool {
16
+ pool: dashmap::DashMap<String, Arc<String>>,
17
+ initialized: AtomicBool,
18
+ }
19
+
20
+ impl LanguageStringPool {
21
+ /// Create a new language string pool.
22
+ /// Pre-interning is deferred until first `get_or_intern()` call.
23
+ pub(super) fn new() -> Self {
24
+ LanguageStringPool {
25
+ pool: dashmap::DashMap::new(),
26
+ initialized: AtomicBool::new(false),
27
+ }
28
+ }
29
+
30
+ /// Ensure all known language codes are pre-interned (one-time initialization).
31
+ #[inline]
32
+ fn ensure_initialized(&self) {
33
+ if self.initialized.load(Ordering::Acquire) {
34
+ return;
35
+ }
36
+
37
+ let lang_codes = vec![
38
+ "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh", "ar", "hi", "th", "tr", "pl", "nl", "sv", "no",
39
+ "da", "fi", "cs", "hu", "ro", "el", "he", "fa", "ur", "vi", "id", "ms", "bn", "pa", "te", "mr", "ta", "gu",
40
+ "kn", "ml", "or", "uk", "bg", "sr", "hr", "sl", "sk", "et", "lv", "lt", "sq", "mk", "ka", "hy", "eo",
41
+ "ast", "ca", "eu", "gl", "cy", "gd", "ga",
42
+ ];
43
+
44
+ for code in lang_codes {
45
+ self.pool.insert(code.to_string(), Arc::new(code.to_string()));
46
+ }
47
+
48
+ let _ = self
49
+ .initialized
50
+ .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
51
+ }
52
+
53
+ /// Get or intern a language code string.
54
+ /// Ensures pre-interned language codes are initialized on first call.
55
+ pub(super) fn get_or_intern(&self, lang_code: &str) -> Arc<String> {
56
+ self.ensure_initialized();
57
+
58
+ if let Some(entry) = self.pool.get(lang_code) {
59
+ Arc::clone(&*entry)
60
+ } else {
61
+ let arc_string = Arc::new(lang_code.to_string());
62
+ self.pool.insert(lang_code.to_string(), Arc::clone(&arc_string));
63
+ arc_string
64
+ }
65
+ }
66
+ }
67
+
68
+ /// Global language code string pool.
69
+ pub(super) static LANGUAGE_POOL: Lazy<LanguageStringPool> = Lazy::new(LanguageStringPool::new);
70
+
71
+ /// Get or intern a language code string.
72
+ ///
73
+ /// Returns an `InternedString` that is guaranteed to be deduplicated with any other
74
+ /// intern call for the same language code.
75
+ ///
76
+ /// # Arguments
77
+ ///
78
+ /// * `lang_code` - The language code to intern (e.g., "en", "es", "fr")
79
+ ///
80
+ /// # Returns
81
+ ///
82
+ /// An `InternedString` pointing to the deduplicated string
83
+ ///
84
+ /// # Example
85
+ ///
86
+ /// ```rust,ignore
87
+ /// let en1 = intern_language_code("en");
88
+ /// let en2 = intern_language_code("en");
89
+ /// assert_eq!(en1, en2); // Same pointer
90
+ /// ```
91
+ pub fn intern_language_code(lang_code: &str) -> InternedString {
92
+ InternedString(LANGUAGE_POOL.get_or_intern(lang_code))
93
+ }
94
+
95
+ #[cfg(test)]
96
+ mod tests {
97
+ use super::*;
98
+
99
+ #[test]
100
+ fn test_language_code_deduplication() {
101
+ let en1 = intern_language_code("en");
102
+ let en2 = intern_language_code("en");
103
+
104
+ assert_eq!(en1, en2);
105
+ assert!(Arc::ptr_eq(&en1.0, &en2.0));
106
+ }
107
+
108
+ #[test]
109
+ fn test_preinterned_language_codes() {
110
+ let en = intern_language_code("en");
111
+ assert_eq!(en.as_str(), "en");
112
+
113
+ let es = intern_language_code("es");
114
+ assert_eq!(es.as_str(), "es");
115
+
116
+ let fr = intern_language_code("fr");
117
+ assert_eq!(fr.as_str(), "fr");
118
+ }
119
+ }
@@ -0,0 +1,235 @@
1
+ //! String pool for MIME types with pre-interning of common types.
2
+ //!
3
+ //! This module provides a thread-safe string pool specifically for MIME types,
4
+ //! with lazy initialization of common MIME types on first access.
5
+
6
+ use super::interned::InternedString;
7
+ use once_cell::sync::Lazy;
8
+ use std::sync::Arc;
9
+ use std::sync::atomic::{AtomicBool, Ordering};
10
+
11
+ /// String pool for MIME types.
12
+ ///
13
+ /// Lazily initializes with all known MIME types from `kreuzberg::core::mime`.
14
+ /// Pre-interning is deferred until first access to reduce startup memory usage.
15
+ pub(super) struct MimeStringPool {
16
+ pool: dashmap::DashMap<String, Arc<String>>,
17
+ initialized: AtomicBool,
18
+ }
19
+
20
+ impl MimeStringPool {
21
+ /// Create a new MIME string pool.
22
+ /// Pre-interning is deferred until first `get_or_intern()` call.
23
+ pub(super) fn new() -> Self {
24
+ MimeStringPool {
25
+ pool: dashmap::DashMap::new(),
26
+ initialized: AtomicBool::new(false),
27
+ }
28
+ }
29
+
30
+ /// Ensure all known MIME types are pre-interned (one-time initialization).
31
+ #[inline]
32
+ fn ensure_initialized(&self) {
33
+ if self.initialized.load(Ordering::Acquire) {
34
+ return;
35
+ }
36
+
37
+ let mime_types = vec![
38
+ "text/html",
39
+ "text/markdown",
40
+ "text/x-markdown",
41
+ "text/plain",
42
+ "application/pdf",
43
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
44
+ "application/msword",
45
+ "application/vnd.ms-powerpoint",
46
+ "message/rfc822",
47
+ "application/vnd.ms-outlook",
48
+ "application/json",
49
+ "text/json",
50
+ "application/x-yaml",
51
+ "text/yaml",
52
+ "text/x-yaml",
53
+ "application/yaml",
54
+ "application/toml",
55
+ "text/toml",
56
+ "application/xml",
57
+ "text/xml",
58
+ "image/svg+xml",
59
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
60
+ "application/vnd.ms-excel",
61
+ "application/vnd.ms-excel.sheet.macroEnabled.12",
62
+ "application/vnd.ms-excel.sheet.binary.macroEnabled.12",
63
+ "application/vnd.ms-excel.addin.macroEnabled.12",
64
+ "application/vnd.ms-excel.template.macroEnabled.12",
65
+ "application/vnd.oasis.opendocument.spreadsheet",
66
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
67
+ "application/vnd.oasis.opendocument.text",
68
+ "image/bmp",
69
+ "image/gif",
70
+ "image/jp2",
71
+ "image/jpeg",
72
+ "image/jpm",
73
+ "image/jpx",
74
+ "image/mj2",
75
+ "image/pjpeg",
76
+ "image/png",
77
+ "image/tiff",
78
+ "image/webp",
79
+ "image/x-bmp",
80
+ "image/x-ms-bmp",
81
+ "image/x-portable-anymap",
82
+ "image/x-portable-bitmap",
83
+ "image/x-portable-graymap",
84
+ "image/x-portable-pixmap",
85
+ "image/x-tiff",
86
+ "application/csl+json",
87
+ "application/docbook+xml",
88
+ "application/epub+zip",
89
+ "application/rtf",
90
+ "application/x-biblatex",
91
+ "application/x-bibtex",
92
+ "application/x-endnote+xml",
93
+ "application/x-fictionbook+xml",
94
+ "application/x-ipynb+json",
95
+ "application/x-jats+xml",
96
+ "application/x-latex",
97
+ "application/xml+opml",
98
+ "application/x-opml+xml",
99
+ "application/x-research-info-systems",
100
+ "application/x-typst",
101
+ "text/csv",
102
+ "text/tab-separated-values",
103
+ "text/troff",
104
+ "text/x-commonmark",
105
+ "text/x-dokuwiki",
106
+ "text/x-gfm",
107
+ "text/x-markdown-extra",
108
+ "text/x-mdoc",
109
+ "text/x-multimarkdown",
110
+ "text/x-opml",
111
+ "text/x-org",
112
+ "text/x-pod",
113
+ "text/x-rst",
114
+ "application/zip",
115
+ "application/x-zip-compressed",
116
+ "application/x-tar",
117
+ "application/tar",
118
+ "application/x-gtar",
119
+ "application/x-ustar",
120
+ "application/gzip",
121
+ "application/x-7z-compressed",
122
+ ];
123
+
124
+ for mime_type in mime_types {
125
+ self.pool.insert(mime_type.to_string(), Arc::new(mime_type.to_string()));
126
+ }
127
+
128
+ let _ = self
129
+ .initialized
130
+ .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
131
+ }
132
+
133
+ /// Get or intern a MIME type string.
134
+ /// Ensures pre-interned MIME types are initialized on first call.
135
+ pub(super) fn get_or_intern(&self, mime_type: &str) -> Arc<String> {
136
+ self.ensure_initialized();
137
+
138
+ if let Some(entry) = self.pool.get(mime_type) {
139
+ Arc::clone(&*entry)
140
+ } else {
141
+ let arc_string = Arc::new(mime_type.to_string());
142
+ self.pool.insert(mime_type.to_string(), Arc::clone(&arc_string));
143
+ arc_string
144
+ }
145
+ }
146
+ }
147
+
148
+ /// Global MIME type string pool.
149
+ pub(super) static MIME_POOL: Lazy<MimeStringPool> = Lazy::new(MimeStringPool::new);
150
+
151
+ /// Get or intern a MIME type string.
152
+ ///
153
+ /// Returns an `InternedString` that is guaranteed to be deduplicated with any other
154
+ /// intern call for the same MIME type. This reduces memory usage and allows
155
+ /// fast pointer-based comparisons.
156
+ ///
157
+ /// # Arguments
158
+ ///
159
+ /// * `mime_type` - The MIME type string to intern
160
+ ///
161
+ /// # Returns
162
+ ///
163
+ /// An `InternedString` pointing to the deduplicated string
164
+ ///
165
+ /// # Example
166
+ ///
167
+ /// ```rust,ignore
168
+ /// let pdf1 = intern_mime_type("application/pdf");
169
+ /// let pdf2 = intern_mime_type("application/pdf");
170
+ /// assert_eq!(pdf1, pdf2); // Same pointer
171
+ /// ```
172
+ pub fn intern_mime_type(mime_type: &str) -> InternedString {
173
+ InternedString(MIME_POOL.get_or_intern(mime_type))
174
+ }
175
+
176
+ #[cfg(test)]
177
+ mod tests {
178
+ use super::*;
179
+
180
+ #[test]
181
+ fn test_mime_type_deduplication() {
182
+ let mime1 = intern_mime_type("application/pdf");
183
+ let mime2 = intern_mime_type("application/pdf");
184
+
185
+ assert_eq!(mime1, mime2);
186
+ assert!(Arc::ptr_eq(&mime1.0, &mime2.0));
187
+ }
188
+
189
+ #[test]
190
+ fn test_preinterned_mime_types() {
191
+ let pdf = intern_mime_type("application/pdf");
192
+ assert_eq!(pdf.as_str(), "application/pdf");
193
+
194
+ let html = intern_mime_type("text/html");
195
+ assert_eq!(html.as_str(), "text/html");
196
+
197
+ let json = intern_mime_type("application/json");
198
+ assert_eq!(json.as_str(), "application/json");
199
+ }
200
+
201
+ #[test]
202
+ #[ignore = "Flaky test - concurrent interning may not always share the same Arc"]
203
+ fn test_concurrent_interning() {
204
+ use std::sync::Arc as StdArc;
205
+ use std::thread;
206
+
207
+ let mime = "application/pdf";
208
+ let results = StdArc::new(std::sync::Mutex::new(Vec::new()));
209
+
210
+ let handles: Vec<_> = (0..10)
211
+ .map(|_| {
212
+ let results = StdArc::clone(&results);
213
+ thread::spawn(move || {
214
+ let interned = intern_mime_type(mime);
215
+ results.lock().unwrap().push(interned);
216
+ })
217
+ })
218
+ .collect();
219
+
220
+ for handle in handles {
221
+ handle.join().unwrap();
222
+ }
223
+
224
+ let interned_strings = results.lock().unwrap();
225
+ assert_eq!(interned_strings.len(), 10);
226
+
227
+ let first_arc = &interned_strings[0].0;
228
+ for interned in &*interned_strings {
229
+ assert!(
230
+ Arc::ptr_eq(&interned.0, first_arc),
231
+ "All interned strings should share the same Arc"
232
+ );
233
+ }
234
+ }
235
+ }
@@ -0,0 +1,41 @@
1
+ //! String interning/pooling for frequently used strings.
2
+ //!
3
+ //! This module provides thread-safe string interning to reduce memory allocations
4
+ //! for strings that appear repeatedly across documents (MIME types, language codes, format field names).
5
+ //!
6
+ //! # Performance
7
+ //!
8
+ //! String interning provides 0.1-0.3% improvement by:
9
+ //! - Deduplicating repeated strings (e.g., "application/pdf" appears 1000s of times)
10
+ //! - Reducing allocation overhead for commonly used strings
11
+ //! - Enabling pointer comparisons instead of string comparisons
12
+ //!
13
+ //! # Thread Safety
14
+ //!
15
+ //! The intern pool uses a `DashMap` for lock-free concurrent access. Multiple threads
16
+ //! can insert and lookup strings simultaneously without contention.
17
+ //!
18
+ //! # Example
19
+ //!
20
+ //! ```rust,ignore
21
+ //! use kreuzberg::utils::string_pool::intern_mime_type;
22
+ //!
23
+ //! let mime1 = intern_mime_type("application/pdf");
24
+ //! let mime2 = intern_mime_type("application/pdf");
25
+ //! // Both mime1 and mime2 point to the same interned string
26
+ //! assert_eq!(mime1, mime2);
27
+ //! ```
28
+
29
+ mod buffer_pool;
30
+ mod interned;
31
+ mod language_pool;
32
+ mod mime_pool;
33
+
34
+ // Re-export public types and functions
35
+ pub use buffer_pool::{PoolConfig, PooledString, STRING_BUFFER_POOL, StringBufferPool, acquire_string_buffer};
36
+ pub use interned::InternedString;
37
+ pub use language_pool::intern_language_code;
38
+ pub use mime_pool::intern_mime_type;
39
+
40
+ #[cfg(feature = "pool-metrics")]
41
+ pub use buffer_pool::StringBufferPoolMetrics;