kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,117 @@
1
+ //! Regex patterns for quality detection
2
+ //!
3
+ //! This module contains all regex patterns used for detecting OCR artifacts,
4
+ //! script content, navigation elements, and text structure.
5
+
6
+ use once_cell::sync::Lazy;
7
+ use regex::Regex;
8
+
9
+ // ============================================================================
10
+ // OCR Artifact Patterns
11
+ // ============================================================================
12
+
13
+ /// Detects scattered characters with excessive spacing (e.g., "a b c")
14
+ pub(crate) static SCATTERED_CHARS_PATTERN: Lazy<Regex> = Lazy::new(|| {
15
+ Regex::new(r"\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b")
16
+ .expect("Scattered chars regex pattern is valid and should compile")
17
+ });
18
+
19
+ /// Detects repeated punctuation marks (3 or more dots or underscores)
20
+ pub(crate) static REPEATED_PUNCT_PATTERN: Lazy<Regex> = Lazy::new(|| {
21
+ Regex::new(r"[.]{3,}|[_]{3,}").expect("Repeated punctuation regex pattern is valid and should compile")
22
+ });
23
+
24
+ /// Detects repeated dashes (3 or more)
25
+ pub(crate) static DASH_PATTERN: Lazy<Regex> =
26
+ Lazy::new(|| Regex::new(r"[-]{3,}").expect("Dash pattern regex is valid and should compile"));
27
+
28
+ /// Detects isolated punctuation surrounded by spaces
29
+ pub(crate) static ISOLATED_PUNCT_PATTERN: Lazy<Regex> =
30
+ Lazy::new(|| Regex::new(r"\s[.,;:!?]\s").expect("Isolated punctuation regex pattern is valid and should compile"));
31
+
32
+ /// Detects malformed words with mixed letters and numbers
33
+ pub(crate) static MALFORMED_WORDS_PATTERN: Lazy<Regex> = Lazy::new(|| {
34
+ Regex::new(r"\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b")
35
+ .expect("Malformed words regex pattern is valid and should compile")
36
+ });
37
+
38
+ /// Detects excessive whitespace (3 or more spaces)
39
+ pub(crate) static EXCESSIVE_WHITESPACE_PATTERN: Lazy<Regex> =
40
+ Lazy::new(|| Regex::new(r"\s{3,}").expect("Excessive whitespace regex pattern is valid and should compile"));
41
+
42
+ // ============================================================================
43
+ // Script and Code Patterns
44
+ // ============================================================================
45
+
46
+ /// Detects JavaScript function declarations
47
+ pub(crate) static JS_FUNCTION_PATTERN: Lazy<Regex> = Lazy::new(|| {
48
+ Regex::new(r"(?i)function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}")
49
+ .expect("JavaScript function regex pattern is valid and should compile")
50
+ });
51
+
52
+ /// Detects CSS rules
53
+ pub(crate) static CSS_RULES_PATTERN: Lazy<Regex> = Lazy::new(|| {
54
+ Regex::new(r"(?i)\.[a-zA-Z][\w-]*\s*\{[^}]*\}").expect("CSS rules regex pattern is valid and should compile")
55
+ });
56
+
57
+ /// Detects HTML script tags
58
+ pub(crate) static SCRIPT_TAG_PATTERN: Lazy<Regex> = Lazy::new(|| {
59
+ Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("Script tag regex pattern is valid and should compile")
60
+ });
61
+
62
+ /// Detects HTML style tags
63
+ pub(crate) static STYLE_TAG_PATTERN: Lazy<Regex> = Lazy::new(|| {
64
+ Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("Style tag regex pattern is valid and should compile")
65
+ });
66
+
67
+ // ============================================================================
68
+ // Navigation Element Patterns
69
+ // ============================================================================
70
+
71
+ /// Detects common navigation words and phrases
72
+ pub(crate) static NAV_WORDS_PATTERN: Lazy<Regex> = Lazy::new(|| {
73
+ Regex::new(r"(?i)\b(?:Skip to main content|Back to top|Main navigation|Site navigation)\b")
74
+ .expect("Navigation words regex pattern is valid and should compile")
75
+ });
76
+
77
+ /// Detects breadcrumb navigation patterns
78
+ pub(crate) static BREADCRUMB_PATTERN: Lazy<Regex> = Lazy::new(|| {
79
+ Regex::new(r"(?:Home\s*[>»]\s*|[>»]\s*){2,}").expect("Breadcrumb regex pattern is valid and should compile")
80
+ });
81
+
82
+ /// Detects pagination text
83
+ pub(crate) static PAGINATION_PATTERN: Lazy<Regex> = Lazy::new(|| {
84
+ Regex::new(r"(?i)\b(?:Page \d+ of \d+|First page|Last page|Previous page|Next page|^\d+ of \d+$)\b")
85
+ .expect("Pagination regex pattern is valid and should compile")
86
+ });
87
+
88
+ // ============================================================================
89
+ // Text Structure Patterns
90
+ // ============================================================================
91
+
92
+ /// Detects sentence boundaries
93
+ pub(crate) static SENTENCE_DETECT: Lazy<Regex> =
94
+ Lazy::new(|| Regex::new(r"[.!?]\s+[A-Z]").expect("Sentence detection regex pattern is valid and should compile"));
95
+
96
+ /// Detects punctuation marks
97
+ pub(crate) static PUNCTUATION_DETECT: Lazy<Regex> =
98
+ Lazy::new(|| Regex::new(r"[.!?]").expect("Punctuation detection regex pattern is valid and should compile"));
99
+
100
+ // ============================================================================
101
+ // Whitespace Normalization Patterns
102
+ // ============================================================================
103
+
104
+ /// Normalizes various types of whitespace characters
105
+ pub(crate) static WHITESPACE_NORMALIZE: Lazy<Regex> = Lazy::new(|| {
106
+ Regex::new(r"[ \t\f\v\r\xa0\u{2000}-\u{200b}\u{2028}\u{2029}\u{3000}]+")
107
+ .expect("Whitespace normalization regex pattern is valid and should compile")
108
+ });
109
+
110
+ /// Normalizes multiple consecutive newlines
111
+ pub(crate) static NEWLINE_NORMALIZE: Lazy<Regex> = Lazy::new(|| {
112
+ Regex::new(r"\n\s*\n\s*\n+").expect("Newline normalization regex pattern is valid and should compile")
113
+ });
114
+
115
+ /// Cleans up newline sequences
116
+ pub(crate) static NEWLINE_CLEANUP: Lazy<Regex> =
117
+ Lazy::new(|| Regex::new(r"\n+").expect("Newline cleanup regex pattern is valid and should compile"));
@@ -0,0 +1,178 @@
1
+ //! Quality scoring functions
2
+ //!
3
+ //! This module provides functions to calculate quality scores and penalties
4
+ //! based on various text characteristics.
5
+
6
+ use super::patterns::*;
7
+ use ahash::AHashMap;
8
+ use regex::Regex;
9
+
10
+ // ============================================================================
11
+ // Scoring Constants and Weights
12
+ // ============================================================================
13
+
14
+ pub(crate) const OCR_PENALTY_WEIGHT: f64 = 0.3;
15
+ pub(crate) const SCRIPT_PENALTY_WEIGHT: f64 = 0.2;
16
+ pub(crate) const NAV_PENALTY_WEIGHT: f64 = 0.1;
17
+ pub(crate) const STRUCTURE_BONUS_WEIGHT: f64 = 0.2;
18
+ pub(crate) const METADATA_BONUS_WEIGHT: f64 = 0.1;
19
+
20
+ pub(crate) const MIN_TEXT_LENGTH: usize = 10;
21
+ pub(crate) const LARGE_TEXT_LENGTH: usize = 1000;
22
+
23
+ // ============================================================================
24
+ // Helper Functions
25
+ // ============================================================================
26
+
27
+ /// Sums the total length of all regex matches in the text
28
+ #[inline]
29
+ pub(crate) fn sum_match_lengths(text: &str, pattern: &Regex) -> usize {
30
+ pattern.find_iter(text).map(|m| m.len()).sum()
31
+ }
32
+
33
+ // ============================================================================
34
+ // Penalty Calculation Functions
35
+ // ============================================================================
36
+
37
+ /// Calculate penalty based on OCR artifacts in the text
38
+ #[inline]
39
+ pub(crate) fn calculate_ocr_penalty(text: &str, total_chars: f64) -> f64 {
40
+ if total_chars == 0.0 {
41
+ return 0.0;
42
+ }
43
+
44
+ if !text.contains(" ") && !text.contains("...") {
45
+ return 0.0;
46
+ }
47
+
48
+ let artifact_chars = sum_match_lengths(text, &SCATTERED_CHARS_PATTERN)
49
+ + sum_match_lengths(text, &REPEATED_PUNCT_PATTERN)
50
+ + count_non_table_dash_artifacts(text)
51
+ + sum_match_lengths(text, &ISOLATED_PUNCT_PATTERN)
52
+ + sum_match_lengths(text, &MALFORMED_WORDS_PATTERN)
53
+ + sum_match_lengths(text, &EXCESSIVE_WHITESPACE_PATTERN);
54
+
55
+ (artifact_chars as f64 / total_chars).min(1.0)
56
+ }
57
+
58
+ /// Count dash artifacts while preserving table separators
59
+ #[inline]
60
+ pub(crate) fn count_non_table_dash_artifacts(text: &str) -> usize {
61
+ let mut artifact_count = 0;
62
+
63
+ for line in text.lines() {
64
+ let trimmed = line.trim();
65
+ let is_table_separator = trimmed.starts_with('|')
66
+ && trimmed.ends_with('|')
67
+ && trimmed
68
+ .chars()
69
+ .all(|c| c == '|' || c == '-' || c.is_whitespace() || c == ':');
70
+
71
+ if !is_table_separator {
72
+ for m in DASH_PATTERN.find_iter(line) {
73
+ artifact_count += m.len();
74
+ }
75
+ }
76
+ }
77
+
78
+ artifact_count
79
+ }
80
+
81
+ /// Calculate penalty based on embedded scripts and code
82
+ #[inline]
83
+ pub(crate) fn calculate_script_penalty(text: &str, total_chars: f64) -> f64 {
84
+ if total_chars == 0.0 {
85
+ return 0.0;
86
+ }
87
+
88
+ if !text.contains("function") && !text.contains("<script") && !text.contains("<style") {
89
+ return 0.0;
90
+ }
91
+
92
+ let script_chars = sum_match_lengths(text, &JS_FUNCTION_PATTERN)
93
+ + sum_match_lengths(text, &CSS_RULES_PATTERN)
94
+ + sum_match_lengths(text, &SCRIPT_TAG_PATTERN)
95
+ + sum_match_lengths(text, &STYLE_TAG_PATTERN);
96
+
97
+ (script_chars as f64 / total_chars).min(1.0)
98
+ }
99
+
100
+ /// Calculate penalty based on navigation elements
101
+ #[inline]
102
+ pub(crate) fn calculate_navigation_penalty(text: &str, total_chars: f64) -> f64 {
103
+ if total_chars == 0.0 {
104
+ return 0.0;
105
+ }
106
+
107
+ let nav_chars = sum_match_lengths(text, &NAV_WORDS_PATTERN)
108
+ + sum_match_lengths(text, &BREADCRUMB_PATTERN)
109
+ + sum_match_lengths(text, &PAGINATION_PATTERN);
110
+
111
+ (nav_chars as f64 / total_chars).min(1.0)
112
+ }
113
+
114
+ // ============================================================================
115
+ // Bonus Calculation Functions
116
+ // ============================================================================
117
+
118
+ /// Calculate bonus based on document metadata quality
119
+ #[inline]
120
+ pub(crate) fn calculate_metadata_bonus(metadata: &AHashMap<String, String>) -> f64 {
121
+ const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
122
+
123
+ let present_fields = IMPORTANT_FIELDS
124
+ .iter()
125
+ .filter(|&&field| metadata.contains_key(field))
126
+ .count();
127
+
128
+ present_fields as f64 / IMPORTANT_FIELDS.len() as f64
129
+ }
130
+
131
+ /// Compute a heuristic score (0.0–1.0) describing how clean the extracted text is.
132
+ ///
133
+ /// The scoring pipeline rewards well-structured prose while penalising OCR artefacts,
134
+ /// embedded scripts, and navigation chrome. Supplying document metadata allows the
135
+ /// function to include contextual bonuses.
136
+ ///
137
+ /// ```rust
138
+ /// use ahash::AHashMap;
139
+ /// use kreuzberg::utils::quality::calculate_quality_score;
140
+ ///
141
+ /// let text = "Executive Summary\n===================\nKreuzberg extracts documents quickly.";
142
+ /// let score = calculate_quality_score(text, None);
143
+ /// assert!(score > 0.7);
144
+ /// ```
145
+ pub fn calculate_quality_score(text: &str, metadata: Option<&AHashMap<String, String>>) -> f64 {
146
+ if text.is_empty() || text.trim().is_empty() {
147
+ return 0.0;
148
+ }
149
+
150
+ let total_chars = text.len() as f64;
151
+
152
+ if text.len() < MIN_TEXT_LENGTH {
153
+ return 0.1;
154
+ }
155
+
156
+ let mut score = 1.0;
157
+
158
+ if text.len() > LARGE_TEXT_LENGTH {
159
+ let ocr_penalty = calculate_ocr_penalty(text, total_chars);
160
+ let script_penalty = calculate_script_penalty(text, total_chars);
161
+ let nav_penalty = calculate_navigation_penalty(text, total_chars);
162
+ let structure_bonus = super::heuristics::calculate_structure_bonus(text);
163
+
164
+ score -= ocr_penalty * OCR_PENALTY_WEIGHT;
165
+ score -= script_penalty * SCRIPT_PENALTY_WEIGHT;
166
+ score -= nav_penalty * NAV_PENALTY_WEIGHT;
167
+ score += structure_bonus * STRUCTURE_BONUS_WEIGHT;
168
+ } else {
169
+ score -= calculate_ocr_penalty(text, total_chars) * OCR_PENALTY_WEIGHT;
170
+ score += super::heuristics::calculate_structure_bonus(text) * STRUCTURE_BONUS_WEIGHT;
171
+ }
172
+
173
+ if let Some(metadata) = metadata {
174
+ score += calculate_metadata_bonus(metadata) * METADATA_BONUS_WEIGHT;
175
+ }
176
+
177
+ score.clamp(0.0, 1.0)
178
+ }
@@ -0,0 +1,325 @@
1
+ //! Thread-safe reusable string buffer pool for reducing allocations.
2
+ //!
3
+ //! This module provides a pool of reusable String buffers that can be acquired,
4
+ //! used, and automatically returned to the pool when dropped.
5
+
6
+ use once_cell::sync::Lazy;
7
+ use std::collections::VecDeque;
8
+ use std::sync::Arc;
9
+
10
+ #[cfg(feature = "pool-metrics")]
11
+ use std::sync::atomic::AtomicUsize;
12
+
13
+ #[cfg(feature = "pool-metrics")]
14
+ use std::sync::atomic::Ordering;
15
+
16
+ /// Configuration for the string buffer pool.
17
+ pub struct PoolConfig {
18
+ /// Maximum buffers per size bucket
19
+ pub max_buffers_per_size: usize,
20
+ /// Initial capacity for new buffers
21
+ pub initial_capacity: usize,
22
+ /// Maximum capacity before discarding
23
+ pub max_capacity_before_discard: usize,
24
+ }
25
+
26
+ impl Default for PoolConfig {
27
+ fn default() -> Self {
28
+ Self {
29
+ max_buffers_per_size: 4,
30
+ initial_capacity: 4096,
31
+ max_capacity_before_discard: 65536,
32
+ }
33
+ }
34
+ }
35
+
36
+ /// Thread-safe reusable string buffer pool.
37
+ ///
38
+ /// This pool allows allocation and reuse of String buffers to reduce memory allocations
39
+ /// during document extraction. Buffers are returned to the pool with cleared contents
40
+ /// but preserved capacity, ready for reuse.
41
+ ///
42
+ /// # Thread Safety
43
+ ///
44
+ /// The pool uses DashMap for lock-free concurrent access. Multiple threads can
45
+ /// acquire and release buffers simultaneously.
46
+ ///
47
+ /// # Usage
48
+ ///
49
+ /// ```rust,ignore
50
+ /// use kreuzberg::utils::string_pool::STRING_BUFFER_POOL;
51
+ ///
52
+ /// // Acquire a buffer from the pool
53
+ /// let mut buffer = STRING_BUFFER_POOL.acquire();
54
+ /// buffer.push_str("some content");
55
+ /// // Automatically returned to pool when dropped
56
+ /// drop(buffer);
57
+ /// ```
58
+ pub struct StringBufferPool {
59
+ pool: dashmap::DashMap<usize, VecDeque<String>>,
60
+ config: PoolConfig,
61
+ #[cfg(feature = "pool-metrics")]
62
+ acquire_count: AtomicUsize,
63
+ #[cfg(feature = "pool-metrics")]
64
+ reuse_count: AtomicUsize,
65
+ }
66
+
67
+ impl StringBufferPool {
68
+ /// Create a new string buffer pool with given configuration.
69
+ pub fn new(config: PoolConfig) -> Self {
70
+ StringBufferPool {
71
+ pool: dashmap::DashMap::new(),
72
+ config,
73
+ #[cfg(feature = "pool-metrics")]
74
+ acquire_count: AtomicUsize::new(0),
75
+ #[cfg(feature = "pool-metrics")]
76
+ reuse_count: AtomicUsize::new(0),
77
+ }
78
+ }
79
+
80
+ /// Find the appropriate bucket size for a given capacity.
81
+ fn find_bucket(&self, capacity: usize) -> usize {
82
+ if capacity <= 1024 {
83
+ 1024
84
+ } else if capacity <= 4096 {
85
+ 4096
86
+ } else if capacity <= 16384 {
87
+ 16384
88
+ } else if capacity <= 65536 {
89
+ 65536
90
+ } else {
91
+ 262144
92
+ }
93
+ }
94
+
95
+ /// Try to acquire a buffer from a specific bucket, returning it if found.
96
+ fn try_acquire_from_bucket(&self, bucket: usize) -> Option<String> {
97
+ if let Some(mut entry) = self.pool.get_mut(&bucket) {
98
+ entry.pop_front()
99
+ } else {
100
+ None
101
+ }
102
+ }
103
+
104
+ /// Acquire a string buffer from the pool, or allocate a new one if pool is exhausted.
105
+ ///
106
+ /// The returned buffer is automatically returned to the pool when dropped.
107
+ /// Must be called with the pool wrapped in Arc.
108
+ pub fn acquire(self: Arc<Self>) -> PooledString {
109
+ #[cfg(feature = "pool-metrics")]
110
+ self.acquire_count.fetch_add(1, Ordering::Relaxed);
111
+
112
+ let default_bucket = self.config.initial_capacity;
113
+ if let Some(buffer) = self.try_acquire_from_bucket(default_bucket) {
114
+ #[cfg(feature = "pool-metrics")]
115
+ self.reuse_count.fetch_add(1, Ordering::Relaxed);
116
+ return PooledString { buffer, pool: self };
117
+ }
118
+
119
+ for &bucket in &[1024, 16384, 65536] {
120
+ if let Some(buffer) = self.try_acquire_from_bucket(bucket) {
121
+ #[cfg(feature = "pool-metrics")]
122
+ self.reuse_count.fetch_add(1, Ordering::Relaxed);
123
+ return PooledString { buffer, pool: self };
124
+ }
125
+ }
126
+
127
+ PooledString {
128
+ buffer: String::with_capacity(self.config.initial_capacity),
129
+ pool: self,
130
+ }
131
+ }
132
+
133
+ /// Return a buffer to the pool for reuse.
134
+ pub fn release(&self, mut buffer: String) {
135
+ if buffer.capacity() > self.config.max_capacity_before_discard {
136
+ return;
137
+ }
138
+
139
+ let bucket = self.find_bucket(buffer.capacity());
140
+ buffer.clear();
141
+
142
+ if let Some(mut queue) = self.pool.get_mut(&bucket) {
143
+ if queue.len() < self.config.max_buffers_per_size {
144
+ queue.push_back(buffer);
145
+ }
146
+ } else {
147
+ let mut queue = VecDeque::with_capacity(self.config.max_buffers_per_size);
148
+ queue.push_back(buffer);
149
+ self.pool.insert(bucket, queue);
150
+ }
151
+ }
152
+
153
+ /// Get the current pool size across all buckets.
154
+ #[allow(dead_code)]
155
+ pub fn size(&self) -> usize {
156
+ self.pool.iter().map(|entry| entry.value().len()).sum()
157
+ }
158
+
159
+ /// Get buffer reuse metrics (only available with `pool-metrics` feature).
160
+ #[cfg(feature = "pool-metrics")]
161
+ pub fn metrics(&self) -> StringBufferPoolMetrics {
162
+ let acquire = self.acquire_count.load(Ordering::Relaxed);
163
+ let reuse = self.reuse_count.load(Ordering::Relaxed);
164
+ let hit_rate = if acquire == 0 {
165
+ 0.0
166
+ } else {
167
+ (reuse as f64 / acquire as f64) * 100.0
168
+ };
169
+
170
+ StringBufferPoolMetrics {
171
+ total_acquires: acquire,
172
+ total_reuses: reuse,
173
+ hit_rate,
174
+ }
175
+ }
176
+ }
177
+
178
+ /// Metrics for StringBufferPool (only available with `pool-metrics` feature).
179
+ #[cfg(feature = "pool-metrics")]
180
+ #[derive(Debug, Clone, Copy)]
181
+ pub struct StringBufferPoolMetrics {
182
+ /// Total number of acquire calls
183
+ pub total_acquires: usize,
184
+ /// Total number of buffer reuses from pool
185
+ pub total_reuses: usize,
186
+ /// Hit rate as percentage (0.0-100.0)
187
+ pub hit_rate: f64,
188
+ }
189
+
190
+ /// RAII wrapper for a pooled string buffer.
191
+ ///
192
+ /// Automatically returns the buffer to the pool when dropped.
193
+ pub struct PooledString {
194
+ buffer: String,
195
+ pool: Arc<StringBufferPool>,
196
+ }
197
+
198
+ impl PooledString {
199
+ /// Get mutable access to the underlying string buffer.
200
+ pub fn buffer_mut(&mut self) -> &mut String {
201
+ &mut self.buffer
202
+ }
203
+
204
+ /// Get immutable access to the underlying string buffer.
205
+ pub fn as_str(&self) -> &str {
206
+ self.buffer.as_str()
207
+ }
208
+ }
209
+
210
+ impl std::ops::Deref for PooledString {
211
+ type Target = String;
212
+
213
+ fn deref(&self) -> &Self::Target {
214
+ &self.buffer
215
+ }
216
+ }
217
+
218
+ impl std::ops::DerefMut for PooledString {
219
+ fn deref_mut(&mut self) -> &mut Self::Target {
220
+ &mut self.buffer
221
+ }
222
+ }
223
+
224
+ impl Drop for PooledString {
225
+ fn drop(&mut self) {
226
+ let buffer = std::mem::take(&mut self.buffer);
227
+ self.pool.release(buffer);
228
+ }
229
+ }
230
+
231
+ impl std::fmt::Display for PooledString {
232
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
233
+ write!(f, "{}", self.buffer)
234
+ }
235
+ }
236
+
237
+ impl std::fmt::Debug for PooledString {
238
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
239
+ f.debug_tuple("PooledString").field(&self.buffer).finish()
240
+ }
241
+ }
242
+
243
+ /// Global string buffer pool for temporary allocations during extraction.
244
+ pub static STRING_BUFFER_POOL: Lazy<Arc<StringBufferPool>> =
245
+ Lazy::new(|| Arc::new(StringBufferPool::new(PoolConfig::default())));
246
+
247
+ /// Acquire a string buffer from the global pool.
248
+ ///
249
+ /// The returned buffer is automatically returned to the pool when dropped.
250
+ ///
251
+ /// # Example
252
+ ///
253
+ /// ```rust,ignore
254
+ /// let mut buffer = acquire_string_buffer();
255
+ /// buffer.push_str("content");
256
+ /// // Automatically returned to pool when buffer goes out of scope
257
+ /// ```
258
+ pub fn acquire_string_buffer() -> PooledString {
259
+ Arc::clone(&*STRING_BUFFER_POOL).acquire()
260
+ }
261
+
262
+ #[cfg(test)]
263
+ mod tests {
264
+ use super::*;
265
+
266
+ #[test]
267
+ fn test_buffer_pool_acquire_and_release() {
268
+ let config = PoolConfig::default();
269
+ let pool = Arc::new(StringBufferPool::new(config));
270
+
271
+ let mut buffer = pool.clone().acquire();
272
+ buffer.push_str("test content");
273
+ let capacity = buffer.capacity();
274
+
275
+ drop(buffer);
276
+
277
+ let buffer2 = pool.clone().acquire();
278
+ assert_eq!(buffer2.capacity(), capacity);
279
+ assert!(buffer2.is_empty());
280
+ }
281
+
282
+ #[test]
283
+ fn test_buffer_pool_size() {
284
+ let config = PoolConfig::default();
285
+ let pool = Arc::new(StringBufferPool::new(config));
286
+
287
+ assert_eq!(pool.size(), 0);
288
+
289
+ let buffer1 = pool.clone().acquire();
290
+ drop(buffer1);
291
+ assert_eq!(pool.size(), 1);
292
+
293
+ let buffer2 = pool.clone().acquire();
294
+ drop(buffer2);
295
+ assert_eq!(pool.size(), 1);
296
+ }
297
+
298
+ #[test]
299
+ fn test_buffer_pool_global() {
300
+ let buffer1 = acquire_string_buffer();
301
+ drop(buffer1);
302
+
303
+ let buffer2 = acquire_string_buffer();
304
+ assert!(buffer2.capacity() >= 4096);
305
+ }
306
+
307
+ #[test]
308
+ fn test_pooled_string_deref() {
309
+ let mut buffer = acquire_string_buffer();
310
+ buffer.push_str("hello");
311
+
312
+ assert_eq!(&*buffer, "hello");
313
+ assert_eq!(buffer.as_str(), "hello");
314
+ assert!(!buffer.is_empty());
315
+ }
316
+
317
+ #[test]
318
+ fn test_pooled_string_deref_mut() {
319
+ let mut buffer = acquire_string_buffer();
320
+ buffer.push_str("test");
321
+
322
+ buffer.buffer_mut().push_str(" more");
323
+ assert_eq!(buffer.as_str(), "test more");
324
+ }
325
+ }