kreuzberg 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +24 -16
  3. data/README.md +4 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -1,762 +0,0 @@
1
- //! String interning/pooling for frequently used strings.
2
- //!
3
- //! This module provides thread-safe string interning to reduce memory allocations
4
- //! for strings that appear repeatedly across documents (MIME types, language codes, format field names).
5
- //!
6
- //! # Performance
7
- //!
8
- //! String interning provides 0.1-0.3% improvement by:
9
- //! - Deduplicating repeated strings (e.g., "application/pdf" appears 1000s of times)
10
- //! - Reducing allocation overhead for commonly used strings
11
- //! - Enabling pointer comparisons instead of string comparisons
12
- //!
13
- //! # Thread Safety
14
- //!
15
- //! The intern pool uses a `DashMap` for lock-free concurrent access. Multiple threads
16
- //! can insert and lookup strings simultaneously without contention.
17
- //!
18
- //! # Example
19
- //!
20
- //! ```rust,ignore
21
- //! use kreuzberg::utils::string_pool::intern_mime_type;
22
- //!
23
- //! let mime1 = intern_mime_type("application/pdf");
24
- //! let mime2 = intern_mime_type("application/pdf");
25
- //! // Both mime1 and mime2 point to the same interned string
26
- //! assert_eq!(mime1, mime2);
27
- //! ```
28
-
29
- use once_cell::sync::Lazy;
30
- use std::collections::VecDeque;
31
- use std::sync::Arc;
32
- use std::sync::atomic::{AtomicBool, Ordering};
33
-
34
- #[cfg(feature = "pool-metrics")]
35
- use std::sync::atomic::AtomicUsize;
36
-
37
- /// A reference to an interned string stored in an Arc.
38
- ///
39
- /// This wraps an Arc<String> and provides convenient access to the string content.
40
- /// Multiple calls with the same string content will share the same Arc, reducing memory usage.
41
- #[derive(Clone)]
42
- pub struct InternedString(Arc<String>);
43
-
44
- impl InternedString {
45
- /// Get the string content.
46
- pub fn as_str(&self) -> &str {
47
- self.0.as_str()
48
- }
49
- }
50
-
51
- impl AsRef<str> for InternedString {
52
- fn as_ref(&self) -> &str {
53
- self.as_str()
54
- }
55
- }
56
-
57
- impl std::fmt::Display for InternedString {
58
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
59
- write!(f, "{}", self.as_str())
60
- }
61
- }
62
-
63
- impl std::fmt::Debug for InternedString {
64
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
65
- f.debug_tuple("InternedString").field(&self.as_str()).finish()
66
- }
67
- }
68
-
69
- impl PartialEq for InternedString {
70
- fn eq(&self, other: &Self) -> bool {
71
- Arc::ptr_eq(&self.0, &other.0) || self.as_str() == other.as_str()
72
- }
73
- }
74
-
75
- impl Eq for InternedString {}
76
-
77
- impl std::hash::Hash for InternedString {
78
- fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
79
- self.as_str().hash(state);
80
- }
81
- }
82
-
83
- impl std::ops::Deref for InternedString {
84
- type Target = str;
85
-
86
- fn deref(&self) -> &Self::Target {
87
- self.as_str()
88
- }
89
- }
90
-
91
- /// String pool for MIME types.
92
- ///
93
- /// Lazily initializes with all known MIME types from `kreuzberg::core::mime`.
94
- /// Pre-interning is deferred until first access to reduce startup memory usage.
95
- struct MimeStringPool {
96
- pool: dashmap::DashMap<String, Arc<String>>,
97
- initialized: AtomicBool,
98
- }
99
-
100
- impl MimeStringPool {
101
- /// Create a new MIME string pool.
102
- /// Pre-interning is deferred until first `get_or_intern()` call.
103
- fn new() -> Self {
104
- MimeStringPool {
105
- pool: dashmap::DashMap::new(),
106
- initialized: AtomicBool::new(false),
107
- }
108
- }
109
-
110
- /// Ensure all known MIME types are pre-interned (one-time initialization).
111
- #[inline]
112
- fn ensure_initialized(&self) {
113
- if self.initialized.load(Ordering::Acquire) {
114
- return;
115
- }
116
-
117
- let mime_types = vec![
118
- "text/html",
119
- "text/markdown",
120
- "text/x-markdown",
121
- "text/plain",
122
- "application/pdf",
123
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
124
- "application/msword",
125
- "application/vnd.ms-powerpoint",
126
- "message/rfc822",
127
- "application/vnd.ms-outlook",
128
- "application/json",
129
- "text/json",
130
- "application/x-yaml",
131
- "text/yaml",
132
- "text/x-yaml",
133
- "application/yaml",
134
- "application/toml",
135
- "text/toml",
136
- "application/xml",
137
- "text/xml",
138
- "image/svg+xml",
139
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
140
- "application/vnd.ms-excel",
141
- "application/vnd.ms-excel.sheet.macroEnabled.12",
142
- "application/vnd.ms-excel.sheet.binary.macroEnabled.12",
143
- "application/vnd.ms-excel.addin.macroEnabled.12",
144
- "application/vnd.ms-excel.template.macroEnabled.12",
145
- "application/vnd.oasis.opendocument.spreadsheet",
146
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
147
- "application/vnd.oasis.opendocument.text",
148
- "image/bmp",
149
- "image/gif",
150
- "image/jp2",
151
- "image/jpeg",
152
- "image/jpm",
153
- "image/jpx",
154
- "image/mj2",
155
- "image/pjpeg",
156
- "image/png",
157
- "image/tiff",
158
- "image/webp",
159
- "image/x-bmp",
160
- "image/x-ms-bmp",
161
- "image/x-portable-anymap",
162
- "image/x-portable-bitmap",
163
- "image/x-portable-graymap",
164
- "image/x-portable-pixmap",
165
- "image/x-tiff",
166
- "application/csl+json",
167
- "application/docbook+xml",
168
- "application/epub+zip",
169
- "application/rtf",
170
- "application/x-biblatex",
171
- "application/x-bibtex",
172
- "application/x-endnote+xml",
173
- "application/x-fictionbook+xml",
174
- "application/x-ipynb+json",
175
- "application/x-jats+xml",
176
- "application/x-latex",
177
- "application/xml+opml",
178
- "application/x-opml+xml",
179
- "application/x-research-info-systems",
180
- "application/x-typst",
181
- "text/csv",
182
- "text/tab-separated-values",
183
- "text/troff",
184
- "text/x-commonmark",
185
- "text/x-dokuwiki",
186
- "text/x-gfm",
187
- "text/x-markdown-extra",
188
- "text/x-mdoc",
189
- "text/x-multimarkdown",
190
- "text/x-opml",
191
- "text/x-org",
192
- "text/x-pod",
193
- "text/x-rst",
194
- "application/zip",
195
- "application/x-zip-compressed",
196
- "application/x-tar",
197
- "application/tar",
198
- "application/x-gtar",
199
- "application/x-ustar",
200
- "application/gzip",
201
- "application/x-7z-compressed",
202
- ];
203
-
204
- for mime_type in mime_types {
205
- self.pool.insert(mime_type.to_string(), Arc::new(mime_type.to_string()));
206
- }
207
-
208
- let _ = self
209
- .initialized
210
- .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
211
- }
212
-
213
- /// Get or intern a MIME type string.
214
- /// Ensures pre-interned MIME types are initialized on first call.
215
- fn get_or_intern(&self, mime_type: &str) -> Arc<String> {
216
- self.ensure_initialized();
217
-
218
- if let Some(entry) = self.pool.get(mime_type) {
219
- Arc::clone(&*entry)
220
- } else {
221
- let arc_string = Arc::new(mime_type.to_string());
222
- self.pool.insert(mime_type.to_string(), Arc::clone(&arc_string));
223
- arc_string
224
- }
225
- }
226
- }
227
-
228
- /// String pool for language codes.
229
- ///
230
- /// Lazily initializes with common ISO 639 language codes.
231
- /// Pre-interning is deferred until first access to reduce startup memory usage.
232
- struct LanguageStringPool {
233
- pool: dashmap::DashMap<String, Arc<String>>,
234
- initialized: AtomicBool,
235
- }
236
-
237
- impl LanguageStringPool {
238
- /// Create a new language string pool.
239
- /// Pre-interning is deferred until first `get_or_intern()` call.
240
- fn new() -> Self {
241
- LanguageStringPool {
242
- pool: dashmap::DashMap::new(),
243
- initialized: AtomicBool::new(false),
244
- }
245
- }
246
-
247
- /// Ensure all known language codes are pre-interned (one-time initialization).
248
- #[inline]
249
- fn ensure_initialized(&self) {
250
- if self.initialized.load(Ordering::Acquire) {
251
- return;
252
- }
253
-
254
- let lang_codes = vec![
255
- "en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh", "ar", "hi", "th", "tr", "pl", "nl", "sv", "no",
256
- "da", "fi", "cs", "hu", "ro", "el", "he", "fa", "ur", "vi", "id", "ms", "bn", "pa", "te", "mr", "ta", "gu",
257
- "kn", "ml", "or", "uk", "bg", "sr", "hr", "sl", "sk", "et", "lv", "lt", "sq", "mk", "ka", "hy", "eo",
258
- "ast", "ca", "eu", "gl", "cy", "gd", "ga",
259
- ];
260
-
261
- for code in lang_codes {
262
- self.pool.insert(code.to_string(), Arc::new(code.to_string()));
263
- }
264
-
265
- let _ = self
266
- .initialized
267
- .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
268
- }
269
-
270
- /// Get or intern a language code string.
271
- /// Ensures pre-interned language codes are initialized on first call.
272
- fn get_or_intern(&self, lang_code: &str) -> Arc<String> {
273
- self.ensure_initialized();
274
-
275
- if let Some(entry) = self.pool.get(lang_code) {
276
- Arc::clone(&*entry)
277
- } else {
278
- let arc_string = Arc::new(lang_code.to_string());
279
- self.pool.insert(lang_code.to_string(), Arc::clone(&arc_string));
280
- arc_string
281
- }
282
- }
283
- }
284
-
285
- /// Configuration for the string buffer pool.
286
- pub struct PoolConfig {
287
- /// Maximum buffers per size bucket
288
- pub max_buffers_per_size: usize,
289
- /// Initial capacity for new buffers
290
- pub initial_capacity: usize,
291
- /// Maximum capacity before discarding
292
- pub max_capacity_before_discard: usize,
293
- }
294
-
295
- impl Default for PoolConfig {
296
- fn default() -> Self {
297
- Self {
298
- max_buffers_per_size: 4,
299
- initial_capacity: 4096,
300
- max_capacity_before_discard: 65536,
301
- }
302
- }
303
- }
304
-
305
- /// Thread-safe reusable string buffer pool.
306
- ///
307
- /// This pool allows allocation and reuse of String buffers to reduce memory allocations
308
- /// during document extraction. Buffers are returned to the pool with cleared contents
309
- /// but preserved capacity, ready for reuse.
310
- ///
311
- /// # Thread Safety
312
- ///
313
- /// The pool uses DashMap for lock-free concurrent access. Multiple threads can
314
- /// acquire and release buffers simultaneously.
315
- ///
316
- /// # Usage
317
- ///
318
- /// ```rust,ignore
319
- /// use kreuzberg::utils::string_pool::STRING_BUFFER_POOL;
320
- ///
321
- /// // Acquire a buffer from the pool
322
- /// let mut buffer = STRING_BUFFER_POOL.acquire();
323
- /// buffer.push_str("some content");
324
- /// // Automatically returned to pool when dropped
325
- /// drop(buffer);
326
- /// ```
327
- pub struct StringBufferPool {
328
- pool: dashmap::DashMap<usize, VecDeque<String>>,
329
- config: PoolConfig,
330
- #[cfg(feature = "pool-metrics")]
331
- acquire_count: AtomicUsize,
332
- #[cfg(feature = "pool-metrics")]
333
- reuse_count: AtomicUsize,
334
- }
335
-
336
- impl StringBufferPool {
337
- /// Create a new string buffer pool with given configuration.
338
- pub fn new(config: PoolConfig) -> Self {
339
- StringBufferPool {
340
- pool: dashmap::DashMap::new(),
341
- config,
342
- #[cfg(feature = "pool-metrics")]
343
- acquire_count: AtomicUsize::new(0),
344
- #[cfg(feature = "pool-metrics")]
345
- reuse_count: AtomicUsize::new(0),
346
- }
347
- }
348
-
349
- /// Find the appropriate bucket size for a given capacity.
350
- fn find_bucket(&self, capacity: usize) -> usize {
351
- if capacity <= 1024 {
352
- 1024
353
- } else if capacity <= 4096 {
354
- 4096
355
- } else if capacity <= 16384 {
356
- 16384
357
- } else if capacity <= 65536 {
358
- 65536
359
- } else {
360
- 262144
361
- }
362
- }
363
-
364
- /// Try to acquire a buffer from a specific bucket, returning it if found.
365
- fn try_acquire_from_bucket(&self, bucket: usize) -> Option<String> {
366
- if let Some(mut entry) = self.pool.get_mut(&bucket) {
367
- entry.pop_front()
368
- } else {
369
- None
370
- }
371
- }
372
-
373
- /// Acquire a string buffer from the pool, or allocate a new one if pool is exhausted.
374
- ///
375
- /// The returned buffer is automatically returned to the pool when dropped.
376
- /// Must be called with the pool wrapped in Arc.
377
- pub fn acquire(self: Arc<Self>) -> PooledString {
378
- #[cfg(feature = "pool-metrics")]
379
- self.acquire_count.fetch_add(1, Ordering::Relaxed);
380
-
381
- let default_bucket = self.config.initial_capacity;
382
- if let Some(buffer) = self.try_acquire_from_bucket(default_bucket) {
383
- #[cfg(feature = "pool-metrics")]
384
- self.reuse_count.fetch_add(1, Ordering::Relaxed);
385
- return PooledString { buffer, pool: self };
386
- }
387
-
388
- for &bucket in &[1024, 16384, 65536] {
389
- if let Some(buffer) = self.try_acquire_from_bucket(bucket) {
390
- #[cfg(feature = "pool-metrics")]
391
- self.reuse_count.fetch_add(1, Ordering::Relaxed);
392
- return PooledString { buffer, pool: self };
393
- }
394
- }
395
-
396
- PooledString {
397
- buffer: String::with_capacity(self.config.initial_capacity),
398
- pool: self,
399
- }
400
- }
401
-
402
- /// Return a buffer to the pool for reuse.
403
- pub fn release(&self, mut buffer: String) {
404
- if buffer.capacity() > self.config.max_capacity_before_discard {
405
- return;
406
- }
407
-
408
- let bucket = self.find_bucket(buffer.capacity());
409
- buffer.clear();
410
-
411
- if let Some(mut queue) = self.pool.get_mut(&bucket) {
412
- if queue.len() < self.config.max_buffers_per_size {
413
- queue.push_back(buffer);
414
- }
415
- } else {
416
- let mut queue = VecDeque::with_capacity(self.config.max_buffers_per_size);
417
- queue.push_back(buffer);
418
- self.pool.insert(bucket, queue);
419
- }
420
- }
421
-
422
- /// Get the current pool size across all buckets.
423
- #[allow(dead_code)]
424
- pub fn size(&self) -> usize {
425
- self.pool.iter().map(|entry| entry.value().len()).sum()
426
- }
427
-
428
- /// Get buffer reuse metrics (only available with `pool-metrics` feature).
429
- #[cfg(feature = "pool-metrics")]
430
- pub fn metrics(&self) -> StringBufferPoolMetrics {
431
- let acquire = self.acquire_count.load(Ordering::Relaxed);
432
- let reuse = self.reuse_count.load(Ordering::Relaxed);
433
- let hit_rate = if acquire == 0 {
434
- 0.0
435
- } else {
436
- (reuse as f64 / acquire as f64) * 100.0
437
- };
438
-
439
- StringBufferPoolMetrics {
440
- total_acquires: acquire,
441
- total_reuses: reuse,
442
- hit_rate,
443
- }
444
- }
445
- }
446
-
447
- /// Metrics for StringBufferPool (only available with `pool-metrics` feature).
448
- #[cfg(feature = "pool-metrics")]
449
- #[derive(Debug, Clone, Copy)]
450
- pub struct StringBufferPoolMetrics {
451
- /// Total number of acquire calls
452
- pub total_acquires: usize,
453
- /// Total number of buffer reuses from pool
454
- pub total_reuses: usize,
455
- /// Hit rate as percentage (0.0-100.0)
456
- pub hit_rate: f64,
457
- }
458
-
459
- /// RAII wrapper for a pooled string buffer.
460
- ///
461
- /// Automatically returns the buffer to the pool when dropped.
462
- pub struct PooledString {
463
- buffer: String,
464
- pool: Arc<StringBufferPool>,
465
- }
466
-
467
- impl PooledString {
468
- /// Get mutable access to the underlying string buffer.
469
- pub fn buffer_mut(&mut self) -> &mut String {
470
- &mut self.buffer
471
- }
472
-
473
- /// Get immutable access to the underlying string buffer.
474
- pub fn as_str(&self) -> &str {
475
- self.buffer.as_str()
476
- }
477
- }
478
-
479
- impl std::ops::Deref for PooledString {
480
- type Target = String;
481
-
482
- fn deref(&self) -> &Self::Target {
483
- &self.buffer
484
- }
485
- }
486
-
487
- impl std::ops::DerefMut for PooledString {
488
- fn deref_mut(&mut self) -> &mut Self::Target {
489
- &mut self.buffer
490
- }
491
- }
492
-
493
- impl Drop for PooledString {
494
- fn drop(&mut self) {
495
- let buffer = std::mem::take(&mut self.buffer);
496
- self.pool.release(buffer);
497
- }
498
- }
499
-
500
- impl std::fmt::Display for PooledString {
501
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
502
- write!(f, "{}", self.buffer)
503
- }
504
- }
505
-
506
- impl std::fmt::Debug for PooledString {
507
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
508
- f.debug_tuple("PooledString").field(&self.buffer).finish()
509
- }
510
- }
511
-
512
- /// Global MIME type string pool.
513
- static MIME_POOL: Lazy<MimeStringPool> = Lazy::new(MimeStringPool::new);
514
-
515
- /// Global language code string pool.
516
- static LANGUAGE_POOL: Lazy<LanguageStringPool> = Lazy::new(LanguageStringPool::new);
517
-
518
- /// Global string buffer pool for temporary allocations during extraction.
519
- pub static STRING_BUFFER_POOL: Lazy<Arc<StringBufferPool>> =
520
- Lazy::new(|| Arc::new(StringBufferPool::new(PoolConfig::default())));
521
-
522
- /// Get or intern a MIME type string.
523
- ///
524
- /// Returns an `InternedString` that is guaranteed to be deduplicated with any other
525
- /// intern call for the same MIME type. This reduces memory usage and allows
526
- /// fast pointer-based comparisons.
527
- ///
528
- /// # Arguments
529
- ///
530
- /// * `mime_type` - The MIME type string to intern
531
- ///
532
- /// # Returns
533
- ///
534
- /// An `InternedString` pointing to the deduplicated string
535
- ///
536
- /// # Example
537
- ///
538
- /// ```rust,ignore
539
- /// let pdf1 = intern_mime_type("application/pdf");
540
- /// let pdf2 = intern_mime_type("application/pdf");
541
- /// assert_eq!(pdf1, pdf2); // Same pointer
542
- /// ```
543
- pub fn intern_mime_type(mime_type: &str) -> InternedString {
544
- InternedString(MIME_POOL.get_or_intern(mime_type))
545
- }
546
-
547
- /// Get or intern a language code string.
548
- ///
549
- /// Returns an `InternedString` that is guaranteed to be deduplicated with any other
550
- /// intern call for the same language code.
551
- ///
552
- /// # Arguments
553
- ///
554
- /// * `lang_code` - The language code to intern (e.g., "en", "es", "fr")
555
- ///
556
- /// # Returns
557
- ///
558
- /// An `InternedString` pointing to the deduplicated string
559
- ///
560
- /// # Example
561
- ///
562
- /// ```rust,ignore
563
- /// let en1 = intern_language_code("en");
564
- /// let en2 = intern_language_code("en");
565
- /// assert_eq!(en1, en2); // Same pointer
566
- /// ```
567
- pub fn intern_language_code(lang_code: &str) -> InternedString {
568
- InternedString(LANGUAGE_POOL.get_or_intern(lang_code))
569
- }
570
-
571
- /// Acquire a string buffer from the global pool.
572
- ///
573
- /// The returned buffer is automatically returned to the pool when dropped.
574
- ///
575
- /// # Example
576
- ///
577
- /// ```rust,ignore
578
- /// let mut buffer = acquire_string_buffer();
579
- /// buffer.push_str("content");
580
- /// // Automatically returned to pool when buffer goes out of scope
581
- /// ```
582
- pub fn acquire_string_buffer() -> PooledString {
583
- Arc::clone(&*STRING_BUFFER_POOL).acquire()
584
- }
585
-
586
- #[cfg(test)]
587
- mod tests {
588
- use super::*;
589
-
590
- #[test]
591
- fn test_mime_type_deduplication() {
592
- let mime1 = intern_mime_type("application/pdf");
593
- let mime2 = intern_mime_type("application/pdf");
594
-
595
- assert_eq!(mime1, mime2);
596
- assert!(Arc::ptr_eq(&mime1.0, &mime2.0));
597
- }
598
-
599
- #[test]
600
- fn test_language_code_deduplication() {
601
- let en1 = intern_language_code("en");
602
- let en2 = intern_language_code("en");
603
-
604
- assert_eq!(en1, en2);
605
- assert!(Arc::ptr_eq(&en1.0, &en2.0));
606
- }
607
-
608
- #[test]
609
- fn test_interned_string_display() {
610
- let mime = intern_mime_type("text/html");
611
- assert_eq!(format!("{}", mime), "text/html");
612
- }
613
-
614
- #[test]
615
- fn test_interned_string_deref() {
616
- let mime = intern_mime_type("application/json");
617
- assert_eq!(&*mime, "application/json");
618
- assert_eq!(mime.as_ref(), "application/json");
619
- assert_eq!(mime.as_str(), "application/json");
620
- }
621
-
622
- #[test]
623
- fn test_preinterned_mime_types() {
624
- let pdf = intern_mime_type("application/pdf");
625
- assert_eq!(pdf.as_str(), "application/pdf");
626
-
627
- let html = intern_mime_type("text/html");
628
- assert_eq!(html.as_str(), "text/html");
629
-
630
- let json = intern_mime_type("application/json");
631
- assert_eq!(json.as_str(), "application/json");
632
- }
633
-
634
- #[test]
635
- fn test_preinterned_language_codes() {
636
- let en = intern_language_code("en");
637
- assert_eq!(en.as_str(), "en");
638
-
639
- let es = intern_language_code("es");
640
- assert_eq!(es.as_str(), "es");
641
-
642
- let fr = intern_language_code("fr");
643
- assert_eq!(fr.as_str(), "fr");
644
- }
645
-
646
- #[test]
647
- #[ignore = "Flaky test - concurrent interning may not always share the same Arc"]
648
- fn test_concurrent_interning() {
649
- use std::sync::Arc;
650
- use std::thread;
651
-
652
- let mime = "application/pdf";
653
- let results = Arc::new(std::sync::Mutex::new(Vec::new()));
654
-
655
- let handles: Vec<_> = (0..10)
656
- .map(|_| {
657
- let results = Arc::clone(&results);
658
- thread::spawn(move || {
659
- let interned = intern_mime_type(mime);
660
- results.lock().unwrap().push(interned);
661
- })
662
- })
663
- .collect();
664
-
665
- for handle in handles {
666
- handle.join().unwrap();
667
- }
668
-
669
- let interned_strings = results.lock().unwrap();
670
- assert_eq!(interned_strings.len(), 10);
671
-
672
- let first_arc = &interned_strings[0].0;
673
- for interned in &*interned_strings {
674
- assert!(
675
- Arc::ptr_eq(&interned.0, first_arc),
676
- "All interned strings should share the same Arc"
677
- );
678
- }
679
- }
680
-
681
- #[test]
682
- fn test_interned_string_hash() {
683
- let mime1 = intern_mime_type("application/pdf");
684
- let mime2 = intern_mime_type("application/pdf");
685
-
686
- use std::collections::HashSet;
687
- let mut set = HashSet::new();
688
- set.insert(mime1);
689
- set.insert(mime2);
690
-
691
- assert_eq!(set.len(), 1);
692
- }
693
-
694
- #[test]
695
- fn test_interned_string_clone() {
696
- let mime1 = intern_mime_type("text/html");
697
- let mime2 = mime1.clone();
698
-
699
- assert_eq!(mime1, mime2);
700
- assert!(Arc::ptr_eq(&mime1.0, &mime2.0));
701
- }
702
-
703
- #[test]
704
- fn test_buffer_pool_acquire_and_release() {
705
- let config = PoolConfig::default();
706
- let pool = Arc::new(StringBufferPool::new(config));
707
-
708
- let mut buffer = pool.clone().acquire();
709
- buffer.push_str("test content");
710
- let capacity = buffer.capacity();
711
-
712
- drop(buffer);
713
-
714
- let buffer2 = pool.clone().acquire();
715
- assert_eq!(buffer2.capacity(), capacity);
716
- assert!(buffer2.is_empty());
717
- }
718
-
719
- #[test]
720
- fn test_buffer_pool_size() {
721
- let config = PoolConfig::default();
722
- let pool = Arc::new(StringBufferPool::new(config));
723
-
724
- assert_eq!(pool.size(), 0);
725
-
726
- let buffer1 = pool.clone().acquire();
727
- drop(buffer1);
728
- assert_eq!(pool.size(), 1);
729
-
730
- let buffer2 = pool.clone().acquire();
731
- drop(buffer2);
732
- assert_eq!(pool.size(), 1);
733
- }
734
-
735
- #[test]
736
- fn test_buffer_pool_global() {
737
- let buffer1 = acquire_string_buffer();
738
- drop(buffer1);
739
-
740
- let buffer2 = acquire_string_buffer();
741
- assert!(buffer2.capacity() >= 4096);
742
- }
743
-
744
- #[test]
745
- fn test_pooled_string_deref() {
746
- let mut buffer = acquire_string_buffer();
747
- buffer.push_str("hello");
748
-
749
- assert_eq!(&*buffer, "hello");
750
- assert_eq!(buffer.as_str(), "hello");
751
- assert!(!buffer.is_empty());
752
- }
753
-
754
- #[test]
755
- fn test_pooled_string_deref_mut() {
756
- let mut buffer = acquire_string_buffer();
757
- buffer.push_str("test");
758
-
759
- buffer.buffer_mut().push_str(" more");
760
- assert_eq!(buffer.as_str(), "test more");
761
- }
762
- }