kreuzberg 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +24 -16
  3. data/README.md +4 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,265 @@
1
+ //! OCR processor implementation using Tesseract.
2
+ //!
3
+ //! This module has been split into focused submodules for better organization:
4
+ //! - `validation` - Image and configuration validation
5
+ //! - `config` - Configuration hashing and Tesseract variables
6
+ //! - `execution` - Core OCR execution logic
7
+
8
+ mod config;
9
+ mod execution;
10
+ mod validation;
11
+
12
+ use crate::ocr::cache::OcrCache;
13
+ use crate::ocr::error::OcrError;
14
+ use crate::ocr::types::{BatchItemResult, TesseractConfig};
15
+ use crate::types::OcrExtractionResult;
16
+
17
+ pub struct OcrProcessor {
18
+ cache: OcrCache,
19
+ }
20
+
21
+ impl OcrProcessor {
22
+ pub fn new(cache_dir: Option<std::path::PathBuf>) -> Result<Self, OcrError> {
23
+ let cache = OcrCache::new(cache_dir)?;
24
+ Ok(Self { cache })
25
+ }
26
+
27
+ #[cfg_attr(feature = "otel", tracing::instrument(
28
+ skip(self, image_bytes),
29
+ fields(
30
+ ocr.backend = "tesseract",
31
+ ocr.language = %config.language,
32
+ image.size_bytes = image_bytes.len(),
33
+ )
34
+ ))]
35
+ pub fn process_image(&self, image_bytes: &[u8], config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
36
+ execution::process_image_with_cache(image_bytes, config, &self.cache, None)
37
+ }
38
+
39
+ /// Process an image with OCR and respect the output format from ExtractionConfig.
40
+ ///
41
+ /// This variant allows specifying an output format (Plain, Markdown, Djot) which
42
+ /// affects how the OCR result's mime_type is set when markdown output is requested.
43
+ #[cfg_attr(feature = "otel", tracing::instrument(
44
+ skip(self, image_bytes),
45
+ fields(
46
+ ocr.backend = "tesseract",
47
+ ocr.language = %config.language,
48
+ image.size_bytes = image_bytes.len(),
49
+ )
50
+ ))]
51
+ pub fn process_image_with_format(
52
+ &self,
53
+ image_bytes: &[u8],
54
+ config: &TesseractConfig,
55
+ output_format: crate::core::config::OutputFormat,
56
+ ) -> Result<OcrExtractionResult, OcrError> {
57
+ execution::process_image_with_cache(image_bytes, config, &self.cache, Some(output_format))
58
+ }
59
+
60
+ pub fn clear_cache(&self) -> Result<(), OcrError> {
61
+ self.cache.clear()
62
+ }
63
+
64
+ pub fn get_cache_stats(&self) -> Result<super::cache::OcrCacheStats, OcrError> {
65
+ self.cache.get_stats()
66
+ }
67
+
68
+ pub fn process_file(&self, file_path: &str, config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
69
+ execution::process_file_with_cache(file_path, config, &self.cache, None)
70
+ }
71
+
72
+ /// Process a file with OCR and respect the output format from ExtractionConfig.
73
+ ///
74
+ /// This variant allows specifying an output format (Plain, Markdown, Djot) which
75
+ /// affects how the OCR result's mime_type is set when markdown output is requested.
76
+ pub fn process_file_with_format(
77
+ &self,
78
+ file_path: &str,
79
+ config: &TesseractConfig,
80
+ output_format: crate::core::config::OutputFormat,
81
+ ) -> Result<OcrExtractionResult, OcrError> {
82
+ execution::process_file_with_cache(file_path, config, &self.cache, Some(output_format))
83
+ }
84
+
85
+ /// Process multiple image files in parallel using Rayon.
86
+ ///
87
+ /// This method processes OCR operations in parallel across CPU cores for improved throughput.
88
+ /// Results are returned in the same order as the input file paths.
89
+ pub fn process_files_batch(&self, file_paths: Vec<String>, config: &TesseractConfig) -> Vec<BatchItemResult> {
90
+ execution::process_files_batch(file_paths, config, &self.cache)
91
+ }
92
+ }
93
+
94
+ #[cfg(test)]
95
+ mod tests {
96
+ use super::*;
97
+ use tempfile::tempdir;
98
+
99
+ fn create_test_config() -> TesseractConfig {
100
+ TesseractConfig {
101
+ output_format: "text".to_string(),
102
+ enable_table_detection: false,
103
+ use_cache: false,
104
+ ..TesseractConfig::default()
105
+ }
106
+ }
107
+
108
+ #[test]
109
+ fn test_processor_creation() {
110
+ let temp_dir = tempdir().unwrap();
111
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf()));
112
+ assert!(processor.is_ok());
113
+ }
114
+
115
+ #[test]
116
+ fn test_processor_creation_default_cache_dir() {
117
+ let processor = OcrProcessor::new(None);
118
+ assert!(processor.is_ok());
119
+ }
120
+
121
+ #[test]
122
+ fn test_cache_operations() {
123
+ let temp_dir = tempdir().unwrap();
124
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
125
+
126
+ assert!(processor.clear_cache().is_ok());
127
+
128
+ let stats = processor.get_cache_stats();
129
+ assert!(stats.is_ok());
130
+ }
131
+
132
+ #[test]
133
+ fn test_process_file_nonexistent() {
134
+ let temp_dir = tempdir().unwrap();
135
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
136
+ let config = create_test_config();
137
+
138
+ let result = processor.process_file("/nonexistent/file.png", &config);
139
+ assert!(result.is_err());
140
+ assert!(result.unwrap_err().to_string().contains("Failed to read file"));
141
+ }
142
+
143
+ #[test]
144
+ fn test_process_files_batch_empty() {
145
+ let temp_dir = tempdir().unwrap();
146
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
147
+ let config = create_test_config();
148
+
149
+ let results = processor.process_files_batch(vec![], &config);
150
+ assert_eq!(results.len(), 0);
151
+ }
152
+
153
+ #[test]
154
+ fn test_process_image_invalid_image_data() {
155
+ let temp_dir = tempdir().unwrap();
156
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
157
+ let config = create_test_config();
158
+
159
+ let invalid_data = vec![0, 1, 2, 3, 4];
160
+ let result = processor.process_image(&invalid_data, &config);
161
+
162
+ assert!(result.is_err());
163
+ }
164
+
165
+ #[test]
166
+ fn test_process_files_batch_single_file() {
167
+ let temp_dir = tempdir().unwrap();
168
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
169
+ let config = create_test_config();
170
+
171
+ let results = processor.process_files_batch(vec!["/nonexistent.png".to_string()], &config);
172
+ assert_eq!(results.len(), 1);
173
+ assert!(!results[0].success);
174
+ assert!(results[0].error.is_some());
175
+ assert!(results[0].result.is_none());
176
+ }
177
+
178
+ #[test]
179
+ fn test_process_files_batch_multiple_files() {
180
+ let temp_dir = tempdir().unwrap();
181
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
182
+ let config = create_test_config();
183
+
184
+ let file_paths = vec![
185
+ "/nonexistent1.png".to_string(),
186
+ "/nonexistent2.png".to_string(),
187
+ "/nonexistent3.png".to_string(),
188
+ ];
189
+
190
+ let results = processor.process_files_batch(file_paths, &config);
191
+ assert_eq!(results.len(), 3);
192
+
193
+ for result in &results {
194
+ assert!(!result.success);
195
+ assert!(result.error.is_some());
196
+ assert!(result.result.is_none());
197
+ }
198
+ }
199
+
200
+ #[test]
201
+ fn test_batch_item_result_structure() {
202
+ use std::collections::HashMap;
203
+
204
+ let success_result = BatchItemResult {
205
+ file_path: "test.png".to_string(),
206
+ success: true,
207
+ result: Some(OcrExtractionResult {
208
+ content: "test".to_string(),
209
+ mime_type: "text/plain".to_string(),
210
+ metadata: HashMap::new(),
211
+ tables: vec![],
212
+ }),
213
+ error: None,
214
+ };
215
+
216
+ assert!(success_result.success);
217
+ assert!(success_result.result.is_some());
218
+ assert!(success_result.error.is_none());
219
+
220
+ let error_result = BatchItemResult {
221
+ file_path: "error.png".to_string(),
222
+ success: false,
223
+ result: None,
224
+ error: Some("Test error".to_string()),
225
+ };
226
+
227
+ assert!(!error_result.success);
228
+ assert!(error_result.result.is_none());
229
+ assert!(error_result.error.is_some());
230
+ }
231
+
232
+ #[test]
233
+ fn test_process_files_batch_preserves_order() {
234
+ let temp_dir = tempdir().unwrap();
235
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
236
+ let config = create_test_config();
237
+
238
+ let file_paths = vec![
239
+ "file1.png".to_string(),
240
+ "file2.png".to_string(),
241
+ "file3.png".to_string(),
242
+ ];
243
+
244
+ let results = processor.process_files_batch(file_paths.clone(), &config);
245
+
246
+ assert_eq!(results.len(), 3);
247
+ assert_eq!(results[0].file_path, "file1.png");
248
+ assert_eq!(results[1].file_path, "file2.png");
249
+ assert_eq!(results[2].file_path, "file3.png");
250
+ }
251
+
252
+ #[test]
253
+ fn test_process_image_with_cache_disabled() {
254
+ let temp_dir = tempdir().unwrap();
255
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
256
+
257
+ let mut config = create_test_config();
258
+ config.use_cache = false;
259
+
260
+ let invalid_data = vec![0, 1, 2, 3];
261
+ let result = processor.process_image(&invalid_data, &config);
262
+
263
+ assert!(result.is_err());
264
+ }
265
+ }
@@ -0,0 +1,145 @@
1
+ //! Image and configuration validation logic.
2
+ //!
3
+ //! This module handles validation of images, language files, and Tesseract configuration
4
+ //! before OCR processing begins.
5
+
6
+ use crate::ocr::error::OcrError;
7
+ use std::env;
8
+ use std::path::Path;
9
+
10
+ /// Validate language configuration and check for traineddata files.
11
+ ///
12
+ /// This function validates that:
13
+ /// 1. Language string is not empty
14
+ /// 2. Traineddata files exist for all specified languages
15
+ ///
16
+ /// # Arguments
17
+ ///
18
+ /// * `language` - Language code(s) to validate (can be "eng" or "eng+fra" etc.)
19
+ /// * `tessdata_path` - Path to tessdata directory
20
+ ///
21
+ /// # Returns
22
+ ///
23
+ /// `Ok(())` if validation passes, otherwise returns an error
24
+ pub(super) fn validate_language_and_traineddata(language: &str, tessdata_path: &str) -> Result<(), OcrError> {
25
+ // Validate language before initializing to prevent segfault ~keep
26
+ if language.trim().is_empty() {
27
+ return Err(OcrError::TesseractInitializationFailed(
28
+ "Language cannot be empty. Please specify a valid language code (e.g., 'eng')".to_string(),
29
+ ));
30
+ }
31
+
32
+ // Validate language file exists before initializing to prevent segfault ~keep
33
+ if !tessdata_path.is_empty() {
34
+ let languages: Vec<&str> = language.split('+').collect();
35
+ for lang in languages {
36
+ let lang = lang.trim();
37
+ if lang.is_empty() {
38
+ continue;
39
+ }
40
+ let traineddata_path = Path::new(tessdata_path).join(format!("{}.traineddata", lang));
41
+ if !traineddata_path.exists() {
42
+ return Err(OcrError::TesseractInitializationFailed(format!(
43
+ "Language '{}' not found. Traineddata file does not exist: {}",
44
+ lang,
45
+ traineddata_path.display()
46
+ )));
47
+ }
48
+ }
49
+ }
50
+
51
+ Ok(())
52
+ }
53
+
54
+ /// Resolve tessdata path from environment or fallback locations.
55
+ ///
56
+ /// Checks TESSDATA_PREFIX environment variable first, then tries common
57
+ /// installation paths for macOS, Linux, and Windows.
58
+ ///
59
+ /// # Returns
60
+ ///
61
+ /// Path to tessdata directory if found, otherwise empty string
62
+ pub(super) fn resolve_tessdata_path() -> String {
63
+ let tessdata_env = env::var("TESSDATA_PREFIX").ok();
64
+ let fallback_paths = [
65
+ "/opt/homebrew/share/tessdata",
66
+ "/opt/homebrew/opt/tesseract/share/tessdata",
67
+ "/usr/local/opt/tesseract/share/tessdata",
68
+ "/usr/share/tesseract-ocr/5/tessdata",
69
+ "/usr/share/tesseract-ocr/4/tessdata",
70
+ "/usr/share/tessdata",
71
+ "/usr/local/share/tessdata",
72
+ r#"C:\Program Files\Tesseract-OCR\tessdata"#,
73
+ r#"C:\ProgramData\Tesseract-OCR\tessdata"#,
74
+ ];
75
+
76
+ tessdata_env
77
+ .or_else(|| {
78
+ fallback_paths
79
+ .iter()
80
+ .find(|p| Path::new(p).exists())
81
+ .map(|p| (*p).to_string())
82
+ })
83
+ .unwrap_or_default()
84
+ }
85
+
86
+ /// Strip control characters from text, preserving whitespace.
87
+ ///
88
+ /// Removes control characters (0x00-0x1F, 0x7F) except for newlines, carriage returns, and tabs.
89
+ ///
90
+ /// # Arguments
91
+ ///
92
+ /// * `text` - Text to clean
93
+ ///
94
+ /// # Returns
95
+ ///
96
+ /// Cleaned text with control characters removed
97
+ pub(super) fn strip_control_characters(text: &str) -> String {
98
+ if text
99
+ .chars()
100
+ .any(|c| matches!(c, '\u{0000}'..='\u{001F}' | '\u{007F}') && c != '\n' && c != '\r' && c != '\t')
101
+ {
102
+ text.chars()
103
+ .filter(|c| !matches!(c, '\u{0000}'..='\u{001F}' | '\u{007F}') || matches!(c, '\n' | '\r' | '\t'))
104
+ .collect()
105
+ } else {
106
+ text.to_string()
107
+ }
108
+ }
109
+
110
+ #[cfg(test)]
111
+ mod tests {
112
+ use super::*;
113
+
114
+ #[test]
115
+ fn test_strip_control_characters() {
116
+ let input = "Hello\x00World\x01Test";
117
+ let output = strip_control_characters(input);
118
+ assert_eq!(output, "HelloWorldTest");
119
+
120
+ let input_with_newlines = "Hello\nWorld\rTest\t!";
121
+ let output = strip_control_characters(input_with_newlines);
122
+ assert_eq!(output, "Hello\nWorld\rTest\t!");
123
+ }
124
+
125
+ #[test]
126
+ fn test_strip_control_characters_all_control() {
127
+ let input = "\x00\x01\x02\x03";
128
+ let output = strip_control_characters(input);
129
+ assert_eq!(output, "");
130
+ }
131
+
132
+ #[test]
133
+ fn test_strip_control_characters_no_control() {
134
+ let input = "Hello World Test";
135
+ let output = strip_control_characters(input);
136
+ assert_eq!(output, "Hello World Test");
137
+ }
138
+
139
+ #[test]
140
+ fn test_strip_control_characters_delete_char() {
141
+ let input = "Hello\x7FWorld";
142
+ let output = strip_control_characters(input);
143
+ assert_eq!(output, "HelloWorld");
144
+ }
145
+ }
@@ -177,20 +177,24 @@ impl OcrBackend for TesseractBackend {
177
177
  async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
178
178
  let tess_config = self.config_to_tesseract(config);
179
179
  let tess_config_clone = tess_config.clone();
180
+ let output_format = config.output_format;
180
181
 
181
182
  let processor = Arc::clone(&self.processor);
182
183
  let image_bytes = image_bytes.to_vec();
183
184
 
184
- let ocr_result = tokio::task::spawn_blocking(move || processor.process_image(&image_bytes, &tess_config_clone))
185
- .await
186
- .map_err(|e| crate::KreuzbergError::Plugin {
187
- message: format!("Tesseract task panicked: {}", e),
188
- plugin_name: "tesseract".to_string(),
189
- })?
190
- .map_err(|e| crate::KreuzbergError::Ocr {
191
- message: format!("Tesseract OCR failed: {}", e),
192
- source: Some(Box::new(e)),
193
- })?;
185
+ let ocr_result = tokio::task::spawn_blocking(move || match output_format {
186
+ Some(fmt) => processor.process_image_with_format(&image_bytes, &tess_config_clone, fmt),
187
+ None => processor.process_image(&image_bytes, &tess_config_clone),
188
+ })
189
+ .await
190
+ .map_err(|e| crate::KreuzbergError::Plugin {
191
+ message: format!("Tesseract task panicked: {}", e),
192
+ plugin_name: "tesseract".to_string(),
193
+ })?
194
+ .map_err(|e| crate::KreuzbergError::Ocr {
195
+ message: format!("Tesseract OCR failed: {}", e),
196
+ source: Some(Box::new(e)),
197
+ })?;
194
198
 
195
199
  let metadata = crate::types::Metadata {
196
200
  format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
@@ -225,26 +229,32 @@ impl OcrBackend for TesseractBackend {
225
229
  detected_languages: None,
226
230
  chunks: None,
227
231
  images: None,
232
+ elements: None,
233
+ djot_content: None,
228
234
  })
229
235
  }
230
236
 
231
237
  async fn process_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
232
238
  let tess_config = self.config_to_tesseract(config);
233
239
  let tess_config_clone = tess_config.clone();
240
+ let output_format = config.output_format;
234
241
 
235
242
  let processor = Arc::clone(&self.processor);
236
243
  let path_str = path.to_string_lossy().to_string();
237
244
 
238
- let ocr_result = tokio::task::spawn_blocking(move || processor.process_file(&path_str, &tess_config_clone))
239
- .await
240
- .map_err(|e| crate::KreuzbergError::Plugin {
241
- message: format!("Tesseract task panicked: {}", e),
242
- plugin_name: "tesseract".to_string(),
243
- })?
244
- .map_err(|e| crate::KreuzbergError::Ocr {
245
- message: format!("Tesseract OCR failed: {}", e),
246
- source: Some(Box::new(e)),
247
- })?;
245
+ let ocr_result = tokio::task::spawn_blocking(move || match output_format {
246
+ Some(fmt) => processor.process_file_with_format(&path_str, &tess_config_clone, fmt),
247
+ None => processor.process_file(&path_str, &tess_config_clone),
248
+ })
249
+ .await
250
+ .map_err(|e| crate::KreuzbergError::Plugin {
251
+ message: format!("Tesseract task panicked: {}", e),
252
+ plugin_name: "tesseract".to_string(),
253
+ })?
254
+ .map_err(|e| crate::KreuzbergError::Ocr {
255
+ message: format!("Tesseract OCR failed: {}", e),
256
+ source: Some(Box::new(e)),
257
+ })?;
248
258
 
249
259
  let metadata = crate::types::Metadata {
250
260
  format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
@@ -279,6 +289,8 @@ impl OcrBackend for TesseractBackend {
279
289
  detected_languages: None,
280
290
  chunks: None,
281
291
  images: None,
292
+ elements: None,
293
+ djot_content: None,
282
294
  })
283
295
  }
284
296
 
@@ -326,10 +338,11 @@ mod tests {
326
338
  #[test]
327
339
  fn test_tesseract_backend_supports_language() {
328
340
  let backend = TesseractBackend::new().unwrap();
341
+ // English should always be available
329
342
  assert!(backend.supports_language("eng"));
330
- assert!(backend.supports_language("deu"));
331
- assert!(backend.supports_language("fra"));
343
+ // Invalid language codes should return false
332
344
  assert!(!backend.supports_language("xyz"));
345
+ assert!(!backend.supports_language("invalid"));
333
346
  }
334
347
 
335
348
  #[test]
@@ -342,9 +355,10 @@ mod tests {
342
355
  fn test_tesseract_backend_supported_languages() {
343
356
  let backend = TesseractBackend::new().unwrap();
344
357
  let languages = backend.supported_languages();
358
+ // English should always be available
345
359
  assert!(languages.contains(&"eng".to_string()));
346
- assert!(languages.contains(&"deu".to_string()));
347
- assert!(languages.len() > 30);
360
+ // Should have at least English
361
+ assert!(!languages.is_empty());
348
362
  }
349
363
 
350
364
  #[test]
@@ -354,6 +368,7 @@ mod tests {
354
368
  backend: "tesseract".to_string(),
355
369
  language: "deu".to_string(),
356
370
  tesseract_config: None,
371
+ output_format: None,
357
372
  };
358
373
 
359
374
  let tess_config = backend.config_to_tesseract(&ocr_config);
@@ -375,6 +390,7 @@ mod tests {
375
390
  backend: "tesseract".to_string(),
376
391
  language: "eng".to_string(),
377
392
  tesseract_config: Some(custom_tess_config),
393
+ output_format: None,
378
394
  };
379
395
 
380
396
  let tess_config = backend.config_to_tesseract(&ocr_config);
@@ -418,6 +434,7 @@ mod tests {
418
434
  backend: "tesseract".to_string(),
419
435
  language: "eng".to_string(),
420
436
  tesseract_config: Some(custom_tess_config),
437
+ output_format: None,
421
438
  };
422
439
 
423
440
  let tess_config = backend.config_to_tesseract(&ocr_config);
@@ -263,21 +263,34 @@ mod tests {
263
263
 
264
264
  #[test]
265
265
  fn test_bind_pdfium_multiple_calls() {
266
- let result1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1");
267
- let result2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2");
266
+ // First call - acquire lock, test success, then drop handle to release lock
267
+ {
268
+ let result1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1");
269
+ assert!(result1.is_ok(), "First call should succeed");
270
+ } // result1 dropped here, releasing the lock
268
271
 
269
- assert!(result1.is_ok(), "First call should succeed");
270
- assert!(result2.is_ok(), "Second call should also succeed");
272
+ // Second call - can now acquire lock since first handle was dropped
273
+ {
274
+ let result2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2");
275
+ assert!(result2.is_ok(), "Second call should also succeed");
276
+ }
271
277
  }
272
278
 
273
279
  #[test]
274
280
  fn test_bind_pdfium_returns_same_instance() {
275
- let handle1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1").unwrap();
276
- let handle2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2").unwrap();
281
+ // Get pointer from first handle, then drop it to release lock
282
+ let ptr1 = {
283
+ let handle1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1").unwrap();
284
+ &*handle1 as *const Pdfium
285
+ }; // handle1 dropped here, releasing the lock
286
+
287
+ // Get pointer from second handle
288
+ let ptr2 = {
289
+ let handle2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2").unwrap();
290
+ &*handle2 as *const Pdfium
291
+ };
277
292
 
278
293
  // Both handles should dereference to the same Pdfium instance
279
- let ptr1 = &*handle1 as *const Pdfium;
280
- let ptr2 = &*handle2 as *const Pdfium;
281
294
  assert_eq!(ptr1, ptr2, "Both handles should reference the same Pdfium instance");
282
295
  }
283
296