kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,265 @@
1
+ //! OCR processor implementation using Tesseract.
2
+ //!
3
+ //! This module has been split into focused submodules for better organization:
4
+ //! - `validation` - Image and configuration validation
5
+ //! - `config` - Configuration hashing and Tesseract variables
6
+ //! - `execution` - Core OCR execution logic
7
+
8
+ mod config;
9
+ mod execution;
10
+ mod validation;
11
+
12
+ use crate::ocr::cache::OcrCache;
13
+ use crate::ocr::error::OcrError;
14
+ use crate::ocr::types::{BatchItemResult, TesseractConfig};
15
+ use crate::types::OcrExtractionResult;
16
+
17
+ pub struct OcrProcessor {
18
+ cache: OcrCache,
19
+ }
20
+
21
+ impl OcrProcessor {
22
+ pub fn new(cache_dir: Option<std::path::PathBuf>) -> Result<Self, OcrError> {
23
+ let cache = OcrCache::new(cache_dir)?;
24
+ Ok(Self { cache })
25
+ }
26
+
27
+ #[cfg_attr(feature = "otel", tracing::instrument(
28
+ skip(self, image_bytes),
29
+ fields(
30
+ ocr.backend = "tesseract",
31
+ ocr.language = %config.language,
32
+ image.size_bytes = image_bytes.len(),
33
+ )
34
+ ))]
35
+ pub fn process_image(&self, image_bytes: &[u8], config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
36
+ execution::process_image_with_cache(image_bytes, config, &self.cache, None)
37
+ }
38
+
39
+ /// Process an image with OCR and respect the output format from ExtractionConfig.
40
+ ///
41
+ /// This variant allows specifying an output format (Plain, Markdown, Djot) which
42
+ /// affects how the OCR result's mime_type is set when markdown output is requested.
43
+ #[cfg_attr(feature = "otel", tracing::instrument(
44
+ skip(self, image_bytes),
45
+ fields(
46
+ ocr.backend = "tesseract",
47
+ ocr.language = %config.language,
48
+ image.size_bytes = image_bytes.len(),
49
+ )
50
+ ))]
51
+ pub fn process_image_with_format(
52
+ &self,
53
+ image_bytes: &[u8],
54
+ config: &TesseractConfig,
55
+ output_format: crate::core::config::OutputFormat,
56
+ ) -> Result<OcrExtractionResult, OcrError> {
57
+ execution::process_image_with_cache(image_bytes, config, &self.cache, Some(output_format))
58
+ }
59
+
60
+ pub fn clear_cache(&self) -> Result<(), OcrError> {
61
+ self.cache.clear()
62
+ }
63
+
64
+ pub fn get_cache_stats(&self) -> Result<super::cache::OcrCacheStats, OcrError> {
65
+ self.cache.get_stats()
66
+ }
67
+
68
+ pub fn process_file(&self, file_path: &str, config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
69
+ execution::process_file_with_cache(file_path, config, &self.cache, None)
70
+ }
71
+
72
+ /// Process a file with OCR and respect the output format from ExtractionConfig.
73
+ ///
74
+ /// This variant allows specifying an output format (Plain, Markdown, Djot) which
75
+ /// affects how the OCR result's mime_type is set when markdown output is requested.
76
+ pub fn process_file_with_format(
77
+ &self,
78
+ file_path: &str,
79
+ config: &TesseractConfig,
80
+ output_format: crate::core::config::OutputFormat,
81
+ ) -> Result<OcrExtractionResult, OcrError> {
82
+ execution::process_file_with_cache(file_path, config, &self.cache, Some(output_format))
83
+ }
84
+
85
+ /// Process multiple image files in parallel using Rayon.
86
+ ///
87
+ /// This method processes OCR operations in parallel across CPU cores for improved throughput.
88
+ /// Results are returned in the same order as the input file paths.
89
+ pub fn process_files_batch(&self, file_paths: Vec<String>, config: &TesseractConfig) -> Vec<BatchItemResult> {
90
+ execution::process_files_batch(file_paths, config, &self.cache)
91
+ }
92
+ }
93
+
94
+ #[cfg(test)]
95
+ mod tests {
96
+ use super::*;
97
+ use tempfile::tempdir;
98
+
99
+ fn create_test_config() -> TesseractConfig {
100
+ TesseractConfig {
101
+ output_format: "text".to_string(),
102
+ enable_table_detection: false,
103
+ use_cache: false,
104
+ ..TesseractConfig::default()
105
+ }
106
+ }
107
+
108
+ #[test]
109
+ fn test_processor_creation() {
110
+ let temp_dir = tempdir().unwrap();
111
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf()));
112
+ assert!(processor.is_ok());
113
+ }
114
+
115
+ #[test]
116
+ fn test_processor_creation_default_cache_dir() {
117
+ let processor = OcrProcessor::new(None);
118
+ assert!(processor.is_ok());
119
+ }
120
+
121
+ #[test]
122
+ fn test_cache_operations() {
123
+ let temp_dir = tempdir().unwrap();
124
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
125
+
126
+ assert!(processor.clear_cache().is_ok());
127
+
128
+ let stats = processor.get_cache_stats();
129
+ assert!(stats.is_ok());
130
+ }
131
+
132
+ #[test]
133
+ fn test_process_file_nonexistent() {
134
+ let temp_dir = tempdir().unwrap();
135
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
136
+ let config = create_test_config();
137
+
138
+ let result = processor.process_file("/nonexistent/file.png", &config);
139
+ assert!(result.is_err());
140
+ assert!(result.unwrap_err().to_string().contains("Failed to read file"));
141
+ }
142
+
143
+ #[test]
144
+ fn test_process_files_batch_empty() {
145
+ let temp_dir = tempdir().unwrap();
146
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
147
+ let config = create_test_config();
148
+
149
+ let results = processor.process_files_batch(vec![], &config);
150
+ assert_eq!(results.len(), 0);
151
+ }
152
+
153
+ #[test]
154
+ fn test_process_image_invalid_image_data() {
155
+ let temp_dir = tempdir().unwrap();
156
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
157
+ let config = create_test_config();
158
+
159
+ let invalid_data = vec![0, 1, 2, 3, 4];
160
+ let result = processor.process_image(&invalid_data, &config);
161
+
162
+ assert!(result.is_err());
163
+ }
164
+
165
+ #[test]
166
+ fn test_process_files_batch_single_file() {
167
+ let temp_dir = tempdir().unwrap();
168
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
169
+ let config = create_test_config();
170
+
171
+ let results = processor.process_files_batch(vec!["/nonexistent.png".to_string()], &config);
172
+ assert_eq!(results.len(), 1);
173
+ assert!(!results[0].success);
174
+ assert!(results[0].error.is_some());
175
+ assert!(results[0].result.is_none());
176
+ }
177
+
178
+ #[test]
179
+ fn test_process_files_batch_multiple_files() {
180
+ let temp_dir = tempdir().unwrap();
181
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
182
+ let config = create_test_config();
183
+
184
+ let file_paths = vec![
185
+ "/nonexistent1.png".to_string(),
186
+ "/nonexistent2.png".to_string(),
187
+ "/nonexistent3.png".to_string(),
188
+ ];
189
+
190
+ let results = processor.process_files_batch(file_paths, &config);
191
+ assert_eq!(results.len(), 3);
192
+
193
+ for result in &results {
194
+ assert!(!result.success);
195
+ assert!(result.error.is_some());
196
+ assert!(result.result.is_none());
197
+ }
198
+ }
199
+
200
+ #[test]
201
+ fn test_batch_item_result_structure() {
202
+ use std::collections::HashMap;
203
+
204
+ let success_result = BatchItemResult {
205
+ file_path: "test.png".to_string(),
206
+ success: true,
207
+ result: Some(OcrExtractionResult {
208
+ content: "test".to_string(),
209
+ mime_type: "text/plain".to_string(),
210
+ metadata: HashMap::new(),
211
+ tables: vec![],
212
+ }),
213
+ error: None,
214
+ };
215
+
216
+ assert!(success_result.success);
217
+ assert!(success_result.result.is_some());
218
+ assert!(success_result.error.is_none());
219
+
220
+ let error_result = BatchItemResult {
221
+ file_path: "error.png".to_string(),
222
+ success: false,
223
+ result: None,
224
+ error: Some("Test error".to_string()),
225
+ };
226
+
227
+ assert!(!error_result.success);
228
+ assert!(error_result.result.is_none());
229
+ assert!(error_result.error.is_some());
230
+ }
231
+
232
+ #[test]
233
+ fn test_process_files_batch_preserves_order() {
234
+ let temp_dir = tempdir().unwrap();
235
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
236
+ let config = create_test_config();
237
+
238
+ let file_paths = vec![
239
+ "file1.png".to_string(),
240
+ "file2.png".to_string(),
241
+ "file3.png".to_string(),
242
+ ];
243
+
244
+ let results = processor.process_files_batch(file_paths.clone(), &config);
245
+
246
+ assert_eq!(results.len(), 3);
247
+ assert_eq!(results[0].file_path, "file1.png");
248
+ assert_eq!(results[1].file_path, "file2.png");
249
+ assert_eq!(results[2].file_path, "file3.png");
250
+ }
251
+
252
+ #[test]
253
+ fn test_process_image_with_cache_disabled() {
254
+ let temp_dir = tempdir().unwrap();
255
+ let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
256
+
257
+ let mut config = create_test_config();
258
+ config.use_cache = false;
259
+
260
+ let invalid_data = vec![0, 1, 2, 3];
261
+ let result = processor.process_image(&invalid_data, &config);
262
+
263
+ assert!(result.is_err());
264
+ }
265
+ }
@@ -0,0 +1,145 @@
1
+ //! Image and configuration validation logic.
2
+ //!
3
+ //! This module handles validation of images, language files, and Tesseract configuration
4
+ //! before OCR processing begins.
5
+
6
+ use crate::ocr::error::OcrError;
7
+ use std::env;
8
+ use std::path::Path;
9
+
10
+ /// Validate language configuration and check for traineddata files.
11
+ ///
12
+ /// This function validates that:
13
+ /// 1. Language string is not empty
14
+ /// 2. Traineddata files exist for all specified languages
15
+ ///
16
+ /// # Arguments
17
+ ///
18
+ /// * `language` - Language code(s) to validate (can be "eng" or "eng+fra" etc.)
19
+ /// * `tessdata_path` - Path to tessdata directory
20
+ ///
21
+ /// # Returns
22
+ ///
23
+ /// `Ok(())` if validation passes, otherwise returns an error
24
+ pub(super) fn validate_language_and_traineddata(language: &str, tessdata_path: &str) -> Result<(), OcrError> {
25
+ // Validate language before initializing to prevent segfault ~keep
26
+ if language.trim().is_empty() {
27
+ return Err(OcrError::TesseractInitializationFailed(
28
+ "Language cannot be empty. Please specify a valid language code (e.g., 'eng')".to_string(),
29
+ ));
30
+ }
31
+
32
+ // Validate language file exists before initializing to prevent segfault ~keep
33
+ if !tessdata_path.is_empty() {
34
+ let languages: Vec<&str> = language.split('+').collect();
35
+ for lang in languages {
36
+ let lang = lang.trim();
37
+ if lang.is_empty() {
38
+ continue;
39
+ }
40
+ let traineddata_path = Path::new(tessdata_path).join(format!("{}.traineddata", lang));
41
+ if !traineddata_path.exists() {
42
+ return Err(OcrError::TesseractInitializationFailed(format!(
43
+ "Language '{}' not found. Traineddata file does not exist: {}",
44
+ lang,
45
+ traineddata_path.display()
46
+ )));
47
+ }
48
+ }
49
+ }
50
+
51
+ Ok(())
52
+ }
53
+
54
+ /// Resolve tessdata path from environment or fallback locations.
55
+ ///
56
+ /// Checks TESSDATA_PREFIX environment variable first, then tries common
57
+ /// installation paths for macOS, Linux, and Windows.
58
+ ///
59
+ /// # Returns
60
+ ///
61
+ /// Path to tessdata directory if found, otherwise empty string
62
+ pub(super) fn resolve_tessdata_path() -> String {
63
+ let tessdata_env = env::var("TESSDATA_PREFIX").ok();
64
+ let fallback_paths = [
65
+ "/opt/homebrew/share/tessdata",
66
+ "/opt/homebrew/opt/tesseract/share/tessdata",
67
+ "/usr/local/opt/tesseract/share/tessdata",
68
+ "/usr/share/tesseract-ocr/5/tessdata",
69
+ "/usr/share/tesseract-ocr/4/tessdata",
70
+ "/usr/share/tessdata",
71
+ "/usr/local/share/tessdata",
72
+ r#"C:\Program Files\Tesseract-OCR\tessdata"#,
73
+ r#"C:\ProgramData\Tesseract-OCR\tessdata"#,
74
+ ];
75
+
76
+ tessdata_env
77
+ .or_else(|| {
78
+ fallback_paths
79
+ .iter()
80
+ .find(|p| Path::new(p).exists())
81
+ .map(|p| (*p).to_string())
82
+ })
83
+ .unwrap_or_default()
84
+ }
85
+
86
+ /// Strip control characters from text, preserving whitespace.
87
+ ///
88
+ /// Removes control characters (0x00-0x1F, 0x7F) except for newlines, carriage returns, and tabs.
89
+ ///
90
+ /// # Arguments
91
+ ///
92
+ /// * `text` - Text to clean
93
+ ///
94
+ /// # Returns
95
+ ///
96
+ /// Cleaned text with control characters removed
97
+ pub(super) fn strip_control_characters(text: &str) -> String {
98
+ if text
99
+ .chars()
100
+ .any(|c| matches!(c, '\u{0000}'..='\u{001F}' | '\u{007F}') && c != '\n' && c != '\r' && c != '\t')
101
+ {
102
+ text.chars()
103
+ .filter(|c| !matches!(c, '\u{0000}'..='\u{001F}' | '\u{007F}') || matches!(c, '\n' | '\r' | '\t'))
104
+ .collect()
105
+ } else {
106
+ text.to_string()
107
+ }
108
+ }
109
+
110
+ #[cfg(test)]
111
+ mod tests {
112
+ use super::*;
113
+
114
+ #[test]
115
+ fn test_strip_control_characters() {
116
+ let input = "Hello\x00World\x01Test";
117
+ let output = strip_control_characters(input);
118
+ assert_eq!(output, "HelloWorldTest");
119
+
120
+ let input_with_newlines = "Hello\nWorld\rTest\t!";
121
+ let output = strip_control_characters(input_with_newlines);
122
+ assert_eq!(output, "Hello\nWorld\rTest\t!");
123
+ }
124
+
125
+ #[test]
126
+ fn test_strip_control_characters_all_control() {
127
+ let input = "\x00\x01\x02\x03";
128
+ let output = strip_control_characters(input);
129
+ assert_eq!(output, "");
130
+ }
131
+
132
+ #[test]
133
+ fn test_strip_control_characters_no_control() {
134
+ let input = "Hello World Test";
135
+ let output = strip_control_characters(input);
136
+ assert_eq!(output, "Hello World Test");
137
+ }
138
+
139
+ #[test]
140
+ fn test_strip_control_characters_delete_char() {
141
+ let input = "Hello\x7FWorld";
142
+ let output = strip_control_characters(input);
143
+ assert_eq!(output, "HelloWorld");
144
+ }
145
+ }
@@ -177,20 +177,24 @@ impl OcrBackend for TesseractBackend {
177
177
  async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
178
178
  let tess_config = self.config_to_tesseract(config);
179
179
  let tess_config_clone = tess_config.clone();
180
+ let output_format = config.output_format;
180
181
 
181
182
  let processor = Arc::clone(&self.processor);
182
183
  let image_bytes = image_bytes.to_vec();
183
184
 
184
- let ocr_result = tokio::task::spawn_blocking(move || processor.process_image(&image_bytes, &tess_config_clone))
185
- .await
186
- .map_err(|e| crate::KreuzbergError::Plugin {
187
- message: format!("Tesseract task panicked: {}", e),
188
- plugin_name: "tesseract".to_string(),
189
- })?
190
- .map_err(|e| crate::KreuzbergError::Ocr {
191
- message: format!("Tesseract OCR failed: {}", e),
192
- source: Some(Box::new(e)),
193
- })?;
185
+ let ocr_result = tokio::task::spawn_blocking(move || match output_format {
186
+ Some(fmt) => processor.process_image_with_format(&image_bytes, &tess_config_clone, fmt),
187
+ None => processor.process_image(&image_bytes, &tess_config_clone),
188
+ })
189
+ .await
190
+ .map_err(|e| crate::KreuzbergError::Plugin {
191
+ message: format!("Tesseract task panicked: {}", e),
192
+ plugin_name: "tesseract".to_string(),
193
+ })?
194
+ .map_err(|e| crate::KreuzbergError::Ocr {
195
+ message: format!("Tesseract OCR failed: {}", e),
196
+ source: Some(Box::new(e)),
197
+ })?;
194
198
 
195
199
  let metadata = crate::types::Metadata {
196
200
  format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
@@ -225,26 +229,32 @@ impl OcrBackend for TesseractBackend {
225
229
  detected_languages: None,
226
230
  chunks: None,
227
231
  images: None,
232
+ elements: None,
233
+ djot_content: None,
228
234
  })
229
235
  }
230
236
 
231
237
  async fn process_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
232
238
  let tess_config = self.config_to_tesseract(config);
233
239
  let tess_config_clone = tess_config.clone();
240
+ let output_format = config.output_format;
234
241
 
235
242
  let processor = Arc::clone(&self.processor);
236
243
  let path_str = path.to_string_lossy().to_string();
237
244
 
238
- let ocr_result = tokio::task::spawn_blocking(move || processor.process_file(&path_str, &tess_config_clone))
239
- .await
240
- .map_err(|e| crate::KreuzbergError::Plugin {
241
- message: format!("Tesseract task panicked: {}", e),
242
- plugin_name: "tesseract".to_string(),
243
- })?
244
- .map_err(|e| crate::KreuzbergError::Ocr {
245
- message: format!("Tesseract OCR failed: {}", e),
246
- source: Some(Box::new(e)),
247
- })?;
245
+ let ocr_result = tokio::task::spawn_blocking(move || match output_format {
246
+ Some(fmt) => processor.process_file_with_format(&path_str, &tess_config_clone, fmt),
247
+ None => processor.process_file(&path_str, &tess_config_clone),
248
+ })
249
+ .await
250
+ .map_err(|e| crate::KreuzbergError::Plugin {
251
+ message: format!("Tesseract task panicked: {}", e),
252
+ plugin_name: "tesseract".to_string(),
253
+ })?
254
+ .map_err(|e| crate::KreuzbergError::Ocr {
255
+ message: format!("Tesseract OCR failed: {}", e),
256
+ source: Some(Box::new(e)),
257
+ })?;
248
258
 
249
259
  let metadata = crate::types::Metadata {
250
260
  format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
@@ -279,6 +289,8 @@ impl OcrBackend for TesseractBackend {
279
289
  detected_languages: None,
280
290
  chunks: None,
281
291
  images: None,
292
+ elements: None,
293
+ djot_content: None,
282
294
  })
283
295
  }
284
296
 
@@ -326,10 +338,11 @@ mod tests {
326
338
  #[test]
327
339
  fn test_tesseract_backend_supports_language() {
328
340
  let backend = TesseractBackend::new().unwrap();
341
+ // English should always be available
329
342
  assert!(backend.supports_language("eng"));
330
- assert!(backend.supports_language("deu"));
331
- assert!(backend.supports_language("fra"));
343
+ // Invalid language codes should return false
332
344
  assert!(!backend.supports_language("xyz"));
345
+ assert!(!backend.supports_language("invalid"));
333
346
  }
334
347
 
335
348
  #[test]
@@ -342,9 +355,10 @@ mod tests {
342
355
  fn test_tesseract_backend_supported_languages() {
343
356
  let backend = TesseractBackend::new().unwrap();
344
357
  let languages = backend.supported_languages();
358
+ // English should always be available
345
359
  assert!(languages.contains(&"eng".to_string()));
346
- assert!(languages.contains(&"deu".to_string()));
347
- assert!(languages.len() > 30);
360
+ // Should have at least English
361
+ assert!(!languages.is_empty());
348
362
  }
349
363
 
350
364
  #[test]
@@ -354,6 +368,7 @@ mod tests {
354
368
  backend: "tesseract".to_string(),
355
369
  language: "deu".to_string(),
356
370
  tesseract_config: None,
371
+ output_format: None,
357
372
  };
358
373
 
359
374
  let tess_config = backend.config_to_tesseract(&ocr_config);
@@ -375,6 +390,7 @@ mod tests {
375
390
  backend: "tesseract".to_string(),
376
391
  language: "eng".to_string(),
377
392
  tesseract_config: Some(custom_tess_config),
393
+ output_format: None,
378
394
  };
379
395
 
380
396
  let tess_config = backend.config_to_tesseract(&ocr_config);
@@ -418,6 +434,7 @@ mod tests {
418
434
  backend: "tesseract".to_string(),
419
435
  language: "eng".to_string(),
420
436
  tesseract_config: Some(custom_tess_config),
437
+ output_format: None,
421
438
  };
422
439
 
423
440
  let tess_config = backend.config_to_tesseract(&ocr_config);
@@ -263,21 +263,34 @@ mod tests {
263
263
 
264
264
  #[test]
265
265
  fn test_bind_pdfium_multiple_calls() {
266
- let result1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1");
267
- let result2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2");
266
+ // First call - acquire lock, test success, then drop handle to release lock
267
+ {
268
+ let result1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1");
269
+ assert!(result1.is_ok(), "First call should succeed");
270
+ } // result1 dropped here, releasing the lock
268
271
 
269
- assert!(result1.is_ok(), "First call should succeed");
270
- assert!(result2.is_ok(), "Second call should also succeed");
272
+ // Second call - can now acquire lock since first handle was dropped
273
+ {
274
+ let result2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2");
275
+ assert!(result2.is_ok(), "Second call should also succeed");
276
+ }
271
277
  }
272
278
 
273
279
  #[test]
274
280
  fn test_bind_pdfium_returns_same_instance() {
275
- let handle1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1").unwrap();
276
- let handle2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2").unwrap();
281
+ // Get pointer from first handle, then drop it to release lock
282
+ let ptr1 = {
283
+ let handle1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1").unwrap();
284
+ &*handle1 as *const Pdfium
285
+ }; // handle1 dropped here, releasing the lock
286
+
287
+ // Get pointer from second handle
288
+ let ptr2 = {
289
+ let handle2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2").unwrap();
290
+ &*handle2 as *const Pdfium
291
+ };
277
292
 
278
293
  // Both handles should dereference to the same Pdfium instance
279
- let ptr1 = &*handle1 as *const Pdfium;
280
- let ptr2 = &*handle2 as *const Pdfium;
281
294
  assert_eq!(ptr1, ptr2, "Both handles should reference the same Pdfium instance");
282
295
  }
283
296