kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,408 +0,0 @@
1
- //! Runtime extraction of bundled PDFium library.
2
- //!
3
- //! When the `bundled-pdfium` feature is enabled, the PDFium library is embedded in the binary
4
- //! using `include_bytes!` during compilation. This module handles runtime extraction to a
5
- //! temporary directory and provides the path for dynamic loading.
6
- //!
7
- //! # Thread Safety
8
- //!
9
- //! Extraction is protected by a `Mutex` to prevent race conditions during concurrent access.
10
- //! The first thread to call `extract_bundled_pdfium()` will perform the extraction while
11
- //! others wait for completion.
12
- //!
13
- //! To prevent the "file too short" race condition where one thread loads a partially-written
14
- //! file, we use atomic file operations: write to a temporary file, then atomically rename to
15
- //! the final location. This ensures other threads never observe a partial file.
16
- //!
17
- //! # How It Works
18
- //!
19
- //! 1. During build (build.rs): PDFium is copied to OUT_DIR and the build script sets
20
- //! `KREUZBERG_PDFIUM_BUNDLED_PATH` environment variable
21
- //! 2. At compile time: `include_bytes!` embeds the library binary in the executable
22
- //! 3. At runtime: `extract_bundled_pdfium()` extracts to `$TMPDIR/kreuzberg-pdfium/`
23
- //! 4. Library is reused if already present (based on file size validation)
24
- //! 5. Concurrent calls are serialized with a `Mutex` to prevent partial writes
25
- //! 6. Atomic rename (write temp file → rename) prevents "file too short" race conditions
26
- //!
27
- //! # Example
28
- //!
29
- //! ```rust,ignore
30
- //! # #[cfg(feature = "bundled-pdfium")]
31
- //! # {
32
- //! use kreuzberg::pdf::bundled::extract_bundled_pdfium;
33
- //!
34
- //! # fn example() -> kreuzberg::Result<()> {
35
- //! let lib_path = extract_bundled_pdfium()?;
36
- //! println!("Extracted to: {}", lib_path.display());
37
- //! # Ok(())
38
- //! # }
39
- //! # }
40
- //! ```
41
-
42
- use std::fs;
43
- use std::io;
44
- use std::path::{Path, PathBuf};
45
- use std::sync::Mutex;
46
-
47
- #[cfg(unix)]
48
- use std::os::unix::fs::PermissionsExt;
49
-
50
- static EXTRACTION_LOCK: Mutex<()> = Mutex::new(());
51
-
52
- /// Runtime library name and extraction directory for the bundled PDFium library.
53
- ///
54
- /// Returns tuple of (library_name, extraction_directory)
55
- fn bundled_library_info() -> (&'static str, &'static str) {
56
- if cfg!(target_os = "windows") {
57
- ("pdfium.dll", "kreuzberg-pdfium")
58
- } else if cfg!(target_os = "macos") {
59
- ("libpdfium.dylib", "kreuzberg-pdfium")
60
- } else {
61
- ("libpdfium.so", "kreuzberg-pdfium")
62
- }
63
- }
64
-
65
- /// Get the temporary directory for bundled PDFium extraction.
66
- ///
67
- /// Uses `std::env::temp_dir()` on all platforms.
68
- fn get_extraction_dir() -> io::Result<PathBuf> {
69
- let (_, subdir) = bundled_library_info();
70
- Ok(std::env::temp_dir().join(subdir))
71
- }
72
-
73
- /// Check if extracted library exists and is valid.
74
- ///
75
- /// Verifies:
76
- /// - File exists at expected path
77
- /// - File size matches embedded size (basic validation)
78
- ///
79
- /// Returns `true` if library can be safely reused, `false` if extraction is needed.
80
- fn is_extracted_library_valid(lib_path: &Path, embedded_size: usize) -> bool {
81
- if !lib_path.exists() {
82
- return false;
83
- }
84
-
85
- match fs::metadata(lib_path) {
86
- Ok(metadata) => {
87
- let file_size = metadata.len() as usize;
88
- let size_tolerance = (embedded_size as f64 * 0.01) as usize;
89
- let min_size = embedded_size.saturating_sub(size_tolerance);
90
- let max_size = embedded_size.saturating_add(size_tolerance);
91
- file_size >= min_size && file_size <= max_size
92
- }
93
- Err(_) => false,
94
- }
95
- }
96
-
97
- /// Extract bundled PDFium library to temporary directory.
98
- ///
99
- /// # Behavior
100
- ///
101
- /// - Embeds PDFium library using `include_bytes!`
102
- /// - Extracts to `$TMPDIR/kreuzberg-pdfium/` (non-WASM only)
103
- /// - Reuses extracted library if size matches
104
- /// - Sets permissions to 0755 on Unix
105
- /// - Returns path to extracted library
106
- /// - **Thread-safe**: Synchronized with a global `Mutex` to prevent concurrent writes
107
- ///
108
- /// # Concurrency
109
- ///
110
- /// This function is fully thread-safe. When multiple threads call it simultaneously,
111
- /// only the first thread performs the actual extraction while others wait. This prevents
112
- /// the "file too short" error that occurs when one thread reads a partially-written file.
113
- ///
114
- /// # WASM Handling
115
- ///
116
- /// On WASM targets (wasm32-*), this function returns an error with a helpful
117
- /// message directing users to use WASM-specific initialization. WASM PDFium
118
- /// is initialized through the runtime, not via file extraction.
119
- ///
120
- /// # Errors
121
- ///
122
- /// Returns `std::io::Error` if:
123
- /// - Cannot create extraction directory
124
- /// - Cannot write library file
125
- /// - Cannot set file permissions (Unix only)
126
- /// - Target is WASM (filesystem access not available)
127
- ///
128
- /// # Platform-Specific Library Names
129
- ///
130
- /// - Linux: `libpdfium.so`
131
- /// - macOS: `libpdfium.dylib`
132
- /// - Windows: `pdfium.dll`
133
- pub fn extract_bundled_pdfium() -> io::Result<PathBuf> {
134
- #[cfg(target_arch = "wasm32")]
135
- {
136
- return Err(io::Error::new(
137
- io::ErrorKind::Unsupported,
138
- "File extraction is not available in WASM. \
139
- PDFium for WASM must be initialized via the WebAssembly runtime. \
140
- Use a WASM-compatible environment with proper module initialization.",
141
- ));
142
- }
143
-
144
- let (lib_name, _) = bundled_library_info();
145
- let extract_dir = get_extraction_dir()?;
146
-
147
- fs::create_dir_all(&extract_dir).map_err(|e| {
148
- io::Error::new(
149
- e.kind(),
150
- format!(
151
- "Failed to create bundled pdfium extraction directory '{}': {}",
152
- extract_dir.display(),
153
- e
154
- ),
155
- )
156
- })?;
157
-
158
- let lib_path = extract_dir.join(lib_name);
159
-
160
- let bundled_lib = include_bytes!(env!("KREUZBERG_PDFIUM_BUNDLED_PATH"));
161
-
162
- if is_extracted_library_valid(&lib_path, bundled_lib.len()) {
163
- return Ok(lib_path);
164
- }
165
-
166
- let _guard = EXTRACTION_LOCK.lock().unwrap_or_else(|poisoned| poisoned.into_inner());
167
-
168
- if is_extracted_library_valid(&lib_path, bundled_lib.len()) {
169
- return Ok(lib_path);
170
- }
171
-
172
- let temp_path = lib_path.with_extension(format!("tmp.{}", std::process::id()));
173
-
174
- fs::write(&temp_path, bundled_lib).map_err(|e| {
175
- io::Error::new(
176
- e.kind(),
177
- format!(
178
- "Failed to write bundled pdfium library to temp file '{}': {}",
179
- temp_path.display(),
180
- e
181
- ),
182
- )
183
- })?;
184
-
185
- #[cfg(unix)]
186
- {
187
- let perms = fs::Permissions::from_mode(0o755);
188
- fs::set_permissions(&temp_path, perms).map_err(|e| {
189
- let _ = fs::remove_file(&temp_path);
190
- io::Error::new(
191
- e.kind(),
192
- format!(
193
- "Failed to set permissions on bundled pdfium temp file '{}': {}",
194
- temp_path.display(),
195
- e
196
- ),
197
- )
198
- })?;
199
- }
200
-
201
- fs::rename(&temp_path, &lib_path).map_err(|e| {
202
- let _ = fs::remove_file(&temp_path);
203
- io::Error::new(
204
- e.kind(),
205
- format!(
206
- "Failed to rename bundled pdfium library from '{}' to '{}': {}",
207
- temp_path.display(),
208
- lib_path.display(),
209
- e
210
- ),
211
- )
212
- })?;
213
-
214
- Ok(lib_path)
215
- }
216
-
217
- #[cfg(test)]
218
- mod tests {
219
- use super::*;
220
-
221
- #[test]
222
- fn test_bundled_library_info_windows() {
223
- if cfg!(target_os = "windows") {
224
- let (name, dir) = bundled_library_info();
225
- assert_eq!(name, "pdfium.dll");
226
- assert_eq!(dir, "kreuzberg-pdfium");
227
- }
228
- }
229
-
230
- #[test]
231
- fn test_bundled_library_info_macos() {
232
- if cfg!(target_os = "macos") {
233
- let (name, dir) = bundled_library_info();
234
- assert_eq!(name, "libpdfium.dylib");
235
- assert_eq!(dir, "kreuzberg-pdfium");
236
- }
237
- }
238
-
239
- #[test]
240
- fn test_bundled_library_info_linux() {
241
- if cfg!(target_os = "linux") {
242
- let (name, dir) = bundled_library_info();
243
- assert_eq!(name, "libpdfium.so");
244
- assert_eq!(dir, "kreuzberg-pdfium");
245
- }
246
- }
247
-
248
- #[test]
249
- fn test_get_extraction_dir() {
250
- let result = get_extraction_dir();
251
- assert!(result.is_ok());
252
- let dir = result.unwrap();
253
- assert!(dir.to_str().is_some());
254
- assert!(dir.ends_with("kreuzberg-pdfium"));
255
- }
256
-
257
- #[test]
258
- fn test_is_extracted_library_valid_missing() {
259
- let nonexistent = PathBuf::from("/tmp/nonexistent-pdfium-test");
260
- assert!(!is_extracted_library_valid(&nonexistent, 1000));
261
- }
262
-
263
- #[test]
264
- fn test_is_extracted_library_valid_size_match() {
265
- let temp_dir = std::env::temp_dir();
266
- let test_file = temp_dir.join("test-pdfium-size.dll");
267
- let test_size = 5_000_000;
268
- let test_data = vec![0u8; test_size];
269
-
270
- if fs::write(&test_file, &test_data).is_ok() {
271
- let is_valid = is_extracted_library_valid(&test_file, test_size);
272
- assert!(is_valid);
273
- let _ = fs::remove_file(&test_file);
274
- }
275
- }
276
-
277
- #[test]
278
- fn test_is_extracted_library_valid_size_tolerance() {
279
- let temp_dir = std::env::temp_dir();
280
- let test_file = temp_dir.join("test-pdfium-tolerance.dll");
281
- let original_size = 10_000_000;
282
- let tolerance = (original_size as f64 * 0.01) as usize;
283
-
284
- let actual_size = original_size - tolerance / 2;
285
- let test_data = vec![0u8; actual_size];
286
-
287
- if fs::write(&test_file, &test_data).is_ok() {
288
- let is_valid = is_extracted_library_valid(&test_file, original_size);
289
- assert!(is_valid);
290
- let _ = fs::remove_file(&test_file);
291
- }
292
- }
293
-
294
- #[test]
295
- fn test_is_extracted_library_valid_size_mismatch() {
296
- let temp_dir = std::env::temp_dir();
297
- let test_file = temp_dir.join("test-pdfium-mismatch.dll");
298
- let original_size = 10_000_000;
299
-
300
- let actual_size = (original_size as f64 * 0.85) as usize;
301
- let test_data = vec![0u8; actual_size];
302
-
303
- if fs::write(&test_file, &test_data).is_ok() {
304
- let is_valid = is_extracted_library_valid(&test_file, original_size);
305
- assert!(!is_valid);
306
- let _ = fs::remove_file(&test_file);
307
- }
308
- }
309
-
310
- #[test]
311
- #[cfg(feature = "bundled-pdfium")]
312
- fn test_extract_bundled_pdfium() {
313
- let result = extract_bundled_pdfium();
314
- assert!(result.is_ok());
315
-
316
- let lib_path = result.unwrap();
317
- assert!(
318
- lib_path.exists(),
319
- "Extracted library should exist at: {}",
320
- lib_path.display()
321
- );
322
- assert!(lib_path.file_name().is_some(), "Library path should have filename");
323
-
324
- let (expected_name, _) = bundled_library_info();
325
- assert_eq!(lib_path.file_name().unwrap(), expected_name);
326
- }
327
-
328
- #[test]
329
- #[cfg(feature = "bundled-pdfium")]
330
- fn test_extract_bundled_pdfium_reuses_existing() {
331
- let result1 = extract_bundled_pdfium();
332
- assert!(result1.is_ok());
333
- let path1 = result1.unwrap();
334
-
335
- let metadata1 = fs::metadata(&path1).expect("Should be able to read metadata");
336
- let size1 = metadata1.len();
337
-
338
- let result2 = extract_bundled_pdfium();
339
- assert!(result2.is_ok());
340
- let path2 = result2.unwrap();
341
-
342
- assert_eq!(path1, path2, "Extraction should return same path on second call");
343
-
344
- let metadata2 = fs::metadata(&path2).expect("Should be able to read metadata");
345
- let size2 = metadata2.len();
346
- assert_eq!(size1, size2, "Reused library should have same file size");
347
- }
348
-
349
- #[test]
350
- #[cfg(feature = "bundled-pdfium")]
351
- fn test_extract_bundled_pdfium_concurrent_access() {
352
- use std::thread;
353
-
354
- let handles: Vec<_> = (0..10)
355
- .map(|_| {
356
- thread::spawn(|| {
357
- let result = extract_bundled_pdfium();
358
- assert!(result.is_ok(), "Concurrent extraction should succeed");
359
- result.unwrap()
360
- })
361
- })
362
- .collect();
363
-
364
- let paths: Vec<PathBuf> = handles
365
- .into_iter()
366
- .map(|h| h.join().expect("Thread should complete"))
367
- .collect();
368
-
369
- let first_path = &paths[0];
370
- assert!(
371
- paths.iter().all(|p| p == first_path),
372
- "All concurrent extractions should return the same path"
373
- );
374
-
375
- assert!(
376
- first_path.exists(),
377
- "Extracted library should exist at: {}",
378
- first_path.display()
379
- );
380
-
381
- let metadata = fs::metadata(first_path).expect("Should be able to read metadata");
382
- let file_size = metadata.len();
383
- assert!(
384
- file_size > 1_000_000,
385
- "PDFium library should be at least 1MB, got {} bytes",
386
- file_size
387
- );
388
- }
389
-
390
- #[test]
391
- #[cfg(unix)]
392
- #[cfg(feature = "bundled-pdfium")]
393
- fn test_extract_bundled_pdfium_permissions() {
394
- let result = extract_bundled_pdfium();
395
- assert!(result.is_ok());
396
-
397
- let lib_path = result.unwrap();
398
- let metadata = fs::metadata(&lib_path).expect("Should be able to read metadata");
399
- let perms = metadata.permissions();
400
- let mode = perms.mode();
401
-
402
- assert!(
403
- mode & 0o111 != 0,
404
- "Library should have executable bit set, got mode: {:#o}",
405
- mode
406
- );
407
- }
408
- }