kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -6,102 +6,14 @@ use std::process::Command;
6
6
  use std::thread;
7
7
  use std::time::Duration;
8
8
 
9
- /// PDFium linking strategy
10
- #[derive(Debug, Clone, Copy, PartialEq, Eq)]
11
- enum PdfiumLinkStrategy {
12
- /// Download and link statically (static-pdfium feature)
13
- DownloadStatic,
14
- /// Download, link dynamically, and embed in binary (bundled-pdfium feature)
15
- Bundled,
16
- /// Use system-installed pdfium via pkg-config (system-pdfium feature)
17
- System,
18
- }
19
-
20
9
  fn main() {
21
10
  let target = env::var("TARGET").unwrap();
22
11
  let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
23
12
 
24
13
  println!("cargo::rustc-check-cfg=cfg(coverage)");
25
14
 
26
- if !cfg!(feature = "pdf") {
27
- tracing::debug!("PDF feature not enabled, skipping pdfium linking");
28
- return;
29
- }
30
-
31
- let strategy = determine_link_strategy(&target);
32
-
33
- tracing::debug!("Using PDFium linking strategy: {:?}", strategy);
34
-
35
- match strategy {
36
- PdfiumLinkStrategy::DownloadStatic => {
37
- let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
38
- link_statically(&pdfium_dir, &target);
39
- }
40
- PdfiumLinkStrategy::Bundled => {
41
- let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
42
- link_bundled(&pdfium_dir, &target, &out_dir);
43
- }
44
- PdfiumLinkStrategy::System => {
45
- link_system(&target);
46
- }
47
- }
48
-
49
- link_system_frameworks(&target);
50
- println!("cargo:rerun-if-changed=build.rs");
51
- }
52
-
53
- /// Determine which linking strategy to use based on features and target
54
- fn determine_link_strategy(target: &str) -> PdfiumLinkStrategy {
55
- if target.contains("wasm") {
56
- if let Ok(wasm_lib) = env::var("PDFIUM_WASM_LIB") {
57
- println!("cargo:rustc-link-search=native={}", wasm_lib);
58
- println!("cargo:rustc-link-lib=static=pdfium");
59
- return PdfiumLinkStrategy::DownloadStatic;
60
- }
61
- println!("cargo:warning=WASM build using bundled PDFium (set PDFIUM_WASM_LIB to link custom WASM PDFium)");
62
- return PdfiumLinkStrategy::Bundled;
63
- }
64
-
65
- let system_pdfium = cfg!(feature = "system-pdfium");
66
- let bundled_pdfium = cfg!(feature = "bundled-pdfium");
67
- let static_pdfium = cfg!(feature = "static-pdfium");
68
-
69
- let enabled_count = usize::from(system_pdfium) + usize::from(bundled_pdfium) + usize::from(static_pdfium);
70
- if enabled_count > 1 {
71
- println!(
72
- "cargo:warning=Multiple PDFium linking strategies enabled (static-pdfium={}, bundled-pdfium={}, system-pdfium={}); using bundled-pdfium for this build",
73
- static_pdfium, bundled_pdfium, system_pdfium
74
- );
75
- }
76
-
77
- if bundled_pdfium {
78
- return PdfiumLinkStrategy::Bundled;
79
- }
80
- if system_pdfium {
81
- return PdfiumLinkStrategy::System;
82
- }
83
- if static_pdfium {
84
- return PdfiumLinkStrategy::DownloadStatic;
85
- }
15
+ let (download_url, lib_name) = get_pdfium_url_and_lib(&target);
86
16
 
87
- PdfiumLinkStrategy::Bundled
88
- }
89
-
90
- /// Download PDFium or use prebuilt directory
91
- ///
92
- /// This is the main orchestrator function that:
93
- /// 1. Checks for `KREUZBERG_PDFIUM_PREBUILT` environment variable
94
- /// 2. If set and valid, uses prebuilt pdfium directory
95
- /// 3. If not set, downloads pdfium to out_dir (with caching)
96
- /// 4. Returns PathBuf to pdfium directory
97
- ///
98
- /// Reuses all existing helper functions:
99
- /// - `get_pdfium_url_and_lib()` - determines download URL for target
100
- /// - `download_and_extract_pdfium()` - downloads with retry logic
101
- /// - `runtime_library_info()` - platform-specific library names
102
- /// - `prepare_prebuilt_pdfium()` - handles prebuilt copy
103
- fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
104
- let (download_url, _lib_name) = get_pdfium_url_and_lib(target);
105
17
  let pdfium_dir = out_dir.join("pdfium");
106
18
 
107
19
  if let Some(prebuilt) = env::var_os("KREUZBERG_PDFIUM_PREBUILT") {
@@ -109,10 +21,6 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
109
21
  if prebuilt_path.exists() {
110
22
  prepare_prebuilt_pdfium(&prebuilt_path, &pdfium_dir)
111
23
  .unwrap_or_else(|err| panic!("Failed to copy Pdfium from {}: {}", prebuilt_path.display(), err));
112
- if target.contains("windows") {
113
- ensure_windows_import_library(&pdfium_dir);
114
- }
115
- return pdfium_dir;
116
24
  } else {
117
25
  panic!(
118
26
  "Environment variable KREUZBERG_PDFIUM_PREBUILT points to '{}' but the directory does not exist",
@@ -121,9 +29,8 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
121
29
  }
122
30
  }
123
31
 
124
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
125
- let lib_found = find_pdfium_library(&pdfium_dir, &runtime_lib_name, runtime_subdir).is_ok();
126
-
32
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(&target);
33
+ let runtime_lib_path = pdfium_dir.join(runtime_subdir).join(&runtime_lib_name);
127
34
  let import_lib_exists = if target.contains("windows") {
128
35
  let lib_dir = pdfium_dir.join("lib");
129
36
  lib_dir.join("pdfium.lib").exists() || lib_dir.join("pdfium.dll.lib").exists()
@@ -131,50 +38,57 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
131
38
  true
132
39
  };
133
40
 
134
- if !lib_found || !import_lib_exists {
41
+ if !runtime_lib_path.exists() || !import_lib_exists {
135
42
  tracing::debug!("Pdfium library not found, downloading for target: {}", target);
136
43
  tracing::debug!("Download URL: {}", download_url);
137
44
  download_and_extract_pdfium(&download_url, &pdfium_dir);
138
45
  } else {
139
- tracing::debug!("Pdfium library already cached at {}", pdfium_dir.display());
46
+ tracing::debug!("Pdfium library already present at {}", runtime_lib_path.display());
140
47
  }
141
48
 
142
49
  if target.contains("windows") {
143
- ensure_windows_import_library(&pdfium_dir);
144
- }
50
+ let lib_dir = pdfium_dir.join("lib");
51
+ let dll_lib = lib_dir.join("pdfium.dll.lib");
52
+ let expected_lib = lib_dir.join("pdfium.lib");
145
53
 
146
- pdfium_dir
147
- }
54
+ if dll_lib.exists() && !expected_lib.exists() {
55
+ tracing::debug!("Renaming cached {} to {}", dll_lib.display(), expected_lib.display());
56
+ fs::rename(&dll_lib, &expected_lib).expect("Failed to rename pdfium.dll.lib to pdfium.lib");
57
+ }
58
+ }
148
59
 
149
- fn ensure_windows_import_library(pdfium_dir: &Path) {
150
60
  let lib_dir = pdfium_dir.join("lib");
151
- let dll_lib = lib_dir.join("pdfium.dll.lib");
152
- let expected_lib = lib_dir.join("pdfium.lib");
61
+ println!("cargo:rustc-link-search=native={}", lib_dir.display());
62
+ println!("cargo:rustc-link-lib=dylib={}", lib_name);
153
63
 
154
- if dll_lib.exists() && !expected_lib.exists() {
155
- tracing::debug!(
156
- "Ensuring Windows import library at {} (source: {})",
157
- expected_lib.display(),
158
- dll_lib.display()
159
- );
160
- fs::copy(&dll_lib, &expected_lib).unwrap_or_else(|err| {
161
- panic!(
162
- "Failed to copy Windows import library from {} to {}: {}",
163
- dll_lib.display(),
164
- expected_lib.display(),
165
- err
166
- )
167
- });
64
+ if target.contains("darwin") {
65
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
66
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
67
+ } else if target.contains("linux") {
68
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
69
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
168
70
  }
71
+
72
+ copy_lib_to_package(&pdfium_dir, &target);
73
+
74
+ if target.contains("darwin") {
75
+ println!("cargo:rustc-link-lib=framework=CoreFoundation");
76
+ println!("cargo:rustc-link-lib=framework=CoreGraphics");
77
+ println!("cargo:rustc-link-lib=framework=CoreText");
78
+ println!("cargo:rustc-link-lib=framework=AppKit");
79
+ println!("cargo:rustc-link-lib=dylib=c++");
80
+ } else if target.contains("linux") {
81
+ println!("cargo:rustc-link-lib=dylib=stdc++");
82
+ println!("cargo:rustc-link-lib=dylib=m");
83
+ } else if target.contains("windows") {
84
+ println!("cargo:rustc-link-lib=dylib=gdi32");
85
+ println!("cargo:rustc-link-lib=dylib=user32");
86
+ println!("cargo:rustc-link-lib=dylib=advapi32");
87
+ }
88
+
89
+ println!("cargo:rerun-if-changed=build.rs");
169
90
  }
170
91
 
171
- /// Fetch the latest release version from a GitHub repository
172
- ///
173
- /// Uses curl to query the GitHub API and extract the tag_name from the
174
- /// latest release JSON response. Uses improved JSON parsing with fallback logic.
175
- ///
176
- /// For WASM (paulocoutinhox/pdfium-lib), falls back to known stable versions.
177
- /// For non-WASM (bblanchon/pdfium-binaries), uses a different fallback strategy.
178
92
  fn get_latest_version(repo: &str) -> String {
179
93
  let api_url = format!("https://api.github.com/repos/{}/releases/latest", repo);
180
94
 
@@ -184,61 +98,21 @@ fn get_latest_version(repo: &str) -> String {
184
98
  && output.status.success()
185
99
  {
186
100
  let json = String::from_utf8_lossy(&output.stdout);
187
-
188
- if let Some(tag) = extract_tag_from_json(&json) {
189
- return tag;
190
- }
191
- }
192
-
193
- if repo.contains("paulocoutinhox") {
194
- eprintln!(
195
- "cargo:warning=Failed to fetch latest PDFium WASM version from GitHub API, using fallback version 7442b"
196
- );
197
- "7442b".to_string()
198
- } else if repo.contains("bblanchon") {
199
- eprintln!(
200
- "cargo:warning=Failed to fetch latest PDFium binaries version from GitHub API, using fallback version 7568"
201
- );
202
- "7568".to_string()
203
- } else {
204
- eprintln!(
205
- "cargo:warning=Failed to fetch latest PDFium version from GitHub API (unknown repository: {})",
206
- repo
207
- );
208
- String::new()
209
- }
210
- }
211
-
212
- /// Extract tag_name from GitHub API JSON response
213
- ///
214
- /// Parses JSON by finding the tag_name field and extracting the value between quotes.
215
- /// Handles various JSON formatting variations.
216
- fn extract_tag_from_json(json: &str) -> Option<String> {
217
- if let Some(start) = json.find("\"tag_name\"") {
218
- let after_colon = &json[start + "\"tag_name\"".len()..];
219
-
220
- let after_colon = after_colon.trim_start();
221
- let after_colon = after_colon.strip_prefix(':')?;
222
- let after_colon = after_colon.trim_start();
223
-
224
- if let Some(opening_quote) = after_colon.find('"') {
225
- let value_start = opening_quote + 1;
226
- if let Some(closing_quote) = after_colon[value_start..].find('"') {
227
- let tag = &after_colon[value_start..value_start + closing_quote];
228
- return Some(tag.split('/').next_back().unwrap_or(tag).to_string());
101
+ if let Some(start) = json.find("\"tag_name\":") {
102
+ let after_colon = &json[start + "\"tag_name\":".len()..];
103
+ if let Some(opening_quote) = after_colon.find('"')
104
+ && let Some(closing_quote) = after_colon[opening_quote + 1..].find('"')
105
+ {
106
+ let tag_start = opening_quote + 1;
107
+ let tag = &after_colon[tag_start..tag_start + closing_quote];
108
+ return tag.split('/').next_back().unwrap_or(tag).to_string();
229
109
  }
230
110
  }
231
111
  }
232
112
 
233
- None
113
+ "7529".to_string()
234
114
  }
235
115
 
236
- /// Get the download URL and library name for the target platform
237
- ///
238
- /// Determines platform/architecture from target triple and constructs
239
- /// the appropriate GitHub release download URL. Supports:
240
- /// - WASM: paulocoutinhox/pdfium-lib
241
- /// - Other platforms: bblanchon/pdfium-binaries
242
116
  fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
243
117
  if target.contains("wasm") {
244
118
  let version = env::var("PDFIUM_WASM_VERSION")
@@ -247,10 +121,11 @@ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
247
121
  .unwrap_or_else(|| get_latest_version("paulocoutinhox/pdfium-lib"));
248
122
  tracing::debug!("Using pdfium-lib version: {}", version);
249
123
 
124
+ let wasm_arch = if target.contains("wasm32") { "wasm32" } else { "wasm64" };
250
125
  return (
251
126
  format!(
252
- "https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/wasm.tgz",
253
- version
127
+ "https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/pdfium-{}.tar.gz",
128
+ version, wasm_arch
254
129
  ),
255
130
  "pdfium".to_string(),
256
131
  );
@@ -295,15 +170,6 @@ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
295
170
  (url, "pdfium".to_string())
296
171
  }
297
172
 
298
- /// Download and extract PDFium archive with retry logic
299
- ///
300
- /// Features:
301
- /// - Exponential backoff retry (configurable via env vars)
302
- /// - File type validation (gzip check)
303
- /// - Windows-specific import library handling (pdfium.dll.lib -> pdfium.lib)
304
- /// - Environment variables:
305
- /// - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: number of retries (default: 5)
306
- /// - KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS: initial backoff in seconds (default: 2)
307
173
  fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
308
174
  fs::create_dir_all(dest_dir).expect("Failed to create pdfium directory");
309
175
 
@@ -367,20 +233,21 @@ fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
367
233
  thread::sleep(Duration::from_secs(delay_secs));
368
234
  }
369
235
 
370
- // Validate gzip magic bytes (0x1f 0x8b) instead of using external 'file' command
371
- // This is more portable and works correctly on Windows
372
- let is_valid_gzip = fs::read(&archive_path)
373
- .map(|bytes| bytes.len() >= 2 && bytes[0] == 0x1f && bytes[1] == 0x8b)
374
- .unwrap_or(false);
236
+ let file_type = Command::new("file")
237
+ .arg(archive_path.to_str().unwrap())
238
+ .output()
239
+ .expect("Failed to check file type");
240
+
241
+ let file_type_output = String::from_utf8_lossy(&file_type.stdout);
242
+ tracing::debug!("Downloaded file type: {}", file_type_output.trim());
375
243
 
376
- if !is_valid_gzip {
244
+ if !file_type_output.to_lowercase().contains("gzip") && !file_type_output.to_lowercase().contains("compressed") {
377
245
  fs::remove_file(&archive_path).ok();
378
246
  panic!(
379
247
  "Downloaded file is not a valid gzip archive. URL may be incorrect or version unavailable: {}",
380
248
  url
381
249
  );
382
250
  }
383
- tracing::debug!("Downloaded file validated as gzip archive");
384
251
 
385
252
  tracing::debug!("Extracting Pdfium archive...");
386
253
  let status = Command::new("tar")
@@ -414,369 +281,180 @@ fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
414
281
  tracing::debug!("Pdfium downloaded and extracted successfully");
415
282
  }
416
283
 
417
- /// Prepare prebuilt PDFium by copying to destination directory
418
- ///
419
- /// Removes existing destination if present, then recursively copies
420
- /// all files from prebuilt source to destination.
421
- fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
422
- if dest_dir.exists() {
423
- fs::remove_dir_all(dest_dir)?;
284
+ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
285
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
286
+ let src_lib = pdfium_dir.join(runtime_subdir).join(&runtime_lib_name);
287
+
288
+ if !src_lib.exists() {
289
+ tracing::debug!("Source library not found: {}", src_lib.display());
290
+ return;
424
291
  }
425
- copy_dir_all(prebuilt_src, dest_dir)
426
- }
427
292
 
428
- /// Recursively copy directory tree
429
- ///
430
- /// Used by `prepare_prebuilt_pdfium()` to copy entire pdfium directory
431
- /// structure, preserving all files and subdirectories.
432
- fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
433
- fs::create_dir_all(dst)?;
434
- for entry in fs::read_dir(src)? {
435
- let entry = entry?;
436
- let file_type = entry.file_type()?;
437
- let target_path = dst.join(entry.file_name());
438
- if file_type.is_dir() {
439
- copy_dir_all(&entry.path(), &target_path)?;
440
- } else {
441
- fs::copy(entry.path(), &target_path)?;
293
+ // Fix install_name on macOS to use @rpath
294
+ if target.contains("darwin") {
295
+ fix_macos_install_name(&src_lib, &runtime_lib_name);
296
+ codesign_if_needed(target, &src_lib);
297
+ }
298
+
299
+ let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
300
+ let workspace_root = crate_dir.parent().unwrap().parent().unwrap();
301
+
302
+ // Copy to target directory for CLI binary
303
+ if let Ok(profile) = env::var("PROFILE") {
304
+ let target_dir = workspace_root.join("target").join(profile);
305
+ if target_dir.exists() {
306
+ copy_lib_if_needed(
307
+ &src_lib,
308
+ &target_dir.join(&runtime_lib_name),
309
+ "CLI target directory",
310
+ target,
311
+ );
442
312
  }
443
313
  }
444
- Ok(())
445
- }
446
314
 
447
- /// Get platform-specific runtime library name and subdirectory
448
- ///
449
- /// Returns tuple of (library_name, subdirectory) for the target platform:
450
- /// - WASM: ("libpdfium.a", "release/lib")
451
- /// - Windows: ("pdfium.dll", "bin")
452
- /// - macOS: ("libpdfium.dylib", "lib")
453
- /// - Linux: ("libpdfium.so", "lib")
454
- fn runtime_library_info(target: &str) -> (String, &'static str) {
455
- if target.contains("wasm") {
456
- ("libpdfium.a".to_string(), "release/lib")
457
- } else if target.contains("windows") {
458
- ("pdfium.dll".to_string(), "bin")
459
- } else if target.contains("darwin") {
460
- ("libpdfium.dylib".to_string(), "lib")
315
+ let python_dest_dir = workspace_root.join("packages").join("python").join("kreuzberg");
316
+ if python_dest_dir.exists() {
317
+ copy_lib_if_needed(
318
+ &src_lib,
319
+ &python_dest_dir.join(&runtime_lib_name),
320
+ "Python package",
321
+ target,
322
+ );
461
323
  } else {
462
- ("libpdfium.so".to_string(), "lib")
463
- }
464
- }
465
-
466
- /// Find PDFium library in archive with flexible directory detection
467
- ///
468
- /// Attempts to locate the library at multiple possible locations:
469
- /// - {subdir}/{lib_name} (standard location)
470
- /// - {lib_name} (root of archive)
471
- /// - bin/{lib_name} (alternative location)
472
- /// - lib/{lib_name} (explicit lib directory)
473
- ///
474
- /// This handles variations in archive structure across different platform builds,
475
- /// particularly macOS ARM64 where the archive structure may differ.
476
- ///
477
- /// Returns the full path to the library if found, or an error with available files.
478
- fn find_pdfium_library(pdfium_dir: &Path, lib_name: &str, expected_subdir: &str) -> Result<PathBuf, String> {
479
- let candidates = [
480
- pdfium_dir.join(expected_subdir).join(lib_name),
481
- pdfium_dir.join(lib_name),
482
- pdfium_dir.join("bin").join(lib_name),
483
- pdfium_dir.join("lib").join(lib_name),
484
- ];
485
-
486
- for candidate in &candidates {
487
- if candidate.exists() {
488
- tracing::debug!("Found PDFium library at: {}", candidate.display());
489
- return Ok(candidate.clone());
490
- }
324
+ tracing::debug!("Python package directory not found, skipping Python library copy");
491
325
  }
492
326
 
493
- let mut error_msg = format!(
494
- "PDFium library not found at expected location: {}/{}\n\n",
495
- pdfium_dir.display(),
496
- expected_subdir
497
- );
498
- error_msg.push_str("Attempted locations:\n");
499
- for candidate in &candidates {
500
- error_msg.push_str(&format!(" - {}\n", candidate.display()));
327
+ let node_dest_dir = workspace_root.join("crates").join("kreuzberg-node");
328
+ if node_dest_dir.exists() {
329
+ copy_lib_if_needed(
330
+ &src_lib,
331
+ &node_dest_dir.join(&runtime_lib_name),
332
+ "Node.js package",
333
+ target,
334
+ );
335
+ } else {
336
+ tracing::debug!("Node.js package directory not found, skipping Node library copy");
501
337
  }
502
338
 
503
- error_msg.push_str("\nActual archive contents:\n");
504
- if let Ok(entries) = fs::read_dir(pdfium_dir) {
505
- for entry in entries.flatten() {
506
- let path = entry.path();
507
- let file_type = if path.is_dir() { "dir" } else { "file" };
508
- error_msg.push_str(&format!(" {} ({})\n", path.display(), file_type));
509
-
510
- if path.is_dir()
511
- && let Ok(sub_entries) = fs::read_dir(&path)
512
- {
513
- for sub_entry in sub_entries.flatten() {
514
- let sub_path = sub_entry.path();
515
- let sub_type = if sub_path.is_dir() { "dir" } else { "file" };
516
- error_msg.push_str(&format!(" {} ({})\n", sub_path.display(), sub_type));
517
- }
518
- }
519
- }
339
+ let ruby_dest_dir = workspace_root.join("packages").join("ruby").join("lib");
340
+ if ruby_dest_dir.exists() {
341
+ copy_lib_if_needed(&src_lib, &ruby_dest_dir.join(&runtime_lib_name), "Ruby package", target);
342
+ } else {
343
+ tracing::debug!("Ruby package directory not found, skipping Ruby library copy");
520
344
  }
521
-
522
- Err(error_msg)
523
345
  }
524
346
 
525
- /// Link PDFium dynamically (default)
526
- ///
527
- /// Sets up linker to use PDFium as a dynamic library (.dylib/.so/.dll)
528
- /// with platform-specific rpath configuration for runtime library discovery.
529
- /// Supports flexible archive structures by adding multiple possible lib directories.
530
- fn link_dynamically(pdfium_dir: &Path, target: &str) {
531
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
347
+ fn copy_lib_if_needed(src: &Path, dest: &Path, package_name: &str, target: &str) {
348
+ use std::fs;
532
349
 
533
- let lib_path = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
534
- Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
535
- Err(err) => panic!("{}", err),
350
+ let should_copy = if dest.exists() {
351
+ let src_metadata = fs::metadata(src).ok();
352
+ let dest_metadata = fs::metadata(dest).ok();
353
+ match (src_metadata, dest_metadata) {
354
+ (Some(src), Some(dest)) => src.modified().ok() > dest.modified().ok(),
355
+ _ => true,
356
+ }
357
+ } else {
358
+ true
536
359
  };
537
360
 
538
- println!("cargo:rustc-link-search=native={}", lib_path.display());
539
- println!("cargo:rustc-link-lib=dylib=pdfium");
540
-
541
- let std_lib_dir = pdfium_dir.join("lib");
542
- if std_lib_dir.exists() && std_lib_dir != lib_path {
543
- println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
361
+ if should_copy {
362
+ match fs::copy(src, dest) {
363
+ Ok(_) => {
364
+ tracing::debug!("Copied {} to {} ({})", src.display(), dest.display(), package_name);
365
+ codesign_if_needed(target, dest);
366
+ }
367
+ Err(e) => tracing::debug!("Failed to copy library to {}: {}", package_name, e),
368
+ }
544
369
  }
370
+ }
545
371
 
546
- let bin_dir = pdfium_dir.join("bin");
547
- if bin_dir.exists() && bin_dir != lib_path {
548
- println!("cargo:rustc-link-search=native={}", bin_dir.display());
372
+ fn codesign_if_needed(target: &str, binary: &Path) {
373
+ if !target.contains("apple-darwin") || !binary.exists() {
374
+ return;
549
375
  }
550
376
 
551
- if target.contains("darwin") {
552
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
553
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
554
- } else if target.contains("linux") {
555
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
556
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
557
- }
558
- }
377
+ let identity = env::var("KREUZBERG_CODESIGN_IDENTITY").unwrap_or_else(|_| "-".to_string());
378
+ let status = Command::new("codesign")
379
+ .arg("--force")
380
+ .arg("--timestamp=none")
381
+ .arg("--sign")
382
+ .arg(identity)
383
+ .arg(binary)
384
+ .status();
559
385
 
560
- /// Link PDFium statically (static-pdfium feature)
561
- ///
562
- /// Embeds PDFium into the binary as a static library. Adds system
563
- /// dependencies required for static linking on Linux.
564
- /// Supports flexible archive structures by finding library in multiple locations.
565
- ///
566
- /// Environment Variables:
567
- /// - `PDFIUM_STATIC_LIB_PATH`: Path to directory containing libpdfium.a (for Docker/musl builds)
568
- ///
569
- /// Note: bblanchon/pdfium-binaries only provides dynamic libraries.
570
- /// On macOS, this will fallback to dynamic linking with a warning.
571
- /// On Linux, you must provide PDFIUM_STATIC_LIB_PATH pointing to a static build.
572
- fn link_statically(pdfium_dir: &Path, target: &str) {
573
- let static_lib_name = "libpdfium.a";
574
- let lib_subdir = if target.contains("wasm") { "release/lib" } else { "lib" };
575
-
576
- if let Ok(custom_path) = env::var("PDFIUM_STATIC_LIB_PATH") {
577
- let custom_lib_dir = PathBuf::from(&custom_path);
578
-
579
- if !custom_lib_dir.exists() {
580
- panic!(
581
- "PDFIUM_STATIC_LIB_PATH points to '{}' but the directory does not exist",
582
- custom_path
583
- );
386
+ match status {
387
+ Ok(result) if result.success() => {
388
+ tracing::debug!("Codesigned {}", binary.display());
584
389
  }
585
-
586
- let custom_lib = custom_lib_dir.join(static_lib_name);
587
- if !custom_lib.exists() {
588
- panic!(
589
- "PDFIUM_STATIC_LIB_PATH points to '{}' but {} not found.\n\
590
- Expected to find: {}",
591
- custom_path,
592
- static_lib_name,
593
- custom_lib.display()
390
+ Ok(result) => {
391
+ tracing::debug!(
392
+ "codesign exited with status {} while signing {}",
393
+ result,
394
+ binary.display()
594
395
  );
595
396
  }
596
-
597
- tracing::debug!("Using custom static PDFium from: {}", custom_lib.display());
598
- println!("cargo:rustc-link-search=native={}", custom_lib_dir.display());
599
- println!("cargo:rustc-link-lib=static=pdfium");
600
-
601
- if target.contains("linux") {
602
- println!("cargo:rustc-link-lib=dylib=pthread");
603
- println!("cargo:rustc-link-lib=dylib=dl");
604
- } else if target.contains("windows") {
605
- println!("cargo:rustc-link-lib=dylib=ws2_32");
606
- println!("cargo:rustc-link-lib=dylib=userenv");
397
+ Err(err) => {
398
+ tracing::debug!("Failed to run codesign for {}: {}", binary.display(), err);
607
399
  }
608
-
609
- return;
610
400
  }
401
+ }
611
402
 
612
- let lib_path = match find_pdfium_library(pdfium_dir, static_lib_name, lib_subdir) {
613
- Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
614
- Err(_err) => {
615
- if target.contains("darwin") {
616
- eprintln!("cargo:warning=Static PDFium library (libpdfium.a) not found for macOS.");
617
- eprintln!("cargo:warning=bblanchon/pdfium-binaries only provides dynamic libraries.");
618
- eprintln!("cargo:warning=Falling back to dynamic linking for local development.");
619
- eprintln!("cargo:warning=Production Linux builds require PDFIUM_STATIC_LIB_PATH.");
620
-
621
- link_dynamically(pdfium_dir, target);
622
- return;
623
- } else {
624
- panic!(
625
- "Static PDFium library (libpdfium.a) not found.\n\n\
626
- bblanchon/pdfium-binaries only provides dynamic libraries.\n\n\
627
- For static linking (required for Docker with musl), you must:\n\n\
628
- 1. Build static PDFium or obtain from a source that provides it\n\
629
- - See: https://github.com/ajrcarey/pdfium-render/issues/53\n\
630
- - Or use: https://github.com/paulocoutinhox/pdfium-lib (provides static builds)\n\n\
631
- 2. Set environment variable pointing to the directory containing libpdfium.a:\n\
632
- export PDFIUM_STATIC_LIB_PATH=/path/to/pdfium/lib\n\n\
633
- 3. Or use alternative features:\n\
634
- - 'pdf' (dynamic linking, requires .so at runtime)\n\
635
- - 'bundled-pdfium' (embeds dynamic library in binary)\n\
636
- - 'system-pdfium' (use system-installed pdfium)\n\n\
637
- Example Dockerfile pattern:\n\
638
- FROM alpine:latest as pdfium-builder\n\
639
- # Download/build static libpdfium.a\n\
640
- \n\
641
- FROM rust:alpine as builder\n\
642
- ENV PDFIUM_STATIC_LIB_PATH=/pdfium/lib\n\
643
- COPY --from=pdfium-builder /path/to/libpdfium.a /pdfium/lib/"
644
- );
645
- }
646
- }
647
- };
648
-
649
- println!("cargo:rustc-link-search=native={}", lib_path.display());
650
- println!("cargo:rustc-link-lib=static=pdfium");
651
-
652
- let std_lib_dir = pdfium_dir.join("lib");
653
- if std_lib_dir.exists() && std_lib_dir != lib_path {
654
- println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
403
+ fn runtime_library_info(target: &str) -> (String, &'static str) {
404
+ if target.contains("windows") {
405
+ ("pdfium.dll".to_string(), "bin")
406
+ } else if target.contains("darwin") {
407
+ ("libpdfium.dylib".to_string(), "lib")
408
+ } else {
409
+ ("libpdfium.so".to_string(), "lib")
655
410
  }
411
+ }
656
412
 
657
- let bin_dir = pdfium_dir.join("bin");
658
- if bin_dir.exists() && bin_dir != lib_path {
659
- println!("cargo:rustc-link-search=native={}", bin_dir.display());
413
+ fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
414
+ if dest_dir.exists() {
415
+ fs::remove_dir_all(dest_dir)?;
660
416
  }
417
+ copy_dir_all(prebuilt_src, dest_dir)
418
+ }
661
419
 
662
- if target.contains("linux") {
663
- println!("cargo:rustc-link-lib=dylib=pthread");
664
- println!("cargo:rustc-link-lib=dylib=dl");
665
- } else if target.contains("windows") {
666
- println!("cargo:rustc-link-lib=dylib=ws2_32");
667
- println!("cargo:rustc-link-lib=dylib=userenv");
420
+ fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
421
+ fs::create_dir_all(dst)?;
422
+ for entry in fs::read_dir(src)? {
423
+ let entry = entry?;
424
+ let file_type = entry.file_type()?;
425
+ let target_path = dst.join(entry.file_name());
426
+ if file_type.is_dir() {
427
+ copy_dir_all(&entry.path(), &target_path)?;
428
+ } else {
429
+ fs::copy(entry.path(), &target_path)?;
430
+ }
668
431
  }
432
+ Ok(())
669
433
  }
670
434
 
671
- /// Link PDFium bundled (bundled-pdfium feature)
672
- ///
673
- /// Links dynamically but copies library to OUT_DIR for embedding in binary.
674
- /// Each binary extracts and uses its own copy of the PDFium library.
675
- /// Supports flexible archive structures by finding library in multiple locations.
676
- ///
677
- /// For WASM targets, links statically using the bundled static library.
678
- fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
679
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
680
- let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
681
- Ok(path) => path,
682
- Err(err) => panic!("{}", err),
683
- };
684
- let bundled_lib = out_dir.join(&runtime_lib_name);
435
+ fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
436
+ use std::process::Command;
685
437
 
686
- fs::copy(&src_lib, &bundled_lib)
687
- .unwrap_or_else(|err| panic!("Failed to copy library to OUT_DIR for bundling: {}", err));
438
+ // Change install_name from ./libpdfium.dylib to @rpath/libpdfium.dylib
439
+ let new_install_name = format!("@rpath/{}", lib_name);
688
440
 
689
- let bundled_path = bundled_lib
690
- .to_str()
691
- .unwrap_or_else(|| panic!("Non-UTF8 path for bundled library: {}", bundled_lib.display()));
692
- println!("cargo:rustc-env=KREUZBERG_PDFIUM_BUNDLED_PATH={}", bundled_path);
441
+ tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);
693
442
 
694
- if target.contains("wasm") {
695
- let lib_dir = bundled_lib
696
- .parent()
697
- .unwrap_or_else(|| panic!("Invalid bundled library path: {}", bundled_lib.display()));
698
- println!("cargo:rustc-link-search=native={}", lib_dir.display());
699
- println!("cargo:rustc-link-lib=static=pdfium");
700
- tracing::debug!("Bundled PDFium static library linked for WASM at: {}", bundled_path);
701
- } else {
702
- tracing::debug!("Bundled PDFium library at: {}", bundled_path);
703
- }
704
- }
443
+ let status = Command::new("install_name_tool")
444
+ .arg("-id")
445
+ .arg(&new_install_name)
446
+ .arg(lib_path)
447
+ .status();
705
448
 
706
- /// Link system-installed PDFium (system-pdfium feature)
707
- ///
708
- /// Attempts to find PDFium via pkg-config first, then falls back to
709
- /// environment variables (KREUZBERG_PDFIUM_SYSTEM_PATH, KREUZBERG_PDFIUM_SYSTEM_INCLUDE).
710
- fn link_system(_target: &str) {
711
- match pkg_config::Config::new().atleast_version("5.0").probe("pdfium") {
712
- Ok(library) => {
713
- tracing::debug!("Found system pdfium via pkg-config");
714
- for include_path in &library.include_paths {
715
- println!("cargo:include={}", include_path.display());
716
- }
717
- return;
718
- }
719
- Err(err) => {
720
- tracing::debug!("pkg-config probe failed: {}", err);
449
+ match status {
450
+ Ok(s) if s.success() => {
451
+ tracing::debug!("Successfully updated install_name");
721
452
  }
722
- }
723
-
724
- let lib_path = env::var("KREUZBERG_PDFIUM_SYSTEM_PATH").ok();
725
- let include_path = env::var("KREUZBERG_PDFIUM_SYSTEM_INCLUDE").ok();
726
-
727
- if let Some(lib_dir) = lib_path {
728
- let lib_dir_path = PathBuf::from(&lib_dir);
729
- if !lib_dir_path.exists() {
730
- panic!(
731
- "KREUZBERG_PDFIUM_SYSTEM_PATH points to '{}' but the directory does not exist",
732
- lib_dir
733
- );
453
+ Ok(s) => {
454
+ tracing::debug!("install_name_tool failed with status: {}", s);
734
455
  }
735
-
736
- println!("cargo:rustc-link-search=native={}", lib_dir);
737
- println!("cargo:rustc-link-lib=dylib=pdfium");
738
-
739
- if let Some(inc_dir) = include_path {
740
- println!("cargo:include={}", inc_dir);
456
+ Err(e) => {
457
+ tracing::debug!("Failed to run install_name_tool: {}", e);
741
458
  }
742
-
743
- tracing::debug!("Using system pdfium from: {}", lib_dir);
744
- return;
745
- }
746
-
747
- panic!(
748
- "system-pdfium feature enabled but pdfium not found.\n\
749
- \n\
750
- Please install pdfium system-wide or provide:\n\
751
- - KREUZBERG_PDFIUM_SYSTEM_PATH: path to directory containing libpdfium\n\
752
- - KREUZBERG_PDFIUM_SYSTEM_INCLUDE: path to pdfium headers (optional)\n\
753
- \n\
754
- Alternatively, use a different linking strategy:\n\
755
- - Default (dynamic): cargo build --features pdf\n\
756
- - Static linking: cargo build --features pdf,static-pdfium\n\
757
- - Bundled: cargo build --features pdf,bundled-pdfium"
758
- );
759
- }
760
-
761
- /// Link system frameworks and standard libraries
762
- ///
763
- /// Adds platform-specific system libraries required for PDFium linking:
764
- /// - macOS: CoreFoundation, CoreGraphics, CoreText, AppKit, libc++
765
- /// - Linux: stdc++, libm
766
- /// - Windows: gdi32, user32, advapi32
767
- fn link_system_frameworks(target: &str) {
768
- if target.contains("darwin") {
769
- println!("cargo:rustc-link-lib=framework=CoreFoundation");
770
- println!("cargo:rustc-link-lib=framework=CoreGraphics");
771
- println!("cargo:rustc-link-lib=framework=CoreText");
772
- println!("cargo:rustc-link-lib=framework=AppKit");
773
- println!("cargo:rustc-link-lib=dylib=c++");
774
- } else if target.contains("linux") {
775
- println!("cargo:rustc-link-lib=dylib=stdc++");
776
- println!("cargo:rustc-link-lib=dylib=m");
777
- } else if target.contains("windows") {
778
- println!("cargo:rustc-link-lib=dylib=gdi32");
779
- println!("cargo:rustc-link-lib=dylib=user32");
780
- println!("cargo:rustc-link-lib=dylib=advapi32");
781
459
  }
782
460
  }