kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (370) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +538 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +4 -104
  8. data/README.md +454 -432
  9. data/Rakefile +25 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -341
  12. data/ext/kreuzberg_rb/extconf.rb +45 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
  14. data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6721
  15. data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3135
  23. data/extconf.rb +28 -28
  24. data/kreuzberg.gemspec +214 -182
  25. data/lib/kreuzberg/api_proxy.rb +142 -142
  26. data/lib/kreuzberg/cache_api.rb +81 -46
  27. data/lib/kreuzberg/cli.rb +55 -55
  28. data/lib/kreuzberg/cli_proxy.rb +127 -127
  29. data/lib/kreuzberg/config.rb +724 -724
  30. data/lib/kreuzberg/error_context.rb +80 -32
  31. data/lib/kreuzberg/errors.rb +118 -118
  32. data/lib/kreuzberg/extraction_api.rb +340 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  36. data/lib/kreuzberg/result.rb +279 -279
  37. data/lib/kreuzberg/setup_lib_path.rb +80 -80
  38. data/lib/kreuzberg/validator_protocol.rb +89 -89
  39. data/lib/kreuzberg/version.rb +5 -5
  40. data/lib/kreuzberg.rb +109 -103
  41. data/lib/pdfium.dll +0 -0
  42. data/sig/kreuzberg/internal.rbs +184 -184
  43. data/sig/kreuzberg.rbs +546 -537
  44. data/spec/binding/cache_spec.rb +227 -227
  45. data/spec/binding/cli_proxy_spec.rb +85 -85
  46. data/spec/binding/cli_spec.rb +55 -55
  47. data/spec/binding/config_spec.rb +345 -345
  48. data/spec/binding/config_validation_spec.rb +283 -283
  49. data/spec/binding/error_handling_spec.rb +213 -213
  50. data/spec/binding/errors_spec.rb +66 -66
  51. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  52. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  53. data/spec/binding/plugins/validator_spec.rb +274 -274
  54. data/spec/fixtures/config.toml +39 -39
  55. data/spec/fixtures/config.yaml +41 -41
  56. data/spec/fixtures/invalid_config.toml +4 -4
  57. data/spec/smoke/package_spec.rb +178 -178
  58. data/spec/spec_helper.rb +42 -42
  59. data/vendor/Cargo.toml +45 -0
  60. data/vendor/kreuzberg/Cargo.toml +61 -38
  61. data/vendor/kreuzberg/README.md +230 -221
  62. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
  63. data/vendor/kreuzberg/build.rs +843 -891
  64. data/vendor/kreuzberg/src/api/error.rs +81 -81
  65. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  66. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  67. data/vendor/kreuzberg/src/api/server.rs +353 -353
  68. data/vendor/kreuzberg/src/api/types.rs +170 -170
  69. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  70. data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
  71. data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
  72. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  73. data/vendor/kreuzberg/src/core/config.rs +1080 -1080
  74. data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
  75. data/vendor/kreuzberg/src/core/io.rs +329 -329
  76. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  77. data/vendor/kreuzberg/src/core/mod.rs +47 -47
  78. data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1171
  79. data/vendor/kreuzberg/src/embeddings.rs +500 -432
  80. data/vendor/kreuzberg/src/error.rs +431 -431
  81. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  82. data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
  83. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  84. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  85. data/vendor/kreuzberg/src/extraction/html.rs +601 -569
  86. data/vendor/kreuzberg/src/extraction/image.rs +491 -491
  87. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
  88. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
  89. data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
  90. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  91. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  92. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  93. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  94. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
  95. data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
  96. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  97. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  98. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  99. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  100. data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
  101. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
  102. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
  103. data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
  104. data/vendor/kreuzberg/src/extractors/email.rs +157 -157
  105. data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
  106. data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
  107. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
  108. data/vendor/kreuzberg/src/extractors/html.rs +407 -407
  109. data/vendor/kreuzberg/src/extractors/image.rs +219 -219
  110. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
  112. data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
  114. data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
  115. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  116. data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
  117. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
  118. data/vendor/kreuzberg/src/extractors/pdf.rs +749 -673
  119. data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
  120. data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
  121. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
  122. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  123. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  124. data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
  125. data/vendor/kreuzberg/src/extractors/text.rs +265 -265
  126. data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
  127. data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
  128. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  129. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  130. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  131. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  132. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  133. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  134. data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
  135. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  136. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  137. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  138. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
  139. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
  140. data/vendor/kreuzberg/src/lib.rs +113 -113
  141. data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
  142. data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
  143. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  144. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  145. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  146. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  147. data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
  148. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  149. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  150. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
  151. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  152. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  153. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  154. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  155. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  156. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -328
  157. data/vendor/kreuzberg/src/pdf/error.rs +130 -130
  158. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  159. data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
  160. data/vendor/kreuzberg/src/pdf/mod.rs +68 -66
  161. data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
  162. data/vendor/kreuzberg/src/pdf/table.rs +420 -417
  163. data/vendor/kreuzberg/src/pdf/text.rs +240 -240
  164. data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
  165. data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
  166. data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
  167. data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
  168. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
  169. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  170. data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
  171. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  172. data/vendor/kreuzberg/src/text/mod.rs +25 -25
  173. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  174. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
  175. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  176. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  177. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  178. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  179. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  180. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  181. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  182. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  183. data/vendor/kreuzberg/src/types.rs +1055 -1055
  184. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  185. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  186. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  187. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  188. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  189. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  190. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  191. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  192. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  193. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  194. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  195. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  196. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  198. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  199. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  200. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  201. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  202. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  203. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  204. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  205. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  206. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  207. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  208. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  209. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  210. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  211. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  212. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  213. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  214. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  215. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  216. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  217. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  218. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  219. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  220. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  221. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  222. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  223. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  224. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  225. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  226. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  227. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  228. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  229. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  230. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  231. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  232. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  233. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  234. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  235. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  236. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  237. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  238. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  239. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  240. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  241. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  242. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  243. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  244. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  245. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  246. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  247. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  248. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  249. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  250. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  251. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  252. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  253. data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
  254. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
  255. data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
  256. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  257. data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
  258. data/vendor/kreuzberg/tests/config_features.rs +612 -612
  259. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
  260. data/vendor/kreuzberg/tests/core_integration.rs +510 -510
  261. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  262. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
  263. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  264. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  265. data/vendor/kreuzberg/tests/email_integration.rs +327 -327
  266. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  267. data/vendor/kreuzberg/tests/error_handling.rs +402 -402
  268. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  269. data/vendor/kreuzberg/tests/format_integration.rs +164 -161
  270. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  271. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  272. data/vendor/kreuzberg/tests/image_integration.rs +255 -255
  273. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  274. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  275. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  276. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  277. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  278. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  279. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  280. data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
  281. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
  282. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
  283. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
  284. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  285. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
  286. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  287. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  288. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
  289. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
  290. data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
  291. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
  292. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
  293. data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
  294. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  295. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
  296. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
  297. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
  298. data/vendor/kreuzberg/tests/security_validation.rs +416 -416
  299. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  300. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
  301. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
  302. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
  303. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  304. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  305. data/vendor/kreuzberg-ffi/README.md +851 -0
  306. data/vendor/kreuzberg-ffi/build.rs +176 -0
  307. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  308. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  309. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  310. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  311. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  312. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  313. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  314. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  315. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  316. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  317. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  318. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  319. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  320. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  321. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  322. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  323. data/vendor/kreuzberg-tesseract/README.md +399 -0
  324. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  325. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  326. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  327. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  328. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  329. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  330. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  331. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  332. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  333. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  334. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  335. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  336. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  337. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  338. data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
  339. data/vendor/rb-sys/Cargo.lock +393 -393
  340. data/vendor/rb-sys/Cargo.toml +70 -70
  341. data/vendor/rb-sys/Cargo.toml.orig +57 -57
  342. data/vendor/rb-sys/LICENSE-APACHE +190 -190
  343. data/vendor/rb-sys/LICENSE-MIT +21 -21
  344. data/vendor/rb-sys/build/features.rs +111 -111
  345. data/vendor/rb-sys/build/main.rs +286 -286
  346. data/vendor/rb-sys/build/stable_api_config.rs +155 -155
  347. data/vendor/rb-sys/build/version.rs +50 -50
  348. data/vendor/rb-sys/readme.md +36 -36
  349. data/vendor/rb-sys/src/bindings.rs +21 -21
  350. data/vendor/rb-sys/src/hidden.rs +11 -11
  351. data/vendor/rb-sys/src/lib.rs +35 -35
  352. data/vendor/rb-sys/src/macros.rs +371 -371
  353. data/vendor/rb-sys/src/memory.rs +53 -53
  354. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
  355. data/vendor/rb-sys/src/special_consts.rs +31 -31
  356. data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
  357. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
  358. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
  359. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
  360. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
  361. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
  362. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
  363. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
  364. data/vendor/rb-sys/src/stable_api.rs +260 -260
  365. data/vendor/rb-sys/src/symbol.rs +31 -31
  366. data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
  367. data/vendor/rb-sys/src/utils.rs +89 -89
  368. data/vendor/rb-sys/src/value_type.rs +7 -7
  369. metadata +44 -81
  370. data/vendor/rb-sys/bin/release.sh +0 -21
@@ -1,891 +1,843 @@
1
- use std::env;
2
- use std::fs;
3
- use std::io;
4
- use std::path::{Path, PathBuf};
5
- use std::process::Command;
6
- use std::thread;
7
- use std::time::Duration;
8
-
9
- /// PDFium linking strategy
10
- #[derive(Debug, Clone, Copy, PartialEq, Eq)]
11
- enum PdfiumLinkStrategy {
12
- /// Download and link dynamically (default behavior)
13
- DownloadDynamic,
14
- /// Download and link statically (pdf-static feature)
15
- DownloadStatic,
16
- /// Download, link dynamically, and embed in binary (pdf-bundled feature)
17
- Bundled,
18
- /// Use system-installed pdfium via pkg-config (pdf-system feature)
19
- System,
20
- }
21
-
22
- // ============================================================================
23
- // MAIN BUILD ORCHESTRATION
24
- // ============================================================================
25
-
26
- fn main() {
27
- let target = env::var("TARGET").unwrap();
28
- let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
29
-
30
- println!("cargo::rustc-check-cfg=cfg(coverage)");
31
-
32
- // Skip pdfium linking if the pdf feature is not enabled
33
- if !cfg!(feature = "pdf") {
34
- tracing::debug!("PDF feature not enabled, skipping pdfium linking");
35
- return;
36
- }
37
-
38
- validate_feature_exclusivity();
39
- let strategy = determine_link_strategy(&target);
40
-
41
- tracing::debug!("Using PDFium linking strategy: {:?}", strategy);
42
-
43
- match strategy {
44
- PdfiumLinkStrategy::DownloadDynamic => {
45
- let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
46
- link_dynamically(&pdfium_dir, &target);
47
- copy_lib_to_package(&pdfium_dir, &target);
48
- }
49
- PdfiumLinkStrategy::DownloadStatic => {
50
- let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
51
- link_statically(&pdfium_dir, &target);
52
- // Skip copy_lib_to_package - library embedded in binary
53
- }
54
- PdfiumLinkStrategy::Bundled => {
55
- let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
56
- link_bundled(&pdfium_dir, &target, &out_dir);
57
- // Skip copy_lib_to_package - each binary extracts its own
58
- }
59
- PdfiumLinkStrategy::System => {
60
- link_system(&target);
61
- // No download or copy needed
62
- }
63
- }
64
-
65
- link_system_frameworks(&target);
66
- println!("cargo:rerun-if-changed=build.rs");
67
- }
68
-
69
- // ============================================================================
70
- // FEATURE & STRATEGY VALIDATION
71
- // ============================================================================
72
-
73
- /// Validate that only one linking strategy feature is enabled at a time
74
- fn validate_feature_exclusivity() {
75
- let strategies = [
76
- cfg!(feature = "pdf-static"),
77
- cfg!(feature = "pdf-bundled"),
78
- cfg!(feature = "pdf-system"),
79
- ];
80
- let count = strategies.iter().filter(|&&x| x).count();
81
-
82
- if count > 1 {
83
- panic!(
84
- "Only one of pdf-static, pdf-bundled, pdf-system can be enabled at once.\n\
85
- Please choose a single PDFium linking strategy."
86
- );
87
- }
88
- }
89
-
90
- /// Determine which linking strategy to use based on features and target
91
- fn determine_link_strategy(target: &str) -> PdfiumLinkStrategy {
92
- // WASM always uses static linking
93
- if target.contains("wasm") {
94
- return PdfiumLinkStrategy::DownloadStatic;
95
- }
96
-
97
- // Feature-based strategy selection (priority order)
98
- if cfg!(feature = "pdf-system") {
99
- return PdfiumLinkStrategy::System;
100
- }
101
- if cfg!(feature = "pdf-bundled") {
102
- return PdfiumLinkStrategy::Bundled;
103
- }
104
- if cfg!(feature = "pdf-static") {
105
- return PdfiumLinkStrategy::DownloadStatic;
106
- }
107
-
108
- // Default: download and link dynamically
109
- PdfiumLinkStrategy::DownloadDynamic
110
- }
111
-
112
- // ============================================================================
113
- // DOWNLOAD & PREBUILT ORCHESTRATION
114
- // ============================================================================
115
-
116
- /// Download PDFium or use prebuilt directory
117
- ///
118
- /// This is the main orchestrator function that:
119
- /// 1. Checks for `KREUZBERG_PDFIUM_PREBUILT` environment variable
120
- /// 2. If set and valid, uses prebuilt pdfium directory
121
- /// 3. If not set, downloads pdfium to out_dir (with caching)
122
- /// 4. Returns PathBuf to pdfium directory
123
- ///
124
- /// Reuses all existing helper functions:
125
- /// - `get_pdfium_url_and_lib()` - determines download URL for target
126
- /// - `download_and_extract_pdfium()` - downloads with retry logic
127
- /// - `runtime_library_info()` - platform-specific library names
128
- /// - `prepare_prebuilt_pdfium()` - handles prebuilt copy
129
- fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
130
- let (download_url, _lib_name) = get_pdfium_url_and_lib(target);
131
- let pdfium_dir = out_dir.join("pdfium");
132
-
133
- // Check for prebuilt pdfium directory
134
- if let Some(prebuilt) = env::var_os("KREUZBERG_PDFIUM_PREBUILT") {
135
- let prebuilt_path = PathBuf::from(prebuilt);
136
- if prebuilt_path.exists() {
137
- prepare_prebuilt_pdfium(&prebuilt_path, &pdfium_dir)
138
- .unwrap_or_else(|err| panic!("Failed to copy Pdfium from {}: {}", prebuilt_path.display(), err));
139
- return pdfium_dir;
140
- } else {
141
- panic!(
142
- "Environment variable KREUZBERG_PDFIUM_PREBUILT points to '{}' but the directory does not exist",
143
- prebuilt_path.display()
144
- );
145
- }
146
- }
147
-
148
- // Check if library already exists (cache validation) using flexible detection
149
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
150
- let lib_found = find_pdfium_library(&pdfium_dir, &runtime_lib_name, runtime_subdir).is_ok();
151
-
152
- let import_lib_exists = if target.contains("windows") {
153
- let lib_dir = pdfium_dir.join("lib");
154
- lib_dir.join("pdfium.lib").exists() || lib_dir.join("pdfium.dll.lib").exists()
155
- } else {
156
- true
157
- };
158
-
159
- if !lib_found || !import_lib_exists {
160
- tracing::debug!("Pdfium library not found, downloading for target: {}", target);
161
- tracing::debug!("Download URL: {}", download_url);
162
- download_and_extract_pdfium(&download_url, &pdfium_dir);
163
- } else {
164
- tracing::debug!("Pdfium library already cached at {}", pdfium_dir.display());
165
- }
166
-
167
- // Windows-specific: ensure pdfium.lib exists
168
- if target.contains("windows") {
169
- let lib_dir = pdfium_dir.join("lib");
170
- let dll_lib = lib_dir.join("pdfium.dll.lib");
171
- let expected_lib = lib_dir.join("pdfium.lib");
172
-
173
- if dll_lib.exists() && !expected_lib.exists() {
174
- tracing::debug!("Renaming cached {} to {}", dll_lib.display(), expected_lib.display());
175
- fs::rename(&dll_lib, &expected_lib).expect("Failed to rename pdfium.dll.lib to pdfium.lib");
176
- }
177
- }
178
-
179
- pdfium_dir
180
- }
181
-
182
- // ============================================================================
183
- // DOWNLOAD UTILITIES
184
- // ============================================================================
185
-
186
- /// Fetch the latest release version from a GitHub repository
187
- ///
188
- /// Uses curl to query the GitHub API and extract the tag_name from the
189
- /// latest release JSON response. Falls back to "7529" if API call fails.
190
- fn get_latest_version(repo: &str) -> String {
191
- let api_url = format!("https://api.github.com/repos/{}/releases/latest", repo);
192
-
193
- let output = Command::new("curl").args(["-s", &api_url]).output();
194
-
195
- if let Ok(output) = output
196
- && output.status.success()
197
- {
198
- let json = String::from_utf8_lossy(&output.stdout);
199
- if let Some(start) = json.find("\"tag_name\":") {
200
- let after_colon = &json[start + "\"tag_name\":".len()..];
201
- if let Some(opening_quote) = after_colon.find('"')
202
- && let Some(closing_quote) = after_colon[opening_quote + 1..].find('"')
203
- {
204
- let tag_start = opening_quote + 1;
205
- let tag = &after_colon[tag_start..tag_start + closing_quote];
206
- return tag.split('/').next_back().unwrap_or(tag).to_string();
207
- }
208
- }
209
- }
210
-
211
- "7529".to_string()
212
- }
213
-
214
- /// Get the download URL and library name for the target platform
215
- ///
216
- /// Determines platform/architecture from target triple and constructs
217
- /// the appropriate GitHub release download URL. Supports:
218
- /// - WASM: paulocoutinhox/pdfium-lib
219
- /// - Other platforms: bblanchon/pdfium-binaries
220
- fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
221
- if target.contains("wasm") {
222
- let version = env::var("PDFIUM_WASM_VERSION")
223
- .ok()
224
- .filter(|v| !v.is_empty())
225
- .unwrap_or_else(|| get_latest_version("paulocoutinhox/pdfium-lib"));
226
- tracing::debug!("Using pdfium-lib version: {}", version);
227
-
228
- // WASM builds use a single 'wasm.tgz' asset regardless of architecture
229
- // The archive contains both wasm32 and wasm64 if available
230
- return (
231
- format!(
232
- "https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/wasm.tgz",
233
- version
234
- ),
235
- "pdfium".to_string(),
236
- );
237
- }
238
-
239
- let (platform, arch) = if target.contains("darwin") {
240
- let arch = if target.contains("aarch64") { "arm64" } else { "x64" };
241
- ("mac", arch)
242
- } else if target.contains("linux") {
243
- let arch = if target.contains("aarch64") {
244
- "arm64"
245
- } else if target.contains("arm") {
246
- "arm"
247
- } else {
248
- "x64"
249
- };
250
- ("linux", arch)
251
- } else if target.contains("windows") {
252
- let arch = if target.contains("aarch64") {
253
- "arm64"
254
- } else if target.contains("i686") {
255
- "x86"
256
- } else {
257
- "x64"
258
- };
259
- ("win", arch)
260
- } else {
261
- panic!("Unsupported target platform: {}", target);
262
- };
263
-
264
- let version = env::var("PDFIUM_VERSION")
265
- .ok()
266
- .filter(|v| !v.is_empty())
267
- .unwrap_or_else(|| get_latest_version("bblanchon/pdfium-binaries"));
268
- tracing::debug!("Using pdfium-binaries version: {}", version);
269
-
270
- let url = format!(
271
- "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium/{}/pdfium-{}-{}.tgz",
272
- version, platform, arch
273
- );
274
-
275
- (url, "pdfium".to_string())
276
- }
277
-
278
- /// Download and extract PDFium archive with retry logic
279
- ///
280
- /// Features:
281
- /// - Exponential backoff retry (configurable via env vars)
282
- /// - File type validation (gzip check)
283
- /// - Windows-specific import library handling (pdfium.dll.lib -> pdfium.lib)
284
- /// - Environment variables:
285
- /// - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: number of retries (default: 5)
286
- /// - KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS: initial backoff in seconds (default: 2)
287
- fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
288
- fs::create_dir_all(dest_dir).expect("Failed to create pdfium directory");
289
-
290
- let archive_path = dest_dir.join("pdfium.tar.gz");
291
- let retries = env::var("KREUZBERG_PDFIUM_DOWNLOAD_RETRIES")
292
- .ok()
293
- .and_then(|value| value.parse::<u32>().ok())
294
- .filter(|value| *value > 0)
295
- .unwrap_or(5);
296
- let base_delay = env::var("KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS")
297
- .ok()
298
- .and_then(|value| value.parse::<u64>().ok())
299
- .filter(|value| *value > 0)
300
- .unwrap_or(2);
301
-
302
- let archive_path_str = archive_path
303
- .to_str()
304
- .unwrap_or_else(|| panic!("Non-UTF8 path for archive: {}", archive_path.display()));
305
- let mut last_error = String::new();
306
-
307
- for attempt in 1..=retries {
308
- let _ = fs::remove_file(&archive_path);
309
- tracing::debug!(
310
- "Downloading Pdfium archive from: {} (attempt {}/{})",
311
- url,
312
- attempt,
313
- retries
314
- );
315
-
316
- let status = Command::new("curl")
317
- .args(["-f", "-L", "-o", archive_path_str, url])
318
- .status();
319
-
320
- match status {
321
- Ok(code) if code.success() => {
322
- last_error.clear();
323
- break;
324
- }
325
- Ok(code) => {
326
- last_error = format!("curl exited with {:?}", code.code());
327
- }
328
- Err(err) => {
329
- last_error = format!("failed to spawn curl: {err}");
330
- }
331
- }
332
-
333
- if attempt == retries {
334
- panic!(
335
- "Failed to download Pdfium from {} after {} attempts. Last error: {}",
336
- url, retries, last_error
337
- );
338
- }
339
-
340
- let exponent = u32::min(attempt, 5);
341
- let multiplier = 1u64 << exponent;
342
- let delay_secs = base_delay.saturating_mul(multiplier).min(30);
343
- println!(
344
- "cargo:warning=Pdfium download failed (attempt {}/{}) - {}. Retrying in {}s",
345
- attempt, retries, last_error, delay_secs
346
- );
347
- thread::sleep(Duration::from_secs(delay_secs));
348
- }
349
-
350
- let file_type = Command::new("file")
351
- .arg(archive_path.to_str().unwrap())
352
- .output()
353
- .expect("Failed to check file type");
354
-
355
- let file_type_output = String::from_utf8_lossy(&file_type.stdout);
356
- tracing::debug!("Downloaded file type: {}", file_type_output.trim());
357
-
358
- if !file_type_output.to_lowercase().contains("gzip") && !file_type_output.to_lowercase().contains("compressed") {
359
- fs::remove_file(&archive_path).ok();
360
- panic!(
361
- "Downloaded file is not a valid gzip archive. URL may be incorrect or version unavailable: {}",
362
- url
363
- );
364
- }
365
-
366
- tracing::debug!("Extracting Pdfium archive...");
367
- let status = Command::new("tar")
368
- .args(["-xzf", archive_path.to_str().unwrap(), "-C", dest_dir.to_str().unwrap()])
369
- .status()
370
- .expect("Failed to execute tar");
371
-
372
- if !status.success() {
373
- fs::remove_file(&archive_path).ok();
374
- panic!("Failed to extract Pdfium archive from {}", url);
375
- }
376
-
377
- fs::remove_file(&archive_path).ok();
378
-
379
- let target = env::var("TARGET").unwrap();
380
- if target.contains("windows") {
381
- let lib_dir = dest_dir.join("lib");
382
- let dll_lib = lib_dir.join("pdfium.dll.lib");
383
- let expected_lib = lib_dir.join("pdfium.lib");
384
-
385
- if dll_lib.exists() {
386
- tracing::debug!("Ensuring Windows import library at {}", expected_lib.display());
387
- if let Err(err) = fs::copy(&dll_lib, &expected_lib) {
388
- panic!("Failed to copy pdfium.dll.lib to pdfium.lib: {err}");
389
- }
390
- } else {
391
- tracing::debug!("Warning: Expected {} not found after extraction", dll_lib.display());
392
- }
393
- }
394
-
395
- tracing::debug!("Pdfium downloaded and extracted successfully");
396
- }
397
-
398
- // ============================================================================
399
- // PREBUILT HANDLING
400
- // ============================================================================
401
-
402
- /// Prepare prebuilt PDFium by copying to destination directory
403
- ///
404
- /// Removes existing destination if present, then recursively copies
405
- /// all files from prebuilt source to destination.
406
- fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
407
- if dest_dir.exists() {
408
- fs::remove_dir_all(dest_dir)?;
409
- }
410
- copy_dir_all(prebuilt_src, dest_dir)
411
- }
412
-
413
- /// Recursively copy directory tree
414
- ///
415
- /// Used by `prepare_prebuilt_pdfium()` to copy entire pdfium directory
416
- /// structure, preserving all files and subdirectories.
417
- fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
418
- fs::create_dir_all(dst)?;
419
- for entry in fs::read_dir(src)? {
420
- let entry = entry?;
421
- let file_type = entry.file_type()?;
422
- let target_path = dst.join(entry.file_name());
423
- if file_type.is_dir() {
424
- copy_dir_all(&entry.path(), &target_path)?;
425
- } else {
426
- fs::copy(entry.path(), &target_path)?;
427
- }
428
- }
429
- Ok(())
430
- }
431
-
432
- // ============================================================================
433
- // PLATFORM UTILITIES
434
- // ============================================================================
435
-
436
- /// Get platform-specific runtime library name and subdirectory
437
- ///
438
- /// Returns tuple of (library_name, subdirectory) for the target platform:
439
- /// - WASM: ("libpdfium.a", "lib")
440
- /// - Windows: ("pdfium.dll", "bin")
441
- /// - macOS: ("libpdfium.dylib", "lib")
442
- /// - Linux: ("libpdfium.so", "lib")
443
- fn runtime_library_info(target: &str) -> (String, &'static str) {
444
- if target.contains("wasm") {
445
- ("libpdfium.a".to_string(), "lib")
446
- } else if target.contains("windows") {
447
- ("pdfium.dll".to_string(), "bin")
448
- } else if target.contains("darwin") {
449
- ("libpdfium.dylib".to_string(), "lib")
450
- } else {
451
- ("libpdfium.so".to_string(), "lib")
452
- }
453
- }
454
-
455
- /// Find PDFium library in archive with flexible directory detection
456
- ///
457
- /// Attempts to locate the library at multiple possible locations:
458
- /// - {subdir}/{lib_name} (standard location)
459
- /// - {lib_name} (root of archive)
460
- /// - bin/{lib_name} (alternative location)
461
- /// - lib/{lib_name} (explicit lib directory)
462
- ///
463
- /// This handles variations in archive structure across different platform builds,
464
- /// particularly macOS ARM64 where the archive structure may differ.
465
- ///
466
- /// Returns the full path to the library if found, or an error with available files.
467
- fn find_pdfium_library(pdfium_dir: &Path, lib_name: &str, expected_subdir: &str) -> Result<PathBuf, String> {
468
- // Candidates in priority order
469
- let candidates = [
470
- pdfium_dir.join(expected_subdir).join(lib_name), // Standard: lib/libpdfium.dylib
471
- pdfium_dir.join(lib_name), // Root: libpdfium.dylib
472
- pdfium_dir.join("bin").join(lib_name), // Alternative: bin/libpdfium.dylib
473
- pdfium_dir.join("lib").join(lib_name), // Explicit lib: lib/libpdfium.dylib
474
- ];
475
-
476
- // Try each candidate
477
- for candidate in &candidates {
478
- if candidate.exists() {
479
- tracing::debug!("Found PDFium library at: {}", candidate.display());
480
- return Ok(candidate.clone());
481
- }
482
- }
483
-
484
- // Library not found - provide detailed error with directory listing
485
- let mut error_msg = format!(
486
- "PDFium library not found at expected location: {}/{}\n\n",
487
- pdfium_dir.display(),
488
- expected_subdir
489
- );
490
- error_msg.push_str("Attempted locations:\n");
491
- for candidate in &candidates {
492
- error_msg.push_str(&format!(" - {}\n", candidate.display()));
493
- }
494
-
495
- // List actual contents of pdfium directory for debugging
496
- error_msg.push_str("\nActual archive contents:\n");
497
- if let Ok(entries) = fs::read_dir(pdfium_dir) {
498
- for entry in entries.flatten() {
499
- let path = entry.path();
500
- let file_type = if path.is_dir() { "dir" } else { "file" };
501
- error_msg.push_str(&format!(" {} ({})\n", path.display(), file_type));
502
-
503
- // Show contents of subdirectories
504
- if path.is_dir()
505
- && let Ok(sub_entries) = fs::read_dir(&path)
506
- {
507
- for sub_entry in sub_entries.flatten() {
508
- let sub_path = sub_entry.path();
509
- let sub_type = if sub_path.is_dir() { "dir" } else { "file" };
510
- error_msg.push_str(&format!(" {} ({})\n", sub_path.display(), sub_type));
511
- }
512
- }
513
- }
514
- }
515
-
516
- Err(error_msg)
517
- }
518
-
519
- /// Fix macOS install name (rpath) for dynamic library
520
- ///
521
- /// Uses install_name_tool to set the install name to @rpath/{lib_name}
522
- /// to enable relative path loading on macOS.
523
- fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
524
- let new_install_name = format!("@rpath/{}", lib_name);
525
-
526
- tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);
527
-
528
- let status = Command::new("install_name_tool")
529
- .arg("-id")
530
- .arg(&new_install_name)
531
- .arg(lib_path)
532
- .status();
533
-
534
- match status {
535
- Ok(s) if s.success() => {
536
- tracing::debug!("Successfully updated install_name");
537
- }
538
- Ok(s) => {
539
- tracing::debug!("install_name_tool failed with status: {}", s);
540
- }
541
- Err(e) => {
542
- tracing::debug!("Failed to run install_name_tool: {}", e);
543
- }
544
- }
545
- }
546
-
547
- /// Code sign binary on macOS if needed
548
- ///
549
- /// Uses codesign to sign the binary. Identity from KREUZBERG_CODESIGN_IDENTITY
550
- /// env var (default: "-" for adhoc signing). Only runs on apple-darwin targets.
551
- fn codesign_if_needed(target: &str, binary: &Path) {
552
- if !target.contains("apple-darwin") || !binary.exists() {
553
- return;
554
- }
555
-
556
- let identity = env::var("KREUZBERG_CODESIGN_IDENTITY").unwrap_or_else(|_| "-".to_string());
557
- let status = Command::new("codesign")
558
- .arg("--force")
559
- .arg("--timestamp=none")
560
- .arg("--sign")
561
- .arg(identity)
562
- .arg(binary)
563
- .status();
564
-
565
- match status {
566
- Ok(result) if result.success() => {
567
- tracing::debug!("Codesigned {}", binary.display());
568
- }
569
- Ok(result) => {
570
- tracing::debug!(
571
- "codesign exited with status {} while signing {}",
572
- result,
573
- binary.display()
574
- );
575
- }
576
- Err(err) => {
577
- tracing::debug!("Failed to run codesign for {}: {}", binary.display(), err);
578
- }
579
- }
580
- }
581
-
582
- // ============================================================================
583
- // LINKING STRATEGIES
584
- // ============================================================================
585
-
586
- /// Link PDFium dynamically (default)
587
- ///
588
- /// Sets up linker to use PDFium as a dynamic library (.dylib/.so/.dll)
589
- /// with platform-specific rpath configuration for runtime library discovery.
590
- /// Supports flexible archive structures by adding multiple possible lib directories.
591
- fn link_dynamically(pdfium_dir: &Path, target: &str) {
592
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
593
-
594
- // Find the actual library location (handles multiple possible archive structures)
595
- let lib_path = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
596
- Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
597
- Err(err) => panic!("{}", err),
598
- };
599
-
600
- println!("cargo:rustc-link-search=native={}", lib_path.display());
601
- println!("cargo:rustc-link-lib=dylib=pdfium");
602
-
603
- // Also add standard lib directory for compatibility
604
- let std_lib_dir = pdfium_dir.join("lib");
605
- if std_lib_dir.exists() && std_lib_dir != lib_path {
606
- println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
607
- }
608
-
609
- // Add bin directory for platforms where it might be needed
610
- let bin_dir = pdfium_dir.join("bin");
611
- if bin_dir.exists() && bin_dir != lib_path {
612
- println!("cargo:rustc-link-search=native={}", bin_dir.display());
613
- }
614
-
615
- // Set rpath for dynamic linking
616
- if target.contains("darwin") {
617
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
618
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
619
- } else if target.contains("linux") {
620
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
621
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
622
- }
623
- }
624
-
625
- /// Link PDFium statically (pdf-static feature)
626
- ///
627
- /// Embeds PDFium into the binary as a static library. Adds system
628
- /// dependencies required for static linking on Linux.
629
- /// Supports flexible archive structures by finding library in multiple locations.
630
- fn link_statically(pdfium_dir: &Path, target: &str) {
631
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
632
-
633
- // Find the actual library location (handles multiple possible archive structures)
634
- let lib_path = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
635
- Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
636
- Err(err) => panic!("{}", err),
637
- };
638
-
639
- println!("cargo:rustc-link-search=native={}", lib_path.display());
640
- println!("cargo:rustc-link-lib=static=pdfium");
641
-
642
- // Also add standard lib directory for compatibility
643
- let std_lib_dir = pdfium_dir.join("lib");
644
- if std_lib_dir.exists() && std_lib_dir != lib_path {
645
- println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
646
- }
647
-
648
- // Add bin directory for platforms where it might be needed
649
- let bin_dir = pdfium_dir.join("bin");
650
- if bin_dir.exists() && bin_dir != lib_path {
651
- println!("cargo:rustc-link-search=native={}", bin_dir.display());
652
- }
653
-
654
- // Static linking requires additional system dependencies
655
- if target.contains("linux") {
656
- // Linux requires additional libraries for static linking
657
- println!("cargo:rustc-link-lib=dylib=pthread");
658
- println!("cargo:rustc-link-lib=dylib=dl");
659
- }
660
- }
661
-
662
- /// Link PDFium bundled (pdf-bundled feature)
663
- ///
664
- /// Links dynamically but copies library to OUT_DIR for embedding in binary.
665
- /// Each binary extracts and uses its own copy of the PDFium library.
666
- /// Supports flexible archive structures by finding library in multiple locations.
667
- fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
668
- // Link dynamically for build
669
- link_dynamically(pdfium_dir, target);
670
-
671
- // Copy library to OUT_DIR for bundling using flexible detection
672
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
673
- let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
674
- Ok(path) => path,
675
- Err(err) => panic!("{}", err),
676
- };
677
- let bundled_lib = out_dir.join(&runtime_lib_name);
678
-
679
- fs::copy(&src_lib, &bundled_lib)
680
- .unwrap_or_else(|err| panic!("Failed to copy library to OUT_DIR for bundling: {}", err));
681
-
682
- // Emit environment variable with bundled library path
683
- let bundled_path = bundled_lib
684
- .to_str()
685
- .unwrap_or_else(|| panic!("Non-UTF8 path for bundled library: {}", bundled_lib.display()));
686
- println!("cargo:rustc-env=KREUZBERG_PDFIUM_BUNDLED_PATH={}", bundled_path);
687
-
688
- tracing::debug!("Bundled PDFium library at: {}", bundled_path);
689
- }
690
-
691
- /// Link system-installed PDFium (pdf-system feature)
692
- ///
693
- /// Attempts to find PDFium via pkg-config first, then falls back to
694
- /// environment variables (KREUZBERG_PDFIUM_SYSTEM_PATH, KREUZBERG_PDFIUM_SYSTEM_INCLUDE).
695
- fn link_system(_target: &str) {
696
- // Try pkg-config first
697
- match pkg_config::Config::new().atleast_version("5.0").probe("pdfium") {
698
- Ok(library) => {
699
- tracing::debug!("Found system pdfium via pkg-config");
700
- for include_path in &library.include_paths {
701
- println!("cargo:include={}", include_path.display());
702
- }
703
- return;
704
- }
705
- Err(err) => {
706
- tracing::debug!("pkg-config probe failed: {}", err);
707
- }
708
- }
709
-
710
- // Fallback to environment variables
711
- let lib_path = env::var("KREUZBERG_PDFIUM_SYSTEM_PATH").ok();
712
- let include_path = env::var("KREUZBERG_PDFIUM_SYSTEM_INCLUDE").ok();
713
-
714
- if let Some(lib_dir) = lib_path {
715
- let lib_dir_path = PathBuf::from(&lib_dir);
716
- if !lib_dir_path.exists() {
717
- panic!(
718
- "KREUZBERG_PDFIUM_SYSTEM_PATH points to '{}' but the directory does not exist",
719
- lib_dir
720
- );
721
- }
722
-
723
- println!("cargo:rustc-link-search=native={}", lib_dir);
724
- println!("cargo:rustc-link-lib=dylib=pdfium");
725
-
726
- if let Some(inc_dir) = include_path {
727
- println!("cargo:include={}", inc_dir);
728
- }
729
-
730
- tracing::debug!("Using system pdfium from: {}", lib_dir);
731
- return;
732
- }
733
-
734
- // No system pdfium found
735
- panic!(
736
- "pdf-system feature enabled but pdfium not found.\n\
737
- \n\
738
- Please install pdfium system-wide or provide:\n\
739
- - KREUZBERG_PDFIUM_SYSTEM_PATH: path to directory containing libpdfium\n\
740
- - KREUZBERG_PDFIUM_SYSTEM_INCLUDE: path to pdfium headers (optional)\n\
741
- \n\
742
- Alternatively, use a different linking strategy:\n\
743
- - Default (dynamic): cargo build --features pdf\n\
744
- - Static linking: cargo build --features pdf,pdf-static\n\
745
- - Bundled: cargo build --features pdf,pdf-bundled"
746
- );
747
- }
748
-
749
- /// Link system frameworks and standard libraries
750
- ///
751
- /// Adds platform-specific system libraries required for PDFium linking:
752
- /// - macOS: CoreFoundation, CoreGraphics, CoreText, AppKit, libc++
753
- /// - Linux: stdc++, libm
754
- /// - Windows: gdi32, user32, advapi32
755
- fn link_system_frameworks(target: &str) {
756
- if target.contains("darwin") {
757
- println!("cargo:rustc-link-lib=framework=CoreFoundation");
758
- println!("cargo:rustc-link-lib=framework=CoreGraphics");
759
- println!("cargo:rustc-link-lib=framework=CoreText");
760
- println!("cargo:rustc-link-lib=framework=AppKit");
761
- println!("cargo:rustc-link-lib=dylib=c++");
762
- } else if target.contains("linux") {
763
- println!("cargo:rustc-link-lib=dylib=stdc++");
764
- println!("cargo:rustc-link-lib=dylib=m");
765
- } else if target.contains("windows") {
766
- println!("cargo:rustc-link-lib=dylib=gdi32");
767
- println!("cargo:rustc-link-lib=dylib=user32");
768
- println!("cargo:rustc-link-lib=dylib=advapi32");
769
- }
770
- }
771
-
772
- // ============================================================================
773
- // LIBRARY DISTRIBUTION
774
- // ============================================================================
775
-
776
- /// Copy PDFium library to various package directories
777
- ///
778
- /// Distributes the compiled/downloaded PDFium library to:
779
- /// - CLI target directories (debug/release)
780
- /// - Python package directory
781
- /// - Node.js package directory
782
- /// - Ruby gem directory
783
- ///
784
- /// On macOS, also fixes install_name and applies code signing.
785
- /// Supports flexible archive structures by finding library in multiple locations.
786
- fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
787
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
788
- let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
789
- Ok(path) => path,
790
- Err(err) => {
791
- tracing::debug!("Failed to locate PDFium library: {}", err);
792
- return;
793
- }
794
- };
795
-
796
- if target.contains("darwin") {
797
- fix_macos_install_name(&src_lib, &runtime_lib_name);
798
- codesign_if_needed(target, &src_lib);
799
- }
800
-
801
- let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
802
- let workspace_root = crate_dir.parent().unwrap().parent().unwrap();
803
-
804
- if let Ok(profile) = env::var("PROFILE") {
805
- let target_dir = if let Ok(cargo_target) = env::var("TARGET") {
806
- workspace_root.join("target").join(cargo_target).join(&profile)
807
- } else {
808
- workspace_root.join("target").join(&profile)
809
- };
810
-
811
- if target_dir.exists() {
812
- copy_lib_if_needed(
813
- &src_lib,
814
- &target_dir.join(&runtime_lib_name),
815
- "CLI target directory",
816
- target,
817
- );
818
- }
819
-
820
- let simple_target_dir = workspace_root.join("target").join(&profile);
821
- if simple_target_dir != target_dir {
822
- fs::create_dir_all(&simple_target_dir).ok();
823
- copy_lib_if_needed(
824
- &src_lib,
825
- &simple_target_dir.join(&runtime_lib_name),
826
- "Java FFI target directory",
827
- target,
828
- );
829
- }
830
- }
831
-
832
- let python_dest_dir = workspace_root.join("packages").join("python").join("kreuzberg");
833
- if python_dest_dir.exists() {
834
- copy_lib_if_needed(
835
- &src_lib,
836
- &python_dest_dir.join(&runtime_lib_name),
837
- "Python package",
838
- target,
839
- );
840
- } else {
841
- tracing::debug!("Python package directory not found, skipping Python library copy");
842
- }
843
-
844
- let node_dest_dir = workspace_root.join("crates").join("kreuzberg-node");
845
- if node_dest_dir.exists() {
846
- copy_lib_if_needed(
847
- &src_lib,
848
- &node_dest_dir.join(&runtime_lib_name),
849
- "Node.js package",
850
- target,
851
- );
852
- } else {
853
- tracing::debug!("Node.js package directory not found, skipping Node library copy");
854
- }
855
-
856
- let ruby_dest_dir = workspace_root.join("packages").join("ruby").join("lib");
857
- if ruby_dest_dir.exists() {
858
- copy_lib_if_needed(&src_lib, &ruby_dest_dir.join(&runtime_lib_name), "Ruby package", target);
859
- } else {
860
- tracing::debug!("Ruby package directory not found, skipping Ruby library copy");
861
- }
862
- }
863
-
864
- /// Copy library to destination if needed (based on modification time)
865
- ///
866
- /// Only copies if destination doesn't exist or source is newer than destination.
867
- /// Applies platform-specific post-processing (code signing on macOS).
868
- fn copy_lib_if_needed(src: &Path, dest: &Path, package_name: &str, target: &str) {
869
- use std::fs;
870
-
871
- let should_copy = if dest.exists() {
872
- let src_metadata = fs::metadata(src).ok();
873
- let dest_metadata = fs::metadata(dest).ok();
874
- match (src_metadata, dest_metadata) {
875
- (Some(src), Some(dest)) => src.modified().ok() > dest.modified().ok(),
876
- _ => true,
877
- }
878
- } else {
879
- true
880
- };
881
-
882
- if should_copy {
883
- match fs::copy(src, dest) {
884
- Ok(_) => {
885
- tracing::debug!("Copied {} to {} ({})", src.display(), dest.display(), package_name);
886
- codesign_if_needed(target, dest);
887
- }
888
- Err(e) => tracing::debug!("Failed to copy library to {}: {}", package_name, e),
889
- }
890
- }
891
- }
1
+ // Kreuzberg Build Script - PDFium Linking Configuration
2
+ //
3
+ // This build script handles PDFium library downloading and linking for the kreuzberg crate.
4
+ // It supports multiple linking strategies via Cargo features:
5
+ //
6
+ // 1. Default (pdf, bundled-pdfium): Download dynamic library and embed in binary
7
+ // - Self-contained binary that extracts library at runtime
8
+ // - Larger binary size but no external .so dependency
9
+ // - No PDFIUM_*_PATH environment variables needed
10
+ //
11
+ // 2. static-pdfium: Static linking (no runtime dependency)
12
+ // - REQUIRES: PDFIUM_STATIC_LIB_PATH environment variable pointing to libpdfium.a directory
13
+ // - Reason: bblanchon/pdfium-binaries only provides dynamic libraries
14
+ // - Use case: Docker with musl, fully static binaries
15
+ // - Note: libpdfium.a must be obtained separately (e.g., paulocoutinhox/pdfium-lib)
16
+ //
17
+ // 3. system-pdfium: Use system-installed pdfium
18
+ // - Detected via pkg-config or KREUZBERG_PDFIUM_SYSTEM_PATH
19
+ //
20
+ // Environment Variables:
21
+ // - PDFIUM_STATIC_LIB_PATH: Path to directory containing libpdfium.a (for static-pdfium)
22
+ // - KREUZBERG_PDFIUM_PREBUILT: Path to prebuilt pdfium directory (skip download)
23
+ // - KREUZBERG_PDFIUM_SYSTEM_PATH: System pdfium library path (for system-pdfium)
24
+ // - PDFIUM_VERSION: Override version for bblanchon/pdfium-binaries
25
+ // - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: Number of download retries (default: 5)
26
+
27
+ use std::env;
28
+ use std::fs;
29
+ use std::io;
30
+ use std::path::{Path, PathBuf};
31
+ use std::process::Command;
32
+ use std::thread;
33
+ use std::time::Duration;
34
+
35
+ /// PDFium linking strategy
36
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
37
+ enum PdfiumLinkStrategy {
38
+ /// Download and link statically (static-pdfium feature)
39
+ DownloadStatic,
40
+ /// Download, link dynamically, and embed in binary (bundled-pdfium feature)
41
+ Bundled,
42
+ /// Use system-installed pdfium via pkg-config (system-pdfium feature)
43
+ System,
44
+ }
45
+
46
+ // ============================================================================
47
+ // MAIN BUILD ORCHESTRATION
48
+ // ============================================================================
49
+
50
+ fn main() {
51
+ let target = env::var("TARGET").unwrap();
52
+ let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
53
+
54
+ println!("cargo::rustc-check-cfg=cfg(coverage)");
55
+
56
+ // Skip pdfium linking if the pdf feature is not enabled
57
+ if !cfg!(feature = "pdf") {
58
+ tracing::debug!("PDF feature not enabled, skipping pdfium linking");
59
+ return;
60
+ }
61
+
62
+ let strategy = determine_link_strategy(&target);
63
+
64
+ tracing::debug!("Using PDFium linking strategy: {:?}", strategy);
65
+
66
+ match strategy {
67
+ PdfiumLinkStrategy::DownloadStatic => {
68
+ let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
69
+ link_statically(&pdfium_dir, &target);
70
+ // Skip copy_lib_to_package - library embedded in binary
71
+ }
72
+ PdfiumLinkStrategy::Bundled => {
73
+ let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
74
+ link_bundled(&pdfium_dir, &target, &out_dir);
75
+ // Skip copy_lib_to_package - each binary extracts its own
76
+ }
77
+ PdfiumLinkStrategy::System => {
78
+ link_system(&target);
79
+ // No download or copy needed
80
+ }
81
+ }
82
+
83
+ link_system_frameworks(&target);
84
+ println!("cargo:rerun-if-changed=build.rs");
85
+ }
86
+
87
+ // ============================================================================
88
+ // FEATURE & STRATEGY VALIDATION
89
+ // ============================================================================
90
+
91
+ /// Determine which linking strategy to use based on features and target
92
+ fn determine_link_strategy(target: &str) -> PdfiumLinkStrategy {
93
+ // WASM handling: check for PDFIUM_WASM_LIB environment variable
94
+ if target.contains("wasm") {
95
+ if let Ok(wasm_lib) = env::var("PDFIUM_WASM_LIB") {
96
+ println!("cargo:rustc-link-search=native={}", wasm_lib);
97
+ println!("cargo:rustc-link-lib=static=pdfium");
98
+ return PdfiumLinkStrategy::DownloadStatic;
99
+ }
100
+ // For WASM without explicit PDFIUM_WASM_LIB, use bundled strategy
101
+ // This downloads pdfium-lib which provides WASM-compatible builds
102
+ println!("cargo:warning=WASM build using bundled PDFium (set PDFIUM_WASM_LIB to link custom WASM PDFium)");
103
+ return PdfiumLinkStrategy::Bundled;
104
+ }
105
+
106
+ let system_pdfium = cfg!(feature = "system-pdfium");
107
+ let bundled_pdfium = cfg!(feature = "bundled-pdfium");
108
+ let static_pdfium = cfg!(feature = "static-pdfium");
109
+
110
+ let enabled_count = usize::from(system_pdfium) + usize::from(bundled_pdfium) + usize::from(static_pdfium);
111
+ if enabled_count > 1 {
112
+ println!(
113
+ "cargo:warning=Multiple PDFium linking strategies enabled (static-pdfium={}, bundled-pdfium={}, system-pdfium={}); using bundled-pdfium for this build",
114
+ static_pdfium, bundled_pdfium, system_pdfium
115
+ );
116
+ }
117
+
118
+ // Feature-based strategy selection.
119
+ // Prefer bundled-pdfium when multiple strategies are enabled (e.g. `--all-features`) because it
120
+ // does not require external PDFIUM_STATIC_LIB_PATH and does not depend on a system install.
121
+ if bundled_pdfium {
122
+ return PdfiumLinkStrategy::Bundled;
123
+ }
124
+ if system_pdfium {
125
+ return PdfiumLinkStrategy::System;
126
+ }
127
+ if static_pdfium {
128
+ return PdfiumLinkStrategy::DownloadStatic;
129
+ }
130
+
131
+ // Default: download and link dynamically (bundled-pdfium preferred if pdf not already selected)
132
+ // When only 'pdf' feature is enabled (no linking strategy), default to bundled-pdfium
133
+ PdfiumLinkStrategy::Bundled
134
+ }
135
+
136
+ // ============================================================================
137
+ // DOWNLOAD & PREBUILT ORCHESTRATION
138
+ // ============================================================================
139
+
140
+ /// Download PDFium or use prebuilt directory
141
+ ///
142
+ /// This is the main orchestrator function that:
143
+ /// 1. Checks for `KREUZBERG_PDFIUM_PREBUILT` environment variable
144
+ /// 2. If set and valid, uses prebuilt pdfium directory
145
+ /// 3. If not set, downloads pdfium to out_dir (with caching)
146
+ /// 4. Returns PathBuf to pdfium directory
147
+ ///
148
+ /// Reuses all existing helper functions:
149
+ /// - `get_pdfium_url_and_lib()` - determines download URL for target
150
+ /// - `download_and_extract_pdfium()` - downloads with retry logic
151
+ /// - `runtime_library_info()` - platform-specific library names
152
+ /// - `prepare_prebuilt_pdfium()` - handles prebuilt copy
153
+ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
154
+ let (download_url, _lib_name) = get_pdfium_url_and_lib(target);
155
+ let pdfium_dir = out_dir.join("pdfium");
156
+
157
+ // Check for prebuilt pdfium directory
158
+ if let Some(prebuilt) = env::var_os("KREUZBERG_PDFIUM_PREBUILT") {
159
+ let prebuilt_path = PathBuf::from(prebuilt);
160
+ if prebuilt_path.exists() {
161
+ prepare_prebuilt_pdfium(&prebuilt_path, &pdfium_dir)
162
+ .unwrap_or_else(|err| panic!("Failed to copy Pdfium from {}: {}", prebuilt_path.display(), err));
163
+ if target.contains("windows") {
164
+ ensure_windows_import_library(&pdfium_dir);
165
+ }
166
+ return pdfium_dir;
167
+ } else {
168
+ panic!(
169
+ "Environment variable KREUZBERG_PDFIUM_PREBUILT points to '{}' but the directory does not exist",
170
+ prebuilt_path.display()
171
+ );
172
+ }
173
+ }
174
+
175
+ // Check if library already exists (cache validation) using flexible detection
176
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
177
+ let lib_found = find_pdfium_library(&pdfium_dir, &runtime_lib_name, runtime_subdir).is_ok();
178
+
179
+ let import_lib_exists = if target.contains("windows") {
180
+ let lib_dir = pdfium_dir.join("lib");
181
+ lib_dir.join("pdfium.lib").exists() || lib_dir.join("pdfium.dll.lib").exists()
182
+ } else {
183
+ true
184
+ };
185
+
186
+ if !lib_found || !import_lib_exists {
187
+ tracing::debug!("Pdfium library not found, downloading for target: {}", target);
188
+ tracing::debug!("Download URL: {}", download_url);
189
+ download_and_extract_pdfium(&download_url, &pdfium_dir);
190
+ } else {
191
+ tracing::debug!("Pdfium library already cached at {}", pdfium_dir.display());
192
+ }
193
+
194
+ // Windows-specific: ensure pdfium.lib exists
195
+ if target.contains("windows") {
196
+ ensure_windows_import_library(&pdfium_dir);
197
+ }
198
+
199
+ pdfium_dir
200
+ }
201
+
202
+ fn ensure_windows_import_library(pdfium_dir: &Path) {
203
+ let lib_dir = pdfium_dir.join("lib");
204
+ let dll_lib = lib_dir.join("pdfium.dll.lib");
205
+ let expected_lib = lib_dir.join("pdfium.lib");
206
+
207
+ if dll_lib.exists() && !expected_lib.exists() {
208
+ tracing::debug!(
209
+ "Ensuring Windows import library at {} (source: {})",
210
+ expected_lib.display(),
211
+ dll_lib.display()
212
+ );
213
+ fs::copy(&dll_lib, &expected_lib).unwrap_or_else(|err| {
214
+ panic!(
215
+ "Failed to copy Windows import library from {} to {}: {}",
216
+ dll_lib.display(),
217
+ expected_lib.display(),
218
+ err
219
+ )
220
+ });
221
+ }
222
+ }
223
+
224
+ // ============================================================================
225
+ // DOWNLOAD UTILITIES
226
+ // ============================================================================
227
+
228
+ /// Fetch the latest release version from a GitHub repository
229
+ ///
230
+ /// Uses curl to query the GitHub API and extract the tag_name from the
231
+ /// latest release JSON response. Falls back to "7529" if API call fails.
232
+ fn get_latest_version(repo: &str) -> String {
233
+ let api_url = format!("https://api.github.com/repos/{}/releases/latest", repo);
234
+
235
+ let output = Command::new("curl").args(["-s", &api_url]).output();
236
+
237
+ if let Ok(output) = output
238
+ && output.status.success()
239
+ {
240
+ let json = String::from_utf8_lossy(&output.stdout);
241
+ if let Some(start) = json.find("\"tag_name\":") {
242
+ let after_colon = &json[start + "\"tag_name\":".len()..];
243
+ if let Some(opening_quote) = after_colon.find('"')
244
+ && let Some(closing_quote) = after_colon[opening_quote + 1..].find('"')
245
+ {
246
+ let tag_start = opening_quote + 1;
247
+ let tag = &after_colon[tag_start..tag_start + closing_quote];
248
+ return tag.split('/').next_back().unwrap_or(tag).to_string();
249
+ }
250
+ }
251
+ }
252
+
253
+ "7529".to_string()
254
+ }
255
+
256
+ /// Get the download URL and library name for the target platform
257
+ ///
258
+ /// Determines platform/architecture from target triple and constructs
259
+ /// the appropriate GitHub release download URL. Supports:
260
+ /// - WASM: paulocoutinhox/pdfium-lib
261
+ /// - Other platforms: bblanchon/pdfium-binaries
262
+ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
263
+ if target.contains("wasm") {
264
+ let version = env::var("PDFIUM_WASM_VERSION")
265
+ .ok()
266
+ .filter(|v| !v.is_empty())
267
+ .unwrap_or_else(|| get_latest_version("paulocoutinhox/pdfium-lib"));
268
+ tracing::debug!("Using pdfium-lib version: {}", version);
269
+
270
+ // WASM builds use a single 'wasm.tgz' asset regardless of architecture
271
+ // The archive contains both wasm32 and wasm64 if available
272
+ return (
273
+ format!(
274
+ "https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/wasm.tgz",
275
+ version
276
+ ),
277
+ "pdfium".to_string(),
278
+ );
279
+ }
280
+
281
+ let (platform, arch) = if target.contains("darwin") {
282
+ let arch = if target.contains("aarch64") { "arm64" } else { "x64" };
283
+ ("mac", arch)
284
+ } else if target.contains("linux") {
285
+ let arch = if target.contains("aarch64") {
286
+ "arm64"
287
+ } else if target.contains("arm") {
288
+ "arm"
289
+ } else {
290
+ "x64"
291
+ };
292
+ ("linux", arch)
293
+ } else if target.contains("windows") {
294
+ let arch = if target.contains("aarch64") {
295
+ "arm64"
296
+ } else if target.contains("i686") {
297
+ "x86"
298
+ } else {
299
+ "x64"
300
+ };
301
+ ("win", arch)
302
+ } else {
303
+ panic!("Unsupported target platform: {}", target);
304
+ };
305
+
306
+ let version = env::var("PDFIUM_VERSION")
307
+ .ok()
308
+ .filter(|v| !v.is_empty())
309
+ .unwrap_or_else(|| get_latest_version("bblanchon/pdfium-binaries"));
310
+ tracing::debug!("Using pdfium-binaries version: {}", version);
311
+
312
+ let url = format!(
313
+ "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium/{}/pdfium-{}-{}.tgz",
314
+ version, platform, arch
315
+ );
316
+
317
+ (url, "pdfium".to_string())
318
+ }
319
+
320
+ /// Download and extract PDFium archive with retry logic
321
+ ///
322
+ /// Features:
323
+ /// - Exponential backoff retry (configurable via env vars)
324
+ /// - File type validation (gzip check)
325
+ /// - Windows-specific import library handling (pdfium.dll.lib -> pdfium.lib)
326
+ /// - Environment variables:
327
+ /// - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: number of retries (default: 5)
328
+ /// - KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS: initial backoff in seconds (default: 2)
329
+ fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
330
+ fs::create_dir_all(dest_dir).expect("Failed to create pdfium directory");
331
+
332
+ let archive_path = dest_dir.join("pdfium.tar.gz");
333
+ let retries = env::var("KREUZBERG_PDFIUM_DOWNLOAD_RETRIES")
334
+ .ok()
335
+ .and_then(|value| value.parse::<u32>().ok())
336
+ .filter(|value| *value > 0)
337
+ .unwrap_or(5);
338
+ let base_delay = env::var("KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS")
339
+ .ok()
340
+ .and_then(|value| value.parse::<u64>().ok())
341
+ .filter(|value| *value > 0)
342
+ .unwrap_or(2);
343
+
344
+ let archive_path_str = archive_path
345
+ .to_str()
346
+ .unwrap_or_else(|| panic!("Non-UTF8 path for archive: {}", archive_path.display()));
347
+ let mut last_error = String::new();
348
+
349
+ for attempt in 1..=retries {
350
+ let _ = fs::remove_file(&archive_path);
351
+ tracing::debug!(
352
+ "Downloading Pdfium archive from: {} (attempt {}/{})",
353
+ url,
354
+ attempt,
355
+ retries
356
+ );
357
+
358
+ let status = Command::new("curl")
359
+ .args(["-f", "-L", "-o", archive_path_str, url])
360
+ .status();
361
+
362
+ match status {
363
+ Ok(code) if code.success() => {
364
+ last_error.clear();
365
+ break;
366
+ }
367
+ Ok(code) => {
368
+ last_error = format!("curl exited with {:?}", code.code());
369
+ }
370
+ Err(err) => {
371
+ last_error = format!("failed to spawn curl: {err}");
372
+ }
373
+ }
374
+
375
+ if attempt == retries {
376
+ panic!(
377
+ "Failed to download Pdfium from {} after {} attempts. Last error: {}",
378
+ url, retries, last_error
379
+ );
380
+ }
381
+
382
+ let exponent = u32::min(attempt, 5);
383
+ let multiplier = 1u64 << exponent;
384
+ let delay_secs = base_delay.saturating_mul(multiplier).min(30);
385
+ println!(
386
+ "cargo:warning=Pdfium download failed (attempt {}/{}) - {}. Retrying in {}s",
387
+ attempt, retries, last_error, delay_secs
388
+ );
389
+ thread::sleep(Duration::from_secs(delay_secs));
390
+ }
391
+
392
+ let file_type = Command::new("file")
393
+ .arg(archive_path.to_str().unwrap())
394
+ .output()
395
+ .expect("Failed to check file type");
396
+
397
+ let file_type_output = String::from_utf8_lossy(&file_type.stdout);
398
+ tracing::debug!("Downloaded file type: {}", file_type_output.trim());
399
+
400
+ if !file_type_output.to_lowercase().contains("gzip") && !file_type_output.to_lowercase().contains("compressed") {
401
+ fs::remove_file(&archive_path).ok();
402
+ panic!(
403
+ "Downloaded file is not a valid gzip archive. URL may be incorrect or version unavailable: {}",
404
+ url
405
+ );
406
+ }
407
+
408
+ tracing::debug!("Extracting Pdfium archive...");
409
+ let status = Command::new("tar")
410
+ .args(["-xzf", archive_path.to_str().unwrap(), "-C", dest_dir.to_str().unwrap()])
411
+ .status()
412
+ .expect("Failed to execute tar");
413
+
414
+ if !status.success() {
415
+ fs::remove_file(&archive_path).ok();
416
+ panic!("Failed to extract Pdfium archive from {}", url);
417
+ }
418
+
419
+ fs::remove_file(&archive_path).ok();
420
+
421
+ let target = env::var("TARGET").unwrap();
422
+ if target.contains("windows") {
423
+ let lib_dir = dest_dir.join("lib");
424
+ let dll_lib = lib_dir.join("pdfium.dll.lib");
425
+ let expected_lib = lib_dir.join("pdfium.lib");
426
+
427
+ if dll_lib.exists() {
428
+ tracing::debug!("Ensuring Windows import library at {}", expected_lib.display());
429
+ if let Err(err) = fs::copy(&dll_lib, &expected_lib) {
430
+ panic!("Failed to copy pdfium.dll.lib to pdfium.lib: {err}");
431
+ }
432
+ } else {
433
+ tracing::debug!("Warning: Expected {} not found after extraction", dll_lib.display());
434
+ }
435
+ }
436
+
437
+ tracing::debug!("Pdfium downloaded and extracted successfully");
438
+ }
439
+
440
+ // ============================================================================
441
+ // PREBUILT HANDLING
442
+ // ============================================================================
443
+
444
+ /// Prepare prebuilt PDFium by copying to destination directory
445
+ ///
446
+ /// Removes existing destination if present, then recursively copies
447
+ /// all files from prebuilt source to destination.
448
+ fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
449
+ if dest_dir.exists() {
450
+ fs::remove_dir_all(dest_dir)?;
451
+ }
452
+ copy_dir_all(prebuilt_src, dest_dir)
453
+ }
454
+
455
+ /// Recursively copy directory tree
456
+ ///
457
+ /// Used by `prepare_prebuilt_pdfium()` to copy entire pdfium directory
458
+ /// structure, preserving all files and subdirectories.
459
+ fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
460
+ fs::create_dir_all(dst)?;
461
+ for entry in fs::read_dir(src)? {
462
+ let entry = entry?;
463
+ let file_type = entry.file_type()?;
464
+ let target_path = dst.join(entry.file_name());
465
+ if file_type.is_dir() {
466
+ copy_dir_all(&entry.path(), &target_path)?;
467
+ } else {
468
+ fs::copy(entry.path(), &target_path)?;
469
+ }
470
+ }
471
+ Ok(())
472
+ }
473
+
474
+ // ============================================================================
475
+ // PLATFORM UTILITIES
476
+ // ============================================================================
477
+
478
+ /// Get platform-specific runtime library name and subdirectory
479
+ ///
480
+ /// Returns tuple of (library_name, subdirectory) for the target platform:
481
+ /// - WASM: ("libpdfium.a", "release/lib")
482
+ /// - Windows: ("pdfium.dll", "bin")
483
+ /// - macOS: ("libpdfium.dylib", "lib")
484
+ /// - Linux: ("libpdfium.so", "lib")
485
+ fn runtime_library_info(target: &str) -> (String, &'static str) {
486
+ if target.contains("wasm") {
487
+ // pdfium-lib `wasm.tgz` extracts into `release/lib/libpdfium.a`
488
+ ("libpdfium.a".to_string(), "release/lib")
489
+ } else if target.contains("windows") {
490
+ ("pdfium.dll".to_string(), "bin")
491
+ } else if target.contains("darwin") {
492
+ ("libpdfium.dylib".to_string(), "lib")
493
+ } else {
494
+ ("libpdfium.so".to_string(), "lib")
495
+ }
496
+ }
497
+
498
+ /// Find PDFium library in archive with flexible directory detection
499
+ ///
500
+ /// Attempts to locate the library at multiple possible locations:
501
+ /// - {subdir}/{lib_name} (standard location)
502
+ /// - {lib_name} (root of archive)
503
+ /// - bin/{lib_name} (alternative location)
504
+ /// - lib/{lib_name} (explicit lib directory)
505
+ ///
506
+ /// This handles variations in archive structure across different platform builds,
507
+ /// particularly macOS ARM64 where the archive structure may differ.
508
+ ///
509
+ /// Returns the full path to the library if found, or an error with available files.
510
+ fn find_pdfium_library(pdfium_dir: &Path, lib_name: &str, expected_subdir: &str) -> Result<PathBuf, String> {
511
+ // Candidates in priority order
512
+ let candidates = [
513
+ pdfium_dir.join(expected_subdir).join(lib_name), // Standard: lib/libpdfium.dylib
514
+ pdfium_dir.join(lib_name), // Root: libpdfium.dylib
515
+ pdfium_dir.join("bin").join(lib_name), // Alternative: bin/libpdfium.dylib
516
+ pdfium_dir.join("lib").join(lib_name), // Explicit lib: lib/libpdfium.dylib
517
+ ];
518
+
519
+ // Try each candidate
520
+ for candidate in &candidates {
521
+ if candidate.exists() {
522
+ tracing::debug!("Found PDFium library at: {}", candidate.display());
523
+ return Ok(candidate.clone());
524
+ }
525
+ }
526
+
527
+ // Library not found - provide detailed error with directory listing
528
+ let mut error_msg = format!(
529
+ "PDFium library not found at expected location: {}/{}\n\n",
530
+ pdfium_dir.display(),
531
+ expected_subdir
532
+ );
533
+ error_msg.push_str("Attempted locations:\n");
534
+ for candidate in &candidates {
535
+ error_msg.push_str(&format!(" - {}\n", candidate.display()));
536
+ }
537
+
538
+ // List actual contents of pdfium directory for debugging
539
+ error_msg.push_str("\nActual archive contents:\n");
540
+ if let Ok(entries) = fs::read_dir(pdfium_dir) {
541
+ for entry in entries.flatten() {
542
+ let path = entry.path();
543
+ let file_type = if path.is_dir() { "dir" } else { "file" };
544
+ error_msg.push_str(&format!(" {} ({})\n", path.display(), file_type));
545
+
546
+ // Show contents of subdirectories
547
+ if path.is_dir()
548
+ && let Ok(sub_entries) = fs::read_dir(&path)
549
+ {
550
+ for sub_entry in sub_entries.flatten() {
551
+ let sub_path = sub_entry.path();
552
+ let sub_type = if sub_path.is_dir() { "dir" } else { "file" };
553
+ error_msg.push_str(&format!(" {} ({})\n", sub_path.display(), sub_type));
554
+ }
555
+ }
556
+ }
557
+ }
558
+
559
+ Err(error_msg)
560
+ }
561
+
562
+ // ============================================================================
563
+ // LINKING STRATEGIES
564
+ // ============================================================================
565
+
566
+ /// Link PDFium dynamically (default)
567
+ ///
568
+ /// Sets up linker to use PDFium as a dynamic library (.dylib/.so/.dll)
569
+ /// with platform-specific rpath configuration for runtime library discovery.
570
+ /// Supports flexible archive structures by adding multiple possible lib directories.
571
+ fn link_dynamically(pdfium_dir: &Path, target: &str) {
572
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
573
+
574
+ // Find the actual library location (handles multiple possible archive structures)
575
+ let lib_path = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
576
+ Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
577
+ Err(err) => panic!("{}", err),
578
+ };
579
+
580
+ println!("cargo:rustc-link-search=native={}", lib_path.display());
581
+ println!("cargo:rustc-link-lib=dylib=pdfium");
582
+
583
+ // Also add standard lib directory for compatibility
584
+ let std_lib_dir = pdfium_dir.join("lib");
585
+ if std_lib_dir.exists() && std_lib_dir != lib_path {
586
+ println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
587
+ }
588
+
589
+ // Add bin directory for platforms where it might be needed
590
+ let bin_dir = pdfium_dir.join("bin");
591
+ if bin_dir.exists() && bin_dir != lib_path {
592
+ println!("cargo:rustc-link-search=native={}", bin_dir.display());
593
+ }
594
+
595
+ // Set rpath for dynamic linking
596
+ if target.contains("darwin") {
597
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
598
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
599
+ } else if target.contains("linux") {
600
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
601
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
602
+ }
603
+ }
604
+
605
+ /// Link PDFium statically (static-pdfium feature)
606
+ ///
607
+ /// Embeds PDFium into the binary as a static library. Adds system
608
+ /// dependencies required for static linking on Linux.
609
+ /// Supports flexible archive structures by finding library in multiple locations.
610
+ ///
611
+ /// Environment Variables:
612
+ /// - `PDFIUM_STATIC_LIB_PATH`: Path to directory containing libpdfium.a (for Docker/musl builds)
613
+ ///
614
+ /// Note: bblanchon/pdfium-binaries only provides dynamic libraries.
615
+ /// On macOS, this will fallback to dynamic linking with a warning.
616
+ /// On Linux, you must provide PDFIUM_STATIC_LIB_PATH pointing to a static build.
617
+ fn link_statically(pdfium_dir: &Path, target: &str) {
618
+ // For static linking, we need libpdfium.a (not .dylib or .so)
619
+ let static_lib_name = "libpdfium.a";
620
+ let lib_subdir = if target.contains("wasm") { "release/lib" } else { "lib" };
621
+
622
+ // First, check if user provided a static library path via environment variable
623
+ if let Ok(custom_path) = env::var("PDFIUM_STATIC_LIB_PATH") {
624
+ let custom_lib_dir = PathBuf::from(&custom_path);
625
+
626
+ if !custom_lib_dir.exists() {
627
+ panic!(
628
+ "PDFIUM_STATIC_LIB_PATH points to '{}' but the directory does not exist",
629
+ custom_path
630
+ );
631
+ }
632
+
633
+ let custom_lib = custom_lib_dir.join(static_lib_name);
634
+ if !custom_lib.exists() {
635
+ panic!(
636
+ "PDFIUM_STATIC_LIB_PATH points to '{}' but {} not found.\n\
637
+ Expected to find: {}",
638
+ custom_path,
639
+ static_lib_name,
640
+ custom_lib.display()
641
+ );
642
+ }
643
+
644
+ tracing::debug!("Using custom static PDFium from: {}", custom_lib.display());
645
+ println!("cargo:rustc-link-search=native={}", custom_lib_dir.display());
646
+ println!("cargo:rustc-link-lib=static=pdfium");
647
+
648
+ // Static linking requires additional system dependencies
649
+ if target.contains("linux") {
650
+ println!("cargo:rustc-link-lib=dylib=pthread");
651
+ println!("cargo:rustc-link-lib=dylib=dl");
652
+ } else if target.contains("windows") {
653
+ println!("cargo:rustc-link-lib=dylib=ws2_32");
654
+ println!("cargo:rustc-link-lib=dylib=userenv");
655
+ }
656
+
657
+ return;
658
+ }
659
+
660
+ // Find the actual library location (handles multiple possible archive structures)
661
+ let lib_path = match find_pdfium_library(pdfium_dir, static_lib_name, lib_subdir) {
662
+ Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
663
+ Err(_err) => {
664
+ // Static library not found - check if we're on macOS and can fallback
665
+ if target.contains("darwin") {
666
+ eprintln!("cargo:warning=Static PDFium library (libpdfium.a) not found for macOS.");
667
+ eprintln!("cargo:warning=bblanchon/pdfium-binaries only provides dynamic libraries.");
668
+ eprintln!("cargo:warning=Falling back to dynamic linking for local development.");
669
+ eprintln!("cargo:warning=Production Linux builds require PDFIUM_STATIC_LIB_PATH.");
670
+
671
+ // Fallback to dynamic linking on macOS
672
+ link_dynamically(pdfium_dir, target);
673
+ return;
674
+ } else {
675
+ // On Linux/Windows, provide helpful error with actionable steps
676
+ panic!(
677
+ "Static PDFium library (libpdfium.a) not found.\n\n\
678
+ bblanchon/pdfium-binaries only provides dynamic libraries.\n\n\
679
+ For static linking (required for Docker with musl), you must:\n\n\
680
+ 1. Build static PDFium or obtain from a source that provides it\n\
681
+ - See: https://github.com/ajrcarey/pdfium-render/issues/53\n\
682
+ - Or use: https://github.com/paulocoutinhox/pdfium-lib (provides static builds)\n\n\
683
+ 2. Set environment variable pointing to the directory containing libpdfium.a:\n\
684
+ export PDFIUM_STATIC_LIB_PATH=/path/to/pdfium/lib\n\n\
685
+ 3. Or use alternative features:\n\
686
+ - 'pdf' (dynamic linking, requires .so at runtime)\n\
687
+ - 'bundled-pdfium' (embeds dynamic library in binary)\n\
688
+ - 'system-pdfium' (use system-installed pdfium)\n\n\
689
+ Example Dockerfile pattern:\n\
690
+ FROM alpine:latest as pdfium-builder\n\
691
+ # Download/build static libpdfium.a\n\
692
+ \n\
693
+ FROM rust:alpine as builder\n\
694
+ ENV PDFIUM_STATIC_LIB_PATH=/pdfium/lib\n\
695
+ COPY --from=pdfium-builder /path/to/libpdfium.a /pdfium/lib/"
696
+ );
697
+ }
698
+ }
699
+ };
700
+
701
+ println!("cargo:rustc-link-search=native={}", lib_path.display());
702
+ println!("cargo:rustc-link-lib=static=pdfium");
703
+
704
+ // Also add standard lib directory for compatibility
705
+ let std_lib_dir = pdfium_dir.join("lib");
706
+ if std_lib_dir.exists() && std_lib_dir != lib_path {
707
+ println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
708
+ }
709
+
710
+ // Add bin directory for platforms where it might be needed
711
+ let bin_dir = pdfium_dir.join("bin");
712
+ if bin_dir.exists() && bin_dir != lib_path {
713
+ println!("cargo:rustc-link-search=native={}", bin_dir.display());
714
+ }
715
+
716
+ // Static linking requires additional system dependencies
717
+ if target.contains("linux") {
718
+ println!("cargo:rustc-link-lib=dylib=pthread");
719
+ println!("cargo:rustc-link-lib=dylib=dl");
720
+ } else if target.contains("windows") {
721
+ println!("cargo:rustc-link-lib=dylib=ws2_32");
722
+ println!("cargo:rustc-link-lib=dylib=userenv");
723
+ }
724
+ }
725
+
726
+ /// Link PDFium bundled (bundled-pdfium feature)
727
+ ///
728
+ /// Links dynamically but copies library to OUT_DIR for embedding in binary.
729
+ /// Each binary extracts and uses its own copy of the PDFium library.
730
+ /// Supports flexible archive structures by finding library in multiple locations.
731
+ ///
732
+ /// For WASM targets, links statically using the bundled static library.
733
+ fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
734
+ // Copy library to OUT_DIR for bundling using flexible detection
735
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
736
+ let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
737
+ Ok(path) => path,
738
+ Err(err) => panic!("{}", err),
739
+ };
740
+ let bundled_lib = out_dir.join(&runtime_lib_name);
741
+
742
+ fs::copy(&src_lib, &bundled_lib)
743
+ .unwrap_or_else(|err| panic!("Failed to copy library to OUT_DIR for bundling: {}", err));
744
+
745
+ // Emit environment variable with bundled library path
746
+ let bundled_path = bundled_lib
747
+ .to_str()
748
+ .unwrap_or_else(|| panic!("Non-UTF8 path for bundled library: {}", bundled_lib.display()));
749
+ println!("cargo:rustc-env=KREUZBERG_PDFIUM_BUNDLED_PATH={}", bundled_path);
750
+
751
+ // For WASM, link statically using the bundled library
752
+ if target.contains("wasm") {
753
+ let lib_dir = bundled_lib
754
+ .parent()
755
+ .unwrap_or_else(|| panic!("Invalid bundled library path: {}", bundled_lib.display()));
756
+ println!("cargo:rustc-link-search=native={}", lib_dir.display());
757
+ println!("cargo:rustc-link-lib=static=pdfium");
758
+ tracing::debug!("Bundled PDFium static library linked for WASM at: {}", bundled_path);
759
+ } else {
760
+ tracing::debug!("Bundled PDFium library at: {}", bundled_path);
761
+ }
762
+ }
763
+
764
+ /// Link system-installed PDFium (system-pdfium feature)
765
+ ///
766
+ /// Attempts to find PDFium via pkg-config first, then falls back to
767
+ /// environment variables (KREUZBERG_PDFIUM_SYSTEM_PATH, KREUZBERG_PDFIUM_SYSTEM_INCLUDE).
768
+ fn link_system(_target: &str) {
769
+ // Try pkg-config first
770
+ match pkg_config::Config::new().atleast_version("5.0").probe("pdfium") {
771
+ Ok(library) => {
772
+ tracing::debug!("Found system pdfium via pkg-config");
773
+ for include_path in &library.include_paths {
774
+ println!("cargo:include={}", include_path.display());
775
+ }
776
+ return;
777
+ }
778
+ Err(err) => {
779
+ tracing::debug!("pkg-config probe failed: {}", err);
780
+ }
781
+ }
782
+
783
+ // Fallback to environment variables
784
+ let lib_path = env::var("KREUZBERG_PDFIUM_SYSTEM_PATH").ok();
785
+ let include_path = env::var("KREUZBERG_PDFIUM_SYSTEM_INCLUDE").ok();
786
+
787
+ if let Some(lib_dir) = lib_path {
788
+ let lib_dir_path = PathBuf::from(&lib_dir);
789
+ if !lib_dir_path.exists() {
790
+ panic!(
791
+ "KREUZBERG_PDFIUM_SYSTEM_PATH points to '{}' but the directory does not exist",
792
+ lib_dir
793
+ );
794
+ }
795
+
796
+ println!("cargo:rustc-link-search=native={}", lib_dir);
797
+ println!("cargo:rustc-link-lib=dylib=pdfium");
798
+
799
+ if let Some(inc_dir) = include_path {
800
+ println!("cargo:include={}", inc_dir);
801
+ }
802
+
803
+ tracing::debug!("Using system pdfium from: {}", lib_dir);
804
+ return;
805
+ }
806
+
807
+ // No system pdfium found
808
+ panic!(
809
+ "system-pdfium feature enabled but pdfium not found.\n\
810
+ \n\
811
+ Please install pdfium system-wide or provide:\n\
812
+ - KREUZBERG_PDFIUM_SYSTEM_PATH: path to directory containing libpdfium\n\
813
+ - KREUZBERG_PDFIUM_SYSTEM_INCLUDE: path to pdfium headers (optional)\n\
814
+ \n\
815
+ Alternatively, use a different linking strategy:\n\
816
+ - Default (dynamic): cargo build --features pdf\n\
817
+ - Static linking: cargo build --features pdf,static-pdfium\n\
818
+ - Bundled: cargo build --features pdf,bundled-pdfium"
819
+ );
820
+ }
821
+
822
+ /// Link system frameworks and standard libraries
823
+ ///
824
+ /// Adds platform-specific system libraries required for PDFium linking:
825
+ /// - macOS: CoreFoundation, CoreGraphics, CoreText, AppKit, libc++
826
+ /// - Linux: stdc++, libm
827
+ /// - Windows: gdi32, user32, advapi32
828
+ fn link_system_frameworks(target: &str) {
829
+ if target.contains("darwin") {
830
+ println!("cargo:rustc-link-lib=framework=CoreFoundation");
831
+ println!("cargo:rustc-link-lib=framework=CoreGraphics");
832
+ println!("cargo:rustc-link-lib=framework=CoreText");
833
+ println!("cargo:rustc-link-lib=framework=AppKit");
834
+ println!("cargo:rustc-link-lib=dylib=c++");
835
+ } else if target.contains("linux") {
836
+ println!("cargo:rustc-link-lib=dylib=stdc++");
837
+ println!("cargo:rustc-link-lib=dylib=m");
838
+ } else if target.contains("windows") {
839
+ println!("cargo:rustc-link-lib=dylib=gdi32");
840
+ println!("cargo:rustc-link-lib=dylib=user32");
841
+ println!("cargo:rustc-link-lib=dylib=advapi32");
842
+ }
843
+ }