kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,154 @@
1
+ use std::any::Any;
2
+ use std::time::{SystemTime, UNIX_EPOCH};
3
+
4
+ /// Context information captured when a panic occurs.
5
+ ///
6
+ /// This struct stores detailed information about where and when a panic happened,
7
+ /// enabling better error reporting across FFI boundaries.
8
+ #[derive(Debug, Clone)]
9
+ pub struct PanicContext {
10
+ /// Source file where the panic occurred
11
+ pub file: &'static str,
12
+ /// Line number where the panic occurred
13
+ pub line: u32,
14
+ /// Function name where the panic occurred
15
+ pub function: &'static str,
16
+ /// Panic message extracted from the panic payload
17
+ pub message: String,
18
+ /// Timestamp when the panic was captured
19
+ pub timestamp: SystemTime,
20
+ }
21
+
22
+ impl PanicContext {
23
+ /// Creates a new PanicContext with the given parameters.
24
+ ///
25
+ /// # Arguments
26
+ ///
27
+ /// * `file` - Source file path
28
+ /// * `line` - Line number
29
+ /// * `function` - Function name
30
+ /// * `panic_info` - The panic payload to extract message from
31
+ pub fn new(file: &'static str, line: u32, function: &'static str, panic_info: &dyn Any) -> Self {
32
+ let timestamp = std::panic::catch_unwind(SystemTime::now).unwrap_or(UNIX_EPOCH);
33
+
34
+ Self {
35
+ file,
36
+ line,
37
+ function,
38
+ message: extract_panic_message(panic_info),
39
+ timestamp,
40
+ }
41
+ }
42
+
43
+ /// Formats the panic context as a human-readable string.
44
+ pub fn format(&self) -> String {
45
+ format!(
46
+ "Panic at {}:{}:{} - {}",
47
+ self.file, self.line, self.function, self.message
48
+ )
49
+ }
50
+ }
51
+
52
+ /// Maximum panic message length to prevent DoS attacks
53
+ const MAX_PANIC_MESSAGE_LEN: usize = 4096;
54
+
55
+ /// Extracts a human-readable message from a panic payload.
56
+ ///
57
+ /// Attempts to downcast the panic payload to common types (String, &str)
58
+ /// to extract a meaningful error message.
59
+ ///
60
+ /// Message is truncated to 4KB to prevent DoS attacks via extremely large panic messages.
61
+ ///
62
+ /// # Arguments
63
+ ///
64
+ /// * `panic_info` - The panic payload from catch_unwind
65
+ ///
66
+ /// # Returns
67
+ ///
68
+ /// A string representation of the panic message (truncated if necessary)
69
+ pub fn extract_panic_message(panic_info: &dyn Any) -> String {
70
+ let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
71
+ s.clone()
72
+ } else if let Some(s) = panic_info.downcast_ref::<&str>() {
73
+ (*s).to_string()
74
+ } else {
75
+ "Unknown panic payload".to_string()
76
+ };
77
+
78
+ if msg.len() > MAX_PANIC_MESSAGE_LEN {
79
+ let truncate_at = msg.floor_char_boundary(MAX_PANIC_MESSAGE_LEN);
80
+ format!("{}... [truncated]", &msg[..truncate_at])
81
+ } else {
82
+ msg
83
+ }
84
+ }
85
+
86
+ #[cfg(test)]
87
+ mod tests {
88
+ use super::*;
89
+
90
+ #[test]
91
+ fn test_extract_panic_message_string() {
92
+ let panic_msg = "test panic".to_string();
93
+ let msg = extract_panic_message(&panic_msg);
94
+ assert_eq!(msg, "test panic");
95
+ }
96
+
97
+ #[test]
98
+ fn test_extract_panic_message_str() {
99
+ let panic_msg: &str = "test panic";
100
+ let msg = extract_panic_message(&panic_msg);
101
+ assert_eq!(msg, "test panic");
102
+ }
103
+
104
+ #[test]
105
+ fn test_extract_panic_message_unknown() {
106
+ let panic_msg = 42i32;
107
+ let msg = extract_panic_message(&panic_msg);
108
+ assert_eq!(msg, "Unknown panic payload");
109
+ }
110
+
111
+ #[test]
112
+ fn test_panic_context_format() {
113
+ let panic_msg = "test error".to_string();
114
+ let ctx = PanicContext::new("test.rs", 42, "test_function", &panic_msg);
115
+
116
+ let formatted = ctx.format();
117
+ assert!(formatted.contains("test.rs"));
118
+ assert!(formatted.contains("42"));
119
+ assert!(formatted.contains("test_function"));
120
+ assert!(formatted.contains("test error"));
121
+ }
122
+
123
+ #[test]
124
+ fn test_panic_message_truncation() {
125
+ let long_msg = "x".repeat(5000);
126
+ let msg = extract_panic_message(&long_msg);
127
+ assert!(msg.len() <= MAX_PANIC_MESSAGE_LEN + 20);
128
+ assert!(msg.ends_with("... [truncated]"));
129
+ }
130
+
131
+ #[test]
132
+ fn test_panic_message_truncation_utf8_boundary() {
133
+ let mut msg = "x".repeat(4093);
134
+ msg.push('🦀');
135
+ msg.push_str("yyy");
136
+
137
+ let truncated = extract_panic_message(&msg);
138
+
139
+ assert!(truncated.ends_with("... [truncated]"));
140
+
141
+ assert!(std::str::from_utf8(truncated.as_bytes()).is_ok());
142
+
143
+ assert!(!truncated.contains("🦀"));
144
+ assert!(!truncated.contains("yyy"));
145
+ }
146
+
147
+ #[test]
148
+ fn test_panic_message_no_truncation_needed() {
149
+ let short_msg = "short".to_string();
150
+ let msg = extract_panic_message(&short_msg);
151
+ assert_eq!(msg, "short");
152
+ assert!(!msg.contains("[truncated]"));
153
+ }
154
+ }
@@ -0,0 +1,122 @@
1
+ use std::fmt;
2
+
3
+ #[derive(Debug, Clone)]
4
+ pub enum PdfError {
5
+ InvalidPdf(String),
6
+ PasswordRequired,
7
+ InvalidPassword,
8
+ EncryptionNotSupported(String),
9
+ PageNotFound(usize),
10
+ TextExtractionFailed(String),
11
+ RenderingFailed(String),
12
+ MetadataExtractionFailed(String),
13
+ IOError(String),
14
+ }
15
+
16
+ impl fmt::Display for PdfError {
17
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
18
+ match self {
19
+ PdfError::InvalidPdf(msg) => write!(f, "Invalid PDF: {}", msg),
20
+ PdfError::PasswordRequired => write!(f, "PDF is password-protected"),
21
+ PdfError::InvalidPassword => write!(f, "Invalid password provided"),
22
+ PdfError::EncryptionNotSupported(msg) => {
23
+ write!(f, "Encryption not supported: {}", msg)
24
+ }
25
+ PdfError::PageNotFound(page) => write!(f, "Page {} not found", page),
26
+ PdfError::TextExtractionFailed(msg) => write!(f, "Text extraction failed: {}", msg),
27
+ PdfError::RenderingFailed(msg) => write!(f, "Page rendering failed: {}", msg),
28
+ PdfError::MetadataExtractionFailed(msg) => {
29
+ write!(f, "Metadata extraction failed: {}", msg)
30
+ }
31
+ PdfError::IOError(msg) => write!(f, "I/O error: {}", msg),
32
+ }
33
+ }
34
+ }
35
+
36
+ impl std::error::Error for PdfError {}
37
+
38
+ // NOTE: No From<std::io::Error> impl - IO errors must bubble up unchanged per error handling policy
39
+
40
+ impl From<lopdf::Error> for PdfError {
41
+ fn from(err: lopdf::Error) -> Self {
42
+ match err {
43
+ lopdf::Error::IO(io_err) => PdfError::IOError(io_err.to_string()),
44
+ _ => PdfError::InvalidPdf(err.to_string()),
45
+ }
46
+ }
47
+ }
48
+
49
+ pub type Result<T> = std::result::Result<T, PdfError>;
50
+
51
+ #[cfg(test)]
52
+ mod tests {
53
+ use super::*;
54
+
55
+ #[test]
56
+ fn test_invalid_pdf_error() {
57
+ let err = PdfError::InvalidPdf("corrupted header".to_string());
58
+ assert_eq!(err.to_string(), "Invalid PDF: corrupted header");
59
+ }
60
+
61
+ #[test]
62
+ fn test_password_required_error() {
63
+ let err = PdfError::PasswordRequired;
64
+ assert_eq!(err.to_string(), "PDF is password-protected");
65
+ }
66
+
67
+ #[test]
68
+ fn test_invalid_password_error() {
69
+ let err = PdfError::InvalidPassword;
70
+ assert_eq!(err.to_string(), "Invalid password provided");
71
+ }
72
+
73
+ #[test]
74
+ fn test_encryption_not_supported_error() {
75
+ let err = PdfError::EncryptionNotSupported("AES-256".to_string());
76
+ assert_eq!(err.to_string(), "Encryption not supported: AES-256");
77
+ }
78
+
79
+ #[test]
80
+ fn test_page_not_found_error() {
81
+ let err = PdfError::PageNotFound(5);
82
+ assert_eq!(err.to_string(), "Page 5 not found");
83
+ }
84
+
85
+ #[test]
86
+ fn test_text_extraction_failed_error() {
87
+ let err = PdfError::TextExtractionFailed("no text layer".to_string());
88
+ assert_eq!(err.to_string(), "Text extraction failed: no text layer");
89
+ }
90
+
91
+ #[test]
92
+ fn test_rendering_failed_error() {
93
+ let err = PdfError::RenderingFailed("out of memory".to_string());
94
+ assert_eq!(err.to_string(), "Page rendering failed: out of memory");
95
+ }
96
+
97
+ #[test]
98
+ fn test_metadata_extraction_failed_error() {
99
+ let err = PdfError::MetadataExtractionFailed("invalid metadata".to_string());
100
+ assert_eq!(err.to_string(), "Metadata extraction failed: invalid metadata");
101
+ }
102
+
103
+ #[test]
104
+ fn test_io_error() {
105
+ let err = PdfError::IOError("read failed".to_string());
106
+ assert_eq!(err.to_string(), "I/O error: read failed");
107
+ }
108
+
109
+ #[test]
110
+ fn test_error_debug() {
111
+ let err = PdfError::InvalidPassword;
112
+ let debug_str = format!("{:?}", err);
113
+ assert!(debug_str.contains("InvalidPassword"));
114
+ }
115
+
116
+ #[test]
117
+ fn test_error_clone() {
118
+ let err1 = PdfError::PageNotFound(3);
119
+ let err2 = err1.clone();
120
+ assert_eq!(err1.to_string(), err2.to_string());
121
+ }
122
+ }
@@ -0,0 +1,139 @@
1
+ use super::error::{PdfError, Result};
2
+ use lopdf::Document;
3
+ use serde::{Deserialize, Serialize};
4
+
5
+ #[derive(Debug, Clone, Serialize, Deserialize)]
6
+ pub struct PdfImage {
7
+ pub page_number: usize,
8
+ pub image_index: usize,
9
+ pub width: i64,
10
+ pub height: i64,
11
+ pub color_space: Option<String>,
12
+ pub bits_per_component: Option<i64>,
13
+ pub filters: Vec<String>,
14
+ pub data: Vec<u8>,
15
+ }
16
+
17
+ #[derive(Debug)]
18
+ pub struct PdfImageExtractor {
19
+ document: Document,
20
+ }
21
+
22
+ impl PdfImageExtractor {
23
+ pub fn new(pdf_bytes: &[u8]) -> Result<Self> {
24
+ Self::new_with_password(pdf_bytes, None)
25
+ }
26
+
27
+ pub fn new_with_password(pdf_bytes: &[u8], password: Option<&str>) -> Result<Self> {
28
+ let mut doc =
29
+ Document::load_mem(pdf_bytes).map_err(|e| PdfError::InvalidPdf(format!("Failed to load PDF: {}", e)))?;
30
+
31
+ if doc.is_encrypted() {
32
+ if let Some(pwd) = password {
33
+ doc.decrypt(pwd).map_err(|_| PdfError::InvalidPassword)?;
34
+ } else {
35
+ return Err(PdfError::PasswordRequired);
36
+ }
37
+ }
38
+
39
+ Ok(Self { document: doc })
40
+ }
41
+
42
+ pub fn extract_images(&self) -> Result<Vec<PdfImage>> {
43
+ let mut all_images = Vec::new();
44
+ let pages = self.document.get_pages();
45
+
46
+ for (page_num, page_id) in pages.iter() {
47
+ let images = self
48
+ .document
49
+ .get_page_images(*page_id)
50
+ .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to get page images: {}", e)))?;
51
+
52
+ for (img_index, img) in images.iter().enumerate() {
53
+ let filters = img.filters.clone().unwrap_or_default();
54
+
55
+ all_images.push(PdfImage {
56
+ page_number: *page_num as usize,
57
+ image_index: img_index + 1,
58
+ width: img.width,
59
+ height: img.height,
60
+ color_space: img.color_space.clone(),
61
+ bits_per_component: img.bits_per_component,
62
+ filters,
63
+ data: img.content.to_vec(),
64
+ });
65
+ }
66
+ }
67
+
68
+ Ok(all_images)
69
+ }
70
+
71
+ pub fn extract_images_from_page(&self, page_number: u32) -> Result<Vec<PdfImage>> {
72
+ let pages = self.document.get_pages();
73
+ let page_id = pages
74
+ .get(&page_number)
75
+ .ok_or(PdfError::PageNotFound(page_number as usize))?;
76
+
77
+ let images = self
78
+ .document
79
+ .get_page_images(*page_id)
80
+ .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to get page images: {}", e)))?;
81
+
82
+ let mut page_images = Vec::new();
83
+ for (img_index, img) in images.iter().enumerate() {
84
+ let filters = img.filters.clone().unwrap_or_default();
85
+
86
+ page_images.push(PdfImage {
87
+ page_number: page_number as usize,
88
+ image_index: img_index + 1,
89
+ width: img.width,
90
+ height: img.height,
91
+ color_space: img.color_space.clone(),
92
+ bits_per_component: img.bits_per_component,
93
+ filters,
94
+ data: img.content.to_vec(),
95
+ });
96
+ }
97
+
98
+ Ok(page_images)
99
+ }
100
+
101
+ pub fn get_image_count(&self) -> Result<usize> {
102
+ let images = self.extract_images()?;
103
+ Ok(images.len())
104
+ }
105
+ }
106
+
107
+ pub fn extract_images_from_pdf(pdf_bytes: &[u8]) -> Result<Vec<PdfImage>> {
108
+ let extractor = PdfImageExtractor::new(pdf_bytes)?;
109
+ extractor.extract_images()
110
+ }
111
+
112
+ pub fn extract_images_from_pdf_with_password(pdf_bytes: &[u8], password: &str) -> Result<Vec<PdfImage>> {
113
+ let extractor = PdfImageExtractor::new_with_password(pdf_bytes, Some(password))?;
114
+ extractor.extract_images()
115
+ }
116
+
117
+ #[cfg(test)]
118
+ mod tests {
119
+ use super::*;
120
+
121
+ #[test]
122
+ fn test_extractor_creation() {
123
+ let result = PdfImageExtractor::new(b"not a pdf");
124
+ assert!(result.is_err());
125
+ assert!(matches!(result.unwrap_err(), PdfError::InvalidPdf(_)));
126
+ }
127
+
128
+ #[test]
129
+ fn test_extract_images_invalid_pdf() {
130
+ let result = extract_images_from_pdf(b"not a pdf");
131
+ assert!(result.is_err());
132
+ }
133
+
134
+ #[test]
135
+ fn test_extract_images_empty_pdf() {
136
+ let result = extract_images_from_pdf(b"");
137
+ assert!(result.is_err());
138
+ }
139
+ }