kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,332 @@
1
+ //! Support for reporting Rust memory usage to the Ruby GC.
2
+
3
+ use std::{
4
+ fmt::Formatter,
5
+ sync::{
6
+ Arc,
7
+ atomic::{AtomicIsize, Ordering},
8
+ },
9
+ };
10
+
11
+ #[cfg(ruby_engine = "mri")]
12
+ mod mri {
13
+ use crate::{rb_gc_adjust_memory_usage, utils::is_ruby_vm_started};
14
+ use std::alloc::{GlobalAlloc, Layout, System};
15
+
16
+ /// A simple wrapper over [`System`] which reports memory usage to
17
+ /// the Ruby GC. This gives the GC a more accurate picture of the process'
18
+ /// memory usage so it can make better decisions about when to run.
19
+ #[derive(Debug)]
20
+ pub struct TrackingAllocator;
21
+
22
+ impl TrackingAllocator {
23
+ /// Create a new [`TrackingAllocator`].
24
+ #[allow(clippy::new_without_default)]
25
+ pub const fn new() -> Self {
26
+ Self
27
+ }
28
+
29
+ /// Create a new [`TrackingAllocator`] with default values.
30
+ pub const fn default() -> Self {
31
+ Self::new()
32
+ }
33
+
34
+ /// Adjust the memory usage reported to the Ruby GC by `delta`. Useful for
35
+ /// tracking allocations invisible to the Rust allocator, such as `mmap` or
36
+ /// direct `malloc` calls.
37
+ ///
38
+ /// # Example
39
+ /// ```
40
+ /// use rb_sys::TrackingAllocator;
41
+ ///
42
+ /// // Allocate 1024 bytes of memory using `mmap` or `malloc`...
43
+ /// TrackingAllocator::adjust_memory_usage(1024);
44
+ ///
45
+ /// // ...and then after the memory is freed, adjust the memory usage again.
46
+ /// TrackingAllocator::adjust_memory_usage(-1024);
47
+ /// ```
48
+ #[inline]
49
+ pub fn adjust_memory_usage(delta: isize) -> isize {
50
+ if delta == 0 {
51
+ return 0;
52
+ }
53
+
54
+ #[cfg(target_pointer_width = "32")]
55
+ let delta = delta as i32;
56
+
57
+ #[cfg(target_pointer_width = "64")]
58
+ let delta = delta as i64;
59
+
60
+ unsafe {
61
+ if is_ruby_vm_started() {
62
+ // On Windows, ssize_t is i32 even on 64-bit, so cast i64 to i32
63
+ #[cfg(all(target_pointer_width = "64", target_os = "windows"))]
64
+ rb_gc_adjust_memory_usage(delta as i32);
65
+
66
+ #[cfg(not(all(target_pointer_width = "64", target_os = "windows")))]
67
+ rb_gc_adjust_memory_usage(delta);
68
+
69
+ delta as isize
70
+ } else {
71
+ 0
72
+ }
73
+ }
74
+ }
75
+ }
76
+
77
+ unsafe impl GlobalAlloc for TrackingAllocator {
78
+ #[inline]
79
+ unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
80
+ let ret = System.alloc(layout);
81
+ let delta = layout.size() as isize;
82
+
83
+ if !ret.is_null() && delta != 0 {
84
+ Self::adjust_memory_usage(delta);
85
+ }
86
+
87
+ ret
88
+ }
89
+
90
+ #[inline]
91
+ unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
92
+ let ret = System.alloc_zeroed(layout);
93
+ let delta = layout.size() as isize;
94
+
95
+ if !ret.is_null() && delta != 0 {
96
+ Self::adjust_memory_usage(delta);
97
+ }
98
+
99
+ ret
100
+ }
101
+
102
+ #[inline]
103
+ unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
104
+ System.dealloc(ptr, layout);
105
+ let delta = -(layout.size() as isize);
106
+
107
+ if delta != 0 {
108
+ Self::adjust_memory_usage(delta);
109
+ }
110
+ }
111
+
112
+ #[inline]
113
+ unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
114
+ let ret = System.realloc(ptr, layout, new_size);
115
+ let delta = new_size as isize - layout.size() as isize;
116
+
117
+ if !ret.is_null() && delta != 0 {
118
+ Self::adjust_memory_usage(delta);
119
+ }
120
+
121
+ ret
122
+ }
123
+ }
124
+ }
125
+
126
+ #[cfg(not(ruby_engine = "mri"))]
127
+ mod non_mri {
128
+ use std::alloc::{GlobalAlloc, Layout, System};
129
+
130
+ /// A simple wrapper over [`System`] as a fallback for non-MRI Ruby engines.
131
+ pub struct TrackingAllocator;
132
+
133
+ impl TrackingAllocator {
134
+ #[allow(clippy::new_without_default)]
135
+ pub const fn new() -> Self {
136
+ Self
137
+ }
138
+
139
+ pub const fn default() -> Self {
140
+ Self::new()
141
+ }
142
+
143
+ pub fn adjust_memory_usage(_delta: isize) -> isize {
144
+ 0
145
+ }
146
+ }
147
+
148
+ unsafe impl GlobalAlloc for TrackingAllocator {
149
+ #[inline]
150
+ unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
151
+ System.alloc(layout)
152
+ }
153
+
154
+ #[inline]
155
+ unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
156
+ System.alloc_zeroed(layout)
157
+ }
158
+
159
+ #[inline]
160
+ unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
161
+ System.dealloc(ptr, layout)
162
+ }
163
+
164
+ #[inline]
165
+ unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
166
+ System.realloc(ptr, layout, new_size)
167
+ }
168
+ }
169
+ }
170
+
171
+ #[cfg(ruby_engine = "mri")]
172
+ pub use mri::*;
173
+
174
+ #[cfg(not(ruby_engine = "mri"))]
175
+ pub use non_mri::*;
176
+
177
+ /// Set the global allocator to [`TrackingAllocator`].
178
+ ///
179
+ /// # Example
180
+ /// ```
181
+ /// // File: ext/my_gem/src/lib.rs
182
+ /// use rb_sys::set_global_tracking_allocator;
183
+ ///
184
+ /// set_global_tracking_allocator!();
185
+ /// ```
186
+ #[macro_export]
187
+ macro_rules! set_global_tracking_allocator {
188
+ () => {
189
+ #[global_allocator]
190
+ static RUBY_GLOBAL_TRACKING_ALLOCATOR: $crate::tracking_allocator::TrackingAllocator =
191
+ $crate::tracking_allocator::TrackingAllocator;
192
+ };
193
+ }
194
+
195
+ #[derive(Debug)]
196
+ #[repr(transparent)]
197
+ struct MemsizeDelta(Arc<AtomicIsize>);
198
+
199
+ impl MemsizeDelta {
200
+ fn new(delta: isize) -> Self {
201
+ let delta = TrackingAllocator::adjust_memory_usage(delta);
202
+ Self(Arc::new(AtomicIsize::new(delta)))
203
+ }
204
+
205
+ fn add(&self, delta: usize) {
206
+ if delta == 0 {
207
+ return;
208
+ }
209
+
210
+ let delta = TrackingAllocator::adjust_memory_usage(delta as _);
211
+ self.0.fetch_add(delta as _, Ordering::SeqCst);
212
+ }
213
+
214
+ fn sub(&self, delta: usize) {
215
+ if delta == 0 {
216
+ return;
217
+ }
218
+
219
+ let delta = TrackingAllocator::adjust_memory_usage(-(delta as isize));
220
+ self.0.fetch_add(delta, Ordering::SeqCst);
221
+ }
222
+
223
+ fn get(&self) -> isize {
224
+ self.0.load(Ordering::SeqCst)
225
+ }
226
+ }
227
+
228
+ impl Clone for MemsizeDelta {
229
+ fn clone(&self) -> Self {
230
+ Self(Arc::clone(&self.0))
231
+ }
232
+ }
233
+
234
+ impl Drop for MemsizeDelta {
235
+ fn drop(&mut self) {
236
+ let memsize = self.0.swap(0, Ordering::SeqCst);
237
+ TrackingAllocator::adjust_memory_usage(0 - memsize);
238
+ }
239
+ }
240
+
241
+ /// A guard which adjusts the memory usage reported to the Ruby GC by `delta`.
242
+ /// This allows you to track resources which are invisible to the Rust
243
+ /// allocator, such as items that are known to internally use `mmap` or direct
244
+ /// `malloc` in their implementation.
245
+ ///
246
+ /// Internally, it uses an [`Arc<AtomicIsize>`] to track the memory usage delta,
247
+ /// and is safe to clone when `T` is [`Clone`].
248
+ ///
249
+ /// # Example
250
+ /// ```
251
+ /// use rb_sys::tracking_allocator::ManuallyTracked;
252
+ ///
253
+ /// type SomethingThatUsedMmap = ();
254
+ ///
255
+ /// // Will tell the Ruby GC that 1024 bytes were allocated.
256
+ /// let item = ManuallyTracked::new(SomethingThatUsedMmap, 1024);
257
+ ///
258
+ /// // Will tell the Ruby GC that 1024 bytes were freed.
259
+ /// std::mem::drop(item);
260
+ /// ```
261
+ pub struct ManuallyTracked<T> {
262
+ item: T,
263
+ memsize_delta: MemsizeDelta,
264
+ }
265
+
266
+ impl<T> ManuallyTracked<T> {
267
+ /// Create a new `ManuallyTracked<T>`, and immediately report that `memsize`
268
+ /// bytes were allocated.
269
+ pub fn wrap(item: T, memsize: usize) -> Self {
270
+ Self {
271
+ item,
272
+ memsize_delta: MemsizeDelta::new(memsize as _),
273
+ }
274
+ }
275
+
276
+ /// Increase the memory usage reported to the Ruby GC by `memsize` bytes.
277
+ pub fn increase_memory_usage(&self, memsize: usize) {
278
+ self.memsize_delta.add(memsize);
279
+ }
280
+
281
+ /// Decrease the memory usage reported to the Ruby GC by `memsize` bytes.
282
+ pub fn decrease_memory_usage(&self, memsize: usize) {
283
+ self.memsize_delta.sub(memsize);
284
+ }
285
+
286
+ /// Get the current memory usage delta.
287
+ pub fn memsize_delta(&self) -> isize {
288
+ self.memsize_delta.get()
289
+ }
290
+
291
+ /// Get a shared reference to the inner `T`.
292
+ pub fn get(&self) -> &T {
293
+ &self.item
294
+ }
295
+
296
+ /// Get a mutable reference to the inner `T`.
297
+ pub fn get_mut(&mut self) -> &mut T {
298
+ &mut self.item
299
+ }
300
+ }
301
+
302
+ impl ManuallyTracked<()> {
303
+ /// Create a new `ManuallyTracked<()>`, and immediately report that
304
+ /// `memsize` bytes were allocated.
305
+ pub fn new(memsize: usize) -> Self {
306
+ Self::wrap((), memsize)
307
+ }
308
+ }
309
+
310
+ impl Default for ManuallyTracked<()> {
311
+ fn default() -> Self {
312
+ Self::wrap((), 0)
313
+ }
314
+ }
315
+
316
+ impl<T: Clone> Clone for ManuallyTracked<T> {
317
+ fn clone(&self) -> Self {
318
+ Self {
319
+ item: self.item.clone(),
320
+ memsize_delta: self.memsize_delta.clone(),
321
+ }
322
+ }
323
+ }
324
+
325
+ impl<T: std::fmt::Debug> std::fmt::Debug for ManuallyTracked<T> {
326
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
327
+ f.debug_struct("ManuallyTracked")
328
+ .field("item", &self.item)
329
+ .field("memsize_delta", &self.memsize_delta)
330
+ .finish()
331
+ }
332
+ }
@@ -0,0 +1,89 @@
1
+ //! Internal utility functions.
2
+
3
+ /// Check if the Ruby VM is globally available.
4
+ ///
5
+ /// Unfortunately there is no public API for this check, but there's a hidden
6
+ /// `ruby_current_vm_ptr` symbol in libruby 2.5 - 3.2 which we can use to
7
+ /// determine if the VM has been initialized, or shut down.
8
+ ///
9
+ /// # Notes
10
+ ///
11
+ /// Ruby 2.4 and below don't have a global VM pointer, so we can't check if it's
12
+ /// null. Ruby 2.4 is EOL, and support will be dropped soon anyway.
13
+ //
14
+ /// In Ruby 3.3, the global VM pointer is no longer exported, so there's no
15
+ /// simple way to check the global VM pointer, so instead we check if known
16
+ /// static value is non-zero.
17
+ ///
18
+ /// On Ruby < 3.3, we also need to check if the global VM pointer is null to
19
+ /// ensure the VM hasn't stopped, which makes the function name a bit of a
20
+ /// misnomer... but in actuality this function can only guarantee that the
21
+ /// VM is started, not that it's still running.
22
+ #[allow(dead_code)]
23
+ pub(crate) unsafe fn is_ruby_vm_started() -> bool {
24
+ #[cfg(ruby_engine = "mri")]
25
+ let ret = {
26
+ #[cfg(all(ruby_gt_2_4, ruby_lte_3_2))]
27
+ let ret = !crate::hidden::ruby_current_vm_ptr.is_null();
28
+
29
+ #[cfg(any(ruby_lte_2_4, ruby_gt_3_2))]
30
+ let ret = crate::rb_cBasicObject != 0;
31
+
32
+ ret
33
+ };
34
+
35
+ #[cfg(ruby_engine = "truffleruby")]
36
+ let ret = crate::rb_cBasicObject != 0;
37
+
38
+ ret
39
+ }
40
+
41
+ /// Macro for conditionally asserting type checks in Ruby, only active when RUBY_DEBUG is enabled.
42
+ /// This matches Ruby's behavior of only checking types in debug mode.
43
+ #[macro_export]
44
+ macro_rules! debug_ruby_assert_type {
45
+ ($obj:expr, $type:expr, $message:expr) => {
46
+ #[cfg(ruby_ruby_debug = "true")]
47
+ {
48
+ #[allow(clippy::macro_metavars_in_unsafe)]
49
+ unsafe {
50
+ assert!(
51
+ !$crate::SPECIAL_CONST_P($obj) && $crate::RB_BUILTIN_TYPE($obj) == $type,
52
+ $message
53
+ );
54
+ }
55
+ }
56
+ #[cfg(not(ruby_ruby_debug = "true"))]
57
+ {
58
+ let _ = ($obj, $type, $message); // Prevent unused variable warnings
59
+ }
60
+ };
61
+ }
62
+
63
+ #[cfg(test)]
64
+ mod tests {
65
+ use super::*;
66
+ use rusty_fork::rusty_fork_test;
67
+
68
+ rusty_fork_test! {
69
+ #[test]
70
+ fn test_is_ruby_vm_started() {
71
+ assert!(!unsafe { is_ruby_vm_started() });
72
+
73
+ #[cfg(windows)]
74
+ {
75
+ let mut argc = 0;
76
+ let mut argv: [*mut std::os::raw::c_char; 0] = [];
77
+ let mut argv = argv.as_mut_ptr();
78
+ unsafe { rb_sys::rb_w32_sysinit(&mut argc, &mut argv) };
79
+ }
80
+
81
+ match unsafe { crate::ruby_setup() } {
82
+ 0 => {}
83
+ code => panic!("Failed to setup Ruby (error code: {})", code),
84
+ };
85
+
86
+ assert!(unsafe { is_ruby_vm_started() });
87
+ }
88
+ }
89
+ }
@@ -0,0 +1,7 @@
1
+ #![allow(rustdoc::broken_intra_doc_links)]
2
+ //! Definitions for Ruby's special constants.
3
+ //!
4
+ //! Makes it easier to reference important Ruby constants, without havign to dig
5
+ //! around in bindgen's output.
6
+
7
+ pub use crate::ruby_value_type::*;