kreuzberg 4.0.0.pre.rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +157 -0
- data/README.md +426 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +341 -0
- data/ext/kreuzberg_rb/extconf.rb +45 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +15 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +148 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +46 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +691 -0
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -0
- data/lib/kreuzberg/extraction_api.rb +85 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +80 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +103 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +520 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +204 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -0
- data/vendor/kreuzberg/src/core/mime.rs +605 -0
- data/vendor/kreuzberg/src/core/mod.rs +45 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
- data/vendor/kreuzberg/src/embeddings.rs +432 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
- data/vendor/kreuzberg/src/extractors/email.rs +143 -0
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -0
- data/vendor/kreuzberg/src/extractors/image.rs +198 -0
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
- data/vendor/kreuzberg/src/extractors/text.rs +260 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +105 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +393 -0
- data/vendor/kreuzberg/src/pdf/text.rs +158 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +903 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
- data/vendor/kreuzberg/tests/config_features.rs +598 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
- data/vendor/kreuzberg/tests/core_integration.rs +510 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +536 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
//! Support for reporting Rust memory usage to the Ruby GC.
|
|
2
|
+
|
|
3
|
+
use std::{
|
|
4
|
+
fmt::Formatter,
|
|
5
|
+
sync::{
|
|
6
|
+
Arc,
|
|
7
|
+
atomic::{AtomicIsize, Ordering},
|
|
8
|
+
},
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
#[cfg(ruby_engine = "mri")]
|
|
12
|
+
mod mri {
|
|
13
|
+
use crate::{rb_gc_adjust_memory_usage, utils::is_ruby_vm_started};
|
|
14
|
+
use std::alloc::{GlobalAlloc, Layout, System};
|
|
15
|
+
|
|
16
|
+
/// A simple wrapper over [`System`] which reports memory usage to
|
|
17
|
+
/// the Ruby GC. This gives the GC a more accurate picture of the process'
|
|
18
|
+
/// memory usage so it can make better decisions about when to run.
|
|
19
|
+
#[derive(Debug)]
|
|
20
|
+
pub struct TrackingAllocator;
|
|
21
|
+
|
|
22
|
+
impl TrackingAllocator {
|
|
23
|
+
/// Create a new [`TrackingAllocator`].
|
|
24
|
+
#[allow(clippy::new_without_default)]
|
|
25
|
+
pub const fn new() -> Self {
|
|
26
|
+
Self
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/// Create a new [`TrackingAllocator`] with default values.
|
|
30
|
+
pub const fn default() -> Self {
|
|
31
|
+
Self::new()
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/// Adjust the memory usage reported to the Ruby GC by `delta`. Useful for
|
|
35
|
+
/// tracking allocations invisible to the Rust allocator, such as `mmap` or
|
|
36
|
+
/// direct `malloc` calls.
|
|
37
|
+
///
|
|
38
|
+
/// # Example
|
|
39
|
+
/// ```
|
|
40
|
+
/// use rb_sys::TrackingAllocator;
|
|
41
|
+
///
|
|
42
|
+
/// // Allocate 1024 bytes of memory using `mmap` or `malloc`...
|
|
43
|
+
/// TrackingAllocator::adjust_memory_usage(1024);
|
|
44
|
+
///
|
|
45
|
+
/// // ...and then after the memory is freed, adjust the memory usage again.
|
|
46
|
+
/// TrackingAllocator::adjust_memory_usage(-1024);
|
|
47
|
+
/// ```
|
|
48
|
+
#[inline]
|
|
49
|
+
pub fn adjust_memory_usage(delta: isize) -> isize {
|
|
50
|
+
if delta == 0 {
|
|
51
|
+
return 0;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
#[cfg(target_pointer_width = "32")]
|
|
55
|
+
let delta = delta as i32;
|
|
56
|
+
|
|
57
|
+
#[cfg(target_pointer_width = "64")]
|
|
58
|
+
let delta = delta as i64;
|
|
59
|
+
|
|
60
|
+
unsafe {
|
|
61
|
+
if is_ruby_vm_started() {
|
|
62
|
+
// On Windows, ssize_t is i32 even on 64-bit, so cast i64 to i32
|
|
63
|
+
#[cfg(all(target_pointer_width = "64", target_os = "windows"))]
|
|
64
|
+
rb_gc_adjust_memory_usage(delta as i32);
|
|
65
|
+
|
|
66
|
+
#[cfg(not(all(target_pointer_width = "64", target_os = "windows")))]
|
|
67
|
+
rb_gc_adjust_memory_usage(delta);
|
|
68
|
+
|
|
69
|
+
delta as isize
|
|
70
|
+
} else {
|
|
71
|
+
0
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
unsafe impl GlobalAlloc for TrackingAllocator {
|
|
78
|
+
#[inline]
|
|
79
|
+
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
|
|
80
|
+
let ret = System.alloc(layout);
|
|
81
|
+
let delta = layout.size() as isize;
|
|
82
|
+
|
|
83
|
+
if !ret.is_null() && delta != 0 {
|
|
84
|
+
Self::adjust_memory_usage(delta);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
ret
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[inline]
|
|
91
|
+
unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
|
|
92
|
+
let ret = System.alloc_zeroed(layout);
|
|
93
|
+
let delta = layout.size() as isize;
|
|
94
|
+
|
|
95
|
+
if !ret.is_null() && delta != 0 {
|
|
96
|
+
Self::adjust_memory_usage(delta);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
ret
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
#[inline]
|
|
103
|
+
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
|
|
104
|
+
System.dealloc(ptr, layout);
|
|
105
|
+
let delta = -(layout.size() as isize);
|
|
106
|
+
|
|
107
|
+
if delta != 0 {
|
|
108
|
+
Self::adjust_memory_usage(delta);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
#[inline]
|
|
113
|
+
unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
|
|
114
|
+
let ret = System.realloc(ptr, layout, new_size);
|
|
115
|
+
let delta = new_size as isize - layout.size() as isize;
|
|
116
|
+
|
|
117
|
+
if !ret.is_null() && delta != 0 {
|
|
118
|
+
Self::adjust_memory_usage(delta);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
ret
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
#[cfg(not(ruby_engine = "mri"))]
|
|
127
|
+
mod non_mri {
|
|
128
|
+
use std::alloc::{GlobalAlloc, Layout, System};
|
|
129
|
+
|
|
130
|
+
/// A simple wrapper over [`System`] as a fallback for non-MRI Ruby engines.
|
|
131
|
+
pub struct TrackingAllocator;
|
|
132
|
+
|
|
133
|
+
impl TrackingAllocator {
|
|
134
|
+
#[allow(clippy::new_without_default)]
|
|
135
|
+
pub const fn new() -> Self {
|
|
136
|
+
Self
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
pub const fn default() -> Self {
|
|
140
|
+
Self::new()
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
pub fn adjust_memory_usage(_delta: isize) -> isize {
|
|
144
|
+
0
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
unsafe impl GlobalAlloc for TrackingAllocator {
|
|
149
|
+
#[inline]
|
|
150
|
+
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
|
|
151
|
+
System.alloc(layout)
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
#[inline]
|
|
155
|
+
unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
|
|
156
|
+
System.alloc_zeroed(layout)
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[inline]
|
|
160
|
+
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
|
|
161
|
+
System.dealloc(ptr, layout)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
#[inline]
|
|
165
|
+
unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
|
|
166
|
+
System.realloc(ptr, layout, new_size)
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
#[cfg(ruby_engine = "mri")]
|
|
172
|
+
pub use mri::*;
|
|
173
|
+
|
|
174
|
+
#[cfg(not(ruby_engine = "mri"))]
|
|
175
|
+
pub use non_mri::*;
|
|
176
|
+
|
|
177
|
+
/// Set the global allocator to [`TrackingAllocator`].
|
|
178
|
+
///
|
|
179
|
+
/// # Example
|
|
180
|
+
/// ```
|
|
181
|
+
/// // File: ext/my_gem/src/lib.rs
|
|
182
|
+
/// use rb_sys::set_global_tracking_allocator;
|
|
183
|
+
///
|
|
184
|
+
/// set_global_tracking_allocator!();
|
|
185
|
+
/// ```
|
|
186
|
+
#[macro_export]
|
|
187
|
+
macro_rules! set_global_tracking_allocator {
|
|
188
|
+
() => {
|
|
189
|
+
#[global_allocator]
|
|
190
|
+
static RUBY_GLOBAL_TRACKING_ALLOCATOR: $crate::tracking_allocator::TrackingAllocator =
|
|
191
|
+
$crate::tracking_allocator::TrackingAllocator;
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
#[derive(Debug)]
|
|
196
|
+
#[repr(transparent)]
|
|
197
|
+
struct MemsizeDelta(Arc<AtomicIsize>);
|
|
198
|
+
|
|
199
|
+
impl MemsizeDelta {
|
|
200
|
+
fn new(delta: isize) -> Self {
|
|
201
|
+
let delta = TrackingAllocator::adjust_memory_usage(delta);
|
|
202
|
+
Self(Arc::new(AtomicIsize::new(delta)))
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
fn add(&self, delta: usize) {
|
|
206
|
+
if delta == 0 {
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
let delta = TrackingAllocator::adjust_memory_usage(delta as _);
|
|
211
|
+
self.0.fetch_add(delta as _, Ordering::SeqCst);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
fn sub(&self, delta: usize) {
|
|
215
|
+
if delta == 0 {
|
|
216
|
+
return;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
let delta = TrackingAllocator::adjust_memory_usage(-(delta as isize));
|
|
220
|
+
self.0.fetch_add(delta, Ordering::SeqCst);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
fn get(&self) -> isize {
|
|
224
|
+
self.0.load(Ordering::SeqCst)
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
impl Clone for MemsizeDelta {
|
|
229
|
+
fn clone(&self) -> Self {
|
|
230
|
+
Self(Arc::clone(&self.0))
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
impl Drop for MemsizeDelta {
|
|
235
|
+
fn drop(&mut self) {
|
|
236
|
+
let memsize = self.0.swap(0, Ordering::SeqCst);
|
|
237
|
+
TrackingAllocator::adjust_memory_usage(0 - memsize);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/// A guard which adjusts the memory usage reported to the Ruby GC by `delta`.
|
|
242
|
+
/// This allows you to track resources which are invisible to the Rust
|
|
243
|
+
/// allocator, such as items that are known to internally use `mmap` or direct
|
|
244
|
+
/// `malloc` in their implementation.
|
|
245
|
+
///
|
|
246
|
+
/// Internally, it uses an [`Arc<AtomicIsize>`] to track the memory usage delta,
|
|
247
|
+
/// and is safe to clone when `T` is [`Clone`].
|
|
248
|
+
///
|
|
249
|
+
/// # Example
|
|
250
|
+
/// ```
|
|
251
|
+
/// use rb_sys::tracking_allocator::ManuallyTracked;
|
|
252
|
+
///
|
|
253
|
+
/// type SomethingThatUsedMmap = ();
|
|
254
|
+
///
|
|
255
|
+
/// // Will tell the Ruby GC that 1024 bytes were allocated.
|
|
256
|
+
/// let item = ManuallyTracked::new(SomethingThatUsedMmap, 1024);
|
|
257
|
+
///
|
|
258
|
+
/// // Will tell the Ruby GC that 1024 bytes were freed.
|
|
259
|
+
/// std::mem::drop(item);
|
|
260
|
+
/// ```
|
|
261
|
+
pub struct ManuallyTracked<T> {
|
|
262
|
+
item: T,
|
|
263
|
+
memsize_delta: MemsizeDelta,
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
impl<T> ManuallyTracked<T> {
|
|
267
|
+
/// Create a new `ManuallyTracked<T>`, and immediately report that `memsize`
|
|
268
|
+
/// bytes were allocated.
|
|
269
|
+
pub fn wrap(item: T, memsize: usize) -> Self {
|
|
270
|
+
Self {
|
|
271
|
+
item,
|
|
272
|
+
memsize_delta: MemsizeDelta::new(memsize as _),
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/// Increase the memory usage reported to the Ruby GC by `memsize` bytes.
|
|
277
|
+
pub fn increase_memory_usage(&self, memsize: usize) {
|
|
278
|
+
self.memsize_delta.add(memsize);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/// Decrease the memory usage reported to the Ruby GC by `memsize` bytes.
|
|
282
|
+
pub fn decrease_memory_usage(&self, memsize: usize) {
|
|
283
|
+
self.memsize_delta.sub(memsize);
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
/// Get the current memory usage delta.
|
|
287
|
+
pub fn memsize_delta(&self) -> isize {
|
|
288
|
+
self.memsize_delta.get()
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/// Get a shared reference to the inner `T`.
|
|
292
|
+
pub fn get(&self) -> &T {
|
|
293
|
+
&self.item
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/// Get a mutable reference to the inner `T`.
|
|
297
|
+
pub fn get_mut(&mut self) -> &mut T {
|
|
298
|
+
&mut self.item
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
impl ManuallyTracked<()> {
|
|
303
|
+
/// Create a new `ManuallyTracked<()>`, and immediately report that
|
|
304
|
+
/// `memsize` bytes were allocated.
|
|
305
|
+
pub fn new(memsize: usize) -> Self {
|
|
306
|
+
Self::wrap((), memsize)
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
impl Default for ManuallyTracked<()> {
|
|
311
|
+
fn default() -> Self {
|
|
312
|
+
Self::wrap((), 0)
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
impl<T: Clone> Clone for ManuallyTracked<T> {
|
|
317
|
+
fn clone(&self) -> Self {
|
|
318
|
+
Self {
|
|
319
|
+
item: self.item.clone(),
|
|
320
|
+
memsize_delta: self.memsize_delta.clone(),
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
impl<T: std::fmt::Debug> std::fmt::Debug for ManuallyTracked<T> {
|
|
326
|
+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
|
327
|
+
f.debug_struct("ManuallyTracked")
|
|
328
|
+
.field("item", &self.item)
|
|
329
|
+
.field("memsize_delta", &self.memsize_delta)
|
|
330
|
+
.finish()
|
|
331
|
+
}
|
|
332
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
//! Internal utility functions.
|
|
2
|
+
|
|
3
|
+
/// Check if the Ruby VM is globally available.
|
|
4
|
+
///
|
|
5
|
+
/// Unfortunately there is no public API for this check, but there's a hidden
|
|
6
|
+
/// `ruby_current_vm_ptr` symbol in libruby 2.5 - 3.2 which we can use to
|
|
7
|
+
/// determine if the VM has been initialized, or shut down.
|
|
8
|
+
///
|
|
9
|
+
/// # Notes
|
|
10
|
+
///
|
|
11
|
+
/// Ruby 2.4 and below don't have a global VM pointer, so we can't check if it's
|
|
12
|
+
/// null. Ruby 2.4 is EOL, and support will be dropped soon anyway.
|
|
13
|
+
//
|
|
14
|
+
/// In Ruby 3.3, the global VM pointer is no longer exported, so there's no
|
|
15
|
+
/// simple way to check the global VM pointer, so instead we check if known
|
|
16
|
+
/// static value is non-zero.
|
|
17
|
+
///
|
|
18
|
+
/// On Ruby < 3.3, we also need to check if the global VM pointer is null to
|
|
19
|
+
/// ensure the VM hasn't stopped, which makes the function name a bit of a
|
|
20
|
+
/// misnomer... but in actuality this function can only guarantee that the
|
|
21
|
+
/// VM is started, not that it's still running.
|
|
22
|
+
#[allow(dead_code)]
|
|
23
|
+
pub(crate) unsafe fn is_ruby_vm_started() -> bool {
|
|
24
|
+
#[cfg(ruby_engine = "mri")]
|
|
25
|
+
let ret = {
|
|
26
|
+
#[cfg(all(ruby_gt_2_4, ruby_lte_3_2))]
|
|
27
|
+
let ret = !crate::hidden::ruby_current_vm_ptr.is_null();
|
|
28
|
+
|
|
29
|
+
#[cfg(any(ruby_lte_2_4, ruby_gt_3_2))]
|
|
30
|
+
let ret = crate::rb_cBasicObject != 0;
|
|
31
|
+
|
|
32
|
+
ret
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
#[cfg(ruby_engine = "truffleruby")]
|
|
36
|
+
let ret = crate::rb_cBasicObject != 0;
|
|
37
|
+
|
|
38
|
+
ret
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/// Macro for conditionally asserting type checks in Ruby, only active when RUBY_DEBUG is enabled.
|
|
42
|
+
/// This matches Ruby's behavior of only checking types in debug mode.
|
|
43
|
+
#[macro_export]
|
|
44
|
+
macro_rules! debug_ruby_assert_type {
|
|
45
|
+
($obj:expr, $type:expr, $message:expr) => {
|
|
46
|
+
#[cfg(ruby_ruby_debug = "true")]
|
|
47
|
+
{
|
|
48
|
+
#[allow(clippy::macro_metavars_in_unsafe)]
|
|
49
|
+
unsafe {
|
|
50
|
+
assert!(
|
|
51
|
+
!$crate::SPECIAL_CONST_P($obj) && $crate::RB_BUILTIN_TYPE($obj) == $type,
|
|
52
|
+
$message
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
#[cfg(not(ruby_ruby_debug = "true"))]
|
|
57
|
+
{
|
|
58
|
+
let _ = ($obj, $type, $message); // Prevent unused variable warnings
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
#[cfg(test)]
|
|
64
|
+
mod tests {
|
|
65
|
+
use super::*;
|
|
66
|
+
use rusty_fork::rusty_fork_test;
|
|
67
|
+
|
|
68
|
+
rusty_fork_test! {
|
|
69
|
+
#[test]
|
|
70
|
+
fn test_is_ruby_vm_started() {
|
|
71
|
+
assert!(!unsafe { is_ruby_vm_started() });
|
|
72
|
+
|
|
73
|
+
#[cfg(windows)]
|
|
74
|
+
{
|
|
75
|
+
let mut argc = 0;
|
|
76
|
+
let mut argv: [*mut std::os::raw::c_char; 0] = [];
|
|
77
|
+
let mut argv = argv.as_mut_ptr();
|
|
78
|
+
unsafe { rb_sys::rb_w32_sysinit(&mut argc, &mut argv) };
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
match unsafe { crate::ruby_setup() } {
|
|
82
|
+
0 => {}
|
|
83
|
+
code => panic!("Failed to setup Ruby (error code: {})", code),
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
assert!(unsafe { is_ruby_vm_started() });
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|