parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,91 @@
1
+ use crate::{error::Result, RubyAdapterError};
2
+ use magnus::{value::ReprValue, IntoValue, Ruby, Value};
3
+
4
+ /// Trait for converting Rust values to Ruby values with error handling
5
+ ///
6
+ /// This is similar to Magnus's `IntoValue` trait but allows for returning errors
7
+ /// instead of panicking or returning invalid values.
8
+ pub trait TryIntoValue: Sized {
9
+ /// Convert `self` to a Ruby value with error handling
10
+ fn try_into_value(self, handle: &Ruby) -> Result<Value>;
11
+
12
+ /// Convert `self` to a Ruby value with error handling, using the Ruby runtime from the current thread
13
+ fn try_into_value_with_current_thread(self) -> Result<Value> {
14
+ let ruby =
15
+ Ruby::get().map_err(|_| RubyAdapterError::runtime("Failed to get Ruby runtime"))?;
16
+ self.try_into_value(&ruby)
17
+ }
18
+ }
19
+
20
+ // Note: We don't provide a blanket implementation for all IntoValue types
21
+ // because some types may want to provide custom error handling.
22
+ // Types that need TryIntoValue should implement it explicitly.
23
+
24
+ // Convenience implementations for common types
25
+ impl TryIntoValue for String {
26
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
27
+ Ok(self.into_value_with(handle))
28
+ }
29
+ }
30
+
31
+ impl TryIntoValue for &str {
32
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
33
+ Ok(self.into_value_with(handle))
34
+ }
35
+ }
36
+
37
+ impl TryIntoValue for i32 {
38
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
39
+ Ok(self.into_value_with(handle))
40
+ }
41
+ }
42
+
43
+ impl TryIntoValue for i64 {
44
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
45
+ Ok(self.into_value_with(handle))
46
+ }
47
+ }
48
+
49
+ impl TryIntoValue for f32 {
50
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
51
+ Ok(self.into_value_with(handle))
52
+ }
53
+ }
54
+
55
+ impl TryIntoValue for f64 {
56
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
57
+ Ok(self.into_value_with(handle))
58
+ }
59
+ }
60
+
61
+ impl TryIntoValue for bool {
62
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
63
+ Ok(self.into_value_with(handle))
64
+ }
65
+ }
66
+
67
+ impl<T> TryIntoValue for Vec<T>
68
+ where
69
+ T: TryIntoValue,
70
+ {
71
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
72
+ let array = handle.ary_new();
73
+ for item in self {
74
+ let ruby_value = item.try_into_value(handle)?;
75
+ array.push(ruby_value)?;
76
+ }
77
+ Ok(handle.into_value(array))
78
+ }
79
+ }
80
+
81
+ impl<T> TryIntoValue for Option<T>
82
+ where
83
+ T: TryIntoValue,
84
+ {
85
+ fn try_into_value(self, handle: &Ruby) -> Result<Value> {
86
+ match self {
87
+ Some(value) => value.try_into_value(handle),
88
+ None => Ok(handle.qnil().as_value()),
89
+ }
90
+ }
91
+ }
@@ -0,0 +1,98 @@
1
+ use crate::string_storage::StringStorageConfig;
2
+ use magnus::Value;
3
+ use std::fs::File;
4
+ use std::str::FromStr;
5
+ use tempfile::NamedTempFile;
6
+
7
+ /// Arguments for writing Parquet files
8
+ #[derive(Debug)]
9
+ pub struct ParquetWriteArgs {
10
+ pub read_from: Value,
11
+ pub write_to: Value,
12
+ pub schema_value: Value,
13
+ pub batch_size: Option<usize>,
14
+ pub flush_threshold: Option<usize>,
15
+ pub compression: Option<String>,
16
+ pub sample_size: Option<usize>,
17
+ pub logger: Option<Value>,
18
+ /// Requested string-cache capacity; `None` means the cache is disabled.
19
+ pub string_cache: Option<usize>,
20
+ }
21
+
22
+ /// Arguments for creating row enumerators
23
+ pub struct RowEnumeratorArgs {
24
+ pub rb_self: Value,
25
+ pub to_read: Value,
26
+ pub result_type: ParserResultType,
27
+ pub columns: Option<Vec<String>>,
28
+ pub strict: bool,
29
+ pub string_storage: StringStorageConfig,
30
+ pub logger: Option<Value>,
31
+ }
32
+
33
+ /// Arguments for creating column enumerators
34
+ pub struct ColumnEnumeratorArgs {
35
+ pub rb_self: Value,
36
+ pub to_read: Value,
37
+ pub result_type: ParserResultType,
38
+ pub columns: Option<Vec<String>>,
39
+ pub batch_size: Option<usize>,
40
+ pub strict: bool,
41
+ pub string_storage: StringStorageConfig,
42
+ pub logger: Option<Value>,
43
+ }
44
+
45
+ /// Enum to handle different writer outputs
46
+ pub enum WriterOutput {
47
+ File(parquet_core::Writer<File>),
48
+ TempFile(parquet_core::Writer<File>, NamedTempFile, Value), // Writer, temp file, IO object
49
+ }
50
+
51
+ /// Result type for parser output
52
+ #[derive(Copy, Clone, Debug, PartialEq, Eq)]
53
+ pub enum ParserResultType {
54
+ Hash,
55
+ Array,
56
+ }
57
+
58
+ impl ParserResultType {
59
+ pub fn iter() -> impl Iterator<Item = Self> {
60
+ [Self::Hash, Self::Array].into_iter()
61
+ }
62
+ }
63
+
64
+ impl FromStr for ParserResultType {
65
+ type Err = String;
66
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
67
+ Self::try_from(s)
68
+ }
69
+ }
70
+
71
+ impl TryFrom<&str> for ParserResultType {
72
+ type Error = String;
73
+
74
+ fn try_from(value: &str) -> Result<Self, Self::Error> {
75
+ match value {
76
+ "hash" => Ok(ParserResultType::Hash),
77
+ "array" => Ok(ParserResultType::Array),
78
+ _ => Err(format!("Invalid parser result type: {}", value)),
79
+ }
80
+ }
81
+ }
82
+
83
+ impl TryFrom<String> for ParserResultType {
84
+ type Error = String;
85
+
86
+ fn try_from(value: String) -> Result<Self, Self::Error> {
87
+ Self::try_from(value.as_str())
88
+ }
89
+ }
90
+
91
+ impl std::fmt::Display for ParserResultType {
92
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
93
+ match self {
94
+ ParserResultType::Hash => write!(f, "hash"),
95
+ ParserResultType::Array => write!(f, "array"),
96
+ }
97
+ }
98
+ }
@@ -0,0 +1,280 @@
1
+ use magnus::value::ReprValue;
2
+ use magnus::{
3
+ scan_args::{get_kwargs, scan_args},
4
+ Error as MagnusError, KwArgs, Ruby, TryConvert, Value,
5
+ };
6
+ use parquet::basic::Compression;
7
+ use parquet_core::{MAX_BATCH_SIZE, MAX_SAMPLE_SIZE};
8
+
9
+ use crate::string_cache::{DEFAULT_STRING_CACHE_CAPACITY, STRING_CACHE_CAPACITY_MAX};
10
+ use crate::string_storage::{
11
+ StringStorageConfig, StringStorageMode, DEFAULT_SHARED_MAX_ENTRIES,
12
+ DEFAULT_SHARED_MAX_VALUE_BYTES,
13
+ };
14
+ use crate::types::{ColumnEnumeratorArgs, ParquetWriteArgs, RowEnumeratorArgs};
15
+
16
+ /// Reconstruct the `string_storage:` kwarg value for an enumerator so a
17
+ /// block-less call round-trips losslessly: a plain symbol for the mode, or a
18
+ /// hash when a `:shared` budget differs from the default. Returns `None` for the
19
+ /// default (`:copy`) config so the kwarg is simply omitted.
20
+ fn string_storage_kwarg(
21
+ ruby: &Ruby,
22
+ config: StringStorageConfig,
23
+ ) -> Result<Option<Value>, MagnusError> {
24
+ if config == StringStorageConfig::default() {
25
+ return Ok(None);
26
+ }
27
+ let default_budget = config.shared_max_entries == DEFAULT_SHARED_MAX_ENTRIES
28
+ && config.shared_max_value_bytes == DEFAULT_SHARED_MAX_VALUE_BYTES;
29
+ if config.mode == StringStorageMode::Shared && !default_budget {
30
+ let hash = ruby.hash_new();
31
+ hash.aset(
32
+ ruby.to_symbol("mode"),
33
+ ruby.to_symbol(config.mode.to_string()),
34
+ )?;
35
+ hash.aset(ruby.to_symbol("max_entries"), config.shared_max_entries)?;
36
+ hash.aset(
37
+ ruby.to_symbol("max_value_bytes"),
38
+ config.shared_max_value_bytes,
39
+ )?;
40
+ Ok(Some(hash.as_value()))
41
+ } else {
42
+ Ok(Some(ruby.to_symbol(config.mode.to_string()).as_value()))
43
+ }
44
+ }
45
+
46
+ /// Parse compression type from string
47
+ pub fn parse_compression(
48
+ ruby: &Ruby,
49
+ compression: Option<String>,
50
+ ) -> Result<Compression, MagnusError> {
51
+ match compression.map(|s| s.to_lowercase()).as_deref() {
52
+ Some("none") | Some("uncompressed") => Ok(Compression::UNCOMPRESSED),
53
+ Some("snappy") => Ok(Compression::SNAPPY),
54
+ Some("gzip") => Ok(Compression::GZIP(parquet::basic::GzipLevel::default())),
55
+ Some("lz4") => Ok(Compression::LZ4),
56
+ Some("zstd") => Ok(Compression::ZSTD(parquet::basic::ZstdLevel::default())),
57
+ Some("brotli") => Ok(Compression::BROTLI(parquet::basic::BrotliLevel::default())),
58
+ None => Ok(Compression::SNAPPY), // Default to SNAPPY
59
+ Some(other) => Err(MagnusError::new(
60
+ ruby.exception_arg_error(),
61
+ format!("Invalid compression option: '{}'. Valid options are: none, snappy, gzip, lz4, zstd, brotli", other),
62
+ )),
63
+ }
64
+ }
65
+
66
+ /// Parse arguments for Parquet writing
67
+ pub fn parse_parquet_write_args(
68
+ ruby: &Ruby,
69
+ args: &[Value],
70
+ ) -> Result<ParquetWriteArgs, MagnusError> {
71
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
72
+ let (read_from,) = parsed_args.required;
73
+
74
+ let kwargs = get_kwargs::<
75
+ _,
76
+ (Value, Value),
77
+ (
78
+ Option<Option<usize>>,
79
+ Option<Option<usize>>,
80
+ Option<Option<String>>,
81
+ Option<Option<usize>>,
82
+ Option<Option<Value>>,
83
+ Option<Option<Value>>,
84
+ ),
85
+ (),
86
+ >(
87
+ parsed_args.keywords,
88
+ &["schema", "write_to"],
89
+ &[
90
+ "batch_size",
91
+ "flush_threshold",
92
+ "compression",
93
+ "sample_size",
94
+ "logger",
95
+ "string_cache",
96
+ ],
97
+ )?;
98
+
99
+ Ok(ParquetWriteArgs {
100
+ read_from,
101
+ write_to: kwargs.required.1,
102
+ schema_value: kwargs.required.0,
103
+ batch_size: parse_positive_bounded_usize(
104
+ ruby,
105
+ "batch_size",
106
+ kwargs.optional.0.flatten(),
107
+ MAX_BATCH_SIZE,
108
+ )?,
109
+ // A zero threshold would flush a row group per row; reject it like the
110
+ // other sizing options rather than producing a pathological file.
111
+ flush_threshold: parse_positive_bounded_usize(
112
+ ruby,
113
+ "flush_threshold",
114
+ kwargs.optional.1.flatten(),
115
+ usize::MAX,
116
+ )?,
117
+ compression: kwargs.optional.2.flatten(),
118
+ sample_size: parse_positive_bounded_usize(
119
+ ruby,
120
+ "sample_size",
121
+ kwargs.optional.3.flatten(),
122
+ MAX_SAMPLE_SIZE,
123
+ )?,
124
+ logger: kwargs.optional.4.flatten(),
125
+ string_cache: parse_string_cache(ruby, kwargs.optional.5.flatten())?,
126
+ })
127
+ }
128
+
129
+ fn parse_positive_bounded_usize(
130
+ ruby: &Ruby,
131
+ name: &str,
132
+ value: Option<usize>,
133
+ max: usize,
134
+ ) -> Result<Option<usize>, MagnusError> {
135
+ let Some(value) = value else {
136
+ return Ok(None);
137
+ };
138
+ if value == 0 {
139
+ return Err(MagnusError::new(
140
+ ruby.exception_arg_error(),
141
+ format!("{name} must be positive"),
142
+ ));
143
+ }
144
+ if value > max {
145
+ return Err(MagnusError::new(
146
+ ruby.exception_arg_error(),
147
+ format!("{name} must be at most {max}"),
148
+ ));
149
+ }
150
+ Ok(Some(value))
151
+ }
152
+
153
+ /// Parse the `string_cache:` write option. `false`/`nil`/absent disables it,
154
+ /// `true` enables it with the default capacity, and a positive Integer enables
155
+ /// it with that capacity. Returns the requested capacity, or `None` when
156
+ /// disabled.
157
+ pub fn parse_string_cache(ruby: &Ruby, value: Option<Value>) -> Result<Option<usize>, MagnusError> {
158
+ let Some(value) = value else {
159
+ return Ok(None);
160
+ };
161
+ // Strict: only true/false/nil and Integer are accepted (no Ruby truthiness
162
+ // coercion, so a stray String is a clear error rather than "enabled").
163
+ if value.is_nil() || value.eql(ruby.qfalse())? {
164
+ return Ok(None);
165
+ }
166
+ if value.eql(ruby.qtrue())? {
167
+ return Ok(Some(DEFAULT_STRING_CACHE_CAPACITY));
168
+ }
169
+ if value.is_kind_of(ruby.class_integer()) {
170
+ let capacity: usize = TryConvert::try_convert(value)?;
171
+ if capacity == 0 {
172
+ return Err(MagnusError::new(
173
+ ruby.exception_arg_error(),
174
+ "string_cache capacity must be positive",
175
+ ));
176
+ }
177
+ if capacity > STRING_CACHE_CAPACITY_MAX {
178
+ return Err(MagnusError::new(
179
+ ruby.exception_arg_error(),
180
+ format!(
181
+ "string_cache capacity must be at most {}",
182
+ STRING_CACHE_CAPACITY_MAX
183
+ ),
184
+ ));
185
+ }
186
+ return Ok(Some(capacity));
187
+ }
188
+ Err(MagnusError::new(
189
+ ruby.exception_type_error(),
190
+ "string_cache must be true, false, or a positive Integer",
191
+ ))
192
+ }
193
+
194
+ /// Convert a Ruby Value to a String, handling both String and Symbol types
195
+ pub fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
196
+ if value.is_nil() {
197
+ Ok(None)
198
+ } else if value.is_kind_of(ruby.class_string()) || value.is_kind_of(ruby.class_symbol()) {
199
+ let stringed = value.to_r_string()?.to_string()?;
200
+ Ok(Some(stringed))
201
+ } else {
202
+ Err(MagnusError::new(
203
+ ruby.exception_type_error(),
204
+ "Value must be a String or Symbol",
205
+ ))
206
+ }
207
+ }
208
+
209
+ /// Handle block or enumerator creation
210
+ pub fn handle_block_or_enum<F, T>(
211
+ block_given: bool,
212
+ create_enum: F,
213
+ ) -> Result<Option<T>, MagnusError>
214
+ where
215
+ F: FnOnce() -> Result<T, MagnusError>,
216
+ {
217
+ if !block_given {
218
+ let enum_value = create_enum()?;
219
+ return Ok(Some(enum_value));
220
+ }
221
+ Ok(None)
222
+ }
223
+
224
+ /// Create a row enumerator
225
+ pub fn create_row_enumerator(
226
+ ruby: &Ruby,
227
+ args: RowEnumeratorArgs,
228
+ ) -> Result<magnus::Enumerator, MagnusError> {
229
+ let kwargs = ruby.hash_new();
230
+ kwargs.aset(
231
+ ruby.to_symbol("result_type"),
232
+ ruby.to_symbol(args.result_type.to_string()),
233
+ )?;
234
+ if let Some(columns) = args.columns {
235
+ kwargs.aset(ruby.to_symbol("columns"), ruby.ary_from_vec(columns))?;
236
+ }
237
+ if args.strict {
238
+ kwargs.aset(ruby.to_symbol("strict"), true)?;
239
+ }
240
+ if let Some(value) = string_storage_kwarg(ruby, args.string_storage)? {
241
+ kwargs.aset(ruby.to_symbol("string_storage"), value)?;
242
+ }
243
+ if let Some(logger) = args.logger {
244
+ kwargs.aset(ruby.to_symbol("logger"), logger)?;
245
+ }
246
+ Ok(args
247
+ .rb_self
248
+ .enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
249
+ }
250
+
251
+ /// Create a column enumerator
252
+ #[inline]
253
+ pub fn create_column_enumerator(
254
+ ruby: &Ruby,
255
+ args: ColumnEnumeratorArgs,
256
+ ) -> Result<magnus::Enumerator, MagnusError> {
257
+ let kwargs = ruby.hash_new();
258
+ kwargs.aset(
259
+ ruby.to_symbol("result_type"),
260
+ ruby.to_symbol(args.result_type.to_string()),
261
+ )?;
262
+ if let Some(columns) = args.columns {
263
+ kwargs.aset(ruby.to_symbol("columns"), ruby.ary_from_vec(columns))?;
264
+ }
265
+ if let Some(batch_size) = args.batch_size {
266
+ kwargs.aset(ruby.to_symbol("batch_size"), batch_size)?;
267
+ }
268
+ if args.strict {
269
+ kwargs.aset(ruby.to_symbol("strict"), true)?;
270
+ }
271
+ if let Some(value) = string_storage_kwarg(ruby, args.string_storage)? {
272
+ kwargs.aset(ruby.to_symbol("string_storage"), value)?;
273
+ }
274
+ if let Some(logger) = args.logger {
275
+ kwargs.aset(ruby.to_symbol("logger"), logger)?;
276
+ }
277
+ Ok(args
278
+ .rb_self
279
+ .enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
280
+ }