parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,297 @@
1
+ use magnus::r_hash::ForEach;
2
+ use magnus::scan_args::{get_kwargs, scan_args};
3
+ use magnus::value::ReprValue;
4
+ use magnus::{Error as MagnusError, RHash, Ruby, TryConvert, Value};
5
+ use parquet_ruby_adapter::utils::parse_string_or_symbol;
6
+ use parquet_ruby_adapter::{
7
+ logger::RubyLogger, types::ParserResultType, utils::parse_parquet_write_args,
8
+ StringStorageConfig, StringStorageMode, DEFAULT_SHARED_MAX_ENTRIES,
9
+ DEFAULT_SHARED_MAX_VALUE_BYTES,
10
+ };
11
+
12
+ fn arg_error(message: impl Into<String>) -> MagnusError {
13
+ // Only ever called while constructing an error to return to Ruby, i.e. on the
14
+ // Ruby thread with the GVL held, so a handle is always available.
15
+ let ruby = Ruby::get().expect("arg_error built while the Ruby GVL is held");
16
+ MagnusError::new(ruby.exception_arg_error(), message.into())
17
+ }
18
+
19
+ /// Parse the optional `string_storage:` keyword into a config. Accepts a symbol
20
+ /// or string naming the mode (`:copy`/`:intern`/`:shared`), or a hash
21
+ /// `{ mode:, max_entries:, max_value_bytes: }` to also set the `:shared` leak
22
+ /// budget. Defaults to the historical copy-per-value behavior when absent.
23
+ fn parse_string_storage(
24
+ ruby: &Ruby,
25
+ value: Option<Value>,
26
+ ) -> Result<StringStorageConfig, MagnusError> {
27
+ let Some(value) = value else {
28
+ return Ok(StringStorageConfig::default());
29
+ };
30
+ if value.is_kind_of(ruby.class_hash()) {
31
+ return parse_string_storage_hash(ruby, value);
32
+ }
33
+ let mode = parse_storage_mode(ruby, value)?;
34
+ Ok(StringStorageConfig::from_mode(mode))
35
+ }
36
+
37
+ fn parse_storage_mode(ruby: &Ruby, value: Value) -> Result<StringStorageMode, MagnusError> {
38
+ parse_string_or_symbol(ruby, value)?
39
+ .ok_or_else(|| arg_error("string_storage mode cannot be nil"))?
40
+ .parse()
41
+ .map_err(arg_error)
42
+ }
43
+
44
+ fn parse_string_storage_hash(
45
+ ruby: &Ruby,
46
+ value: Value,
47
+ ) -> Result<StringStorageConfig, MagnusError> {
48
+ let hash: RHash = TryConvert::try_convert(value)?;
49
+ reject_unknown_string_storage_keys(ruby, hash)?;
50
+ let mode = match hash.get(ruby.to_symbol("mode")) {
51
+ Some(mode_value) => parse_storage_mode(ruby, mode_value)?,
52
+ None => return Err(arg_error("string_storage hash requires a :mode")),
53
+ };
54
+ // The leak budget only applies to :shared. Reject it for other modes rather
55
+ // than silently ignoring it — that also keeps every parsed config in a state
56
+ // the symbol/hash round-trip can reproduce (only :shared carries a budget).
57
+ if mode != StringStorageMode::Shared
58
+ && (has_key(ruby, &hash, "max_entries") || has_key(ruby, &hash, "max_value_bytes"))
59
+ {
60
+ return Err(arg_error(
61
+ "string_storage :max_entries/:max_value_bytes are only valid with mode: :shared",
62
+ ));
63
+ }
64
+ Ok(StringStorageConfig {
65
+ mode,
66
+ shared_max_entries: positive_usize(ruby, &hash, "max_entries", DEFAULT_SHARED_MAX_ENTRIES)?,
67
+ shared_max_value_bytes: positive_usize(
68
+ ruby,
69
+ &hash,
70
+ "max_value_bytes",
71
+ DEFAULT_SHARED_MAX_VALUE_BYTES,
72
+ )?,
73
+ })
74
+ }
75
+
76
+ fn reject_unknown_string_storage_keys(ruby: &Ruby, hash: RHash) -> Result<(), MagnusError> {
77
+ hash.foreach(|key: Value, _value: Value| {
78
+ let key_name = parse_string_or_symbol(ruby, key)?
79
+ .ok_or_else(|| arg_error("string_storage option keys cannot be nil"))?;
80
+ match key_name.as_str() {
81
+ "mode" | "max_entries" | "max_value_bytes" => Ok(ForEach::Continue),
82
+ other => Err(arg_error(format!("unknown string_storage option :{other}"))),
83
+ }
84
+ })?;
85
+ Ok(())
86
+ }
87
+
88
+ fn has_key(ruby: &Ruby, hash: &RHash, key: &str) -> bool {
89
+ hash.get(ruby.to_symbol(key))
90
+ .is_some_and(|value| !value.is_nil())
91
+ }
92
+
93
+ /// Read a positive-integer value from `hash[:key]`, falling back to `default`
94
+ /// when the key is absent or nil.
95
+ fn positive_usize(
96
+ ruby: &Ruby,
97
+ hash: &RHash,
98
+ key: &str,
99
+ default: usize,
100
+ ) -> Result<usize, MagnusError> {
101
+ match hash.get(ruby.to_symbol(key)) {
102
+ Some(value) if !value.is_nil() => {
103
+ let parsed: usize = TryConvert::try_convert(value).map_err(|_| {
104
+ arg_error(format!(
105
+ "string_storage :{} must be a positive Integer",
106
+ key
107
+ ))
108
+ })?;
109
+ if parsed == 0 {
110
+ return Err(arg_error(format!(
111
+ "string_storage :{} must be positive",
112
+ key
113
+ )));
114
+ }
115
+ Ok(parsed)
116
+ }
117
+ _ => Ok(default),
118
+ }
119
+ }
120
+
121
+ pub fn each_row(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
122
+ let ruby = Ruby::get().expect("Ruby FFI entry point runs while the Ruby GVL is held");
123
+
124
+ // Parse arguments
125
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
126
+ let (to_read,) = parsed_args.required;
127
+
128
+ // Parse keyword arguments
129
+ let kwargs = get_kwargs::<
130
+ _,
131
+ (),
132
+ (
133
+ Option<Option<Value>>, // result_type
134
+ Option<Option<Vec<String>>>, // columns
135
+ Option<Option<bool>>, // strict
136
+ Option<Option<Value>>, // string_storage
137
+ Option<Option<Value>>, // logger
138
+ ),
139
+ (),
140
+ >(
141
+ parsed_args.keywords,
142
+ &[],
143
+ &[
144
+ "result_type",
145
+ "columns",
146
+ "strict",
147
+ "string_storage",
148
+ "logger",
149
+ ],
150
+ )?;
151
+
152
+ let result_type: ParserResultType = if let Some(rt_value) = kwargs.optional.0.flatten() {
153
+ parse_string_or_symbol(&ruby, rt_value)?
154
+ .ok_or_else(|| {
155
+ MagnusError::new(ruby.exception_arg_error(), "result_type cannot be nil")
156
+ })?
157
+ .parse()
158
+ .map_err(|_| {
159
+ MagnusError::new(ruby.exception_arg_error(), "Invalid result_type value")
160
+ })?
161
+ } else {
162
+ ParserResultType::Hash
163
+ };
164
+ let columns = kwargs.optional.1.flatten();
165
+ let strict = kwargs.optional.2.flatten().unwrap_or(true);
166
+ let string_storage = parse_string_storage(&ruby, kwargs.optional.3.flatten())?;
167
+ let logger = RubyLogger::new(kwargs.optional.4.flatten())?;
168
+
169
+ // Delegate to parquet_ruby_adapter
170
+ parquet_ruby_adapter::reader::each_row(
171
+ &ruby,
172
+ rb_self,
173
+ to_read,
174
+ result_type,
175
+ columns,
176
+ strict,
177
+ string_storage,
178
+ logger,
179
+ )
180
+ }
181
+
182
+ pub fn each_column(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
183
+ let ruby = Ruby::get().expect("Ruby FFI entry point runs while the Ruby GVL is held");
184
+
185
+ // Parse arguments
186
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
187
+ let (to_read,) = parsed_args.required;
188
+
189
+ // Parse keyword arguments
190
+ let kwargs = get_kwargs::<
191
+ _,
192
+ (),
193
+ (
194
+ Option<Option<Value>>, // result_type
195
+ Option<Option<Vec<String>>>, // columns
196
+ Option<Option<usize>>, // batch_size
197
+ Option<Option<bool>>, // strict
198
+ Option<Option<Value>>, // string_storage
199
+ Option<Option<Value>>, // logger
200
+ ),
201
+ (),
202
+ >(
203
+ parsed_args.keywords,
204
+ &[],
205
+ &[
206
+ "result_type",
207
+ "columns",
208
+ "batch_size",
209
+ "strict",
210
+ "string_storage",
211
+ "logger",
212
+ ],
213
+ )?;
214
+
215
+ let result_type: ParserResultType = if let Some(rt_value) = kwargs.optional.0.flatten() {
216
+ parse_string_or_symbol(&ruby, rt_value)?
217
+ .ok_or_else(|| {
218
+ MagnusError::new(ruby.exception_arg_error(), "result_type cannot be nil")
219
+ })?
220
+ .parse()
221
+ .map_err(|_| {
222
+ MagnusError::new(ruby.exception_arg_error(), "Invalid result_type value")
223
+ })?
224
+ } else {
225
+ ParserResultType::Hash
226
+ };
227
+ let columns = kwargs.optional.1.flatten();
228
+ let batch_size = if let Some(bs) = kwargs.optional.2.flatten() {
229
+ if bs == 0 {
230
+ return Err(MagnusError::new(
231
+ ruby.exception_arg_error(),
232
+ "batch_size must be greater than 0",
233
+ ));
234
+ }
235
+ Some(bs)
236
+ } else {
237
+ None
238
+ };
239
+ let strict = kwargs.optional.3.flatten().unwrap_or(true);
240
+ let string_storage = parse_string_storage(&ruby, kwargs.optional.4.flatten())?;
241
+ let logger = RubyLogger::new(kwargs.optional.5.flatten())?;
242
+
243
+ // Delegate to parquet_ruby_adapter
244
+ parquet_ruby_adapter::reader::each_column(
245
+ &ruby,
246
+ rb_self,
247
+ to_read,
248
+ result_type,
249
+ columns,
250
+ batch_size,
251
+ strict,
252
+ string_storage,
253
+ logger,
254
+ )
255
+ }
256
+
257
+ pub fn write_rows(args: &[Value]) -> Result<Value, MagnusError> {
258
+ let ruby = Ruby::get().expect("Ruby FFI entry point runs while the Ruby GVL is held");
259
+
260
+ // Parse arguments using the new parser
261
+ let write_args = parse_parquet_write_args(&ruby, args)?;
262
+
263
+ // Delegate to parquet_ruby_adapter
264
+ parquet_ruby_adapter::writer::write_rows(&ruby, write_args)
265
+ }
266
+
267
+ pub fn write_columns(args: &[Value]) -> Result<Value, MagnusError> {
268
+ let ruby = Ruby::get().expect("Ruby FFI entry point runs while the Ruby GVL is held");
269
+
270
+ // Parse arguments using the new parser
271
+ let write_args = parse_parquet_write_args(&ruby, args)?;
272
+ reject_row_only_column_write_options(&write_args)?;
273
+
274
+ // Delegate to parquet_ruby_adapter
275
+ parquet_ruby_adapter::writer::write_columns(&ruby, write_args)
276
+ }
277
+
278
+ fn reject_row_only_column_write_options(
279
+ write_args: &parquet_ruby_adapter::types::ParquetWriteArgs,
280
+ ) -> Result<(), MagnusError> {
281
+ if write_args.batch_size.is_some() {
282
+ return Err(arg_error(
283
+ "write_columns does not accept batch_size; split input into column batches instead",
284
+ ));
285
+ }
286
+ if write_args.sample_size.is_some() {
287
+ return Err(arg_error(
288
+ "write_columns does not accept sample_size; sample_size only applies to write_rows",
289
+ ));
290
+ }
291
+ if write_args.string_cache.is_some() {
292
+ return Err(arg_error(
293
+ "write_columns does not accept string_cache; string_cache only applies to write_rows",
294
+ ));
295
+ }
296
+ Ok(())
297
+ }
@@ -0,0 +1,13 @@
1
+ #[cfg(target_os = "linux")]
2
+ use jemallocator::Jemalloc;
3
+
4
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
5
+ use mimalloc::MiMalloc;
6
+
7
+ #[global_allocator]
8
+ #[cfg(target_os = "linux")]
9
+ static ALLOC: Jemalloc = Jemalloc;
10
+
11
+ #[global_allocator]
12
+ #[cfg(not(any(target_os = "linux", target_os = "windows")))]
13
+ static ALLOC: MiMalloc = MiMalloc;
@@ -0,0 +1,24 @@
1
+ mod adapter_ffi;
2
+ mod allocator;
3
+
4
+ use magnus::{function, method, Error, Ruby};
5
+
6
+ use crate::adapter_ffi::{each_column, each_row, write_columns, write_rows};
7
+ use parquet_ruby_adapter::metadata::parse_metadata;
8
+
9
+ /// Initializes the Ruby extension and defines methods.
10
+ #[magnus::init]
11
+ fn init(ruby: &Ruby) -> Result<(), Error> {
12
+ ruby.require("time")?;
13
+ ruby.require("bigdecimal")?;
14
+
15
+ let module = ruby.define_module("Parquet")?;
16
+
17
+ module.define_module_function("metadata", function!(parse_metadata, 1))?;
18
+ module.define_module_function("each_row", method!(each_row, -1))?;
19
+ module.define_module_function("each_column", method!(each_column, -1))?;
20
+ module.define_module_function("write_rows", function!(write_rows, -1))?;
21
+ module.define_module_function("write_columns", function!(write_columns, -1))?;
22
+
23
+ Ok(())
24
+ }
@@ -0,0 +1,24 @@
1
+ [package]
2
+ name = "parquet-core"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [dependencies]
7
+ arrow = "58.3.0"
8
+ arrow-array = "58.3.0"
9
+ arrow-buffer = "58.3.0"
10
+ arrow-schema = { version = "58.3.0", features = ["canonical_extension_types"] }
11
+ bytes = "1.5"
12
+ indexmap = "2.2"
13
+ jiff = "0.2"
14
+ num = "0.4.3"
15
+ ordered-float = "5.0.0"
16
+ parquet = { version = "58.3.0", features = ["arrow", "zstd", "lz4", "snap"] }
17
+ rand = "0.9.1"
18
+ serde = { version = "1.0", features = ["derive"] }
19
+ thiserror = "2.0"
20
+ triomphe = "0.1.15"
21
+ uuid = { version = "1.0", features = ["v4"] }
22
+
23
+ [dev-dependencies]
24
+ tempfile = "3.8"