parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,339 @@
1
+ use magnus::value::{BoxValue, ReprValue};
2
+ use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
3
+ use parquet_core::reader::Reader;
4
+
5
+ use crate::{
6
+ converter::parquet_to_ruby,
7
+ io::{RubyIOReader, ThreadSafeRubyIOReader},
8
+ logger::RubyLogger,
9
+ string_storage::{StringStorage, StringStorageConfig},
10
+ types::{ColumnEnumeratorArgs, ParserResultType, RowEnumeratorArgs},
11
+ utils::{create_column_enumerator, create_row_enumerator, handle_block_or_enum},
12
+ CloneableChunkReader,
13
+ };
14
+ use std::collections::HashSet;
15
+
16
+ /// Read parquet file row by row
17
+ pub fn each_row(
18
+ ruby: &Ruby,
19
+ rb_self: Value,
20
+ to_read: Value,
21
+ result_type: ParserResultType,
22
+ columns: Option<Vec<String>>,
23
+ strict: bool,
24
+ string_storage: StringStorageConfig,
25
+ logger: RubyLogger,
26
+ ) -> Result<Value, MagnusError> {
27
+ if let Some(enum_value) = handle_block_or_enum(ruby.block_given(), || {
28
+ create_row_enumerator(
29
+ ruby,
30
+ RowEnumeratorArgs {
31
+ rb_self,
32
+ to_read,
33
+ result_type,
34
+ columns: columns.clone(),
35
+ strict,
36
+ string_storage,
37
+ logger: logger.inner(),
38
+ },
39
+ )
40
+ .map(|yield_enum| yield_enum.into_value_with(ruby))
41
+ })? {
42
+ return Ok(enum_value);
43
+ }
44
+
45
+ // Log start of processing
46
+ let _ = logger.info(|| "Starting to read parquet file".to_string());
47
+
48
+ // Create a streaming reader based on input type
49
+ let chunk_reader = if to_read.is_kind_of(ruby.class_string()) {
50
+ let path_str: String = TryConvert::try_convert(to_read)?;
51
+ let _ = logger.debug(|| format!("Reading from file: {}", path_str));
52
+ CloneableChunkReader::from_path(&path_str)
53
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
54
+ } else if to_read.respond_to("read", false)? {
55
+ // Handle IO objects with streaming
56
+ let _ = logger.debug(|| "Reading from IO object".to_string());
57
+ let ruby_reader = RubyIOReader::new(to_read)
58
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
59
+ let thread_safe_reader = ThreadSafeRubyIOReader::new(ruby_reader);
60
+
61
+ CloneableChunkReader::from_ruby_io(thread_safe_reader)
62
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
63
+ } else {
64
+ return Err(MagnusError::new(
65
+ ruby.exception_runtime_error(),
66
+ format!(
67
+ "Invalid input type: expected String or IO object with read method, got {}",
68
+ to_read.class()
69
+ ),
70
+ ));
71
+ };
72
+
73
+ let reader = Reader::new(chunk_reader.clone());
74
+ let mut reader_for_metadata = Reader::new(chunk_reader);
75
+
76
+ // Get metadata to extract column names
77
+ let metadata = reader_for_metadata
78
+ .metadata()
79
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
80
+ let schema = metadata.schema();
81
+ let all_column_names: Vec<String> = schema
82
+ .get_fields()
83
+ .iter()
84
+ .map(|f| f.name().to_string())
85
+ .collect();
86
+
87
+ let _ = logger.info(|| format!("Processing {} columns", all_column_names.len()));
88
+
89
+ // Get the row iterator. Projected rows are yielded in file-schema order, not
90
+ // request order, so the hash keys must follow file order too — derive them by
91
+ // filtering the file columns, never from the request-ordered `cols`.
92
+ let (row_iter, column_names) = if let Some(ref cols) = columns {
93
+ let requested = cols.iter().map(String::as_str).collect::<HashSet<_>>();
94
+ let projected_names = all_column_names
95
+ .iter()
96
+ .filter(|name| requested.contains(name.as_str()))
97
+ .cloned()
98
+ .collect::<Vec<_>>();
99
+ let iter = reader
100
+ .read_rows_with_projection(cols)
101
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
102
+ (iter, projected_names)
103
+ } else {
104
+ let iter = reader
105
+ .read_rows()
106
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
107
+ (iter, all_column_names)
108
+ };
109
+
110
+ let mut row_count = 0u64;
111
+ let mut string_storage = StringStorage::new(string_storage);
112
+
113
+ // BoxValue registers each interned key with the GC so it survives a
114
+ // GC.compact triggered by user code inside the yield loop; a bare RString
115
+ // held in this Vec would be relocated and dangle.
116
+ let interned_column_names = column_names
117
+ .iter()
118
+ .map(|name| BoxValue::new(ruby.str_new(name).to_interned_str()))
119
+ .collect::<Vec<_>>();
120
+
121
+ for row_result in row_iter {
122
+ let row = row_result
123
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
124
+
125
+ // Convert row to Ruby value based on result_type
126
+ let ruby_row = match result_type {
127
+ ParserResultType::Array => {
128
+ let array: RArray = ruby.ary_new_capa(row.len());
129
+ for value in row {
130
+ let ruby_value = parquet_to_ruby(value, &mut string_storage).map_err(|e| {
131
+ MagnusError::new(ruby.exception_runtime_error(), e.to_string())
132
+ })?;
133
+ array.push(ruby_value)?;
134
+ }
135
+ array.as_value()
136
+ }
137
+ ParserResultType::Hash => {
138
+ let hash: RHash = ruby.hash_new_capa(row.len());
139
+ for (idx, value) in row.into_iter().enumerate() {
140
+ if idx < interned_column_names.len() {
141
+ let ruby_value =
142
+ parquet_to_ruby(value, &mut string_storage).map_err(|e| {
143
+ MagnusError::new(ruby.exception_runtime_error(), e.to_string())
144
+ })?;
145
+ hash.aset(interned_column_names[idx].as_value(), ruby_value)?;
146
+ }
147
+ }
148
+ hash.as_value()
149
+ }
150
+ };
151
+
152
+ let _: Value = ruby.yield_value(ruby_row)?;
153
+
154
+ row_count += 1;
155
+ if row_count % 1000 == 0 {
156
+ let _ = logger.debug(|| format!("Processed {} rows", row_count));
157
+ }
158
+ }
159
+
160
+ let _ = logger.info(|| format!("Finished processing {} rows", row_count));
161
+
162
+ Ok(ruby.qnil().as_value())
163
+ }
164
+
165
+ /// Arguments for each_column function
166
+ struct EachColumnArgs {
167
+ rb_self: Value,
168
+ to_read: Value,
169
+ result_type: ParserResultType,
170
+ columns: Option<Vec<String>>,
171
+ batch_size: Option<usize>,
172
+ strict: bool,
173
+ string_storage: StringStorageConfig,
174
+ logger: RubyLogger,
175
+ }
176
+
177
+ /// Read parquet file column by column
178
+ #[allow(clippy::too_many_arguments)]
179
+ pub fn each_column(
180
+ ruby: &Ruby,
181
+ rb_self: Value,
182
+ to_read: Value,
183
+ result_type: ParserResultType,
184
+ columns: Option<Vec<String>>,
185
+ batch_size: Option<usize>,
186
+ strict: bool,
187
+ string_storage: StringStorageConfig,
188
+ logger: RubyLogger,
189
+ ) -> Result<Value, MagnusError> {
190
+ let args = EachColumnArgs {
191
+ rb_self,
192
+ to_read,
193
+ result_type,
194
+ columns,
195
+ batch_size,
196
+ strict,
197
+ string_storage,
198
+ logger,
199
+ };
200
+ each_column_impl(ruby, args)
201
+ }
202
+
203
+ fn each_column_impl(ruby: &Ruby, args: EachColumnArgs) -> Result<Value, MagnusError> {
204
+ if let Some(enum_value) = handle_block_or_enum(ruby.block_given(), || {
205
+ create_column_enumerator(
206
+ ruby,
207
+ ColumnEnumeratorArgs {
208
+ rb_self: args.rb_self,
209
+ to_read: args.to_read,
210
+ result_type: args.result_type,
211
+ columns: args.columns.clone(),
212
+ batch_size: args.batch_size,
213
+ strict: args.strict,
214
+ string_storage: args.string_storage,
215
+ logger: args.logger.inner(),
216
+ },
217
+ )
218
+ .map(|yield_enum| yield_enum.into_value_with(ruby))
219
+ })? {
220
+ return Ok(enum_value);
221
+ }
222
+
223
+ // Log start of processing
224
+ let _ = args
225
+ .logger
226
+ .info(|| "Starting to read parquet file columns".to_string());
227
+
228
+ // Create a streaming reader based on input type
229
+ let chunk_reader = if args.to_read.is_kind_of(ruby.class_string()) {
230
+ let path_str: String = TryConvert::try_convert(args.to_read)?;
231
+ let _ = args
232
+ .logger
233
+ .debug(|| format!("Reading columns from file: {}", path_str));
234
+ CloneableChunkReader::from_path(&path_str)
235
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
236
+ } else if args.to_read.respond_to("read", false)? {
237
+ // Handle IO objects with streaming
238
+ let _ = args
239
+ .logger
240
+ .debug(|| "Reading columns from IO object".to_string());
241
+ let ruby_reader = RubyIOReader::new(args.to_read)
242
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
243
+ let thread_safe_reader = ThreadSafeRubyIOReader::new(ruby_reader);
244
+
245
+ CloneableChunkReader::from_ruby_io(thread_safe_reader)
246
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
247
+ } else {
248
+ return Err(MagnusError::new(
249
+ ruby.exception_runtime_error(),
250
+ format!(
251
+ "Invalid input type: expected String or IO object with read method, got {}",
252
+ args.to_read.class()
253
+ ),
254
+ ));
255
+ };
256
+
257
+ let reader = Reader::new(chunk_reader.clone());
258
+ let mut reader_for_metadata = Reader::new(chunk_reader);
259
+
260
+ // Get metadata to extract column names
261
+ let metadata = reader_for_metadata
262
+ .metadata()
263
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
264
+ let schema = metadata.schema();
265
+ let all_column_names: Vec<String> = schema
266
+ .get_fields()
267
+ .iter()
268
+ .map(|f| f.name().to_string())
269
+ .collect();
270
+
271
+ // Get the column iterator
272
+ let (col_iter, _column_names) = if let Some(ref cols) = args.columns {
273
+ let iter = reader
274
+ .read_columns_with_projection(cols, args.batch_size)
275
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
276
+ (iter, cols.clone())
277
+ } else {
278
+ let iter = reader
279
+ .read_columns(args.batch_size)
280
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
281
+ (iter, all_column_names)
282
+ };
283
+
284
+ let mut batch_count = 0u64;
285
+ let mut string_storage = StringStorage::new(args.string_storage);
286
+
287
+ for batch_result in col_iter {
288
+ let batch = batch_result
289
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
290
+
291
+ // Convert batch to Ruby value based on result_type
292
+ let ruby_batch = match args.result_type {
293
+ ParserResultType::Array => {
294
+ let array: RArray = ruby.ary_new_capa(batch.columns.len());
295
+ for (_name, values) in batch.columns {
296
+ let col_array: RArray = ruby.ary_new_capa(values.len());
297
+ for value in values {
298
+ let ruby_value =
299
+ parquet_to_ruby(value, &mut string_storage).map_err(|e| {
300
+ MagnusError::new(ruby.exception_runtime_error(), e.to_string())
301
+ })?;
302
+ col_array.push(ruby_value)?;
303
+ }
304
+ array.push(col_array)?;
305
+ }
306
+ array.as_value()
307
+ }
308
+ ParserResultType::Hash => {
309
+ let hash: RHash = ruby.hash_new();
310
+ for (name, values) in batch.columns {
311
+ let col_array: RArray = ruby.ary_new_capa(values.len());
312
+ for value in values {
313
+ let ruby_value =
314
+ parquet_to_ruby(value, &mut string_storage).map_err(|e| {
315
+ MagnusError::new(ruby.exception_runtime_error(), e.to_string())
316
+ })?;
317
+ col_array.push(ruby_value)?;
318
+ }
319
+ let ruby_key = string_storage.ruby_key(ruby, &name);
320
+ hash.aset(ruby_key, col_array)?;
321
+ }
322
+ hash.as_value()
323
+ }
324
+ };
325
+
326
+ let _: Value = ruby.yield_value(ruby_batch)?;
327
+
328
+ batch_count += 1;
329
+ let _ = args
330
+ .logger
331
+ .debug(|| format!("Processed batch {}", batch_count));
332
+ }
333
+
334
+ let _ = args
335
+ .logger
336
+ .info(|| format!("Finished processing {} batches", batch_count));
337
+
338
+ Ok(ruby.qnil().as_value())
339
+ }