parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,317 @@
1
+ use magnus::value::ReprValue;
2
+ use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
3
+ use parquet_core::reader::Reader;
4
+
5
+ use crate::{
6
+ converter::parquet_to_ruby,
7
+ io::{RubyIOReader, ThreadSafeRubyIOReader},
8
+ logger::RubyLogger,
9
+ types::{ColumnEnumeratorArgs, ParserResultType, RowEnumeratorArgs},
10
+ utils::{create_column_enumerator, create_row_enumerator, handle_block_or_enum},
11
+ CloneableChunkReader,
12
+ };
13
+
14
+ /// Read parquet file row by row
15
+ pub fn each_row(
16
+ ruby: &Ruby,
17
+ rb_self: Value,
18
+ to_read: Value,
19
+ result_type: ParserResultType,
20
+ columns: Option<Vec<String>>,
21
+ strict: bool,
22
+ logger: RubyLogger,
23
+ ) -> Result<Value, MagnusError> {
24
+ if let Some(enum_value) = handle_block_or_enum(ruby.block_given(), || {
25
+ create_row_enumerator(RowEnumeratorArgs {
26
+ rb_self,
27
+ to_read,
28
+ result_type,
29
+ columns: columns.clone(),
30
+ strict,
31
+ logger: logger.inner(),
32
+ })
33
+ .map(|yield_enum| yield_enum.into_value_with(ruby))
34
+ })? {
35
+ return Ok(enum_value);
36
+ }
37
+
38
+ // Log start of processing
39
+ let _ = logger.info(|| "Starting to read parquet file".to_string());
40
+
41
+ // Create a streaming reader based on input type
42
+ let chunk_reader = if to_read.is_kind_of(ruby.class_string()) {
43
+ let path_str: String = TryConvert::try_convert(to_read)?;
44
+ let _ = logger.debug(|| format!("Reading from file: {}", path_str));
45
+ CloneableChunkReader::from_path(&path_str)
46
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
47
+ } else if to_read.respond_to("read", false)? {
48
+ // Handle IO objects with streaming
49
+ let _ = logger.debug(|| "Reading from IO object".to_string());
50
+ let ruby_reader = RubyIOReader::new(to_read)
51
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
52
+ let thread_safe_reader = ThreadSafeRubyIOReader::new(ruby_reader);
53
+
54
+ CloneableChunkReader::from_ruby_io(thread_safe_reader)
55
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
56
+ } else {
57
+ return Err(MagnusError::new(
58
+ ruby.exception_runtime_error(),
59
+ format!(
60
+ "Invalid input type: expected String or IO object with read method, got {}",
61
+ to_read.class()
62
+ ),
63
+ ));
64
+ };
65
+
66
+ let reader = Reader::new(chunk_reader.clone());
67
+ let mut reader_for_metadata = Reader::new(chunk_reader);
68
+
69
+ // Get metadata to extract column names
70
+ let metadata = reader_for_metadata
71
+ .metadata()
72
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
73
+ let schema = metadata.schema();
74
+ let all_column_names: Vec<String> = schema
75
+ .get_fields()
76
+ .iter()
77
+ .map(|f| f.name().to_string())
78
+ .collect();
79
+
80
+ let _ = logger.info(|| format!("Processing {} columns", all_column_names.len()));
81
+
82
+ // Get the row iterator
83
+ let (row_iter, column_names) = if let Some(ref cols) = columns {
84
+ let iter = reader
85
+ .read_rows_with_projection(cols)
86
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
87
+ (iter, cols.clone())
88
+ } else {
89
+ let iter = reader
90
+ .read_rows()
91
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
92
+ (iter, all_column_names)
93
+ };
94
+
95
+ // Process with block
96
+ let proc = ruby.block_proc().map_err(|e| {
97
+ MagnusError::new(
98
+ ruby.exception_runtime_error(),
99
+ format!("Failed to get block: {}", e),
100
+ )
101
+ })?;
102
+ let mut row_count = 0u64;
103
+
104
+ for row_result in row_iter {
105
+ let row = row_result
106
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
107
+
108
+ // Convert row to Ruby value based on result_type
109
+ let ruby_row = match result_type {
110
+ ParserResultType::Array => {
111
+ let array: RArray = ruby.ary_new_capa(row.len());
112
+ for value in row {
113
+ let ruby_value = parquet_to_ruby(value).map_err(|e| {
114
+ MagnusError::new(ruby.exception_runtime_error(), e.to_string())
115
+ })?;
116
+ array.push(ruby_value)?;
117
+ }
118
+ array.as_value()
119
+ }
120
+ ParserResultType::Hash => {
121
+ let hash: RHash = ruby.hash_new();
122
+ for (idx, value) in row.into_iter().enumerate() {
123
+ if idx < column_names.len() {
124
+ let ruby_value = parquet_to_ruby(value).map_err(|e| {
125
+ MagnusError::new(ruby.exception_runtime_error(), e.to_string())
126
+ })?;
127
+ hash.aset(column_names[idx].as_str(), ruby_value)?;
128
+ }
129
+ }
130
+ hash.as_value()
131
+ }
132
+ };
133
+
134
+ proc.call::<_, Value>((ruby_row,))?;
135
+
136
+ row_count += 1;
137
+ if row_count % 1000 == 0 {
138
+ let _ = logger.debug(|| format!("Processed {} rows", row_count));
139
+ }
140
+ }
141
+
142
+ let _ = logger.info(|| format!("Finished processing {} rows", row_count));
143
+
144
+ Ok(ruby.qnil().as_value())
145
+ }
146
+
147
+ /// Arguments for each_column function
148
+ struct EachColumnArgs {
149
+ rb_self: Value,
150
+ to_read: Value,
151
+ result_type: ParserResultType,
152
+ columns: Option<Vec<String>>,
153
+ batch_size: Option<usize>,
154
+ strict: bool,
155
+ logger: RubyLogger,
156
+ }
157
+
158
+ /// Read parquet file column by column
159
+ #[allow(clippy::too_many_arguments)]
160
+ pub fn each_column(
161
+ ruby: &Ruby,
162
+ rb_self: Value,
163
+ to_read: Value,
164
+ result_type: ParserResultType,
165
+ columns: Option<Vec<String>>,
166
+ batch_size: Option<usize>,
167
+ strict: bool,
168
+ logger: RubyLogger,
169
+ ) -> Result<Value, MagnusError> {
170
+ let args = EachColumnArgs {
171
+ rb_self,
172
+ to_read,
173
+ result_type,
174
+ columns,
175
+ batch_size,
176
+ strict,
177
+ logger,
178
+ };
179
+ each_column_impl(ruby, args)
180
+ }
181
+
182
+ fn each_column_impl(ruby: &Ruby, args: EachColumnArgs) -> Result<Value, MagnusError> {
183
+ if let Some(enum_value) = handle_block_or_enum(ruby.block_given(), || {
184
+ create_column_enumerator(ColumnEnumeratorArgs {
185
+ rb_self: args.rb_self,
186
+ to_read: args.to_read,
187
+ result_type: args.result_type,
188
+ columns: args.columns.clone(),
189
+ batch_size: args.batch_size,
190
+ strict: args.strict,
191
+ logger: args.logger.inner(),
192
+ })
193
+ .map(|yield_enum| yield_enum.into_value_with(ruby))
194
+ })? {
195
+ return Ok(enum_value);
196
+ }
197
+
198
+ // Log start of processing
199
+ let _ = args
200
+ .logger
201
+ .info(|| "Starting to read parquet file columns".to_string());
202
+
203
+ // Create a streaming reader based on input type
204
+ let chunk_reader = if args.to_read.is_kind_of(ruby.class_string()) {
205
+ let path_str: String = TryConvert::try_convert(args.to_read)?;
206
+ let _ = args
207
+ .logger
208
+ .debug(|| format!("Reading columns from file: {}", path_str));
209
+ CloneableChunkReader::from_path(&path_str)
210
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
211
+ } else if args.to_read.respond_to("read", false)? {
212
+ // Handle IO objects with streaming
213
+ let _ = args
214
+ .logger
215
+ .debug(|| "Reading columns from IO object".to_string());
216
+ let ruby_reader = RubyIOReader::new(args.to_read)
217
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
218
+ let thread_safe_reader = ThreadSafeRubyIOReader::new(ruby_reader);
219
+
220
+ CloneableChunkReader::from_ruby_io(thread_safe_reader)
221
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
222
+ } else {
223
+ return Err(MagnusError::new(
224
+ ruby.exception_runtime_error(),
225
+ format!(
226
+ "Invalid input type: expected String or IO object with read method, got {}",
227
+ args.to_read.class()
228
+ ),
229
+ ));
230
+ };
231
+
232
+ let reader = Reader::new(chunk_reader.clone());
233
+ let mut reader_for_metadata = Reader::new(chunk_reader);
234
+
235
+ // Get metadata to extract column names
236
+ let metadata = reader_for_metadata
237
+ .metadata()
238
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
239
+ let schema = metadata.schema();
240
+ let all_column_names: Vec<String> = schema
241
+ .get_fields()
242
+ .iter()
243
+ .map(|f| f.name().to_string())
244
+ .collect();
245
+
246
+ // Get the column iterator
247
+ let (col_iter, _column_names) = if let Some(ref cols) = args.columns {
248
+ let iter = reader
249
+ .read_columns_with_projection(cols, args.batch_size)
250
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
251
+ (iter, cols.clone())
252
+ } else {
253
+ let iter = reader
254
+ .read_columns(args.batch_size)
255
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
256
+ (iter, all_column_names)
257
+ };
258
+
259
+ // Process with block
260
+ let proc = ruby.block_proc().map_err(|e| {
261
+ MagnusError::new(
262
+ ruby.exception_runtime_error(),
263
+ format!("Failed to get block: {}", e),
264
+ )
265
+ })?;
266
+ let mut batch_count = 0u64;
267
+
268
+ for batch_result in col_iter {
269
+ let batch = batch_result
270
+ .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
271
+
272
+ // Convert batch to Ruby value based on result_type
273
+ let ruby_batch = match args.result_type {
274
+ ParserResultType::Array => {
275
+ let array: RArray = ruby.ary_new_capa(batch.columns.len());
276
+ for (_name, values) in batch.columns {
277
+ let col_array: RArray = ruby.ary_new_capa(values.len());
278
+ for value in values {
279
+ let ruby_value = parquet_to_ruby(value).map_err(|e| {
280
+ MagnusError::new(ruby.exception_runtime_error(), e.to_string())
281
+ })?;
282
+ col_array.push(ruby_value)?;
283
+ }
284
+ array.push(col_array)?;
285
+ }
286
+ array.as_value()
287
+ }
288
+ ParserResultType::Hash => {
289
+ let hash: RHash = ruby.hash_new();
290
+ for (name, values) in batch.columns {
291
+ let col_array: RArray = ruby.ary_new_capa(values.len());
292
+ for value in values {
293
+ let ruby_value = parquet_to_ruby(value).map_err(|e| {
294
+ MagnusError::new(ruby.exception_runtime_error(), e.to_string())
295
+ })?;
296
+ col_array.push(ruby_value)?;
297
+ }
298
+ hash.aset(name, col_array)?;
299
+ }
300
+ hash.as_value()
301
+ }
302
+ };
303
+
304
+ proc.call::<_, Value>((ruby_batch,))?;
305
+
306
+ batch_count += 1;
307
+ let _ = args
308
+ .logger
309
+ .debug(|| format!("Processed batch {}", batch_count));
310
+ }
311
+
312
+ let _ = args
313
+ .logger
314
+ .info(|| format!("Finished processing {} batches", batch_count));
315
+
316
+ Ok(ruby.qnil().as_value())
317
+ }