parquet 0.5.13 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +3 -0
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -605
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,317 @@
|
|
1
|
+
use magnus::value::ReprValue;
|
2
|
+
use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
|
3
|
+
use parquet_core::reader::Reader;
|
4
|
+
|
5
|
+
use crate::{
|
6
|
+
converter::parquet_to_ruby,
|
7
|
+
io::{RubyIOReader, ThreadSafeRubyIOReader},
|
8
|
+
logger::RubyLogger,
|
9
|
+
types::{ColumnEnumeratorArgs, ParserResultType, RowEnumeratorArgs},
|
10
|
+
utils::{create_column_enumerator, create_row_enumerator, handle_block_or_enum},
|
11
|
+
CloneableChunkReader,
|
12
|
+
};
|
13
|
+
|
14
|
+
/// Read parquet file row by row
|
15
|
+
pub fn each_row(
|
16
|
+
ruby: &Ruby,
|
17
|
+
rb_self: Value,
|
18
|
+
to_read: Value,
|
19
|
+
result_type: ParserResultType,
|
20
|
+
columns: Option<Vec<String>>,
|
21
|
+
strict: bool,
|
22
|
+
logger: RubyLogger,
|
23
|
+
) -> Result<Value, MagnusError> {
|
24
|
+
if let Some(enum_value) = handle_block_or_enum(ruby.block_given(), || {
|
25
|
+
create_row_enumerator(RowEnumeratorArgs {
|
26
|
+
rb_self,
|
27
|
+
to_read,
|
28
|
+
result_type,
|
29
|
+
columns: columns.clone(),
|
30
|
+
strict,
|
31
|
+
logger: logger.inner(),
|
32
|
+
})
|
33
|
+
.map(|yield_enum| yield_enum.into_value_with(ruby))
|
34
|
+
})? {
|
35
|
+
return Ok(enum_value);
|
36
|
+
}
|
37
|
+
|
38
|
+
// Log start of processing
|
39
|
+
let _ = logger.info(|| "Starting to read parquet file".to_string());
|
40
|
+
|
41
|
+
// Create a streaming reader based on input type
|
42
|
+
let chunk_reader = if to_read.is_kind_of(ruby.class_string()) {
|
43
|
+
let path_str: String = TryConvert::try_convert(to_read)?;
|
44
|
+
let _ = logger.debug(|| format!("Reading from file: {}", path_str));
|
45
|
+
CloneableChunkReader::from_path(&path_str)
|
46
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
|
47
|
+
} else if to_read.respond_to("read", false)? {
|
48
|
+
// Handle IO objects with streaming
|
49
|
+
let _ = logger.debug(|| "Reading from IO object".to_string());
|
50
|
+
let ruby_reader = RubyIOReader::new(to_read)
|
51
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
52
|
+
let thread_safe_reader = ThreadSafeRubyIOReader::new(ruby_reader);
|
53
|
+
|
54
|
+
CloneableChunkReader::from_ruby_io(thread_safe_reader)
|
55
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
|
56
|
+
} else {
|
57
|
+
return Err(MagnusError::new(
|
58
|
+
ruby.exception_runtime_error(),
|
59
|
+
format!(
|
60
|
+
"Invalid input type: expected String or IO object with read method, got {}",
|
61
|
+
to_read.class()
|
62
|
+
),
|
63
|
+
));
|
64
|
+
};
|
65
|
+
|
66
|
+
let reader = Reader::new(chunk_reader.clone());
|
67
|
+
let mut reader_for_metadata = Reader::new(chunk_reader);
|
68
|
+
|
69
|
+
// Get metadata to extract column names
|
70
|
+
let metadata = reader_for_metadata
|
71
|
+
.metadata()
|
72
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
73
|
+
let schema = metadata.schema();
|
74
|
+
let all_column_names: Vec<String> = schema
|
75
|
+
.get_fields()
|
76
|
+
.iter()
|
77
|
+
.map(|f| f.name().to_string())
|
78
|
+
.collect();
|
79
|
+
|
80
|
+
let _ = logger.info(|| format!("Processing {} columns", all_column_names.len()));
|
81
|
+
|
82
|
+
// Get the row iterator
|
83
|
+
let (row_iter, column_names) = if let Some(ref cols) = columns {
|
84
|
+
let iter = reader
|
85
|
+
.read_rows_with_projection(cols)
|
86
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
87
|
+
(iter, cols.clone())
|
88
|
+
} else {
|
89
|
+
let iter = reader
|
90
|
+
.read_rows()
|
91
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
92
|
+
(iter, all_column_names)
|
93
|
+
};
|
94
|
+
|
95
|
+
// Process with block
|
96
|
+
let proc = ruby.block_proc().map_err(|e| {
|
97
|
+
MagnusError::new(
|
98
|
+
ruby.exception_runtime_error(),
|
99
|
+
format!("Failed to get block: {}", e),
|
100
|
+
)
|
101
|
+
})?;
|
102
|
+
let mut row_count = 0u64;
|
103
|
+
|
104
|
+
for row_result in row_iter {
|
105
|
+
let row = row_result
|
106
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
107
|
+
|
108
|
+
// Convert row to Ruby value based on result_type
|
109
|
+
let ruby_row = match result_type {
|
110
|
+
ParserResultType::Array => {
|
111
|
+
let array: RArray = ruby.ary_new_capa(row.len());
|
112
|
+
for value in row {
|
113
|
+
let ruby_value = parquet_to_ruby(value).map_err(|e| {
|
114
|
+
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
115
|
+
})?;
|
116
|
+
array.push(ruby_value)?;
|
117
|
+
}
|
118
|
+
array.as_value()
|
119
|
+
}
|
120
|
+
ParserResultType::Hash => {
|
121
|
+
let hash: RHash = ruby.hash_new();
|
122
|
+
for (idx, value) in row.into_iter().enumerate() {
|
123
|
+
if idx < column_names.len() {
|
124
|
+
let ruby_value = parquet_to_ruby(value).map_err(|e| {
|
125
|
+
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
126
|
+
})?;
|
127
|
+
hash.aset(column_names[idx].as_str(), ruby_value)?;
|
128
|
+
}
|
129
|
+
}
|
130
|
+
hash.as_value()
|
131
|
+
}
|
132
|
+
};
|
133
|
+
|
134
|
+
proc.call::<_, Value>((ruby_row,))?;
|
135
|
+
|
136
|
+
row_count += 1;
|
137
|
+
if row_count % 1000 == 0 {
|
138
|
+
let _ = logger.debug(|| format!("Processed {} rows", row_count));
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
142
|
+
let _ = logger.info(|| format!("Finished processing {} rows", row_count));
|
143
|
+
|
144
|
+
Ok(ruby.qnil().as_value())
|
145
|
+
}
|
146
|
+
|
147
|
+
/// Arguments for each_column function
|
148
|
+
struct EachColumnArgs {
|
149
|
+
rb_self: Value,
|
150
|
+
to_read: Value,
|
151
|
+
result_type: ParserResultType,
|
152
|
+
columns: Option<Vec<String>>,
|
153
|
+
batch_size: Option<usize>,
|
154
|
+
strict: bool,
|
155
|
+
logger: RubyLogger,
|
156
|
+
}
|
157
|
+
|
158
|
+
/// Read parquet file column by column
|
159
|
+
#[allow(clippy::too_many_arguments)]
|
160
|
+
pub fn each_column(
|
161
|
+
ruby: &Ruby,
|
162
|
+
rb_self: Value,
|
163
|
+
to_read: Value,
|
164
|
+
result_type: ParserResultType,
|
165
|
+
columns: Option<Vec<String>>,
|
166
|
+
batch_size: Option<usize>,
|
167
|
+
strict: bool,
|
168
|
+
logger: RubyLogger,
|
169
|
+
) -> Result<Value, MagnusError> {
|
170
|
+
let args = EachColumnArgs {
|
171
|
+
rb_self,
|
172
|
+
to_read,
|
173
|
+
result_type,
|
174
|
+
columns,
|
175
|
+
batch_size,
|
176
|
+
strict,
|
177
|
+
logger,
|
178
|
+
};
|
179
|
+
each_column_impl(ruby, args)
|
180
|
+
}
|
181
|
+
|
182
|
+
fn each_column_impl(ruby: &Ruby, args: EachColumnArgs) -> Result<Value, MagnusError> {
|
183
|
+
if let Some(enum_value) = handle_block_or_enum(ruby.block_given(), || {
|
184
|
+
create_column_enumerator(ColumnEnumeratorArgs {
|
185
|
+
rb_self: args.rb_self,
|
186
|
+
to_read: args.to_read,
|
187
|
+
result_type: args.result_type,
|
188
|
+
columns: args.columns.clone(),
|
189
|
+
batch_size: args.batch_size,
|
190
|
+
strict: args.strict,
|
191
|
+
logger: args.logger.inner(),
|
192
|
+
})
|
193
|
+
.map(|yield_enum| yield_enum.into_value_with(ruby))
|
194
|
+
})? {
|
195
|
+
return Ok(enum_value);
|
196
|
+
}
|
197
|
+
|
198
|
+
// Log start of processing
|
199
|
+
let _ = args
|
200
|
+
.logger
|
201
|
+
.info(|| "Starting to read parquet file columns".to_string());
|
202
|
+
|
203
|
+
// Create a streaming reader based on input type
|
204
|
+
let chunk_reader = if args.to_read.is_kind_of(ruby.class_string()) {
|
205
|
+
let path_str: String = TryConvert::try_convert(args.to_read)?;
|
206
|
+
let _ = args
|
207
|
+
.logger
|
208
|
+
.debug(|| format!("Reading columns from file: {}", path_str));
|
209
|
+
CloneableChunkReader::from_path(&path_str)
|
210
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
|
211
|
+
} else if args.to_read.respond_to("read", false)? {
|
212
|
+
// Handle IO objects with streaming
|
213
|
+
let _ = args
|
214
|
+
.logger
|
215
|
+
.debug(|| "Reading columns from IO object".to_string());
|
216
|
+
let ruby_reader = RubyIOReader::new(args.to_read)
|
217
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
218
|
+
let thread_safe_reader = ThreadSafeRubyIOReader::new(ruby_reader);
|
219
|
+
|
220
|
+
CloneableChunkReader::from_ruby_io(thread_safe_reader)
|
221
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
|
222
|
+
} else {
|
223
|
+
return Err(MagnusError::new(
|
224
|
+
ruby.exception_runtime_error(),
|
225
|
+
format!(
|
226
|
+
"Invalid input type: expected String or IO object with read method, got {}",
|
227
|
+
args.to_read.class()
|
228
|
+
),
|
229
|
+
));
|
230
|
+
};
|
231
|
+
|
232
|
+
let reader = Reader::new(chunk_reader.clone());
|
233
|
+
let mut reader_for_metadata = Reader::new(chunk_reader);
|
234
|
+
|
235
|
+
// Get metadata to extract column names
|
236
|
+
let metadata = reader_for_metadata
|
237
|
+
.metadata()
|
238
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
239
|
+
let schema = metadata.schema();
|
240
|
+
let all_column_names: Vec<String> = schema
|
241
|
+
.get_fields()
|
242
|
+
.iter()
|
243
|
+
.map(|f| f.name().to_string())
|
244
|
+
.collect();
|
245
|
+
|
246
|
+
// Get the column iterator
|
247
|
+
let (col_iter, _column_names) = if let Some(ref cols) = args.columns {
|
248
|
+
let iter = reader
|
249
|
+
.read_columns_with_projection(cols, args.batch_size)
|
250
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
251
|
+
(iter, cols.clone())
|
252
|
+
} else {
|
253
|
+
let iter = reader
|
254
|
+
.read_columns(args.batch_size)
|
255
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
256
|
+
(iter, all_column_names)
|
257
|
+
};
|
258
|
+
|
259
|
+
// Process with block
|
260
|
+
let proc = ruby.block_proc().map_err(|e| {
|
261
|
+
MagnusError::new(
|
262
|
+
ruby.exception_runtime_error(),
|
263
|
+
format!("Failed to get block: {}", e),
|
264
|
+
)
|
265
|
+
})?;
|
266
|
+
let mut batch_count = 0u64;
|
267
|
+
|
268
|
+
for batch_result in col_iter {
|
269
|
+
let batch = batch_result
|
270
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
271
|
+
|
272
|
+
// Convert batch to Ruby value based on result_type
|
273
|
+
let ruby_batch = match args.result_type {
|
274
|
+
ParserResultType::Array => {
|
275
|
+
let array: RArray = ruby.ary_new_capa(batch.columns.len());
|
276
|
+
for (_name, values) in batch.columns {
|
277
|
+
let col_array: RArray = ruby.ary_new_capa(values.len());
|
278
|
+
for value in values {
|
279
|
+
let ruby_value = parquet_to_ruby(value).map_err(|e| {
|
280
|
+
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
281
|
+
})?;
|
282
|
+
col_array.push(ruby_value)?;
|
283
|
+
}
|
284
|
+
array.push(col_array)?;
|
285
|
+
}
|
286
|
+
array.as_value()
|
287
|
+
}
|
288
|
+
ParserResultType::Hash => {
|
289
|
+
let hash: RHash = ruby.hash_new();
|
290
|
+
for (name, values) in batch.columns {
|
291
|
+
let col_array: RArray = ruby.ary_new_capa(values.len());
|
292
|
+
for value in values {
|
293
|
+
let ruby_value = parquet_to_ruby(value).map_err(|e| {
|
294
|
+
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
295
|
+
})?;
|
296
|
+
col_array.push(ruby_value)?;
|
297
|
+
}
|
298
|
+
hash.aset(name, col_array)?;
|
299
|
+
}
|
300
|
+
hash.as_value()
|
301
|
+
}
|
302
|
+
};
|
303
|
+
|
304
|
+
proc.call::<_, Value>((ruby_batch,))?;
|
305
|
+
|
306
|
+
batch_count += 1;
|
307
|
+
let _ = args
|
308
|
+
.logger
|
309
|
+
.debug(|| format!("Processed batch {}", batch_count));
|
310
|
+
}
|
311
|
+
|
312
|
+
let _ = args
|
313
|
+
.logger
|
314
|
+
.info(|| format!("Finished processing {} batches", batch_count));
|
315
|
+
|
316
|
+
Ok(ruby.qnil().as_value())
|
317
|
+
}
|