parquet 0.0.5 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,403 @@
1
+ use std::{
2
+ fs::File,
3
+ io::{self, BufReader, BufWriter},
4
+ sync::Arc,
5
+ };
6
+
7
+ use arrow_array::{Array, RecordBatch};
8
+ use arrow_schema::{DataType, Field, Schema, TimeUnit};
9
+ use magnus::{
10
+ scan_args::{get_kwargs, scan_args},
11
+ value::ReprValue,
12
+ Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
+ };
14
+ use parquet::arrow::ArrowWriter;
15
+ use tempfile::NamedTempFile;
16
+
17
+ use crate::{
18
+ convert_ruby_array_to_arrow,
19
+ types::{ColumnCollector, ParquetErrorWrapper, WriterOutput},
20
+ IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
21
+ };
22
+
23
+ const DEFAULT_BATCH_SIZE: usize = 1000;
24
+
25
+ /// Parse arguments for Parquet writing
26
+ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
27
+ let ruby = unsafe { Ruby::get_unchecked() };
28
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
+ let (read_from,) = parsed_args.required;
30
+
31
+ let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
32
+ parsed_args.keywords,
33
+ &["schema", "write_to"],
34
+ &["batch_size"],
35
+ )?;
36
+
37
+ let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
38
+ MagnusError::new(
39
+ magnus::exception::type_error(),
40
+ "schema must be an array of hashes",
41
+ )
42
+ })?;
43
+
44
+ let mut schema = Vec::with_capacity(schema_array.len());
45
+
46
+ for (idx, field_hash) in schema_array.into_iter().enumerate() {
47
+ if !field_hash.is_kind_of(ruby.class_hash()) {
48
+ return Err(MagnusError::new(
49
+ magnus::exception::type_error(),
50
+ format!("schema[{}] must be a hash", idx),
51
+ ));
52
+ }
53
+
54
+ let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
55
+ if entries.len() != 1 {
56
+ return Err(MagnusError::new(
57
+ magnus::exception::type_error(),
58
+ format!("schema[{}] must contain exactly one key-value pair", idx),
59
+ ));
60
+ }
61
+
62
+ let (name, type_str) = &entries[0];
63
+ let name = String::try_convert(name.clone())?;
64
+ let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
65
+
66
+ schema.push(SchemaField { name, type_ });
67
+ }
68
+
69
+ Ok(ParquetWriteArgs {
70
+ read_from,
71
+ write_to: kwargs.required.1,
72
+ schema,
73
+ batch_size: kwargs.optional.0.flatten(),
74
+ })
75
+ }
76
+
77
+ #[inline]
78
+ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
79
+ let ruby = unsafe { Ruby::get_unchecked() };
80
+
81
+ let ParquetWriteArgs {
82
+ read_from,
83
+ write_to,
84
+ schema,
85
+ batch_size,
86
+ } = parse_parquet_write_args(args)?;
87
+
88
+ let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
89
+
90
+ // Convert schema to Arrow schema
91
+ let arrow_fields: Vec<Field> = schema
92
+ .iter()
93
+ .map(|field| {
94
+ Field::new(
95
+ &field.name,
96
+ match field.type_ {
97
+ ParquetSchemaType::Int8 => DataType::Int8,
98
+ ParquetSchemaType::Int16 => DataType::Int16,
99
+ ParquetSchemaType::Int32 => DataType::Int32,
100
+ ParquetSchemaType::Int64 => DataType::Int64,
101
+ ParquetSchemaType::UInt8 => DataType::UInt8,
102
+ ParquetSchemaType::UInt16 => DataType::UInt16,
103
+ ParquetSchemaType::UInt32 => DataType::UInt32,
104
+ ParquetSchemaType::UInt64 => DataType::UInt64,
105
+ ParquetSchemaType::Float => DataType::Float32,
106
+ ParquetSchemaType::Double => DataType::Float64,
107
+ ParquetSchemaType::String => DataType::Utf8,
108
+ ParquetSchemaType::Binary => DataType::Binary,
109
+ ParquetSchemaType::Boolean => DataType::Boolean,
110
+ ParquetSchemaType::Date32 => DataType::Date32,
111
+ ParquetSchemaType::TimestampMillis => {
112
+ DataType::Timestamp(TimeUnit::Millisecond, None)
113
+ }
114
+ ParquetSchemaType::TimestampMicros => {
115
+ DataType::Timestamp(TimeUnit::Microsecond, None)
116
+ }
117
+ ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
118
+ ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
119
+ },
120
+ true,
121
+ )
122
+ })
123
+ .collect();
124
+ let arrow_schema = Arc::new(Schema::new(arrow_fields));
125
+
126
+ // Create the writer
127
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
128
+
129
+ if read_from.is_kind_of(ruby.class_enumerator()) {
130
+ // Create collectors for each column
131
+ let mut column_collectors: Vec<ColumnCollector> = schema
132
+ .into_iter()
133
+ .map(|field| ColumnCollector::new(field.name, field.type_))
134
+ .collect();
135
+
136
+ let mut rows_in_batch = 0;
137
+
138
+ loop {
139
+ match read_from.funcall::<_, _, Value>("next", ()) {
140
+ Ok(row) => {
141
+ let row_array = RArray::from_value(row).ok_or_else(|| {
142
+ MagnusError::new(ruby.exception_type_error(), "Row must be an array")
143
+ })?;
144
+
145
+ // Validate row length matches schema
146
+ if row_array.len() != column_collectors.len() {
147
+ return Err(MagnusError::new(
148
+ magnus::exception::type_error(),
149
+ format!(
150
+ "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
151
+ row_array.len(),
152
+ column_collectors.len(),
153
+ column_collectors.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
154
+ ),
155
+ ));
156
+ }
157
+
158
+ // Process each value in the row immediately
159
+ for (collector, value) in column_collectors.iter_mut().zip(row_array) {
160
+ collector.push_value(value)?;
161
+ }
162
+
163
+ rows_in_batch += 1;
164
+
165
+ // When we reach batch size, write the batch
166
+ if rows_in_batch >= batch_size {
167
+ write_batch(&mut writer, &mut column_collectors)?;
168
+ rows_in_batch = 0;
169
+ }
170
+ }
171
+ Err(e) => {
172
+ if e.is_kind_of(ruby.exception_stop_iteration()) {
173
+ // Write any remaining rows
174
+ if rows_in_batch > 0 {
175
+ write_batch(&mut writer, &mut column_collectors)?;
176
+ }
177
+ break;
178
+ }
179
+ return Err(e);
180
+ }
181
+ }
182
+ }
183
+ } else {
184
+ return Err(MagnusError::new(
185
+ magnus::exception::type_error(),
186
+ "read_from must be an Enumerator",
187
+ ));
188
+ }
189
+
190
+ // Ensure everything is written and get the temp file if it exists
191
+ if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
192
+ // If we got a temp file back, we need to copy its contents to the IO-like object
193
+ copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
194
+ }
195
+
196
+ Ok(())
197
+ }
198
+
199
+ #[inline]
200
+ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
201
+ let ruby = unsafe { Ruby::get_unchecked() };
202
+
203
+ let ParquetWriteArgs {
204
+ read_from,
205
+ write_to,
206
+ schema,
207
+ batch_size: _, // Batch size is determined by the input
208
+ } = parse_parquet_write_args(args)?;
209
+
210
+ // Convert schema to Arrow schema
211
+ let arrow_fields: Vec<Field> = schema
212
+ .iter()
213
+ .map(|field| {
214
+ Field::new(
215
+ &field.name,
216
+ match field.type_ {
217
+ ParquetSchemaType::Int8 => DataType::Int8,
218
+ ParquetSchemaType::Int16 => DataType::Int16,
219
+ ParquetSchemaType::Int32 => DataType::Int32,
220
+ ParquetSchemaType::Int64 => DataType::Int64,
221
+ ParquetSchemaType::UInt8 => DataType::UInt8,
222
+ ParquetSchemaType::UInt16 => DataType::UInt16,
223
+ ParquetSchemaType::UInt32 => DataType::UInt32,
224
+ ParquetSchemaType::UInt64 => DataType::UInt64,
225
+ ParquetSchemaType::Float => DataType::Float32,
226
+ ParquetSchemaType::Double => DataType::Float64,
227
+ ParquetSchemaType::String => DataType::Utf8,
228
+ ParquetSchemaType::Binary => DataType::Binary,
229
+ ParquetSchemaType::Boolean => DataType::Boolean,
230
+ ParquetSchemaType::Date32 => DataType::Date32,
231
+ ParquetSchemaType::TimestampMillis => {
232
+ DataType::Timestamp(TimeUnit::Millisecond, None)
233
+ }
234
+ ParquetSchemaType::TimestampMicros => {
235
+ DataType::Timestamp(TimeUnit::Microsecond, None)
236
+ }
237
+ ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
238
+ ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
239
+ },
240
+ true,
241
+ )
242
+ })
243
+ .collect();
244
+ let arrow_schema = Arc::new(Schema::new(arrow_fields));
245
+
246
+ // Create the writer
247
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
248
+
249
+ if read_from.is_kind_of(ruby.class_enumerator()) {
250
+ loop {
251
+ match read_from.funcall::<_, _, Value>("next", ()) {
252
+ Ok(batch) => {
253
+ let batch_array = RArray::from_value(batch).ok_or_else(|| {
254
+ MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
255
+ })?;
256
+
257
+ // Validate batch length matches schema
258
+ if batch_array.len() != schema.len() {
259
+ return Err(MagnusError::new(
260
+ magnus::exception::type_error(),
261
+ format!(
262
+ "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
263
+ batch_array.len(),
264
+ schema.len(),
265
+ schema.iter().map(|f| f.name.as_str()).collect::<Vec<_>>()
266
+ ),
267
+ ));
268
+ }
269
+
270
+ // Convert each column in the batch to Arrow arrays
271
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = schema
272
+ .iter()
273
+ .zip(batch_array)
274
+ .map(|(field, column)| {
275
+ let column_array = RArray::from_value(column).ok_or_else(|| {
276
+ MagnusError::new(
277
+ magnus::exception::type_error(),
278
+ format!("Column '{}' must be an array", field.name),
279
+ )
280
+ })?;
281
+
282
+ Ok((
283
+ field.name.clone(),
284
+ convert_ruby_array_to_arrow(column_array, &field.type_)?,
285
+ ))
286
+ })
287
+ .collect::<Result<_, MagnusError>>()?;
288
+
289
+ // Create and write record batch
290
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
291
+ MagnusError::new(
292
+ magnus::exception::runtime_error(),
293
+ format!("Failed to create record batch: {}", e),
294
+ )
295
+ })?;
296
+
297
+ writer
298
+ .write(&record_batch)
299
+ .map_err(|e| ParquetErrorWrapper(e))?;
300
+ }
301
+ Err(e) => {
302
+ if e.is_kind_of(ruby.exception_stop_iteration()) {
303
+ break;
304
+ }
305
+ return Err(e);
306
+ }
307
+ }
308
+ }
309
+ } else {
310
+ return Err(MagnusError::new(
311
+ magnus::exception::type_error(),
312
+ "read_from must be an Enumerator",
313
+ ));
314
+ }
315
+
316
+ // Ensure everything is written and get the temp file if it exists
317
+ if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
318
+ // If we got a temp file back, we need to copy its contents to the IO-like object
319
+ copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
320
+ }
321
+
322
+ Ok(())
323
+ }
324
+
325
+ fn create_writer(
326
+ ruby: &Ruby,
327
+ write_to: &Value,
328
+ schema: Arc<Schema>,
329
+ ) -> Result<WriterOutput, MagnusError> {
330
+ if write_to.is_kind_of(ruby.class_string()) {
331
+ let path = write_to.to_r_string()?.to_string()?;
332
+ let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
333
+ let writer =
334
+ ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
335
+ Ok(WriterOutput::File(writer))
336
+ } else {
337
+ // Create a temporary file to write to instead of directly to the IoLikeValue
338
+ let temp_file = NamedTempFile::new().map_err(|e| {
339
+ MagnusError::new(
340
+ magnus::exception::runtime_error(),
341
+ format!("Failed to create temporary file: {}", e),
342
+ )
343
+ })?;
344
+ let file: Box<dyn SendableWrite> = Box::new(temp_file.reopen().map_err(|e| {
345
+ MagnusError::new(
346
+ magnus::exception::runtime_error(),
347
+ format!("Failed to reopen temporary file: {}", e),
348
+ )
349
+ })?);
350
+ let writer =
351
+ ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
352
+ Ok(WriterOutput::TempFile(writer, temp_file))
353
+ }
354
+ }
355
+
356
+ // Helper function to copy temp file contents to IoLikeValue
357
+ fn copy_temp_file_to_io_like(
358
+ temp_file: NamedTempFile,
359
+ io_like: IoLikeValue,
360
+ ) -> Result<(), MagnusError> {
361
+ let file = temp_file.reopen().map_err(|e| {
362
+ MagnusError::new(
363
+ magnus::exception::runtime_error(),
364
+ format!("Failed to reopen temporary file: {}", e),
365
+ )
366
+ })?;
367
+ let mut buf_reader = BufReader::new(file);
368
+ let mut buf_writer = BufWriter::new(io_like);
369
+
370
+ io::copy(&mut buf_reader, &mut buf_writer).map_err(|e| {
371
+ MagnusError::new(
372
+ magnus::exception::runtime_error(),
373
+ format!("Failed to copy temp file to io_like: {}", e),
374
+ )
375
+ })?;
376
+
377
+ Ok(())
378
+ }
379
+
380
+ fn write_batch(
381
+ writer: &mut WriterOutput,
382
+ collectors: &mut [ColumnCollector],
383
+ ) -> Result<(), MagnusError> {
384
+ // Convert columns to Arrow arrays
385
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
386
+ .iter_mut()
387
+ .map(|collector| Ok((collector.name.clone(), collector.take_array()?)))
388
+ .collect::<Result<_, MagnusError>>()?;
389
+
390
+ // Create and write record batch
391
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
392
+ MagnusError::new(
393
+ magnus::exception::runtime_error(),
394
+ format!("Failed to create record batch: {}", e),
395
+ )
396
+ })?;
397
+
398
+ writer
399
+ .write(&record_batch)
400
+ .map_err(|e| ParquetErrorWrapper(e))?;
401
+
402
+ Ok(())
403
+ }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.0.5"
2
+ VERSION = "0.2.6"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -1,4 +1,5 @@
1
1
  # typed: strict
2
+
2
3
  module Parquet
3
4
  # Options:
4
5
  # - `input`: String, File, or IO object containing parquet data
@@ -12,7 +13,7 @@ module Parquet
12
13
  result_type: T.nilable(T.any(String, Symbol)),
13
14
  columns: T.nilable(T::Array[String]),
14
15
  blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
15
- ).returns(T.any(Enumerator, NilClass))
16
+ ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
16
17
  end
17
18
  def self.each_row(input, result_type: nil, columns: nil, &blk)
18
19
  end
@@ -31,8 +32,38 @@ module Parquet
31
32
  batch_size: T.nilable(Integer),
32
33
  blk:
33
34
  T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
34
- ).returns(T.any(Enumerator, NilClass))
35
+ ).returns(T.any(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])], NilClass))
35
36
  end
36
37
  def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
37
38
  end
39
+
40
+ # Options:
41
+ # - `read_from`: An Enumerator yielding arrays of values representing each row
42
+ # - `schema`: Array of hashes specifying column names and types
43
+ # - `write_to`: String path or IO object to write the parquet file to
44
+ # - `batch_size`: Optional batch size for writing (defaults to 1000)
45
+ sig do
46
+ params(
47
+ read_from: T::Enumerator[T::Array[T.untyped]],
48
+ schema: T::Array[T::Hash[String, String]],
49
+ write_to: T.any(String, IO),
50
+ batch_size: T.nilable(Integer)
51
+ ).void
52
+ end
53
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
54
+ end
55
+
56
+ # Options:
57
+ # - `read_from`: An Enumerator yielding arrays of column batches
58
+ # - `schema`: Array of hashes specifying column names and types
59
+ # - `write_to`: String path or IO object to write the parquet file to
60
+ sig do
61
+ params(
62
+ read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
63
+ schema: T::Array[T::Hash[String, String]],
64
+ write_to: T.any(String, IO)
65
+ ).void
66
+ end
67
+ def self.write_columns(read_from, schema:, write_to:)
68
+ end
38
69
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-06 00:00:00.000000000 Z
11
+ date: 2025-01-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -60,13 +60,20 @@ files:
60
60
  - ext/parquet/src/enumerator.rs
61
61
  - ext/parquet/src/header_cache.rs
62
62
  - ext/parquet/src/lib.rs
63
- - ext/parquet/src/parquet_column_reader.rs
64
- - ext/parquet/src/parquet_row_reader.rs
65
- - ext/parquet/src/reader.rs
63
+ - ext/parquet/src/reader/mod.rs
64
+ - ext/parquet/src/reader/parquet_column_reader.rs
65
+ - ext/parquet/src/reader/parquet_row_reader.rs
66
66
  - ext/parquet/src/ruby_integration.rs
67
67
  - ext/parquet/src/ruby_reader.rs
68
- - ext/parquet/src/types.rs
68
+ - ext/parquet/src/types/core_types.rs
69
+ - ext/parquet/src/types/mod.rs
70
+ - ext/parquet/src/types/parquet_value.rs
71
+ - ext/parquet/src/types/record_types.rs
72
+ - ext/parquet/src/types/timestamp.rs
73
+ - ext/parquet/src/types/type_conversion.rs
74
+ - ext/parquet/src/types/writer_types.rs
69
75
  - ext/parquet/src/utils.rs
76
+ - ext/parquet/src/writer/mod.rs
70
77
  - lib/parquet.rb
71
78
  - lib/parquet.rbi
72
79
  - lib/parquet/version.rb