parquet 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,484 @@
1
+ use super::{
2
+ build_column_collectors_from_dsl, copy_temp_file_to_io_like, create_writer,
3
+ parse_parquet_write_args, DEFAULT_MEMORY_THRESHOLD, INITIAL_BATCH_SIZE, MIN_BATCH_SIZE,
4
+ MIN_SAMPLES_FOR_ESTIMATE, SAMPLE_SIZE,
5
+ };
6
+ use crate::{
7
+ logger::RubyLogger,
8
+ types::{
9
+ schema_node::build_arrow_schema, ColumnCollector, ParquetGemError, ParquetSchemaType,
10
+ PrimitiveType, WriterOutput,
11
+ },
12
+ IoLikeValue, ParquetWriteArgs,
13
+ };
14
+ use arrow_array::{Array, RecordBatch};
15
+ use magnus::{
16
+ value::ReprValue, Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
17
+ };
18
+ use rand::Rng;
19
+ use std::sync::Arc;
20
+
21
+ #[inline]
22
+ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
23
+ let ruby = unsafe { Ruby::get_unchecked() };
24
+ write_rows_impl(Arc::new(ruby), args).map_err(|e| {
25
+ let z: MagnusError = e.into();
26
+ z
27
+ })?;
28
+ Ok(())
29
+ }
30
+
31
+ #[inline]
32
+ fn write_rows_impl(ruby: Arc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
33
+ let ParquetWriteArgs {
34
+ read_from,
35
+ write_to,
36
+ schema,
37
+ batch_size: user_batch_size,
38
+ compression,
39
+ flush_threshold,
40
+ sample_size: user_sample_size,
41
+ logger,
42
+ } = parse_parquet_write_args(&ruby, args)?;
43
+
44
+ let logger = RubyLogger::new(&ruby, logger)?;
45
+ let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
46
+
47
+ // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
48
+ let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
49
+ MagnusError::new(
50
+ magnus::exception::runtime_error(),
51
+ format!("Failed to build Arrow schema from DSL schema: {}", e),
52
+ )
53
+ })?;
54
+
55
+ // Create the writer
56
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
57
+
58
+ if read_from.is_kind_of(ruby.class_enumerator()) {
59
+ // Build column collectors - we only have DSL schema now
60
+ let mut column_collectors =
61
+ build_column_collectors_from_dsl(&ruby, &arrow_schema, &schema)?;
62
+
63
+ let mut rows_in_batch = 0;
64
+ let mut total_rows = 0;
65
+ let mut rng = rand::rng();
66
+ let sample_size = user_sample_size.unwrap_or(SAMPLE_SIZE);
67
+ let mut size_samples = Vec::with_capacity(sample_size);
68
+ let mut current_batch_size = user_batch_size.unwrap_or(INITIAL_BATCH_SIZE);
69
+
70
+ loop {
71
+ match read_from.funcall::<_, _, Value>("next", ()) {
72
+ Ok(row) => {
73
+ // Process the row
74
+ process_row(&ruby, row, &mut column_collectors)?;
75
+
76
+ // Update row sampling for dynamic batch sizing
77
+ if size_samples.len() < sample_size {
78
+ // estimate row size
79
+ let row_array = RArray::from_value(row).ok_or_else(|| {
80
+ MagnusError::new(ruby.exception_type_error(), "Row must be an array")
81
+ })?;
82
+ let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
83
+ size_samples.push(row_size);
84
+ } else if rng.random_range(0..=total_rows) < sample_size as usize {
85
+ let idx = rng.random_range(0..sample_size as usize);
86
+ let row_array = RArray::from_value(row).ok_or_else(|| {
87
+ MagnusError::new(ruby.exception_type_error(), "Row must be an array")
88
+ })?;
89
+ let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
90
+ size_samples[idx] = row_size;
91
+ }
92
+
93
+ rows_in_batch += 1;
94
+ total_rows += 1;
95
+
96
+ // Calculate batch size progressively once we have minimum samples
97
+ if user_batch_size.is_none() && size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE {
98
+ current_batch_size =
99
+ update_batch_size(&size_samples, flush_threshold, MIN_BATCH_SIZE);
100
+ }
101
+
102
+ // When we reach batch size, write the batch
103
+ if rows_in_batch >= current_batch_size {
104
+ write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
105
+ rows_in_batch = 0;
106
+ }
107
+ }
108
+ Err(e) => {
109
+ if e.is_kind_of(ruby.exception_stop_iteration()) {
110
+ // Write any remaining rows
111
+ if rows_in_batch > 0 {
112
+ write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
113
+ }
114
+ break;
115
+ }
116
+ return Err(e)?;
117
+ }
118
+ }
119
+ }
120
+ } else {
121
+ return Err(MagnusError::new(
122
+ magnus::exception::type_error(),
123
+ "read_from must be an Enumerator".to_string(),
124
+ ))?;
125
+ }
126
+
127
+ // Ensure everything is written and get the temp file if it exists
128
+ if let Some(temp_file) = writer.close()? {
129
+ // If we got a temp file back, we need to copy its contents to the IO-like object
130
+ copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
131
+ }
132
+
133
+ Ok(())
134
+ }
135
+
136
+ // Processes a single data row and adds values to the corresponding column collectors
137
+ // This function is called for each row of input data when writing in row-wise mode.
138
+ // It performs important validation to ensure the row structure matches the schema:
139
+ // - Verifies that the number of columns in the row matches the schema
140
+ // - Distributes each value to the appropriate ColumnCollector
141
+ //
142
+ // Each ColumnCollector handles type conversion and accumulation for its specific column,
143
+ // allowing this function to focus on row-level validation and distribution.
144
+ fn process_row(
145
+ ruby: &Ruby,
146
+ row: Value,
147
+ column_collectors: &mut [ColumnCollector],
148
+ ) -> Result<(), MagnusError> {
149
+ let row_array = RArray::from_value(row)
150
+ .ok_or_else(|| MagnusError::new(ruby.exception_type_error(), "Row must be an array"))?;
151
+
152
+ // Validate row length matches schema
153
+ if row_array.len() != column_collectors.len() {
154
+ return Err(MagnusError::new(
155
+ magnus::exception::runtime_error(),
156
+ format!(
157
+ "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
158
+ row_array.len(),
159
+ column_collectors.len(),
160
+ column_collectors
161
+ .iter()
162
+ .map(|c| c.name.as_str())
163
+ .collect::<Vec<_>>()
164
+ ),
165
+ ));
166
+ }
167
+
168
+ // Process each value in the row
169
+ for (collector, value) in column_collectors.iter_mut().zip(row_array) {
170
+ collector.push_value(value)?;
171
+ }
172
+
173
+ Ok(())
174
+ }
175
+
176
+ // Converts all accumulated data from ColumnCollectors into an Arrow RecordBatch
177
+ // and writes it to the Parquet file/output. This is a crucial function that bridges
178
+ // between our Ruby-oriented data collectors and the Arrow/Parquet ecosystem.
179
+ //
180
+ // The function:
181
+ // 1. Takes all collected values from each ColumnCollector and converts them to Arrow arrays
182
+ // 2. Creates a RecordBatch from these arrays (column-oriented data format)
183
+ // 3. Writes the batch to the ParquetWriter
184
+ // 4. Flushes the writer if the accumulated memory exceeds the threshold
185
+ //
186
+ // This approach enables efficient batch-wise writing while controlling memory usage.
187
+ fn write_batch(
188
+ writer: &mut WriterOutput,
189
+ collectors: &mut [ColumnCollector],
190
+ flush_threshold: usize,
191
+ ) -> Result<(), ParquetGemError> {
192
+ // Convert columns to Arrow arrays
193
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
194
+ .iter_mut()
195
+ .map(|c| {
196
+ let arr = c.take_array()?;
197
+ Ok((c.name.clone(), arr))
198
+ })
199
+ .collect::<Result<_, ParquetGemError>>()?;
200
+
201
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays.clone()).map_err(|e| {
202
+ MagnusError::new(
203
+ magnus::exception::runtime_error(),
204
+ format!("Failed to create RecordBatch: {}", e),
205
+ )
206
+ })?;
207
+
208
+ writer.write(&record_batch)?;
209
+
210
+ // Check if we need to flush based on memory usage thresholds
211
+ match writer {
212
+ WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
213
+ if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
214
+ w.flush()?;
215
+ }
216
+ }
217
+ }
218
+ Ok(())
219
+ }
220
+
221
+ // Estimates the memory size of a single row by examining each value
222
+ // This is used for dynamic batch sizing to optimize memory usage during writes
223
+ // by adapting batch sizes based on the actual data being processed.
224
+ pub fn estimate_single_row_size(
225
+ row_array: &RArray,
226
+ collectors: &[ColumnCollector],
227
+ ) -> Result<usize, MagnusError> {
228
+ let mut size = 0;
229
+ for (idx, val) in row_array.into_iter().enumerate() {
230
+ let col_type = &collectors[idx].type_;
231
+ // Calculate size based on the type-specific estimation
232
+ size += estimate_value_size(val, col_type)?;
233
+ }
234
+ Ok(size)
235
+ }
236
+
237
+ // Estimates the memory footprint of a single value based on its schema type
238
+ // This provides type-specific size estimates that help with dynamic batch sizing
239
+ // For complex types like lists, maps, and structs, we use reasonable approximations
240
+ pub fn estimate_value_size(
241
+ value: Value,
242
+ schema_type: &ParquetSchemaType,
243
+ ) -> Result<usize, MagnusError> {
244
+ use ParquetSchemaType as PST;
245
+ if value.is_nil() {
246
+ return Ok(0); // nil => minimal
247
+ }
248
+ match schema_type {
249
+ PST::Primitive(PrimitiveType::Int8) | PST::Primitive(PrimitiveType::UInt8) => Ok(1),
250
+ PST::Primitive(PrimitiveType::Int16) | PST::Primitive(PrimitiveType::UInt16) => Ok(2),
251
+ PST::Primitive(PrimitiveType::Int32)
252
+ | PST::Primitive(PrimitiveType::UInt32)
253
+ | PST::Primitive(PrimitiveType::Float32) => Ok(4),
254
+ PST::Primitive(PrimitiveType::Int64)
255
+ | PST::Primitive(PrimitiveType::UInt64)
256
+ | PST::Primitive(PrimitiveType::Float64) => Ok(8),
257
+ PST::Primitive(PrimitiveType::Boolean) => Ok(1),
258
+ PST::Primitive(PrimitiveType::Date32)
259
+ | PST::Primitive(PrimitiveType::TimestampMillis)
260
+ | PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8),
261
+ PST::Primitive(PrimitiveType::String) | PST::Primitive(PrimitiveType::Binary) => {
262
+ if let Ok(s) = String::try_convert(value) {
263
+ // Account for string length plus Rust String's capacity+pointer overhead
264
+ Ok(s.len() + std::mem::size_of::<usize>() * 3)
265
+ } else {
266
+ // Try to convert the value to a string using to_s for non-string types
267
+ // This handles numeric values that will be converted to strings later
268
+ match value.funcall::<_, _, Value>("to_s", ()) {
269
+ Ok(str_val) => {
270
+ if let Ok(s) = String::try_convert(str_val) {
271
+ Ok(s.len() + std::mem::size_of::<usize>() * 3)
272
+ } else {
273
+ // If to_s conversion fails, just use a reasonable default
274
+ Ok(8) // Reasonable size estimate for small values
275
+ }
276
+ }
277
+ Err(_) => {
278
+ // If to_s method fails, use a default size
279
+ Ok(8) // Reasonable size estimate for small values
280
+ }
281
+ }
282
+ }
283
+ }
284
+ PST::List(item_type) => {
285
+ if let Ok(arr) = RArray::try_convert(value) {
286
+ let len = arr.len();
287
+
288
+ // Base overhead for the array structure (pointer, length, capacity)
289
+ let base_size = std::mem::size_of::<usize>() * 3;
290
+
291
+ // If empty, just return the base size
292
+ if len == 0 {
293
+ return Ok(base_size);
294
+ }
295
+
296
+ // Sample up to 5 elements to get average element size
297
+ let sample_count = std::cmp::min(len, 5);
298
+ let mut total_sample_size = 0;
299
+
300
+ for i in 0..sample_count {
301
+ let element = arr.entry(i as isize)?;
302
+ let element_size = estimate_value_size(element, &item_type.item_type)?;
303
+ total_sample_size += element_size;
304
+ }
305
+
306
+ // If we couldn't sample any elements properly, that's an error
307
+ if sample_count > 0 && total_sample_size == 0 {
308
+ return Err(MagnusError::new(
309
+ magnus::exception::runtime_error(),
310
+ "Failed to estimate size of list elements",
311
+ ));
312
+ }
313
+
314
+ // Calculate average element size from samples
315
+ let avg_element_size = if sample_count > 0 {
316
+ total_sample_size as f64 / sample_count as f64
317
+ } else {
318
+ return Err(MagnusError::new(
319
+ magnus::exception::runtime_error(),
320
+ "Failed to sample list elements for size estimation",
321
+ ));
322
+ };
323
+
324
+ // Estimate total size based on average element size * length + base overhead
325
+ Ok(base_size + (avg_element_size as usize * len))
326
+ } else {
327
+ // Instead of assuming it's a small list, return an error
328
+ Err(MagnusError::new(
329
+ magnus::exception::runtime_error(),
330
+ format!("Expected array for List type but got: {:?}", value),
331
+ ))
332
+ }
333
+ }
334
+ PST::Map(map_field) => {
335
+ if let Ok(hash) = RHash::try_convert(value) {
336
+ let size_estimate = hash.funcall::<_, _, usize>("size", ())?;
337
+
338
+ // Base overhead for the hash structure
339
+ let base_size = std::mem::size_of::<usize>() * 4;
340
+
341
+ // If empty, just return the base size
342
+ if size_estimate == 0 {
343
+ return Ok(base_size);
344
+ }
345
+
346
+ // Sample up to 5 key-value pairs to estimate average sizes
347
+ let mut key_sample_size = 0;
348
+ let mut value_sample_size = 0;
349
+ let mut sample_count = 0;
350
+
351
+ // Get an enumerator for the hash
352
+ let enumerator = hash.funcall::<_, _, Value>("to_enum", ())?;
353
+
354
+ // Sample up to 5 entries
355
+ for _ in 0..std::cmp::min(size_estimate, 5) {
356
+ match enumerator.funcall::<_, _, Value>("next", ()) {
357
+ Ok(pair) => {
358
+ if let Ok(pair_array) = RArray::try_convert(pair) {
359
+ if pair_array.len() == 2 {
360
+ let key = pair_array.entry(0)?;
361
+ let val = pair_array.entry(1)?;
362
+
363
+ key_sample_size +=
364
+ estimate_value_size(key, &map_field.key_type)?;
365
+ value_sample_size +=
366
+ estimate_value_size(val, &map_field.value_type)?;
367
+ sample_count += 1;
368
+ }
369
+ }
370
+ }
371
+ Err(_) => break, // Stop if we reach the end
372
+ }
373
+ }
374
+
375
+ // If we couldn't sample any pairs, return an error
376
+ if size_estimate > 0 && sample_count == 0 {
377
+ return Err(MagnusError::new(
378
+ magnus::exception::runtime_error(),
379
+ "Failed to sample map entries for size estimation",
380
+ ));
381
+ }
382
+
383
+ // Calculate average key and value sizes
384
+ let (avg_key_size, avg_value_size) = if sample_count > 0 {
385
+ (
386
+ key_sample_size as f64 / sample_count as f64,
387
+ value_sample_size as f64 / sample_count as f64,
388
+ )
389
+ } else {
390
+ return Err(MagnusError::new(
391
+ magnus::exception::runtime_error(),
392
+ "Failed to sample hash key-value pairs for size estimation",
393
+ ));
394
+ };
395
+
396
+ // Each entry has overhead (node pointers, etc.) in a hash map
397
+ let entry_overhead = std::mem::size_of::<usize>() * 2;
398
+
399
+ // Estimate total size:
400
+ // base size + (key_size + value_size + entry_overhead) * count
401
+ Ok(base_size
402
+ + ((avg_key_size + avg_value_size + entry_overhead as f64) as usize
403
+ * size_estimate))
404
+ } else {
405
+ // Instead of assuming a small map, return an error
406
+ Err(MagnusError::new(
407
+ magnus::exception::runtime_error(),
408
+ format!("Expected hash for Map type but got: {:?}", value),
409
+ ))
410
+ }
411
+ }
412
+ PST::Struct(struct_field) => {
413
+ if let Ok(hash) = RHash::try_convert(value) {
414
+ // Base overhead for the struct
415
+ let base_size = std::mem::size_of::<usize>() * 3;
416
+
417
+ // Estimate size for each field
418
+ let mut total_fields_size = 0;
419
+
420
+ for field in &struct_field.fields {
421
+ // Try to get the field value from the hash
422
+ match hash.get(Symbol::new(&field.name)) {
423
+ Some(field_value) => {
424
+ total_fields_size += estimate_value_size(field_value, &field.type_)?;
425
+ }
426
+ None => {
427
+ if let Some(field_value) = hash.get(&*field.name) {
428
+ total_fields_size +=
429
+ estimate_value_size(field_value, &field.type_)?;
430
+ } else {
431
+ if field.nullable {
432
+ total_fields_size += 0;
433
+ } else {
434
+ return Err(MagnusError::new(
435
+ magnus::exception::runtime_error(),
436
+ format!("Missing field: {} in hash {:?}", field.name, hash),
437
+ ));
438
+ }
439
+ }
440
+ }
441
+ }
442
+ }
443
+
444
+ // We no longer error on missing fields during size estimation
445
+ Ok(base_size + total_fields_size)
446
+ } else {
447
+ // Instead of trying instance_variables or assuming a default, return an error
448
+ Err(MagnusError::new(
449
+ magnus::exception::runtime_error(),
450
+ format!("Expected hash for Struct type but got: {:?}", value),
451
+ ))
452
+ }
453
+ }
454
+ }
455
+ }
456
+
457
+ // Dynamically calculates an optimal batch size based on estimated row sizes
458
+ // and memory constraints. This function enables the writer to adapt to different
459
+ // data characteristics for optimal performance.
460
+ //
461
+ // The algorithm:
462
+ // 1. Requires a minimum number of samples to make a reliable estimate
463
+ // 2. Calculates the average row size from the samples
464
+ // 3. Determines a batch size that would consume approximately the target memory threshold
465
+ // 4. Ensures the batch size doesn't go below a minimum value for efficiency
466
+ //
467
+ // This approach balances memory usage with processing efficiency by targeting
468
+ // a specific memory footprint per batch.
469
+ fn update_batch_size(
470
+ size_samples: &[usize],
471
+ flush_threshold: usize,
472
+ min_batch_size: usize,
473
+ ) -> usize {
474
+ if size_samples.len() < MIN_SAMPLES_FOR_ESTIMATE {
475
+ return min_batch_size;
476
+ }
477
+
478
+ let total_size = size_samples.iter().sum::<usize>();
479
+ // Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
480
+ let avg_row_size = total_size as f64 / size_samples.len() as f64;
481
+ let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
482
+ let suggested_batch_size = (flush_threshold as f64 / avg_row_size).floor() as usize;
483
+ suggested_batch_size.max(min_batch_size)
484
+ }
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Parquet
4
+ # Schema definition for Parquet files
5
+ class Schema
6
+ # Define a new schema using the DSL
7
+ # @return [Hash] schema definition hash
8
+ #
9
+ # @example Define a schema with nullable and non-nullable fields
10
+ # Parquet::Schema.define do
11
+ # field :id, :int64, nullable: false # ID cannot be null
12
+ # field :name, :string # Default nullable: true
13
+ #
14
+ # # List with non-nullable items
15
+ # field :scores, :list, item: :float, item_nullable: false
16
+ #
17
+ # # Map with nullable values
18
+ # field :metadata, :map,
19
+ # key: :string,
20
+ # value: :string,
21
+ # value_nullable: true
22
+ #
23
+ # # Nested struct with non-nullable fields
24
+ # field :address, :struct, nullable: true do
25
+ # field :street, :string, nullable: false
26
+ # field :city, :string, nullable: false
27
+ # field :zip, :string, nullable: false
28
+ # end
29
+ # end
30
+ def self.define(&block)
31
+ builder = SchemaBuilder.new
32
+ builder.instance_eval(&block)
33
+
34
+ # Return a structured hash representing the schema
35
+ { type: :struct, fields: builder.fields }
36
+ end
37
+
38
+ # Internal builder class that provides the DSL methods
39
+ class SchemaBuilder
40
+ attr_reader :fields
41
+
42
+ def initialize
43
+ @fields = []
44
+ end
45
+
46
+ # Define a field in the schema
47
+ # @param name [String, Symbol] field name
48
+ # @param type [Symbol] data type (:int32, :int64, :string, :list, :map, :struct, etc)
49
+ # @param nullable [Boolean] whether the field can be null (default: true)
50
+ # @param kwargs [Hash] additional options depending on type
51
+ #
52
+ # Additional keyword args:
53
+ # - `item:` if type == :list
54
+ # - `item_nullable:` controls nullability of list items (default: true)
55
+ # - `key:, value:` if type == :map
56
+ # - `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
57
+ # - `format:` if you want to store some format string
58
+ # - `nullable:` default to true if not specified
59
+ def field(name, type, nullable: true, **kwargs, &block)
60
+ field_hash = { name: name.to_s, type: type, nullable: !!nullable }
61
+
62
+ # Possibly store a format if provided
63
+ field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
64
+
65
+ case type
66
+ when :struct
67
+ # We'll parse subfields from the block
68
+ sub_builder = SchemaBuilder.new
69
+ sub_builder.instance_eval(&block) if block
70
+ field_hash[:fields] = sub_builder.fields
71
+ when :list
72
+ item_type = kwargs[:item]
73
+ raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
74
+ # Pass item_nullable if provided, otherwise use true as default
75
+ item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]
76
+ field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
77
+ when :map
78
+ # user must specify key:, value:
79
+ key_type = kwargs[:key]
80
+ value_type = kwargs[:value]
81
+ raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
82
+ # Pass key_nullable and value_nullable if provided, otherwise use true as default
83
+ key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
84
+ value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]
85
+ field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)
86
+ field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
87
+ else
88
+ # primitive type: :int32, :int64, :string, etc.
89
+ # do nothing else special
90
+ end
91
+
92
+ @fields << field_hash
93
+ end
94
+
95
+ def build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block)
96
+ # Wrap the key type (maps typically use non-nullable keys)
97
+ key = wrap_subtype(key_type, nullable: key_nullable)
98
+
99
+ # Handle the case where value_type is a complex type (:struct or :list) and a block is provided
100
+ value =
101
+ if (value_type == :struct || value_type == :list) && block
102
+ wrap_subtype(value_type, nullable: value_nullable, &block)
103
+ else
104
+ wrap_subtype(value_type, nullable: value_nullable)
105
+ end
106
+
107
+ # Map is represented as a list of key/value pairs in Parquet
108
+ {
109
+ type: :map,
110
+ nullable: nullable,
111
+ item: {
112
+ type: :struct,
113
+ nullable: false,
114
+ name: "key_value",
115
+ fields: [key, value]
116
+ }
117
+ }
118
+ end
119
+
120
+ private
121
+
122
+ # If user said: field "something", :list, item: :struct do ... end
123
+ # we want to recursively parse that sub-struct from the block.
124
+ # So wrap_subtype might be:
125
+ def wrap_subtype(t, nullable: true, &block)
126
+ if t == :struct
127
+ sub_builder = SchemaBuilder.new
128
+ sub_builder.instance_eval(&block) if block
129
+
130
+ # Validate that the struct has at least one field
131
+ if sub_builder.fields.empty?
132
+ raise ArgumentError, "Cannot create a struct with zero fields. Parquet doesn't support empty structs."
133
+ end
134
+
135
+ { type: :struct, nullable: nullable, name: "item", fields: sub_builder.fields }
136
+ elsif t == :list && block
137
+ # Handle nested lists by processing the block to define the item type
138
+ sub_builder = SchemaBuilder.new
139
+ sub_builder.instance_eval(&block) if block
140
+
141
+ # We expect a single field named "item" that defines the inner list's item type
142
+ if sub_builder.fields.empty? || sub_builder.fields.length > 1 || sub_builder.fields[0][:name] != "item"
143
+ raise ArgumentError, "Nested list must define exactly one field named 'item' for the inner list's item type"
144
+ end
145
+
146
+ { type: :list, nullable: nullable, name: "item", item: sub_builder.fields[0] }
147
+ else
148
+ # e.g. :int32 => { type: :int32, nullable: true }
149
+ { type: t, nullable: nullable, name: "item" }
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.4.2"
2
+ VERSION = "0.5.1"
3
3
  end
data/lib/parquet.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require_relative "parquet/version"
2
+ require_relative "parquet/schema"
2
3
 
3
4
  begin
4
5
  require "parquet/#{RUBY_VERSION.to_f}/parquet"