parquet 0.5.9 → 0.5.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +3 -0
- data/ext/parquet/Cargo.toml +2 -0
- data/ext/parquet/build.rs +1 -1
- data/ext/parquet/src/lib.rs +3 -0
- data/ext/parquet/src/reader/arrow_reader.rs +579 -0
- data/ext/parquet/src/reader/common.rs +65 -11
- data/ext/parquet/src/reader/format_detector.rs +69 -0
- data/ext/parquet/src/reader/mod.rs +7 -2
- data/ext/parquet/src/reader/unified/mod.rs +82 -14
- data/ext/parquet/src/types/core_types.rs +1 -0
- data/ext/parquet/src/types/mod.rs +11 -4
- data/ext/parquet/src/types/parquet_value.rs +290 -73
- data/ext/parquet/src/types/record_types.rs +92 -8
- data/ext/parquet/src/types/schema_node.rs +11 -5
- data/ext/parquet/src/types/type_conversion.rs +216 -0
- data/ext/parquet/src/types/writer_types.rs +50 -0
- data/ext/parquet/src/writer/mod.rs +3 -0
- data/ext/parquet/src/writer/write_columns.rs +3 -0
- data/ext/parquet/src/writer/write_rows.rs +1 -0
- data/lib/parquet/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 114891cfa5fa190e1f00d44803327f1c90cc11f64ba23f7f2a9cc9f9379da787
|
4
|
+
data.tar.gz: 9168b2be960faa93ce9c84d170c6e8f73819535bcedbf3d3b26869ff9829ecc6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f07f99a188ac5fa0663616fba00b1990a2cbd6bb14462383915f0e1617c26c5ca481840c16179958f2b3760b334f176e2e4542d95e3cc922379948ac2b0bfa61
|
7
|
+
data.tar.gz: 42c7b0779d6e3fa46addc5fa92420f326418a54962d391e9b063db8378f8a5f8c2916b43f356649fc127e8fc582aa1e98d7afd71f0bc5f9700a0664ed46313f6
|
data/Cargo.lock
CHANGED
@@ -126,6 +126,7 @@ dependencies = [
|
|
126
126
|
"arrow-data",
|
127
127
|
"arrow-schema",
|
128
128
|
"flatbuffers",
|
129
|
+
"lz4_flex",
|
129
130
|
]
|
130
131
|
|
131
132
|
[[package]]
|
@@ -842,6 +843,8 @@ version = "0.1.0"
|
|
842
843
|
dependencies = [
|
843
844
|
"ahash",
|
844
845
|
"arrow-array",
|
846
|
+
"arrow-buffer",
|
847
|
+
"arrow-ipc",
|
845
848
|
"arrow-schema",
|
846
849
|
"bytes",
|
847
850
|
"either",
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -12,6 +12,8 @@ rb-sys-env = "^0.2"
|
|
12
12
|
[dependencies]
|
13
13
|
ahash = "0.8"
|
14
14
|
arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
|
15
|
+
arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
|
16
|
+
arrow-ipc = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time", features = ["lz4"] }
|
15
17
|
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
|
16
18
|
bytes = "^1.9"
|
17
19
|
either = "1.9"
|
data/ext/parquet/build.rs
CHANGED
data/ext/parquet/src/lib.rs
CHANGED
@@ -19,6 +19,9 @@ use writer::write_rows;
|
|
19
19
|
/// Initializes the Ruby extension and defines methods.
|
20
20
|
#[magnus::init]
|
21
21
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
22
|
+
// Require 'time' for Time.parse method
|
23
|
+
ruby.require("time")?;
|
24
|
+
|
22
25
|
let module = ruby.define_module("Parquet")?;
|
23
26
|
module.define_module_function("metadata", magnus::method!(reader::parse_metadata, -1))?;
|
24
27
|
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
@@ -0,0 +1,579 @@
|
|
1
|
+
use crate::header_cache::StringCache;
|
2
|
+
use crate::logger::RubyLogger;
|
3
|
+
use crate::types::ArrayWrapper;
|
4
|
+
use crate::types::{
|
5
|
+
ColumnRecord, ParquetGemError, ParquetValueVec, ParserResultType, RowRecord, TryIntoValue,
|
6
|
+
};
|
7
|
+
use ahash::RandomState;
|
8
|
+
use arrow_array::RecordBatch;
|
9
|
+
use arrow_ipc::reader::{FileReader, StreamReader};
|
10
|
+
use arrow_schema::Schema;
|
11
|
+
use magnus::{Ruby, Value};
|
12
|
+
use std::collections::HashMap;
|
13
|
+
use std::fs::File;
|
14
|
+
use std::io::Read;
|
15
|
+
use std::rc::Rc;
|
16
|
+
use std::sync::{Arc, OnceLock};
|
17
|
+
|
18
|
+
/// Process Arrow IPC file data for column-based parsing
|
19
|
+
pub fn process_arrow_column_data<R: Read>(
|
20
|
+
ruby: Rc<Ruby>,
|
21
|
+
reader: StreamReader<R>,
|
22
|
+
columns: &Option<Vec<String>>,
|
23
|
+
result_type: ParserResultType,
|
24
|
+
_batch_size: Option<usize>,
|
25
|
+
strict: bool,
|
26
|
+
ruby_logger: &RubyLogger,
|
27
|
+
) -> Result<(), ParquetGemError> {
|
28
|
+
let schema = reader.schema();
|
29
|
+
ruby_logger.debug(|| format!("Arrow schema loaded: {:?}", schema))?;
|
30
|
+
|
31
|
+
// Filter schema if columns are specified
|
32
|
+
let _filtered_schema = if let Some(cols) = columns {
|
33
|
+
let mut fields = Vec::new();
|
34
|
+
for field in schema.fields() {
|
35
|
+
if cols.contains(&field.name().to_string()) {
|
36
|
+
fields.push(field.clone());
|
37
|
+
}
|
38
|
+
}
|
39
|
+
Arc::new(Schema::new(fields))
|
40
|
+
} else {
|
41
|
+
schema.clone()
|
42
|
+
};
|
43
|
+
|
44
|
+
match result_type {
|
45
|
+
ParserResultType::Hash => {
|
46
|
+
let headers = OnceLock::new();
|
47
|
+
|
48
|
+
for batch_result in reader {
|
49
|
+
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
50
|
+
|
51
|
+
// Filter columns if needed
|
52
|
+
let batch = if let Some(cols) = columns {
|
53
|
+
filter_record_batch(&batch, cols)?
|
54
|
+
} else {
|
55
|
+
batch
|
56
|
+
};
|
57
|
+
|
58
|
+
let local_headers = headers
|
59
|
+
.get_or_init(|| {
|
60
|
+
let schema = batch.schema();
|
61
|
+
let fields = schema.fields();
|
62
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
63
|
+
for field in fields {
|
64
|
+
header_string.push(field.name().to_owned());
|
65
|
+
}
|
66
|
+
StringCache::intern_many(&header_string)
|
67
|
+
})
|
68
|
+
.as_ref()
|
69
|
+
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
70
|
+
|
71
|
+
let mut map =
|
72
|
+
HashMap::with_capacity_and_hasher(local_headers.len(), RandomState::default());
|
73
|
+
|
74
|
+
batch
|
75
|
+
.columns()
|
76
|
+
.iter()
|
77
|
+
.enumerate()
|
78
|
+
.try_for_each(|(i, column)| {
|
79
|
+
let header = local_headers[i];
|
80
|
+
let values = ParquetValueVec::try_from(ArrayWrapper {
|
81
|
+
array: column,
|
82
|
+
strict,
|
83
|
+
})?;
|
84
|
+
map.insert(header, values.into_inner());
|
85
|
+
Ok::<_, ParquetGemError>(())
|
86
|
+
})?;
|
87
|
+
|
88
|
+
let record = ColumnRecord::Map::<RandomState>(map);
|
89
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
90
|
+
}
|
91
|
+
}
|
92
|
+
ParserResultType::Array => {
|
93
|
+
for batch_result in reader {
|
94
|
+
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
95
|
+
|
96
|
+
// Filter columns if needed
|
97
|
+
let batch = if let Some(cols) = columns {
|
98
|
+
filter_record_batch(&batch, cols)?
|
99
|
+
} else {
|
100
|
+
batch
|
101
|
+
};
|
102
|
+
|
103
|
+
let vec = batch
|
104
|
+
.columns()
|
105
|
+
.iter()
|
106
|
+
.map(|column| {
|
107
|
+
let values = ParquetValueVec::try_from(ArrayWrapper {
|
108
|
+
array: column,
|
109
|
+
strict,
|
110
|
+
})?;
|
111
|
+
Ok::<_, ParquetGemError>(values.into_inner())
|
112
|
+
})
|
113
|
+
.collect::<Result<Vec<_>, _>>()?;
|
114
|
+
|
115
|
+
let record = ColumnRecord::Vec::<RandomState>(vec);
|
116
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
}
|
120
|
+
|
121
|
+
Ok(())
|
122
|
+
}
|
123
|
+
|
124
|
+
/// Process Arrow IPC file data for row-based parsing
|
125
|
+
pub fn process_arrow_row_data<R: Read>(
|
126
|
+
ruby: Rc<Ruby>,
|
127
|
+
reader: StreamReader<R>,
|
128
|
+
columns: &Option<Vec<String>>,
|
129
|
+
result_type: ParserResultType,
|
130
|
+
strict: bool,
|
131
|
+
ruby_logger: &RubyLogger,
|
132
|
+
) -> Result<(), ParquetGemError> {
|
133
|
+
let schema = reader.schema();
|
134
|
+
ruby_logger.debug(|| format!("Arrow schema loaded: {:?}", schema))?;
|
135
|
+
|
136
|
+
match result_type {
|
137
|
+
ParserResultType::Hash => {
|
138
|
+
let headers = OnceLock::new();
|
139
|
+
|
140
|
+
for batch_result in reader {
|
141
|
+
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
142
|
+
|
143
|
+
// Filter columns if needed
|
144
|
+
let batch = if let Some(cols) = columns {
|
145
|
+
filter_record_batch(&batch, cols)?
|
146
|
+
} else {
|
147
|
+
batch
|
148
|
+
};
|
149
|
+
|
150
|
+
let local_headers = headers
|
151
|
+
.get_or_init(|| {
|
152
|
+
let schema = batch.schema();
|
153
|
+
let fields = schema.fields();
|
154
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
155
|
+
for field in fields {
|
156
|
+
header_string.push(field.name().to_owned());
|
157
|
+
}
|
158
|
+
StringCache::intern_many(&header_string)
|
159
|
+
})
|
160
|
+
.as_ref()
|
161
|
+
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
162
|
+
|
163
|
+
// Convert columnar data to rows
|
164
|
+
for row_idx in 0..batch.num_rows() {
|
165
|
+
let mut map = HashMap::with_capacity_and_hasher(
|
166
|
+
local_headers.len(),
|
167
|
+
RandomState::default(),
|
168
|
+
);
|
169
|
+
|
170
|
+
for (col_idx, column) in batch.columns().iter().enumerate() {
|
171
|
+
let header = local_headers[col_idx];
|
172
|
+
let value = extract_value_at_index(column, row_idx, strict)?;
|
173
|
+
map.insert(header, value);
|
174
|
+
}
|
175
|
+
|
176
|
+
let record = RowRecord::Map::<RandomState>(map);
|
177
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
178
|
+
}
|
179
|
+
}
|
180
|
+
}
|
181
|
+
ParserResultType::Array => {
|
182
|
+
for batch_result in reader {
|
183
|
+
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
184
|
+
|
185
|
+
// Filter columns if needed
|
186
|
+
let batch = if let Some(cols) = columns {
|
187
|
+
filter_record_batch(&batch, cols)?
|
188
|
+
} else {
|
189
|
+
batch
|
190
|
+
};
|
191
|
+
|
192
|
+
// Convert columnar data to rows
|
193
|
+
for row_idx in 0..batch.num_rows() {
|
194
|
+
let mut row_vec = Vec::with_capacity(batch.num_columns());
|
195
|
+
|
196
|
+
for column in batch.columns() {
|
197
|
+
let value = extract_value_at_index(column, row_idx, strict)?;
|
198
|
+
row_vec.push(value);
|
199
|
+
}
|
200
|
+
|
201
|
+
let record = RowRecord::Vec::<RandomState>(row_vec);
|
202
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
203
|
+
}
|
204
|
+
}
|
205
|
+
}
|
206
|
+
}
|
207
|
+
|
208
|
+
Ok(())
|
209
|
+
}
|
210
|
+
|
211
|
+
/// Process Arrow IPC file with FileReader for row-based parsing
|
212
|
+
pub fn process_arrow_file_row_data(
|
213
|
+
ruby: Rc<Ruby>,
|
214
|
+
reader: FileReader<File>,
|
215
|
+
columns: &Option<Vec<String>>,
|
216
|
+
result_type: ParserResultType,
|
217
|
+
strict: bool,
|
218
|
+
ruby_logger: &RubyLogger,
|
219
|
+
) -> Result<(), ParquetGemError> {
|
220
|
+
let schema = reader.schema();
|
221
|
+
ruby_logger.debug(|| format!("Arrow file schema loaded: {:?}", schema))?;
|
222
|
+
|
223
|
+
match result_type {
|
224
|
+
ParserResultType::Hash => {
|
225
|
+
let headers = OnceLock::new();
|
226
|
+
|
227
|
+
for batch_result in reader {
|
228
|
+
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
229
|
+
|
230
|
+
// Filter columns if needed
|
231
|
+
let batch = if let Some(cols) = columns {
|
232
|
+
filter_record_batch(&batch, cols)?
|
233
|
+
} else {
|
234
|
+
batch
|
235
|
+
};
|
236
|
+
|
237
|
+
let local_headers = headers
|
238
|
+
.get_or_init(|| {
|
239
|
+
let schema = batch.schema();
|
240
|
+
let fields = schema.fields();
|
241
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
242
|
+
for field in fields {
|
243
|
+
header_string.push(field.name().to_owned());
|
244
|
+
}
|
245
|
+
StringCache::intern_many(&header_string)
|
246
|
+
})
|
247
|
+
.as_ref()
|
248
|
+
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
249
|
+
|
250
|
+
// Convert columnar data to rows
|
251
|
+
for row_idx in 0..batch.num_rows() {
|
252
|
+
let mut map = HashMap::with_capacity_and_hasher(
|
253
|
+
local_headers.len(),
|
254
|
+
RandomState::default(),
|
255
|
+
);
|
256
|
+
|
257
|
+
for (col_idx, column) in batch.columns().iter().enumerate() {
|
258
|
+
let header = local_headers[col_idx];
|
259
|
+
let value = extract_value_at_index(column, row_idx, strict)?;
|
260
|
+
map.insert(header, value);
|
261
|
+
}
|
262
|
+
|
263
|
+
let record = RowRecord::Map::<RandomState>(map);
|
264
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
265
|
+
}
|
266
|
+
}
|
267
|
+
}
|
268
|
+
ParserResultType::Array => {
|
269
|
+
for batch_result in reader {
|
270
|
+
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
271
|
+
|
272
|
+
// Filter columns if needed
|
273
|
+
let batch = if let Some(cols) = columns {
|
274
|
+
filter_record_batch(&batch, cols)?
|
275
|
+
} else {
|
276
|
+
batch
|
277
|
+
};
|
278
|
+
|
279
|
+
// Convert columnar data to rows
|
280
|
+
for row_idx in 0..batch.num_rows() {
|
281
|
+
let mut row_vec = Vec::with_capacity(batch.num_columns());
|
282
|
+
|
283
|
+
for column in batch.columns() {
|
284
|
+
let value = extract_value_at_index(column, row_idx, strict)?;
|
285
|
+
row_vec.push(value);
|
286
|
+
}
|
287
|
+
|
288
|
+
let record = RowRecord::Vec::<RandomState>(row_vec);
|
289
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
290
|
+
}
|
291
|
+
}
|
292
|
+
}
|
293
|
+
}
|
294
|
+
|
295
|
+
Ok(())
|
296
|
+
}
|
297
|
+
|
298
|
+
/// Process Arrow IPC file with FileReader (for seekable sources)
|
299
|
+
pub fn process_arrow_file_column_data(
|
300
|
+
ruby: Rc<Ruby>,
|
301
|
+
file: File,
|
302
|
+
columns: &Option<Vec<String>>,
|
303
|
+
result_type: ParserResultType,
|
304
|
+
_batch_size: Option<usize>,
|
305
|
+
strict: bool,
|
306
|
+
ruby_logger: &RubyLogger,
|
307
|
+
) -> Result<(), ParquetGemError> {
|
308
|
+
let reader =
|
309
|
+
FileReader::try_new(file, None).map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
310
|
+
|
311
|
+
let schema = reader.schema();
|
312
|
+
ruby_logger.debug(|| format!("Arrow file schema loaded: {:?}", schema))?;
|
313
|
+
|
314
|
+
// FileReader implements Iterator<Item = Result<RecordBatch, ArrowError>>
|
315
|
+
match result_type {
|
316
|
+
ParserResultType::Hash => {
|
317
|
+
let headers = OnceLock::new();
|
318
|
+
|
319
|
+
for batch_result in reader {
|
320
|
+
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
321
|
+
|
322
|
+
// Filter columns if needed
|
323
|
+
let batch = if let Some(cols) = columns {
|
324
|
+
filter_record_batch(&batch, cols)?
|
325
|
+
} else {
|
326
|
+
batch
|
327
|
+
};
|
328
|
+
|
329
|
+
let local_headers = headers
|
330
|
+
.get_or_init(|| {
|
331
|
+
let schema = batch.schema();
|
332
|
+
let fields = schema.fields();
|
333
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
334
|
+
for field in fields {
|
335
|
+
header_string.push(field.name().to_owned());
|
336
|
+
}
|
337
|
+
StringCache::intern_many(&header_string)
|
338
|
+
})
|
339
|
+
.as_ref()
|
340
|
+
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
341
|
+
|
342
|
+
let mut map =
|
343
|
+
HashMap::with_capacity_and_hasher(local_headers.len(), RandomState::default());
|
344
|
+
|
345
|
+
batch
|
346
|
+
.columns()
|
347
|
+
.iter()
|
348
|
+
.enumerate()
|
349
|
+
.try_for_each(|(i, column)| {
|
350
|
+
let header = local_headers[i];
|
351
|
+
let values = ParquetValueVec::try_from(ArrayWrapper {
|
352
|
+
array: column,
|
353
|
+
strict,
|
354
|
+
})?;
|
355
|
+
map.insert(header, values.into_inner());
|
356
|
+
Ok::<_, ParquetGemError>(())
|
357
|
+
})?;
|
358
|
+
|
359
|
+
let record = ColumnRecord::Map::<RandomState>(map);
|
360
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
361
|
+
}
|
362
|
+
}
|
363
|
+
ParserResultType::Array => {
|
364
|
+
for batch_result in reader {
|
365
|
+
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
366
|
+
|
367
|
+
// Filter columns if needed
|
368
|
+
let batch = if let Some(cols) = columns {
|
369
|
+
filter_record_batch(&batch, cols)?
|
370
|
+
} else {
|
371
|
+
batch
|
372
|
+
};
|
373
|
+
|
374
|
+
let vec = batch
|
375
|
+
.columns()
|
376
|
+
.iter()
|
377
|
+
.map(|column| {
|
378
|
+
let values = ParquetValueVec::try_from(ArrayWrapper {
|
379
|
+
array: column,
|
380
|
+
strict,
|
381
|
+
})?;
|
382
|
+
Ok::<_, ParquetGemError>(values.into_inner())
|
383
|
+
})
|
384
|
+
.collect::<Result<Vec<_>, _>>()?;
|
385
|
+
|
386
|
+
let record = ColumnRecord::Vec::<RandomState>(vec);
|
387
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
388
|
+
}
|
389
|
+
}
|
390
|
+
}
|
391
|
+
|
392
|
+
Ok(())
|
393
|
+
}
|
394
|
+
|
395
|
+
/// Extract a single value from an Arrow array at a specific index
|
396
|
+
fn extract_value_at_index(
|
397
|
+
array: &Arc<dyn arrow_array::Array>,
|
398
|
+
index: usize,
|
399
|
+
strict: bool,
|
400
|
+
) -> Result<crate::types::ParquetField, ParquetGemError> {
|
401
|
+
use crate::types::ParquetField;
|
402
|
+
use arrow_array::*;
|
403
|
+
use arrow_schema::DataType;
|
404
|
+
use parquet::record::Field;
|
405
|
+
|
406
|
+
// Convert Arrow array value at index to Parquet Field
|
407
|
+
let field = match array.data_type() {
|
408
|
+
DataType::Boolean => {
|
409
|
+
let arr = array.as_any().downcast_ref::<BooleanArray>().unwrap();
|
410
|
+
if arr.is_null(index) {
|
411
|
+
Field::Null
|
412
|
+
} else {
|
413
|
+
Field::Bool(arr.value(index))
|
414
|
+
}
|
415
|
+
}
|
416
|
+
DataType::Int8 => {
|
417
|
+
let arr = array.as_any().downcast_ref::<Int8Array>().unwrap();
|
418
|
+
if arr.is_null(index) {
|
419
|
+
Field::Null
|
420
|
+
} else {
|
421
|
+
Field::Byte(arr.value(index) as i8)
|
422
|
+
}
|
423
|
+
}
|
424
|
+
DataType::Int16 => {
|
425
|
+
let arr = array.as_any().downcast_ref::<Int16Array>().unwrap();
|
426
|
+
if arr.is_null(index) {
|
427
|
+
Field::Null
|
428
|
+
} else {
|
429
|
+
Field::Short(arr.value(index))
|
430
|
+
}
|
431
|
+
}
|
432
|
+
DataType::Int32 => {
|
433
|
+
let arr = array.as_any().downcast_ref::<Int32Array>().unwrap();
|
434
|
+
if arr.is_null(index) {
|
435
|
+
Field::Null
|
436
|
+
} else {
|
437
|
+
Field::Int(arr.value(index))
|
438
|
+
}
|
439
|
+
}
|
440
|
+
DataType::Int64 => {
|
441
|
+
let arr = array.as_any().downcast_ref::<Int64Array>().unwrap();
|
442
|
+
if arr.is_null(index) {
|
443
|
+
Field::Null
|
444
|
+
} else {
|
445
|
+
Field::Long(arr.value(index))
|
446
|
+
}
|
447
|
+
}
|
448
|
+
DataType::UInt8 => {
|
449
|
+
let arr = array.as_any().downcast_ref::<UInt8Array>().unwrap();
|
450
|
+
if arr.is_null(index) {
|
451
|
+
Field::Null
|
452
|
+
} else {
|
453
|
+
Field::UByte(arr.value(index))
|
454
|
+
}
|
455
|
+
}
|
456
|
+
DataType::UInt16 => {
|
457
|
+
let arr = array.as_any().downcast_ref::<UInt16Array>().unwrap();
|
458
|
+
if arr.is_null(index) {
|
459
|
+
Field::Null
|
460
|
+
} else {
|
461
|
+
Field::UShort(arr.value(index))
|
462
|
+
}
|
463
|
+
}
|
464
|
+
DataType::UInt32 => {
|
465
|
+
let arr = array.as_any().downcast_ref::<UInt32Array>().unwrap();
|
466
|
+
if arr.is_null(index) {
|
467
|
+
Field::Null
|
468
|
+
} else {
|
469
|
+
Field::UInt(arr.value(index))
|
470
|
+
}
|
471
|
+
}
|
472
|
+
DataType::UInt64 => {
|
473
|
+
let arr = array.as_any().downcast_ref::<UInt64Array>().unwrap();
|
474
|
+
if arr.is_null(index) {
|
475
|
+
Field::Null
|
476
|
+
} else {
|
477
|
+
Field::ULong(arr.value(index))
|
478
|
+
}
|
479
|
+
}
|
480
|
+
DataType::Float32 => {
|
481
|
+
let arr = array.as_any().downcast_ref::<Float32Array>().unwrap();
|
482
|
+
if arr.is_null(index) {
|
483
|
+
Field::Null
|
484
|
+
} else {
|
485
|
+
Field::Float(arr.value(index))
|
486
|
+
}
|
487
|
+
}
|
488
|
+
DataType::Float64 => {
|
489
|
+
let arr = array.as_any().downcast_ref::<Float64Array>().unwrap();
|
490
|
+
if arr.is_null(index) {
|
491
|
+
Field::Null
|
492
|
+
} else {
|
493
|
+
Field::Double(arr.value(index))
|
494
|
+
}
|
495
|
+
}
|
496
|
+
DataType::Utf8 => {
|
497
|
+
let arr = array.as_any().downcast_ref::<StringArray>().unwrap();
|
498
|
+
if arr.is_null(index) {
|
499
|
+
Field::Null
|
500
|
+
} else {
|
501
|
+
Field::Str(arr.value(index).to_string())
|
502
|
+
}
|
503
|
+
}
|
504
|
+
DataType::Binary => {
|
505
|
+
let arr = array.as_any().downcast_ref::<BinaryArray>().unwrap();
|
506
|
+
if arr.is_null(index) {
|
507
|
+
Field::Null
|
508
|
+
} else {
|
509
|
+
Field::Bytes(arr.value(index).into())
|
510
|
+
}
|
511
|
+
}
|
512
|
+
DataType::Date32 => {
|
513
|
+
let arr = array.as_any().downcast_ref::<Date32Array>().unwrap();
|
514
|
+
if arr.is_null(index) {
|
515
|
+
Field::Null
|
516
|
+
} else {
|
517
|
+
Field::Date(arr.value(index))
|
518
|
+
}
|
519
|
+
}
|
520
|
+
DataType::Timestamp(unit, _tz) => match unit {
|
521
|
+
arrow_schema::TimeUnit::Millisecond => {
|
522
|
+
let arr = array
|
523
|
+
.as_any()
|
524
|
+
.downcast_ref::<TimestampMillisecondArray>()
|
525
|
+
.unwrap();
|
526
|
+
if arr.is_null(index) {
|
527
|
+
Field::Null
|
528
|
+
} else {
|
529
|
+
Field::TimestampMillis(arr.value(index))
|
530
|
+
}
|
531
|
+
}
|
532
|
+
arrow_schema::TimeUnit::Microsecond => {
|
533
|
+
let arr = array
|
534
|
+
.as_any()
|
535
|
+
.downcast_ref::<TimestampMicrosecondArray>()
|
536
|
+
.unwrap();
|
537
|
+
if arr.is_null(index) {
|
538
|
+
Field::Null
|
539
|
+
} else {
|
540
|
+
Field::TimestampMicros(arr.value(index))
|
541
|
+
}
|
542
|
+
}
|
543
|
+
_ => Field::Null,
|
544
|
+
},
|
545
|
+
// Add more type handling as needed
|
546
|
+
_ => Field::Null,
|
547
|
+
};
|
548
|
+
|
549
|
+
// For Arrow files, we don't have Parquet logical types, so we use defaults
|
550
|
+
Ok(ParquetField {
|
551
|
+
field,
|
552
|
+
converted_type: parquet::basic::ConvertedType::NONE,
|
553
|
+
logical_type: None,
|
554
|
+
strict,
|
555
|
+
})
|
556
|
+
}
|
557
|
+
|
558
|
+
/// Filter a RecordBatch to only include specified columns
|
559
|
+
fn filter_record_batch(
|
560
|
+
batch: &RecordBatch,
|
561
|
+
columns: &[String],
|
562
|
+
) -> Result<RecordBatch, ParquetGemError> {
|
563
|
+
let schema = batch.schema();
|
564
|
+
let mut indices = Vec::new();
|
565
|
+
let mut fields = Vec::new();
|
566
|
+
|
567
|
+
for (i, field) in schema.fields().iter().enumerate() {
|
568
|
+
if columns.contains(&field.name().to_string()) {
|
569
|
+
indices.push(i);
|
570
|
+
fields.push(field.clone());
|
571
|
+
}
|
572
|
+
}
|
573
|
+
|
574
|
+
let new_schema = Arc::new(Schema::new(fields));
|
575
|
+
let new_columns: Vec<_> = indices.iter().map(|&i| batch.column(i).clone()).collect();
|
576
|
+
|
577
|
+
RecordBatch::try_new(new_schema, new_columns)
|
578
|
+
.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))
|
579
|
+
}
|