parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,285 +0,0 @@
|
|
1
|
-
use super::*;
|
2
|
-
use magnus::{TryConvert, Value};
|
3
|
-
|
4
|
-
/// Parses a fixed offset timezone string (e.g., "+09:00", "-05:30", "+0800")
|
5
|
-
/// Returns the offset in minutes from UTC
|
6
|
-
fn parse_fixed_offset(tz: &str) -> Result<i32, ParquetGemError> {
|
7
|
-
// Remove any whitespace
|
8
|
-
let tz = tz.trim();
|
9
|
-
|
10
|
-
// Check if it starts with + or -
|
11
|
-
if !tz.starts_with('+') && !tz.starts_with('-') {
|
12
|
-
return Err(MagnusError::new(
|
13
|
-
magnus::exception::arg_error(),
|
14
|
-
format!(
|
15
|
-
"Invalid timezone offset format: '{}'. Expected format like '+09:00' or '-0530'",
|
16
|
-
tz
|
17
|
-
),
|
18
|
-
))?;
|
19
|
-
}
|
20
|
-
|
21
|
-
let sign = if tz.starts_with('-') { -1 } else { 1 };
|
22
|
-
let offset_str = &tz[1..]; // Remove the sign
|
23
|
-
|
24
|
-
// Parse different formats: "+09:00", "+0900", "+09"
|
25
|
-
let (hours, minutes) = if offset_str.contains(':') {
|
26
|
-
// Format: "+09:00" or "+9:30"
|
27
|
-
let parts: Vec<&str> = offset_str.split(':').collect();
|
28
|
-
if parts.len() != 2 {
|
29
|
-
return Err(MagnusError::new(
|
30
|
-
magnus::exception::arg_error(),
|
31
|
-
format!("Invalid timezone offset format: '{}'. Expected HH:MM", tz),
|
32
|
-
))?;
|
33
|
-
}
|
34
|
-
|
35
|
-
let h = parts[0].parse::<i32>().map_err(|e| {
|
36
|
-
MagnusError::new(
|
37
|
-
magnus::exception::arg_error(),
|
38
|
-
format!("Invalid hour in timezone offset '{}': {}", tz, e),
|
39
|
-
)
|
40
|
-
})?;
|
41
|
-
|
42
|
-
let m = parts[1].parse::<i32>().map_err(|e| {
|
43
|
-
MagnusError::new(
|
44
|
-
magnus::exception::arg_error(),
|
45
|
-
format!("Invalid minute in timezone offset '{}': {}", tz, e),
|
46
|
-
)
|
47
|
-
})?;
|
48
|
-
|
49
|
-
(h, m)
|
50
|
-
} else if offset_str.len() == 4 {
|
51
|
-
// Format: "+0900"
|
52
|
-
let h = offset_str[0..2].parse::<i32>().map_err(|e| {
|
53
|
-
MagnusError::new(
|
54
|
-
magnus::exception::arg_error(),
|
55
|
-
format!("Invalid hour in timezone offset '{}': {}", tz, e),
|
56
|
-
)
|
57
|
-
})?;
|
58
|
-
|
59
|
-
let m = offset_str[2..4].parse::<i32>().map_err(|e| {
|
60
|
-
MagnusError::new(
|
61
|
-
magnus::exception::arg_error(),
|
62
|
-
format!("Invalid minute in timezone offset '{}': {}", tz, e),
|
63
|
-
)
|
64
|
-
})?;
|
65
|
-
|
66
|
-
(h, m)
|
67
|
-
} else if offset_str.len() == 2
|
68
|
-
|| (offset_str.len() == 1 && offset_str.chars().all(|c| c.is_numeric()))
|
69
|
-
{
|
70
|
-
// Format: "+09" or "+9"
|
71
|
-
let h = offset_str.parse::<i32>().map_err(|e| {
|
72
|
-
MagnusError::new(
|
73
|
-
magnus::exception::arg_error(),
|
74
|
-
format!("Invalid hour in timezone offset '{}': {}", tz, e),
|
75
|
-
)
|
76
|
-
})?;
|
77
|
-
(h, 0)
|
78
|
-
} else {
|
79
|
-
return Err(MagnusError::new(
|
80
|
-
magnus::exception::arg_error(),
|
81
|
-
format!("Invalid timezone offset format: '{}'. Expected formats: '+HH:MM', '+HHMM', or '+HH'", tz),
|
82
|
-
))?;
|
83
|
-
};
|
84
|
-
|
85
|
-
// Validate ranges
|
86
|
-
if hours < 0 || hours > 23 {
|
87
|
-
return Err(MagnusError::new(
|
88
|
-
magnus::exception::arg_error(),
|
89
|
-
format!("Invalid hour in timezone offset: {}. Must be 0-23", hours),
|
90
|
-
))?;
|
91
|
-
}
|
92
|
-
|
93
|
-
if minutes < 0 || minutes > 59 {
|
94
|
-
return Err(MagnusError::new(
|
95
|
-
magnus::exception::arg_error(),
|
96
|
-
format!(
|
97
|
-
"Invalid minute in timezone offset: {}. Must be 0-59",
|
98
|
-
minutes
|
99
|
-
),
|
100
|
-
))?;
|
101
|
-
}
|
102
|
-
|
103
|
-
Ok(sign * (hours * 60 + minutes))
|
104
|
-
}
|
105
|
-
|
106
|
-
pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, ParquetGemError> {
|
107
|
-
let (ts, tz) = match value {
|
108
|
-
ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts)?, tz),
|
109
|
-
ParquetValue::TimestampMillis(ts, tz) => (jiff::Timestamp::from_millisecond(*ts)?, tz),
|
110
|
-
ParquetValue::TimestampMicros(ts, tz) => (jiff::Timestamp::from_microsecond(*ts)?, tz),
|
111
|
-
ParquetValue::TimestampNanos(ts, tz) => {
|
112
|
-
(jiff::Timestamp::from_nanosecond(*ts as i128)?, tz)
|
113
|
-
}
|
114
|
-
_ => {
|
115
|
-
return Err(MagnusError::new(
|
116
|
-
magnus::exception::type_error(),
|
117
|
-
"Invalid timestamp value".to_string(),
|
118
|
-
))?
|
119
|
-
}
|
120
|
-
};
|
121
|
-
|
122
|
-
// If timezone is provided, convert to zoned timestamp
|
123
|
-
if let Some(tz) = tz {
|
124
|
-
// Handle fixed offset timezones first
|
125
|
-
if tz.starts_with('+') || tz.starts_with('-') {
|
126
|
-
let total_minutes = parse_fixed_offset(tz)?;
|
127
|
-
|
128
|
-
// Create fixed timezone using the parsed offset
|
129
|
-
let offset_hours = total_minutes / 60;
|
130
|
-
let offset_minutes = total_minutes % 60;
|
131
|
-
|
132
|
-
// jiff expects offset in hours, but we can be more precise
|
133
|
-
let tz = if offset_minutes == 0 {
|
134
|
-
jiff::tz::TimeZone::fixed(jiff::tz::offset(offset_hours as i8))
|
135
|
-
} else {
|
136
|
-
// For non-zero minutes, we need to create a custom offset
|
137
|
-
// jiff doesn't directly support minute-precision offsets in the simple API,
|
138
|
-
// so we'll use the timestamp directly with the offset applied
|
139
|
-
return Ok(ts);
|
140
|
-
};
|
141
|
-
|
142
|
-
Ok(ts.to_zoned(tz).timestamp())
|
143
|
-
} else if tz.eq_ignore_ascii_case("UTC") || tz.eq_ignore_ascii_case("GMT") {
|
144
|
-
// Common UTC aliases
|
145
|
-
Ok(ts)
|
146
|
-
} else {
|
147
|
-
// Try IANA timezone
|
148
|
-
match ts.in_tz(tz) {
|
149
|
-
Ok(zoned) => Ok(zoned.timestamp()),
|
150
|
-
Err(e) => {
|
151
|
-
// Log the error but don't fail - fall back to UTC
|
152
|
-
eprintln!(
|
153
|
-
"Warning: Failed to parse timezone '{}': {}. Using UTC.",
|
154
|
-
tz, e
|
155
|
-
);
|
156
|
-
Ok(ts)
|
157
|
-
}
|
158
|
-
}
|
159
|
-
}
|
160
|
-
} else {
|
161
|
-
// No timezone provided - treat as UTC
|
162
|
-
Ok(ts)
|
163
|
-
}
|
164
|
-
}
|
165
|
-
|
166
|
-
/// Validates and normalizes a timezone string
|
167
|
-
/// Returns the normalized timezone string or None if invalid
|
168
|
-
pub fn validate_timezone(tz: &str) -> Option<String> {
|
169
|
-
let tz = tz.trim();
|
170
|
-
|
171
|
-
// Check for empty timezone
|
172
|
-
if tz.is_empty() {
|
173
|
-
return None;
|
174
|
-
}
|
175
|
-
|
176
|
-
// Fixed offset timezones
|
177
|
-
if tz.starts_with('+') || tz.starts_with('-') {
|
178
|
-
// Validate it can be parsed
|
179
|
-
if parse_fixed_offset(tz).is_ok() {
|
180
|
-
return Some(tz.to_string());
|
181
|
-
}
|
182
|
-
}
|
183
|
-
|
184
|
-
// Common UTC aliases
|
185
|
-
if tz.eq_ignore_ascii_case("UTC")
|
186
|
-
|| tz.eq_ignore_ascii_case("GMT")
|
187
|
-
|| tz.eq_ignore_ascii_case("Z")
|
188
|
-
{
|
189
|
-
return Some("UTC".to_string());
|
190
|
-
}
|
191
|
-
|
192
|
-
// Try to validate as IANA timezone by attempting to use it
|
193
|
-
// This is a bit expensive but ensures we only store valid timezones
|
194
|
-
if let Ok(tz_obj) = jiff::tz::TimeZone::get(tz) {
|
195
|
-
// Use the canonical name from jiff
|
196
|
-
return Some(
|
197
|
-
tz_obj
|
198
|
-
.iana_name()
|
199
|
-
.map(|s| s.to_string())
|
200
|
-
.unwrap_or_else(|| tz.to_string()),
|
201
|
-
);
|
202
|
-
}
|
203
|
-
|
204
|
-
None
|
205
|
-
}
|
206
|
-
|
207
|
-
/// Converts a Ruby Time object to a timestamp with timezone
|
208
|
-
pub fn ruby_time_to_timestamp_with_tz(
|
209
|
-
value: Value,
|
210
|
-
unit: &str,
|
211
|
-
) -> Result<(i64, Option<Arc<str>>), MagnusError> {
|
212
|
-
// Get seconds and microseconds
|
213
|
-
let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
|
214
|
-
let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
|
215
|
-
|
216
|
-
// Get timezone information from Ruby Time object
|
217
|
-
let tz_str = if let Ok(zone) = value.funcall::<_, _, Value>("zone", ()) {
|
218
|
-
if zone.is_nil() {
|
219
|
-
None
|
220
|
-
} else if let Ok(s) = String::try_convert(zone) {
|
221
|
-
validate_timezone(&s).map(|tz| Arc::from(tz.as_str()))
|
222
|
-
} else {
|
223
|
-
None
|
224
|
-
}
|
225
|
-
} else {
|
226
|
-
None
|
227
|
-
};
|
228
|
-
|
229
|
-
// Convert to appropriate unit
|
230
|
-
let timestamp = match unit {
|
231
|
-
"millis" => secs * 1000 + (usecs / 1000),
|
232
|
-
"micros" => secs * 1_000_000 + usecs,
|
233
|
-
"seconds" => secs,
|
234
|
-
"nanos" => secs * 1_000_000_000 + (usecs * 1000),
|
235
|
-
_ => {
|
236
|
-
return Err(MagnusError::new(
|
237
|
-
magnus::exception::arg_error(),
|
238
|
-
format!("Invalid timestamp unit: {}", unit),
|
239
|
-
))
|
240
|
-
}
|
241
|
-
};
|
242
|
-
|
243
|
-
Ok((timestamp, tz_str))
|
244
|
-
}
|
245
|
-
|
246
|
-
// Macro for handling timestamp conversions
|
247
|
-
#[macro_export]
|
248
|
-
macro_rules! impl_timestamp_conversion {
|
249
|
-
($value:expr, $unit:ident, $handle:expr) => {{
|
250
|
-
match $value {
|
251
|
-
ParquetValue::$unit(ts, tz) => {
|
252
|
-
let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz.clone()))?;
|
253
|
-
let time_class = $handle.class_time();
|
254
|
-
|
255
|
-
// Convert timestamp to Time object
|
256
|
-
let time_obj = time_class
|
257
|
-
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
258
|
-
.into_value_with($handle);
|
259
|
-
|
260
|
-
// If we have timezone info, we've already handled it in parse_zoned_timestamp
|
261
|
-
// The resulting Time object will be in the correct timezone
|
262
|
-
|
263
|
-
Ok(time_obj)
|
264
|
-
}
|
265
|
-
_ => Err(MagnusError::new(
|
266
|
-
magnus::exception::type_error(),
|
267
|
-
format!(
|
268
|
-
"Invalid timestamp type. Expected {}, got {:?}",
|
269
|
-
stringify!($unit),
|
270
|
-
$value
|
271
|
-
),
|
272
|
-
))?,
|
273
|
-
}
|
274
|
-
}};
|
275
|
-
}
|
276
|
-
|
277
|
-
// Macro for handling date conversions
|
278
|
-
#[macro_export]
|
279
|
-
macro_rules! impl_date_conversion {
|
280
|
-
($value:expr, $handle:expr) => {{
|
281
|
-
let ts = jiff::Timestamp::from_second(($value as i64) * 86400)?;
|
282
|
-
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
283
|
-
Ok(formatted.into_value_with($handle))
|
284
|
-
}};
|
285
|
-
}
|