parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,285 +0,0 @@
1
- use super::*;
2
- use magnus::{TryConvert, Value};
3
-
4
- /// Parses a fixed offset timezone string (e.g., "+09:00", "-05:30", "+0800")
5
- /// Returns the offset in minutes from UTC
6
- fn parse_fixed_offset(tz: &str) -> Result<i32, ParquetGemError> {
7
- // Remove any whitespace
8
- let tz = tz.trim();
9
-
10
- // Check if it starts with + or -
11
- if !tz.starts_with('+') && !tz.starts_with('-') {
12
- return Err(MagnusError::new(
13
- magnus::exception::arg_error(),
14
- format!(
15
- "Invalid timezone offset format: '{}'. Expected format like '+09:00' or '-0530'",
16
- tz
17
- ),
18
- ))?;
19
- }
20
-
21
- let sign = if tz.starts_with('-') { -1 } else { 1 };
22
- let offset_str = &tz[1..]; // Remove the sign
23
-
24
- // Parse different formats: "+09:00", "+0900", "+09"
25
- let (hours, minutes) = if offset_str.contains(':') {
26
- // Format: "+09:00" or "+9:30"
27
- let parts: Vec<&str> = offset_str.split(':').collect();
28
- if parts.len() != 2 {
29
- return Err(MagnusError::new(
30
- magnus::exception::arg_error(),
31
- format!("Invalid timezone offset format: '{}'. Expected HH:MM", tz),
32
- ))?;
33
- }
34
-
35
- let h = parts[0].parse::<i32>().map_err(|e| {
36
- MagnusError::new(
37
- magnus::exception::arg_error(),
38
- format!("Invalid hour in timezone offset '{}': {}", tz, e),
39
- )
40
- })?;
41
-
42
- let m = parts[1].parse::<i32>().map_err(|e| {
43
- MagnusError::new(
44
- magnus::exception::arg_error(),
45
- format!("Invalid minute in timezone offset '{}': {}", tz, e),
46
- )
47
- })?;
48
-
49
- (h, m)
50
- } else if offset_str.len() == 4 {
51
- // Format: "+0900"
52
- let h = offset_str[0..2].parse::<i32>().map_err(|e| {
53
- MagnusError::new(
54
- magnus::exception::arg_error(),
55
- format!("Invalid hour in timezone offset '{}': {}", tz, e),
56
- )
57
- })?;
58
-
59
- let m = offset_str[2..4].parse::<i32>().map_err(|e| {
60
- MagnusError::new(
61
- magnus::exception::arg_error(),
62
- format!("Invalid minute in timezone offset '{}': {}", tz, e),
63
- )
64
- })?;
65
-
66
- (h, m)
67
- } else if offset_str.len() == 2
68
- || (offset_str.len() == 1 && offset_str.chars().all(|c| c.is_numeric()))
69
- {
70
- // Format: "+09" or "+9"
71
- let h = offset_str.parse::<i32>().map_err(|e| {
72
- MagnusError::new(
73
- magnus::exception::arg_error(),
74
- format!("Invalid hour in timezone offset '{}': {}", tz, e),
75
- )
76
- })?;
77
- (h, 0)
78
- } else {
79
- return Err(MagnusError::new(
80
- magnus::exception::arg_error(),
81
- format!("Invalid timezone offset format: '{}'. Expected formats: '+HH:MM', '+HHMM', or '+HH'", tz),
82
- ))?;
83
- };
84
-
85
- // Validate ranges
86
- if hours < 0 || hours > 23 {
87
- return Err(MagnusError::new(
88
- magnus::exception::arg_error(),
89
- format!("Invalid hour in timezone offset: {}. Must be 0-23", hours),
90
- ))?;
91
- }
92
-
93
- if minutes < 0 || minutes > 59 {
94
- return Err(MagnusError::new(
95
- magnus::exception::arg_error(),
96
- format!(
97
- "Invalid minute in timezone offset: {}. Must be 0-59",
98
- minutes
99
- ),
100
- ))?;
101
- }
102
-
103
- Ok(sign * (hours * 60 + minutes))
104
- }
105
-
106
- pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, ParquetGemError> {
107
- let (ts, tz) = match value {
108
- ParquetValue::TimestampSecond(ts, tz) => (jiff::Timestamp::from_second(*ts)?, tz),
109
- ParquetValue::TimestampMillis(ts, tz) => (jiff::Timestamp::from_millisecond(*ts)?, tz),
110
- ParquetValue::TimestampMicros(ts, tz) => (jiff::Timestamp::from_microsecond(*ts)?, tz),
111
- ParquetValue::TimestampNanos(ts, tz) => {
112
- (jiff::Timestamp::from_nanosecond(*ts as i128)?, tz)
113
- }
114
- _ => {
115
- return Err(MagnusError::new(
116
- magnus::exception::type_error(),
117
- "Invalid timestamp value".to_string(),
118
- ))?
119
- }
120
- };
121
-
122
- // If timezone is provided, convert to zoned timestamp
123
- if let Some(tz) = tz {
124
- // Handle fixed offset timezones first
125
- if tz.starts_with('+') || tz.starts_with('-') {
126
- let total_minutes = parse_fixed_offset(tz)?;
127
-
128
- // Create fixed timezone using the parsed offset
129
- let offset_hours = total_minutes / 60;
130
- let offset_minutes = total_minutes % 60;
131
-
132
- // jiff expects offset in hours, but we can be more precise
133
- let tz = if offset_minutes == 0 {
134
- jiff::tz::TimeZone::fixed(jiff::tz::offset(offset_hours as i8))
135
- } else {
136
- // For non-zero minutes, we need to create a custom offset
137
- // jiff doesn't directly support minute-precision offsets in the simple API,
138
- // so we'll use the timestamp directly with the offset applied
139
- return Ok(ts);
140
- };
141
-
142
- Ok(ts.to_zoned(tz).timestamp())
143
- } else if tz.eq_ignore_ascii_case("UTC") || tz.eq_ignore_ascii_case("GMT") {
144
- // Common UTC aliases
145
- Ok(ts)
146
- } else {
147
- // Try IANA timezone
148
- match ts.in_tz(tz) {
149
- Ok(zoned) => Ok(zoned.timestamp()),
150
- Err(e) => {
151
- // Log the error but don't fail - fall back to UTC
152
- eprintln!(
153
- "Warning: Failed to parse timezone '{}': {}. Using UTC.",
154
- tz, e
155
- );
156
- Ok(ts)
157
- }
158
- }
159
- }
160
- } else {
161
- // No timezone provided - treat as UTC
162
- Ok(ts)
163
- }
164
- }
165
-
166
- /// Validates and normalizes a timezone string
167
- /// Returns the normalized timezone string or None if invalid
168
- pub fn validate_timezone(tz: &str) -> Option<String> {
169
- let tz = tz.trim();
170
-
171
- // Check for empty timezone
172
- if tz.is_empty() {
173
- return None;
174
- }
175
-
176
- // Fixed offset timezones
177
- if tz.starts_with('+') || tz.starts_with('-') {
178
- // Validate it can be parsed
179
- if parse_fixed_offset(tz).is_ok() {
180
- return Some(tz.to_string());
181
- }
182
- }
183
-
184
- // Common UTC aliases
185
- if tz.eq_ignore_ascii_case("UTC")
186
- || tz.eq_ignore_ascii_case("GMT")
187
- || tz.eq_ignore_ascii_case("Z")
188
- {
189
- return Some("UTC".to_string());
190
- }
191
-
192
- // Try to validate as IANA timezone by attempting to use it
193
- // This is a bit expensive but ensures we only store valid timezones
194
- if let Ok(tz_obj) = jiff::tz::TimeZone::get(tz) {
195
- // Use the canonical name from jiff
196
- return Some(
197
- tz_obj
198
- .iana_name()
199
- .map(|s| s.to_string())
200
- .unwrap_or_else(|| tz.to_string()),
201
- );
202
- }
203
-
204
- None
205
- }
206
-
207
- /// Converts a Ruby Time object to a timestamp with timezone
208
- pub fn ruby_time_to_timestamp_with_tz(
209
- value: Value,
210
- unit: &str,
211
- ) -> Result<(i64, Option<Arc<str>>), MagnusError> {
212
- // Get seconds and microseconds
213
- let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
214
- let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
215
-
216
- // Get timezone information from Ruby Time object
217
- let tz_str = if let Ok(zone) = value.funcall::<_, _, Value>("zone", ()) {
218
- if zone.is_nil() {
219
- None
220
- } else if let Ok(s) = String::try_convert(zone) {
221
- validate_timezone(&s).map(|tz| Arc::from(tz.as_str()))
222
- } else {
223
- None
224
- }
225
- } else {
226
- None
227
- };
228
-
229
- // Convert to appropriate unit
230
- let timestamp = match unit {
231
- "millis" => secs * 1000 + (usecs / 1000),
232
- "micros" => secs * 1_000_000 + usecs,
233
- "seconds" => secs,
234
- "nanos" => secs * 1_000_000_000 + (usecs * 1000),
235
- _ => {
236
- return Err(MagnusError::new(
237
- magnus::exception::arg_error(),
238
- format!("Invalid timestamp unit: {}", unit),
239
- ))
240
- }
241
- };
242
-
243
- Ok((timestamp, tz_str))
244
- }
245
-
246
- // Macro for handling timestamp conversions
247
- #[macro_export]
248
- macro_rules! impl_timestamp_conversion {
249
- ($value:expr, $unit:ident, $handle:expr) => {{
250
- match $value {
251
- ParquetValue::$unit(ts, tz) => {
252
- let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz.clone()))?;
253
- let time_class = $handle.class_time();
254
-
255
- // Convert timestamp to Time object
256
- let time_obj = time_class
257
- .funcall::<_, _, Value>("parse", (ts.to_string(),))?
258
- .into_value_with($handle);
259
-
260
- // If we have timezone info, we've already handled it in parse_zoned_timestamp
261
- // The resulting Time object will be in the correct timezone
262
-
263
- Ok(time_obj)
264
- }
265
- _ => Err(MagnusError::new(
266
- magnus::exception::type_error(),
267
- format!(
268
- "Invalid timestamp type. Expected {}, got {:?}",
269
- stringify!($unit),
270
- $value
271
- ),
272
- ))?,
273
- }
274
- }};
275
- }
276
-
277
- // Macro for handling date conversions
278
- #[macro_export]
279
- macro_rules! impl_date_conversion {
280
- ($value:expr, $handle:expr) => {{
281
- let ts = jiff::Timestamp::from_second(($value as i64) * 86400)?;
282
- let formatted = ts.strftime("%Y-%m-%d").to_string();
283
- Ok(formatted.into_value_with($handle))
284
- }};
285
- }