parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,99 +0,0 @@
1
- /// This module exists to avoid cloning header keys in returned HashMaps.
2
- /// Since the underlying RString creation already involves cloning,
3
- /// this caching layer aims to reduce redundant allocations.
4
- ///
5
- /// Note: Performance testing on macOS showed minimal speed improvements,
6
- /// so this optimization could be removed if any issues arise.
7
- use std::{
8
- collections::HashMap,
9
- sync::{LazyLock, Mutex},
10
- };
11
-
12
- use magnus::{IntoValue, RString, Ruby, Value};
13
-
14
- use thiserror::Error;
15
-
16
- #[derive(Debug, Clone, Error)]
17
- pub enum CacheError {
18
- #[error("Failed to acquire lock: {0}")]
19
- LockError(String),
20
- #[error("Failed to convert Ruby String to interned string: {0}")]
21
- RStringConversion(String),
22
- }
23
-
24
- static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, StringCacheKey>>> =
25
- LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
26
-
27
- pub struct StringCache;
28
-
29
- #[derive(Copy, Clone)]
30
- pub struct StringCacheKey(&'static str);
31
-
32
- impl StringCacheKey {
33
- pub fn new(string: &str) -> Result<Self, CacheError> {
34
- let rstr = RString::new(string);
35
- let fstr = rstr.to_interned_str();
36
- Ok(Self(fstr.as_str().map_err(|e| {
37
- CacheError::RStringConversion(e.to_string())
38
- })?))
39
- }
40
- }
41
-
42
- impl AsRef<str> for StringCacheKey {
43
- fn as_ref(&self) -> &'static str {
44
- self.0
45
- }
46
- }
47
-
48
- impl IntoValue for StringCacheKey {
49
- fn into_value_with(self, handle: &Ruby) -> Value {
50
- handle.into_value(self.0)
51
- }
52
- }
53
-
54
- impl IntoValue for &StringCacheKey {
55
- fn into_value_with(self, handle: &Ruby) -> Value {
56
- handle.into_value(self.0)
57
- }
58
- }
59
-
60
- impl std::fmt::Debug for StringCacheKey {
61
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62
- self.0.fmt(f)
63
- }
64
- }
65
-
66
- impl PartialEq for StringCacheKey {
67
- fn eq(&self, other: &Self) -> bool {
68
- self.0 == other.0
69
- }
70
- }
71
-
72
- impl std::cmp::Eq for StringCacheKey {}
73
-
74
- impl std::hash::Hash for StringCacheKey {
75
- fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
76
- self.0.hash(state);
77
- }
78
- }
79
-
80
- impl StringCache {
81
- pub fn intern_many<AsStr: AsRef<str>>(
82
- strings: &[AsStr],
83
- ) -> Result<Vec<StringCacheKey>, CacheError> {
84
- let cache = STRING_CACHE
85
- .lock()
86
- .map_err(|e| CacheError::LockError(e.to_string()))?;
87
-
88
- let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
89
- for string in strings {
90
- if let Some((_, interned_string)) = cache.get_key_value(string.as_ref()) {
91
- result.push(*interned_string);
92
- } else {
93
- let interned = StringCacheKey::new(string.as_ref())?;
94
- result.push(interned);
95
- }
96
- }
97
- Ok(result)
98
- }
99
- }
@@ -1,171 +0,0 @@
1
- // Logger module for Parquet gem
2
- // Provides a Rust wrapper for Ruby logger objects
3
-
4
- use std::str::FromStr;
5
-
6
- use magnus::{exception::runtime_error, value::ReprValue, Error as MagnusError, Ruby, Value};
7
-
8
- use crate::{types::ParquetGemError, utils::parse_string_or_symbol};
9
-
10
- /// Severity levels that match Ruby's Logger levels
11
- #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
12
- pub enum LogLevel {
13
- Debug,
14
- Info,
15
- Warn,
16
- Error,
17
- Fatal,
18
- }
19
-
20
- impl FromStr for LogLevel {
21
- type Err = MagnusError;
22
-
23
- fn from_str(s: &str) -> Result<Self, Self::Err> {
24
- Ok(match s {
25
- "debug" => LogLevel::Debug,
26
- "info" => LogLevel::Info,
27
- "warn" => LogLevel::Warn,
28
- "error" => LogLevel::Error,
29
- "fatal" => LogLevel::Fatal,
30
- _ => {
31
- return Err(MagnusError::new(
32
- runtime_error(),
33
- format!("Invalid log level: {}", s),
34
- ))
35
- }
36
- })
37
- }
38
- }
39
- /// A wrapper around a Ruby logger object
40
- #[derive(Debug, Clone)]
41
- pub struct RubyLogger {
42
- logger: Option<Value>,
43
- level: LogLevel,
44
- }
45
-
46
- #[allow(dead_code)]
47
- impl RubyLogger {
48
- pub fn new(ruby: &Ruby, logger_value: Option<Value>) -> Result<Self, ParquetGemError> {
49
- let environment_level = std::env::var("PARQUET_GEM_LOG_LEVEL")
50
- .unwrap_or_else(|_| "warn".to_string())
51
- .parse::<LogLevel>()
52
- .unwrap_or(LogLevel::Warn);
53
-
54
- match logger_value {
55
- Some(logger) => {
56
- if logger.is_nil() {
57
- return Ok(Self {
58
- logger: None,
59
- level: environment_level,
60
- });
61
- }
62
-
63
- let level_value = logger.funcall::<_, _, Value>("level", ())?;
64
- let level = parse_string_or_symbol(ruby, level_value)?;
65
- let level = level
66
- .map(|s| s.parse::<LogLevel>())
67
- .transpose()?
68
- .unwrap_or(environment_level);
69
-
70
- Ok(Self {
71
- logger: Some(logger),
72
- level,
73
- })
74
- }
75
- None => Ok(Self {
76
- logger: None,
77
- level: environment_level,
78
- }),
79
- }
80
- }
81
-
82
- /// Log a message at the given level
83
- pub fn log(&self, level: LogLevel, message: &str) -> Result<(), MagnusError> {
84
- let method = match level {
85
- LogLevel::Debug => "debug",
86
- LogLevel::Info => "info",
87
- LogLevel::Warn => "warn",
88
- LogLevel::Error => "error",
89
- LogLevel::Fatal => "fatal",
90
- };
91
-
92
- match self.logger {
93
- Some(logger) => {
94
- logger.funcall::<_, _, Value>(method, (message,))?;
95
- }
96
- None => eprintln!("{}", message),
97
- }
98
-
99
- Ok(())
100
- }
101
-
102
- /// Log a debug message
103
- pub fn debug<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
104
- where
105
- F: FnOnce() -> S,
106
- S: AsRef<str>,
107
- {
108
- if self.level <= LogLevel::Debug {
109
- let message = message_fn();
110
- self.log(LogLevel::Debug, message.as_ref())
111
- } else {
112
- Ok(())
113
- }
114
- }
115
-
116
- /// Log an info message
117
- pub fn info<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
118
- where
119
- F: FnOnce() -> S,
120
- S: AsRef<str>,
121
- {
122
- if self.level <= LogLevel::Info {
123
- let message = message_fn();
124
- self.log(LogLevel::Info, message.as_ref())
125
- } else {
126
- Ok(())
127
- }
128
- }
129
-
130
- /// Log a warning message
131
- pub fn warn<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
132
- where
133
- F: FnOnce() -> S,
134
- S: AsRef<str>,
135
- {
136
- if self.level <= LogLevel::Warn {
137
- let message = message_fn();
138
- self.log(LogLevel::Warn, message.as_ref())
139
- } else {
140
- Ok(())
141
- }
142
- }
143
-
144
- /// Log an error message
145
- pub fn error<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
146
- where
147
- F: FnOnce() -> S,
148
- S: AsRef<str>,
149
- {
150
- if self.level <= LogLevel::Error {
151
- let message = message_fn();
152
- self.log(LogLevel::Error, message.as_ref())
153
- } else {
154
- Ok(())
155
- }
156
- }
157
-
158
- /// Log a fatal message
159
- pub fn fatal<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
160
- where
161
- F: FnOnce() -> S,
162
- S: AsRef<str>,
163
- {
164
- if self.level <= LogLevel::Fatal {
165
- let message = message_fn();
166
- self.log(LogLevel::Fatal, message.as_ref())
167
- } else {
168
- Ok(())
169
- }
170
- }
171
- }
@@ -1,111 +0,0 @@
1
- use ahash::RandomState;
2
- use arrow_schema::Schema;
3
- use either::Either;
4
- use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
5
- use parquet::arrow::ProjectionMask;
6
- use std::collections::HashMap;
7
- use std::fs::File;
8
- use std::rc::Rc;
9
- use std::sync::Arc;
10
-
11
- use magnus::value::ReprValue;
12
- use magnus::{Error as MagnusError, Ruby, Value};
13
-
14
- use crate::header_cache::StringCache;
15
- use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
16
- use crate::types::{ParquetGemError, TryIntoValue};
17
- use crate::ColumnRecord;
18
-
19
- /// Opens a parquet file or IO-like object for reading
20
- ///
21
- /// This function handles both file paths (as strings) and IO-like objects,
22
- /// returning either a File or a ThreadSafeRubyReader that can be used with
23
- /// parquet readers.
24
- pub fn open_parquet_source(
25
- ruby: Rc<Ruby>,
26
- to_read: Value,
27
- ) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
28
- if to_read.is_kind_of(ruby.class_string()) {
29
- let path_string = to_read.to_r_string()?;
30
- let file_path = unsafe { path_string.as_str()? };
31
- let file = File::open(file_path).map_err(ParquetGemError::from)?;
32
- Ok(Either::Left(file))
33
- } else {
34
- let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
35
- Ok(Either::Right(readable))
36
- }
37
- }
38
-
39
- /// Helper function to check if a block is given and create an appropriate enumerator
40
- /// if not
41
- pub fn handle_block_or_enum<F, T>(
42
- _ruby: &magnus::Ruby,
43
- block_given: bool,
44
- create_enum: F,
45
- ) -> Result<Option<T>, MagnusError>
46
- where
47
- F: FnOnce() -> Result<T, MagnusError>,
48
- {
49
- if !block_given {
50
- let enum_value = create_enum()?;
51
- return Ok(Some(enum_value));
52
- }
53
- Ok(None)
54
- }
55
-
56
- /// Creates a ParquetRecordBatchReader with the given columns and batch size configurations
57
- pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
58
- reader: T,
59
- columns: &Option<Vec<String>>,
60
- batch_size: Option<usize>,
61
- ) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
62
- let mut builder =
63
- ParquetRecordBatchReaderBuilder::try_new(reader).map_err(ParquetGemError::Parquet)?;
64
-
65
- let schema = builder.schema().clone();
66
- let num_rows = builder.metadata().file_metadata().num_rows();
67
-
68
- // If columns are specified, project only those columns
69
- if let Some(cols) = columns {
70
- // Get the parquet schema
71
- let parquet_schema = builder.parquet_schema();
72
-
73
- // Create a projection mask from column names
74
- let projection = ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
75
- builder = builder.with_projection(projection);
76
- }
77
-
78
- if let Some(batch_size) = batch_size {
79
- builder = builder.with_batch_size(batch_size);
80
- }
81
-
82
- let reader = builder.build().map_err(ParquetGemError::Parquet)?;
83
- Ok((reader, schema, num_rows))
84
- }
85
-
86
- /// Handles the case of an empty parquet file (no rows) by yielding a record with empty arrays
87
- /// Returns true if the file was empty and was handled, false otherwise
88
- pub fn handle_empty_file(
89
- ruby: &magnus::Ruby,
90
- schema: &Arc<Schema>,
91
- num_rows: i64,
92
- ) -> Result<bool, ParquetGemError> {
93
- if num_rows == 0 {
94
- let mut map =
95
- HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
96
- let headers: Vec<String> = schema
97
- .fields()
98
- .iter()
99
- .map(|field| field.name().to_string())
100
- .collect();
101
- let interned_headers =
102
- StringCache::intern_many(&headers).map_err(ParquetGemError::HeaderIntern)?;
103
- for field in interned_headers.iter() {
104
- map.insert(*field, vec![]);
105
- }
106
- let record = ColumnRecord::Map(map);
107
- let _: Value = ruby.yield_value(record.try_into_value_with(ruby)?)?;
108
- return Ok(true);
109
- }
110
- Ok(false)
111
- }
@@ -1,211 +0,0 @@
1
- mod common;
2
- mod parquet_column_reader;
3
- mod parquet_row_reader;
4
- mod unified;
5
- use std::{fs::File, rc::Rc};
6
-
7
- use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
8
- use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
9
- pub use parquet_column_reader::parse_parquet_columns;
10
- pub use parquet_row_reader::parse_parquet_rows;
11
-
12
- use crate::{
13
- ruby_reader::{RubyReader, ThreadSafeRubyReader},
14
- types::{ParquetGemError, TryIntoValue},
15
- };
16
-
17
- struct RubyParquetMetaData(ParquetMetaData);
18
-
19
- impl TryIntoValue for RubyParquetMetaData {
20
- fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
21
- let metadata = self.0;
22
- let file_metadata = metadata.file_metadata();
23
- let row_groups = metadata.row_groups();
24
-
25
- // Construct a hash with the metadata
26
- let hash = handle.hash_new();
27
- hash.aset("num_rows", file_metadata.num_rows())?;
28
- hash.aset("created_by", file_metadata.created_by())?;
29
- // Convert key_value_metadata to a Ruby array if it exists
30
- if let Some(key_value_metadata) = file_metadata.key_value_metadata() {
31
- let kv_array = handle.ary_new();
32
- for kv in key_value_metadata {
33
- let kv_hash = handle.hash_new();
34
- kv_hash.aset("key", kv.key.clone())?;
35
- kv_hash.aset("value", kv.value.clone())?;
36
- kv_array.push(kv_hash)?;
37
- }
38
- hash.aset("key_value_metadata", kv_array)?;
39
- } else {
40
- hash.aset("key_value_metadata", None::<Value>)?;
41
- }
42
-
43
- // Convert schema to a Ruby hash since &Type doesn't implement IntoValue
44
- let schema_hash = handle.hash_new();
45
- let schema = file_metadata.schema();
46
- schema_hash.aset("name", schema.name())?;
47
- // Add schema fields information
48
- let fields_array = handle.ary_new();
49
- for field in schema.get_fields() {
50
- let field_hash = handle.hash_new();
51
- field_hash.aset("name", field.name())?;
52
-
53
- // Handle different field types
54
- match field.as_ref() {
55
- parquet::schema::types::Type::PrimitiveType {
56
- physical_type,
57
- type_length,
58
- scale,
59
- precision,
60
- ..
61
- } => {
62
- field_hash.aset("type", "primitive")?;
63
- field_hash.aset("physical_type", format!("{:?}", physical_type))?;
64
- field_hash.aset("type_length", *type_length)?;
65
- field_hash.aset("scale", *scale)?;
66
- field_hash.aset("precision", *precision)?;
67
- }
68
- parquet::schema::types::Type::GroupType { .. } => {
69
- field_hash.aset("type", "group")?;
70
- }
71
- }
72
-
73
- // Add basic info
74
- let basic_info = field.get_basic_info();
75
- field_hash.aset("repetition", format!("{:?}", basic_info.repetition()))?;
76
- field_hash.aset(
77
- "converted_type",
78
- format!("{:?}", basic_info.converted_type()),
79
- )?;
80
- if let Some(logical_type) = basic_info.logical_type() {
81
- field_hash.aset("logical_type", format!("{:?}", logical_type))?;
82
- }
83
-
84
- fields_array.push(field_hash)?;
85
- }
86
- schema_hash.aset("fields", fields_array)?;
87
-
88
- hash.aset("schema", schema_hash)?;
89
-
90
- // Convert row_groups to a Ruby array since &[RowGroupMetaData] doesn't implement IntoValue
91
- let row_groups_array = handle.ary_new();
92
- for row_group in row_groups.iter() {
93
- let rg_hash = handle.hash_new();
94
- rg_hash.aset("num_columns", row_group.num_columns())?;
95
- rg_hash.aset("num_rows", row_group.num_rows())?;
96
- rg_hash.aset("total_byte_size", row_group.total_byte_size())?;
97
- rg_hash.aset("file_offset", row_group.file_offset())?;
98
- rg_hash.aset("ordinal", row_group.ordinal())?;
99
- rg_hash.aset("compressed_size", row_group.compressed_size())?;
100
-
101
- // Add column chunks metadata
102
- let columns_array = handle.ary_new();
103
- for col_idx in 0..row_group.num_columns() {
104
- let column = row_group.column(col_idx);
105
- let col_hash = handle.hash_new();
106
-
107
- col_hash.aset("column_path", column.column_path().string())?;
108
- col_hash.aset("file_path", column.file_path())?;
109
- col_hash.aset("file_offset", column.file_offset())?;
110
- col_hash.aset("num_values", column.num_values())?;
111
- col_hash.aset("compression", format!("{:?}", column.compression()))?;
112
- col_hash.aset("total_compressed_size", column.compressed_size())?;
113
- col_hash.aset("total_uncompressed_size", column.uncompressed_size())?;
114
- col_hash.aset("data_page_offset", column.data_page_offset())?;
115
-
116
- if let Some(offset) = column.dictionary_page_offset() {
117
- col_hash.aset("dictionary_page_offset", offset)?;
118
- }
119
-
120
- if let Some(offset) = column.bloom_filter_offset() {
121
- col_hash.aset("bloom_filter_offset", offset)?;
122
- }
123
-
124
- if let Some(length) = column.bloom_filter_length() {
125
- col_hash.aset("bloom_filter_length", length)?;
126
- }
127
-
128
- if let Some(offset) = column.offset_index_offset() {
129
- col_hash.aset("offset_index_offset", offset)?;
130
- }
131
-
132
- if let Some(length) = column.offset_index_length() {
133
- col_hash.aset("offset_index_length", length)?;
134
- }
135
-
136
- if let Some(offset) = column.column_index_offset() {
137
- col_hash.aset("column_index_offset", offset)?;
138
- }
139
-
140
- if let Some(length) = column.column_index_length() {
141
- col_hash.aset("column_index_length", length)?;
142
- }
143
-
144
- // Add encodings
145
- let encodings_array = handle.ary_new();
146
- for encoding in column.encodings() {
147
- encodings_array.push(format!("{:?}", encoding))?;
148
- }
149
- col_hash.aset("encodings", encodings_array)?;
150
-
151
- // Add statistics if available
152
- if let Some(stats) = column.statistics() {
153
- let stats_hash = handle.hash_new();
154
- stats_hash.aset("min_is_exact", stats.min_is_exact())?;
155
- stats_hash.aset("max_is_exact", stats.max_is_exact())?;
156
-
157
- col_hash.aset("statistics", stats_hash)?;
158
- }
159
-
160
- // Add page encoding stats if available
161
- if let Some(page_encoding_stats) = column.page_encoding_stats() {
162
- let page_stats_array = handle.ary_new();
163
- for stat in page_encoding_stats {
164
- let stat_hash = handle.hash_new();
165
- stat_hash.aset("page_type", format!("{:?}", stat.page_type))?;
166
- stat_hash.aset("encoding", format!("{:?}", stat.encoding))?;
167
- stat_hash.aset("count", stat.count)?;
168
- page_stats_array.push(stat_hash)?;
169
- }
170
- col_hash.aset("page_encoding_stats", page_stats_array)?;
171
- }
172
-
173
- columns_array.push(col_hash)?;
174
- }
175
- rg_hash.aset("columns", columns_array)?;
176
-
177
- row_groups_array.push(rg_hash)?;
178
- }
179
- hash.aset("row_groups", row_groups_array)?;
180
-
181
- Ok(handle.into_value(hash))
182
- }
183
- }
184
-
185
- pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
186
- let ruby = unsafe { Ruby::get_unchecked() };
187
-
188
- if args.len() != 1 {
189
- return Err(MagnusError::new(
190
- magnus::exception::arg_error(),
191
- format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
192
- ));
193
- }
194
-
195
- let ruby = Rc::new(ruby);
196
- let arg = args[0];
197
-
198
- let mut reader = ParquetMetaDataReader::new();
199
- if arg.is_kind_of(ruby.class_string()) {
200
- let path = arg.to_r_string()?.to_string()?;
201
- let file = File::open(path).map_err(ParquetGemError::FileOpen)?;
202
- reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
203
- } else {
204
- let file = ThreadSafeRubyReader::new(RubyReader::new(ruby.clone(), arg)?);
205
- reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
206
- }
207
-
208
- let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
209
-
210
- Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
211
- }
@@ -1,44 +0,0 @@
1
- use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
2
- use crate::utils::*;
3
- use crate::ParquetGemError;
4
-
5
- use magnus::{Error as MagnusError, Ruby, Value};
6
- use std::rc::Rc;
7
-
8
- #[inline]
9
- pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
10
- let ruby = unsafe { Ruby::get_unchecked() };
11
- parse_parquet_columns_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
12
- let z: MagnusError = e.into();
13
- z
14
- })
15
- }
16
-
17
- #[inline]
18
- fn parse_parquet_columns_impl(
19
- ruby: Rc<Ruby>,
20
- rb_self: Value,
21
- args: &[Value],
22
- ) -> Result<Value, ParquetGemError> {
23
- let ParquetColumnsArgs {
24
- to_read,
25
- result_type,
26
- columns,
27
- batch_size,
28
- strict,
29
- logger,
30
- } = parse_parquet_columns_args(&ruby, args)?;
31
-
32
- // Use the unified parsing implementation
33
- parse_parquet_unified(
34
- ruby,
35
- rb_self,
36
- UnifiedParserArgs {
37
- to_read,
38
- result_type,
39
- columns,
40
- parser_type: ParserType::Column { batch_size, strict },
41
- logger,
42
- },
43
- )
44
- }
@@ -1,43 +0,0 @@
1
- use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
2
- use crate::utils::*;
3
- use crate::ParquetGemError;
4
-
5
- use magnus::{Error as MagnusError, Ruby, Value};
6
- use std::rc::Rc;
7
-
8
- #[inline]
9
- pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
10
- let ruby = unsafe { Ruby::get_unchecked() };
11
- parse_parquet_rows_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
12
- let z: MagnusError = e.into();
13
- z
14
- })
15
- }
16
-
17
- #[inline]
18
- fn parse_parquet_rows_impl(
19
- ruby: Rc<Ruby>,
20
- rb_self: Value,
21
- args: &[Value],
22
- ) -> Result<Value, ParquetGemError> {
23
- let ParquetRowsArgs {
24
- to_read,
25
- result_type,
26
- columns,
27
- strict,
28
- logger,
29
- } = parse_parquet_rows_args(&ruby, args)?;
30
-
31
- // Use the unified parsing implementation
32
- parse_parquet_unified(
33
- ruby,
34
- rb_self,
35
- UnifiedParserArgs {
36
- to_read,
37
- result_type,
38
- columns,
39
- parser_type: ParserType::Row { strict },
40
- logger,
41
- },
42
- )
43
- }