parquet 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,171 @@
1
+ // Logger module for Parquet gem
2
+ // Provides a Rust wrapper for Ruby logger objects
3
+
4
+ use std::str::FromStr;
5
+
6
+ use magnus::{exception::runtime_error, value::ReprValue, Error as MagnusError, Ruby, Value};
7
+
8
+ use crate::{types::ParquetGemError, utils::parse_string_or_symbol};
9
+
10
+ /// Severity levels that match Ruby's Logger levels
11
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
12
+ pub enum LogLevel {
13
+ Debug,
14
+ Info,
15
+ Warn,
16
+ Error,
17
+ Fatal,
18
+ }
19
+
20
+ impl FromStr for LogLevel {
21
+ type Err = MagnusError;
22
+
23
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
24
+ Ok(match s {
25
+ "debug" => LogLevel::Debug,
26
+ "info" => LogLevel::Info,
27
+ "warn" => LogLevel::Warn,
28
+ "error" => LogLevel::Error,
29
+ "fatal" => LogLevel::Fatal,
30
+ _ => {
31
+ return Err(MagnusError::new(
32
+ runtime_error(),
33
+ format!("Invalid log level: {}", s),
34
+ ))
35
+ }
36
+ })
37
+ }
38
+ }
39
+ /// A wrapper around a Ruby logger object
40
+ #[derive(Debug, Clone)]
41
+ pub struct RubyLogger {
42
+ logger: Option<Value>,
43
+ level: LogLevel,
44
+ }
45
+
46
+ #[allow(dead_code)]
47
+ impl RubyLogger {
48
+ pub fn new(ruby: &Ruby, logger_value: Option<Value>) -> Result<Self, ParquetGemError> {
49
+ let environment_level = std::env::var("PARQUET_GEM_LOG_LEVEL")
50
+ .unwrap_or_else(|_| "warn".to_string())
51
+ .parse::<LogLevel>()
52
+ .unwrap_or(LogLevel::Warn);
53
+
54
+ match logger_value {
55
+ Some(logger) => {
56
+ if logger.is_nil() {
57
+ return Ok(Self {
58
+ logger: None,
59
+ level: environment_level,
60
+ });
61
+ }
62
+
63
+ let level_value = logger.funcall::<_, _, Value>("level", ())?;
64
+ let level = parse_string_or_symbol(ruby, level_value)?;
65
+ let level = level
66
+ .map(|s| s.parse::<LogLevel>())
67
+ .transpose()?
68
+ .unwrap_or(environment_level);
69
+
70
+ Ok(Self {
71
+ logger: Some(logger),
72
+ level,
73
+ })
74
+ }
75
+ None => Ok(Self {
76
+ logger: None,
77
+ level: environment_level,
78
+ }),
79
+ }
80
+ }
81
+
82
+ /// Log a message at the given level
83
+ pub fn log(&self, level: LogLevel, message: &str) -> Result<(), MagnusError> {
84
+ let method = match level {
85
+ LogLevel::Debug => "debug",
86
+ LogLevel::Info => "info",
87
+ LogLevel::Warn => "warn",
88
+ LogLevel::Error => "error",
89
+ LogLevel::Fatal => "fatal",
90
+ };
91
+
92
+ match self.logger {
93
+ Some(logger) => {
94
+ logger.funcall::<_, _, Value>(method, (message,))?;
95
+ }
96
+ None => eprintln!("{}", message),
97
+ }
98
+
99
+ Ok(())
100
+ }
101
+
102
+ /// Log a debug message
103
+ pub fn debug<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
104
+ where
105
+ F: FnOnce() -> S,
106
+ S: AsRef<str>,
107
+ {
108
+ if self.level <= LogLevel::Debug {
109
+ let message = message_fn();
110
+ self.log(LogLevel::Debug, message.as_ref())
111
+ } else {
112
+ Ok(())
113
+ }
114
+ }
115
+
116
+ /// Log an info message
117
+ pub fn info<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
118
+ where
119
+ F: FnOnce() -> S,
120
+ S: AsRef<str>,
121
+ {
122
+ if self.level <= LogLevel::Info {
123
+ let message = message_fn();
124
+ self.log(LogLevel::Info, message.as_ref())
125
+ } else {
126
+ Ok(())
127
+ }
128
+ }
129
+
130
+ /// Log a warning message
131
+ pub fn warn<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
132
+ where
133
+ F: FnOnce() -> S,
134
+ S: AsRef<str>,
135
+ {
136
+ if self.level <= LogLevel::Warn {
137
+ let message = message_fn();
138
+ self.log(LogLevel::Warn, message.as_ref())
139
+ } else {
140
+ Ok(())
141
+ }
142
+ }
143
+
144
+ /// Log an error message
145
+ pub fn error<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
146
+ where
147
+ F: FnOnce() -> S,
148
+ S: AsRef<str>,
149
+ {
150
+ if self.level <= LogLevel::Error {
151
+ let message = message_fn();
152
+ self.log(LogLevel::Error, message.as_ref())
153
+ } else {
154
+ Ok(())
155
+ }
156
+ }
157
+
158
+ /// Log a fatal message
159
+ pub fn fatal<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
160
+ where
161
+ F: FnOnce() -> S,
162
+ S: AsRef<str>,
163
+ {
164
+ if self.level <= LogLevel::Fatal {
165
+ let message = message_fn();
166
+ self.log(LogLevel::Fatal, message.as_ref())
167
+ } else {
168
+ Ok(())
169
+ }
170
+ }
171
+ }
@@ -0,0 +1,110 @@
1
+ use ahash::RandomState;
2
+ use arrow_schema::Schema;
3
+ use either::Either;
4
+ use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
5
+ use parquet::arrow::ProjectionMask;
6
+ use std::collections::HashMap;
7
+ use std::fs::File;
8
+ use std::sync::Arc;
9
+
10
+ use magnus::value::ReprValue;
11
+ use magnus::{Error as MagnusError, Ruby, Value};
12
+
13
+ use crate::header_cache::StringCache;
14
+ use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
15
+ use crate::types::{ParquetGemError, TryIntoValue};
16
+ use crate::ColumnRecord;
17
+
18
+ /// Opens a parquet file or IO-like object for reading
19
+ ///
20
+ /// This function handles both file paths (as strings) and IO-like objects,
21
+ /// returning either a File or a ThreadSafeRubyReader that can be used with
22
+ /// parquet readers.
23
+ pub fn open_parquet_source(
24
+ ruby: Arc<Ruby>,
25
+ to_read: Value,
26
+ ) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
27
+ if to_read.is_kind_of(ruby.class_string()) {
28
+ let path_string = to_read.to_r_string()?;
29
+ let file_path = unsafe { path_string.as_str()? };
30
+ let file = File::open(file_path).map_err(ParquetGemError::from)?;
31
+ Ok(Either::Left(file))
32
+ } else {
33
+ let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
34
+ Ok(Either::Right(readable))
35
+ }
36
+ }
37
+
38
+ /// Helper function to check if a block is given and create an appropriate enumerator
39
+ /// if not
40
+ pub fn handle_block_or_enum<F, T>(
41
+ _ruby: &magnus::Ruby,
42
+ block_given: bool,
43
+ create_enum: F,
44
+ ) -> Result<Option<T>, MagnusError>
45
+ where
46
+ F: FnOnce() -> Result<T, MagnusError>,
47
+ {
48
+ if !block_given {
49
+ let enum_value = create_enum()?;
50
+ return Ok(Some(enum_value));
51
+ }
52
+ Ok(None)
53
+ }
54
+
55
+ /// Creates a ParquetRecordBatchReader with the given columns and batch size configurations
56
+ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
57
+ reader: T,
58
+ columns: &Option<Vec<String>>,
59
+ batch_size: Option<usize>,
60
+ ) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
61
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(reader)
62
+ .map_err(|e| ParquetGemError::Parquet(e))?;
63
+
64
+ let schema = builder.schema().clone();
65
+ let num_rows = builder.metadata().file_metadata().num_rows();
66
+
67
+ // If columns are specified, project only those columns
68
+ if let Some(cols) = columns {
69
+ // Get the parquet schema
70
+ let parquet_schema = builder.parquet_schema();
71
+
72
+ // Create a projection mask from column names
73
+ let projection = ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
74
+ builder = builder.with_projection(projection);
75
+ }
76
+
77
+ if let Some(batch_size) = batch_size {
78
+ builder = builder.with_batch_size(batch_size);
79
+ }
80
+
81
+ let reader = builder.build().map_err(|e| ParquetGemError::Parquet(e))?;
82
+ Ok((reader, schema, num_rows))
83
+ }
84
+
85
+ /// Handles the case of an empty parquet file (no rows) by yielding a record with empty arrays
86
+ /// Returns true if the file was empty and was handled, false otherwise
87
+ pub fn handle_empty_file(
88
+ ruby: &magnus::Ruby,
89
+ schema: &Arc<Schema>,
90
+ num_rows: i64,
91
+ ) -> Result<bool, ParquetGemError> {
92
+ if num_rows == 0 {
93
+ let mut map =
94
+ HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
95
+ let headers: Vec<String> = schema
96
+ .fields()
97
+ .iter()
98
+ .map(|field| field.name().to_string())
99
+ .collect();
100
+ let interned_headers =
101
+ StringCache::intern_many(&headers).map_err(|e| ParquetGemError::HeaderIntern(e))?;
102
+ for field in interned_headers.iter() {
103
+ map.insert(*field, vec![]);
104
+ }
105
+ let record = ColumnRecord::Map(map);
106
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
107
+ return Ok(true);
108
+ }
109
+ Ok(false)
110
+ }
@@ -1,48 +1,6 @@
1
+ mod common;
1
2
  mod parquet_column_reader;
2
3
  mod parquet_row_reader;
3
4
 
4
- use std::io;
5
-
6
- use magnus::{Error as MagnusError, Ruby};
7
- use thiserror::Error;
8
-
9
- use crate::header_cache::CacheError;
10
5
  pub use parquet_column_reader::parse_parquet_columns;
11
6
  pub use parquet_row_reader::parse_parquet_rows;
12
-
13
- #[derive(Error, Debug)]
14
- pub enum ReaderError {
15
- #[error("Failed to get file descriptor: {0}")]
16
- FileDescriptor(String),
17
- #[error("Invalid file descriptor")]
18
- InvalidFileDescriptor,
19
- #[error("Failed to open file: {0}")]
20
- FileOpen(#[from] io::Error),
21
- #[error("Failed to intern headers: {0}")]
22
- HeaderIntern(#[from] CacheError),
23
- #[error("Ruby error: {0}")]
24
- Ruby(String),
25
- #[error("Parquet error: {0}")]
26
- Parquet(#[from] parquet::errors::ParquetError),
27
- #[error("Arrow error: {0}")]
28
- Arrow(#[from] arrow_schema::ArrowError),
29
- #[error("UTF-8 error: {0}")]
30
- Utf8Error(#[from] simdutf8::basic::Utf8Error),
31
- #[error("Jiff error: {0}")]
32
- Jiff(#[from] jiff::Error),
33
- }
34
-
35
- impl From<MagnusError> for ReaderError {
36
- fn from(err: MagnusError) -> Self {
37
- Self::Ruby(err.to_string())
38
- }
39
- }
40
-
41
- impl From<ReaderError> for MagnusError {
42
- fn from(err: ReaderError) -> Self {
43
- MagnusError::new(
44
- Ruby::get().unwrap().exception_runtime_error(),
45
- err.to_string(),
46
- )
47
- }
48
- }
@@ -1,119 +1,83 @@
1
1
  use crate::header_cache::StringCache;
2
- use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
3
- use crate::types::{ArrayWrapper, TryIntoValue};
2
+ use crate::logger::RubyLogger;
3
+ use crate::types::{ArrayWrapper, ParquetGemError, TryIntoValue};
4
4
  use crate::{
5
5
  create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
6
6
  ParserResultType,
7
7
  };
8
8
  use ahash::RandomState;
9
- use magnus::value::ReprValue;
9
+ use either::Either;
10
10
  use magnus::IntoValue;
11
11
  use magnus::{Error as MagnusError, Ruby, Value};
12
- use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
13
- use parquet::arrow::ProjectionMask;
14
12
  use std::collections::HashMap;
15
- use std::fs::File;
16
- use std::sync::OnceLock;
13
+ use std::sync::{Arc, OnceLock};
17
14
 
18
- use super::ReaderError;
15
+ use super::common::{
16
+ create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
17
+ };
19
18
 
20
19
  #[inline]
21
20
  pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
22
21
  let ruby = unsafe { Ruby::get_unchecked() };
22
+ Ok(
23
+ parse_parquet_columns_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
24
+ let z: MagnusError = e.into();
25
+ z
26
+ })?,
27
+ )
28
+ }
23
29
 
30
+ #[inline]
31
+ fn parse_parquet_columns_impl<'a>(
32
+ ruby: Arc<Ruby>,
33
+ rb_self: Value,
34
+ args: &[Value],
35
+ ) -> Result<Value, ParquetGemError> {
24
36
  let ParquetColumnsArgs {
25
37
  to_read,
26
38
  result_type,
27
39
  columns,
28
40
  batch_size,
29
41
  strict,
42
+ logger,
30
43
  } = parse_parquet_columns_args(&ruby, args)?;
31
44
 
32
- if !ruby.block_given() {
33
- return create_column_enumerator(ColumnEnumeratorArgs {
45
+ // Initialize the logger if provided
46
+ let ruby_logger = RubyLogger::new(&ruby, logger)?;
47
+ if let Some(ref bs) = batch_size {
48
+ ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
49
+ }
50
+
51
+ // Clone values for the closure to avoid move issues
52
+ let columns_clone = columns.clone();
53
+
54
+ // Handle block or create enumerator
55
+ if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
56
+ create_column_enumerator(ColumnEnumeratorArgs {
34
57
  rb_self,
35
58
  to_read,
36
59
  result_type,
37
- columns,
60
+ columns: columns_clone,
38
61
  batch_size,
39
62
  strict,
63
+ logger: logger.as_ref().map(|_| to_read),
40
64
  })
41
- .map(|yield_enum| yield_enum.into_value_with(&ruby));
65
+ .map(|yield_enum| yield_enum.into_value_with(&ruby))
66
+ })? {
67
+ return Ok(enum_value);
42
68
  }
43
69
 
44
- let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
45
- let path_string = to_read.to_r_string()?;
46
- let file_path = unsafe { path_string.as_str()? };
47
- let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
70
+ let source = open_parquet_source(ruby.clone(), to_read)?;
48
71
 
49
- let mut builder =
50
- ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
51
- let schema = builder.schema().clone();
52
- let num_rows = builder.metadata().file_metadata().num_rows();
53
-
54
- // If columns are specified, project only those columns
55
- if let Some(cols) = &columns {
56
- // Get the parquet schema
57
- let parquet_schema = builder.parquet_schema();
58
-
59
- // Create a projection mask from column names
60
- let projection =
61
- ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
62
-
63
- builder = builder.with_projection(projection);
64
- }
65
-
66
- if let Some(batch_size) = batch_size {
67
- builder = builder.with_batch_size(batch_size);
68
- }
72
+ // Use the common function to create the batch reader
69
73
 
70
- let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
71
-
72
- (reader, schema, num_rows)
73
- } else {
74
- let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
75
-
76
- let mut builder =
77
- ParquetRecordBatchReaderBuilder::try_new(readable).map_err(ReaderError::from)?;
78
- let schema = builder.schema().clone();
79
- let num_rows = builder.metadata().file_metadata().num_rows();
80
-
81
- // If columns are specified, project only those columns
82
- if let Some(cols) = &columns {
83
- // Get the parquet schema
84
- let parquet_schema = builder.parquet_schema();
85
-
86
- // Create a projection mask from column names
87
- let projection =
88
- ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
89
-
90
- builder = builder.with_projection(projection);
91
- }
92
-
93
- if let Some(batch_size) = batch_size {
94
- builder = builder.with_batch_size(batch_size);
95
- }
96
-
97
- let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
98
-
99
- (reader, schema, num_rows)
74
+ let (batch_reader, schema, num_rows) = match source {
75
+ Either::Left(file) => create_batch_reader(file, &columns, batch_size)?,
76
+ Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
100
77
  };
101
78
 
102
- if num_rows == 0 {
103
- let mut map =
104
- HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
105
- let headers: Vec<String> = schema
106
- .fields()
107
- .iter()
108
- .map(|field| field.name().to_string())
109
- .collect();
110
- let interned_headers =
111
- StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
112
- for field in interned_headers.iter() {
113
- map.insert(*field, vec![]);
114
- }
115
- let record = ColumnRecord::Map(map);
116
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
79
+ // Handle empty file case
80
+ if handle_empty_file(&ruby, &schema, num_rows)? {
117
81
  return Ok(ruby.qnil().into_value_with(&ruby));
118
82
  }
119
83
 
@@ -122,7 +86,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
122
86
  let headers = OnceLock::new();
123
87
  let headers_clone = headers.clone();
124
88
  let iter = batch_reader.map(move |batch| {
125
- batch.map_err(ReaderError::Arrow).and_then(|batch| {
89
+ batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
126
90
  let local_headers = headers_clone
127
91
  .get_or_init(|| {
128
92
  let schema = batch.schema();
@@ -134,7 +98,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
134
98
  StringCache::intern_many(&header_string)
135
99
  })
136
100
  .as_ref()
137
- .map_err(|e| ReaderError::HeaderIntern(e.clone()))?;
101
+ .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
138
102
 
139
103
  let mut map = HashMap::with_capacity_and_hasher(
140
104
  local_headers.len(),
@@ -152,7 +116,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
152
116
  strict: strict,
153
117
  })?;
154
118
  map.insert(header, values.into_inner());
155
- Ok::<_, ReaderError>(())
119
+ Ok::<_, ParquetGemError>(())
156
120
  })?;
157
121
 
158
122
  Ok(ColumnRecord::Map::<RandomState>(map))
@@ -166,7 +130,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
166
130
  }
167
131
  ParserResultType::Array => {
168
132
  let iter = batch_reader.map(|batch| {
169
- batch.map_err(ReaderError::Arrow).and_then(|batch| {
133
+ batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
170
134
  let vec = batch
171
135
  .columns()
172
136
  .into_iter()
@@ -175,7 +139,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
175
139
  array: &*column,
176
140
  strict: strict,
177
141
  })?;
178
- Ok::<_, ReaderError>(values.into_inner())
142
+ Ok::<_, ParquetGemError>(values.into_inner())
179
143
  })
180
144
  .collect::<Result<Vec<_>, _>>()?;
181
145
  Ok(ColumnRecord::Vec::<RandomState>(vec))
@@ -1,55 +1,84 @@
1
1
  use crate::header_cache::StringCache;
2
- use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
2
+ use crate::logger::RubyLogger;
3
3
  use crate::types::TryIntoValue;
4
4
  use crate::{
5
- create_row_enumerator, utils::*, ParquetField, ParserResultType, ReaderError,
5
+ create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
6
6
  RowEnumeratorArgs, RowRecord,
7
7
  };
8
8
  use ahash::RandomState;
9
- use magnus::value::ReprValue;
9
+ use either::Either;
10
10
  use magnus::IntoValue;
11
11
  use magnus::{Error as MagnusError, Ruby, Value};
12
12
  use parquet::file::reader::{FileReader, SerializedFileReader};
13
13
  use parquet::record::reader::RowIter as ParquetRowIter;
14
14
  use parquet::schema::types::{Type as SchemaType, TypePtr};
15
15
  use std::collections::HashMap;
16
- use std::fs::File;
17
- use std::sync::OnceLock;
16
+ use std::sync::{Arc, OnceLock};
17
+
18
+ use super::common::{handle_block_or_enum, open_parquet_source};
18
19
 
19
20
  #[inline]
20
21
  pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
21
22
  let ruby = unsafe { Ruby::get_unchecked() };
23
+ Ok(
24
+ parse_parquet_rows_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
25
+ let z: MagnusError = e.into();
26
+ z
27
+ })?,
28
+ )
29
+ }
22
30
 
31
+ #[inline]
32
+ fn parse_parquet_rows_impl<'a>(
33
+ ruby: Arc<Ruby>,
34
+ rb_self: Value,
35
+ args: &[Value],
36
+ ) -> Result<Value, ParquetGemError> {
23
37
  let ParquetRowsArgs {
24
38
  to_read,
25
39
  result_type,
26
40
  columns,
27
41
  strict,
42
+ logger,
28
43
  } = parse_parquet_rows_args(&ruby, args)?;
29
44
 
30
- if !ruby.block_given() {
31
- return create_row_enumerator(RowEnumeratorArgs {
45
+ // Initialize the logger if provided
46
+ let ruby_logger = RubyLogger::new(&ruby, logger)?;
47
+
48
+ // Clone values for the closure to avoid move issues
49
+ let columns_clone = columns.clone();
50
+
51
+ // Handle block or create enumerator
52
+ if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
53
+ create_row_enumerator(RowEnumeratorArgs {
32
54
  rb_self,
33
55
  to_read,
34
56
  result_type,
35
- columns,
57
+ columns: columns_clone,
36
58
  strict,
59
+ logger,
37
60
  })
38
- .map(|yield_enum| yield_enum.into_value_with(&ruby));
61
+ .map(|yield_enum| yield_enum.into_value_with(&ruby))
62
+ })? {
63
+ return Ok(enum_value);
39
64
  }
40
65
 
41
- let reader: Box<dyn FileReader> = if to_read.is_kind_of(ruby.class_string()) {
42
- let path_string = to_read.to_r_string()?;
43
- let file_path = unsafe { path_string.as_str()? };
44
- let file = File::open(file_path).map_err(ReaderError::from)?;
45
- Box::new(SerializedFileReader::new(file).map_err(ReaderError::from)?)
46
- } else {
47
- let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
48
- Box::new(SerializedFileReader::new(readable).map_err(ReaderError::from)?)
66
+ let source = open_parquet_source(ruby.clone(), to_read)?;
67
+ let reader: Box<dyn FileReader> = match source {
68
+ Either::Left(file) => {
69
+ Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
70
+ }
71
+ Either::Right(readable) => {
72
+ Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
73
+ }
49
74
  };
75
+
50
76
  let schema = reader.metadata().file_metadata().schema().clone();
77
+ ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
78
+
51
79
  let mut iter = ParquetRowIter::from_file_into(reader);
52
80
  if let Some(cols) = columns {
81
+ ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
53
82
  let projection = create_projection_schema(&schema, &cols);
54
83
  iter = iter.project(Some(projection.to_owned())).map_err(|e| {
55
84
  MagnusError::new(
@@ -81,13 +110,13 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
81
110
 
82
111
  let mut map =
83
112
  HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
84
- row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
113
+ for (i, (_, v)) in row.get_column_iter().enumerate() {
85
114
  map.insert(headers[i], ParquetField(v.clone(), strict));
86
- });
115
+ }
87
116
  Ok(map)
88
117
  })
89
118
  .and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
90
- .map_err(|e| ReaderError::from(e))
119
+ .map_err(|e| ParquetGemError::from(e))
91
120
  });
92
121
 
93
122
  for result in iter {
@@ -100,12 +129,13 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
100
129
  row.and_then(|row| {
101
130
  let column_count = row.get_column_iter().count();
102
131
  let mut vec = Vec::with_capacity(column_count);
103
- row.get_column_iter()
104
- .for_each(|(_, v)| vec.push(ParquetField(v.clone(), strict)));
132
+ for (_, v) in row.get_column_iter() {
133
+ vec.push(ParquetField(v.clone(), strict));
134
+ }
105
135
  Ok(vec)
106
136
  })
107
137
  .and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
108
- .map_err(|e| ReaderError::from(e))
138
+ .map_err(|e| ParquetGemError::from(e))
109
139
  });
110
140
 
111
141
  for result in iter {