parquet 0.2.12-arm64-darwin → 0.3.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,77 +0,0 @@
1
- use std::{
2
- fs::File,
3
- io::{BufReader, SeekFrom},
4
- mem::ManuallyDrop,
5
- };
6
-
7
- use bytes::Bytes;
8
- use magnus::{value::Opaque, Ruby, Value};
9
- use parquet::{
10
- errors::ParquetError,
11
- file::reader::{ChunkReader, Length},
12
- };
13
- use std::io::Read;
14
-
15
- use crate::ruby_reader::{build_ruby_reader, SeekableRead};
16
-
17
- const READ_BUFFER_SIZE: usize = 16 * 1024;
18
-
19
- pub struct SeekableRubyValue(pub Opaque<Value>);
20
-
21
- impl Length for SeekableRubyValue {
22
- fn len(&self) -> u64 {
23
- let ruby = unsafe { Ruby::get_unchecked() };
24
- let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
25
- let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
26
- let file_len = reader.seek(SeekFrom::End(0)).unwrap();
27
- reader.seek(SeekFrom::Start(current_pos)).unwrap();
28
- file_len
29
- }
30
- }
31
-
32
- impl ChunkReader for SeekableRubyValue {
33
- type T = BufReader<Box<dyn SeekableRead>>;
34
-
35
- fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
36
- let ruby = unsafe { Ruby::get_unchecked() };
37
- let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
38
- reader.seek(SeekFrom::Start(start))?;
39
- Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
40
- }
41
-
42
- fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
43
- let ruby = unsafe { Ruby::get_unchecked() };
44
- let mut buffer = Vec::with_capacity(length);
45
- let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
46
- reader.seek(SeekFrom::Start(start))?;
47
- let read = reader.take(length as _).read_to_end(&mut buffer)?;
48
-
49
- if read != length {
50
- return Err(ParquetError::EOF(format!(
51
- "Expected to read {} bytes, read only {}",
52
- length, read
53
- )));
54
- }
55
- Ok(buffer.into())
56
- }
57
- }
58
-
59
- pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
60
-
61
- impl Length for ForgottenFileHandle {
62
- fn len(&self) -> u64 {
63
- self.0.len()
64
- }
65
- }
66
-
67
- impl ChunkReader for ForgottenFileHandle {
68
- type T = BufReader<File>;
69
-
70
- fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
71
- self.0.get_read(start)
72
- }
73
-
74
- fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
75
- self.0.get_bytes(start, length)
76
- }
77
- }
@@ -1,171 +0,0 @@
1
- use magnus::{
2
- value::{Opaque, ReprValue},
3
- RClass, RString, Ruby, Value,
4
- };
5
- use std::io::{self, Read, Seek, SeekFrom, Write};
6
- use std::sync::OnceLock;
7
-
8
- static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
9
-
10
- /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
11
- /// and provide a standard Read implementation for them.
12
- pub struct RubyReader<T> {
13
- inner: T,
14
- offset: usize,
15
- }
16
-
17
- pub trait SeekableRead: Read + Seek {}
18
- impl<T: Read + Seek> SeekableRead for T {}
19
-
20
- pub fn build_ruby_reader(
21
- ruby: &Ruby,
22
- input: Value,
23
- ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
24
- if RubyReader::is_string_io(ruby, &input) {
25
- RubyReader::from_string_io(ruby, input)
26
- } else if RubyReader::is_io_like(&input) {
27
- RubyReader::from_io(input)
28
- } else {
29
- RubyReader::from_string_like(input)
30
- }
31
- }
32
-
33
- impl Seek for RubyReader<Value> {
34
- fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
35
- let (whence, offset) = match pos {
36
- SeekFrom::Start(i) => (0, i as i64),
37
- SeekFrom::Current(i) => (1, i),
38
- SeekFrom::End(i) => (2, i),
39
- };
40
-
41
- let new_position = self
42
- .inner
43
- .funcall("seek", (offset, whence))
44
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
45
-
46
- Ok(new_position)
47
- }
48
- }
49
-
50
- impl Write for RubyReader<Value> {
51
- fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
52
- let ruby_bytes = RString::from_slice(buf);
53
-
54
- let bytes_written = self
55
- .inner
56
- .funcall::<_, _, usize>("write", (ruby_bytes,))
57
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
58
-
59
- Ok(bytes_written)
60
- }
61
-
62
- fn flush(&mut self) -> Result<(), io::Error> {
63
- self.inner
64
- .funcall::<_, _, Value>("flush", ())
65
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
66
-
67
- Ok(())
68
- }
69
- }
70
-
71
- impl Seek for RubyReader<RString> {
72
- fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
73
- match pos {
74
- io::SeekFrom::Start(offset) => self.offset = offset as usize,
75
- io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
76
- io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
77
- }
78
- Ok(self.offset as u64)
79
- }
80
- }
81
-
82
- impl RubyReader<Value> {
83
- fn from_io(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
84
- if Self::is_io_like(&input) {
85
- Ok(Box::new(Self::from_io_like(input)))
86
- } else {
87
- Err(magnus::Error::new(
88
- magnus::exception::type_error(),
89
- "Input is not an IO-like object",
90
- ))
91
- }
92
- }
93
-
94
- fn is_io_like(input: &Value) -> bool {
95
- input.respond_to("read", false).unwrap_or(false)
96
- }
97
-
98
- fn from_io_like(input: Value) -> Self {
99
- Self {
100
- inner: input,
101
- offset: 0,
102
- }
103
- }
104
- }
105
-
106
- impl RubyReader<RString> {
107
- pub fn from_string_io(
108
- ruby: &Ruby,
109
- input: Value,
110
- ) -> Result<Box<dyn SeekableRead>, magnus::Error> {
111
- if !Self::is_string_io(ruby, &input) {
112
- return Err(magnus::Error::new(
113
- magnus::exception::type_error(),
114
- "Input is not a StringIO",
115
- ));
116
- }
117
-
118
- let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
119
- Ok(Box::new(Self {
120
- inner: string_content,
121
- offset: 0,
122
- }))
123
- }
124
-
125
- fn is_string_io(ruby: &Ruby, input: &Value) -> bool {
126
- let string_io_class = STRING_IO_CLASS.get_or_init(|| {
127
- let class = RClass::from_value(ruby.eval("StringIO").unwrap()).unwrap();
128
- Opaque::from(class)
129
- });
130
- input.is_kind_of(ruby.get_inner(*string_io_class))
131
- }
132
-
133
- fn from_string_like(input: Value) -> Result<Box<dyn SeekableRead>, magnus::Error> {
134
- // Try calling `to_str`, and if that fails, try `to_s`
135
- let string_content = input
136
- .funcall::<_, _, RString>("to_str", ())
137
- .or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
138
- Ok(Box::new(Self {
139
- inner: string_content,
140
- offset: 0,
141
- }))
142
- }
143
- }
144
-
145
- impl Read for RubyReader<Value> {
146
- fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
147
- let bytes = self
148
- .inner
149
- .funcall::<_, _, RString>("read", (buf.len(),))
150
- .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
151
-
152
- buf.write_all(unsafe { bytes.as_slice() })?;
153
-
154
- Ok(bytes.len())
155
- }
156
- }
157
-
158
- impl Read for RubyReader<RString> {
159
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
160
- let string_buffer = unsafe { self.inner.as_slice() };
161
- if self.offset >= string_buffer.len() {
162
- return Ok(0); // EOF
163
- }
164
-
165
- let remaining = string_buffer.len() - self.offset;
166
- let copy_size = remaining.min(buf.len());
167
- buf[..copy_size].copy_from_slice(&string_buffer[self.offset..self.offset + copy_size]);
168
- self.offset += copy_size;
169
- Ok(copy_size)
170
- }
171
- }
@@ -1,75 +0,0 @@
1
- #[derive(Copy, Clone, Debug, PartialEq, Eq)]
2
- pub enum ParserResultType {
3
- Hash,
4
- Array,
5
- }
6
-
7
- impl ParserResultType {
8
- pub fn iter() -> impl Iterator<Item = Self> {
9
- [Self::Hash, Self::Array].into_iter()
10
- }
11
- }
12
-
13
- impl TryFrom<&str> for ParserResultType {
14
- type Error = String;
15
-
16
- fn try_from(value: &str) -> Result<Self, Self::Error> {
17
- match value {
18
- "hash" => Ok(ParserResultType::Hash),
19
- "array" => Ok(ParserResultType::Array),
20
- _ => Err(format!("Invalid parser result type: {}", value)),
21
- }
22
- }
23
- }
24
-
25
- impl TryFrom<String> for ParserResultType {
26
- type Error = String;
27
-
28
- fn try_from(value: String) -> Result<Self, Self::Error> {
29
- Self::try_from(value.as_str())
30
- }
31
- }
32
-
33
- impl std::fmt::Display for ParserResultType {
34
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35
- match self {
36
- ParserResultType::Hash => write!(f, "hash"),
37
- ParserResultType::Array => write!(f, "array"),
38
- }
39
- }
40
- }
41
-
42
- #[derive(Debug, Clone)]
43
- pub struct ListField<'a> {
44
- pub item_type: ParquetSchemaType<'a>,
45
- pub format: Option<&'a str>,
46
- }
47
-
48
- #[derive(Debug, Clone)]
49
- pub struct MapField<'a> {
50
- pub key_type: ParquetSchemaType<'a>,
51
- pub value_type: ParquetSchemaType<'a>,
52
- pub format: Option<&'a str>,
53
- }
54
-
55
- #[derive(Debug, Clone)]
56
- pub enum ParquetSchemaType<'a> {
57
- Int8,
58
- Int16,
59
- Int32,
60
- Int64,
61
- UInt8,
62
- UInt16,
63
- UInt32,
64
- UInt64,
65
- Float,
66
- Double,
67
- String,
68
- Binary,
69
- Boolean,
70
- Date32,
71
- TimestampMillis,
72
- TimestampMicros,
73
- List(Box<ListField<'a>>),
74
- Map(Box<MapField<'a>>),
75
- }
@@ -1,30 +0,0 @@
1
- // Re-export all public items from submodules
2
- mod core_types;
3
- mod parquet_value;
4
- mod record_types;
5
- mod timestamp;
6
- mod type_conversion;
7
- mod writer_types;
8
-
9
- pub use core_types::*;
10
- pub use parquet_value::*;
11
- pub use record_types::*;
12
- pub use timestamp::*;
13
- pub use type_conversion::*;
14
- pub use writer_types::*;
15
-
16
- // Common imports used across the module
17
- use arrow_array::cast::downcast_array;
18
- use arrow_array::{
19
- Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
20
- Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
21
- StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
22
- TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
23
- };
24
- use arrow_schema::{DataType, TimeUnit};
25
- use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
26
- use parquet::data_type::Decimal;
27
- use parquet::record::Field;
28
- use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
29
-
30
- use crate::header_cache::StringCacheKey;