parquet 0.0.5 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 398a8ab4fe6b9c8e82d63ec832aa73163e75874c39080d87291a60397756df42
4
- data.tar.gz: cace20e14d0eddc6e3185b2f9294253cb57c1689ec463ff66bc903d3c780af13
3
+ metadata.gz: 90e876ca198a0e1871f692a382f09ceaeec670d162da26f2c102ea4eca4244bf
4
+ data.tar.gz: 96743e260cbd2fb55f6cdeaf256fbb1e915c57651fdc3f20fdd58b6a34596544
5
5
  SHA512:
6
- metadata.gz: 72ae6542b367fe433016f06fa109aaa77fe360bbc1df64e5c997db8fcc0a00aa166aa19a37240a706b3f443612770b80bc387dd41b34ee4a94ab26c3b0e74832
7
- data.tar.gz: f69b10c6d4c8d879cdd3fce7c3b44933a99569358d1adfa3106760bd7c66036a2fef86737cf4dc6369be46234c124b9f2ef66e82fab118e36b5b079e9d23e10b
6
+ metadata.gz: 1609a37c5a9bd9f1d57bb31dd02b2fdb5b608a7c044686e6ef2513c95e53e830bd7bf7048a36904465a32a5915425c7b6bf581c5b35a4fb19f950cbca20913b2
7
+ data.tar.gz: 96ec18377fc5944556760329c126f440de61d3b378bfa976a66437db03f0a51220c880afd14098a5b1968daa968d2e836c50f83bef21507789ba4df314c48148
data/Cargo.lock CHANGED
@@ -387,6 +387,22 @@ version = "1.13.0"
387
387
  source = "registry+https://github.com/rust-lang/crates.io-index"
388
388
  checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
389
389
 
390
+ [[package]]
391
+ name = "errno"
392
+ version = "0.3.10"
393
+ source = "registry+https://github.com/rust-lang/crates.io-index"
394
+ checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
395
+ dependencies = [
396
+ "libc",
397
+ "windows-sys",
398
+ ]
399
+
400
+ [[package]]
401
+ name = "fastrand"
402
+ version = "2.3.0"
403
+ source = "registry+https://github.com/rust-lang/crates.io-index"
404
+ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
405
+
390
406
  [[package]]
391
407
  name = "flatbuffers"
392
408
  version = "24.12.23"
@@ -934,6 +950,12 @@ dependencies = [
934
950
  "libc",
935
951
  ]
936
952
 
953
+ [[package]]
954
+ name = "linux-raw-sys"
955
+ version = "0.4.15"
956
+ source = "registry+https://github.com/rust-lang/crates.io-index"
957
+ checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
958
+
937
959
  [[package]]
938
960
  name = "litemap"
939
961
  version = "0.7.4"
@@ -1185,6 +1207,7 @@ dependencies = [
1185
1207
  "mimalloc",
1186
1208
  "parquet 54.0.0",
1187
1209
  "rb-sys",
1210
+ "tempfile",
1188
1211
  "thiserror",
1189
1212
  ]
1190
1213
 
@@ -1377,6 +1400,19 @@ dependencies = [
1377
1400
  "semver",
1378
1401
  ]
1379
1402
 
1403
+ [[package]]
1404
+ name = "rustix"
1405
+ version = "0.38.43"
1406
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1407
+ checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6"
1408
+ dependencies = [
1409
+ "bitflags 2.6.0",
1410
+ "errno",
1411
+ "libc",
1412
+ "linux-raw-sys",
1413
+ "windows-sys",
1414
+ ]
1415
+
1380
1416
  [[package]]
1381
1417
  name = "ryu"
1382
1418
  version = "1.0.18"
@@ -1530,6 +1566,20 @@ dependencies = [
1530
1566
  "syn",
1531
1567
  ]
1532
1568
 
1569
+ [[package]]
1570
+ name = "tempfile"
1571
+ version = "3.15.0"
1572
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1573
+ checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
1574
+ dependencies = [
1575
+ "cfg-if",
1576
+ "fastrand",
1577
+ "getrandom",
1578
+ "once_cell",
1579
+ "rustix",
1580
+ "windows-sys",
1581
+ ]
1582
+
1533
1583
  [[package]]
1534
1584
  name = "thiserror"
1535
1585
  version = "2.0.9"
data/README.md CHANGED
@@ -4,8 +4,6 @@
4
4
 
5
5
  This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
6
6
 
7
- At the moment, it only supports iterating rows as either a hash or an array.
8
-
9
7
  ## Usage
10
8
 
11
9
  This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
@@ -83,3 +81,95 @@ Additional arguments for `each_column`:
83
81
  - `batch_size`: Number of rows per batch (defaults to implementation-defined value)
84
82
 
85
83
  When no block is given, both methods return an Enumerator.
84
+
85
+ ### Writing Row-wise Data
86
+
87
+ The `write_rows` method allows you to write data row by row:
88
+
89
+ ```ruby
90
+ require "parquet"
91
+
92
+ # Define the schema for your data
93
+ schema = [
94
+ { "id" => "int64" },
95
+ { "name" => "string" },
96
+ { "score" => "double" }
97
+ ]
98
+
99
+ # Create an enumerator that yields arrays of row values
100
+ rows = [
101
+ [1, "Alice", 95.5],
102
+ [2, "Bob", 82.3],
103
+ [3, "Charlie", 88.7]
104
+ ].each
105
+
106
+ # Write to a file
107
+ Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
108
+
109
+ # Write to an IO object
110
+ File.open("data.parquet", "wb") do |file|
111
+ Parquet.write_rows(rows, schema: schema, write_to: file)
112
+ end
113
+
114
+ # Optionally specify batch size (default is 1000)
115
+ Parquet.write_rows(rows,
116
+ schema: schema,
117
+ write_to: "data.parquet",
118
+ batch_size: 500
119
+ )
120
+ ```
121
+
122
+ ### Writing Column-wise Data
123
+
124
+ The `write_columns` method provides a more efficient way to write data in column-oriented batches:
125
+
126
+ ```ruby
127
+ require "parquet"
128
+
129
+ # Define the schema
130
+ schema = [
131
+ { "id" => "int64" },
132
+ { "name" => "string" },
133
+ { "score" => "double" }
134
+ ]
135
+
136
+ # Create batches of column data
137
+ batches = [
138
+ # First batch
139
+ [
140
+ [1, 2], # id column
141
+ ["Alice", "Bob"], # name column
142
+ [95.5, 82.3] # score column
143
+ ],
144
+ # Second batch
145
+ [
146
+ [3], # id column
147
+ ["Charlie"], # name column
148
+ [88.7] # score column
149
+ ]
150
+ ]
151
+
152
+ # Create an enumerator from the batches
153
+ columns = batches.each
154
+
155
+ # Write to a parquet file
156
+ Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
+
158
+ # Write to an IO object
159
+ File.open("data.parquet", "wb") do |file|
160
+ Parquet.write_columns(columns, schema: schema, write_to: file)
161
+ end
162
+ ```
163
+
164
+ The following data types are supported in the schema:
165
+
166
+ - `int8`, `int16`, `int32`, `int64`
167
+ - `uint8`, `uint16`, `uint32`, `uint64`
168
+ - `float`, `double`
169
+ - `string`
170
+ - `binary`
171
+ - `boolean`
172
+ - `date32`
173
+ - `timestamp_millis`, `timestamp_micros`
174
+
175
+ Note: List and Map types are currently not supported.
@@ -17,6 +17,7 @@ magnus = { version = "0.7", features = ["rb-sys"] }
17
17
  parquet = { version = "^54.0", features = ["json", "object_store"] }
18
18
  rb-sys = "^0.9"
19
19
  thiserror = "2.0"
20
+ tempfile = "^3.15"
20
21
 
21
22
  [target.'cfg(target_os = "linux")'.dependencies]
22
23
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -6,9 +6,7 @@ mod ruby_integration;
6
6
  mod ruby_reader;
7
7
  mod types;
8
8
  mod utils;
9
-
10
- mod parquet_column_reader;
11
- mod parquet_row_reader;
9
+ mod writer;
12
10
 
13
11
  use crate::enumerator::*;
14
12
  use crate::reader::*;
@@ -16,6 +14,8 @@ use crate::ruby_integration::*;
16
14
  use crate::types::*;
17
15
 
18
16
  use magnus::{Error, Ruby};
17
+ use writer::write_columns;
18
+ use writer::write_rows;
19
19
 
20
20
  /// Initializes the Ruby extension and defines methods.
21
21
  #[magnus::init]
@@ -23,5 +23,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
23
23
  let module = ruby.define_module("Parquet")?;
24
24
  module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
25
25
  module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
26
+ module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
27
+ module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
26
28
  Ok(())
27
29
  }
@@ -1,11 +1,14 @@
1
+ mod parquet_column_reader;
2
+ mod parquet_row_reader;
3
+
1
4
  use std::io;
2
5
 
3
6
  use magnus::{Error as MagnusError, Ruby};
4
7
  use thiserror::Error;
5
8
 
6
9
  use crate::header_cache::CacheError;
7
- pub use crate::parquet_column_reader::parse_parquet_columns;
8
- pub use crate::parquet_row_reader::parse_parquet_rows;
10
+ pub use parquet_column_reader::parse_parquet_columns;
11
+ pub use parquet_row_reader::parse_parquet_rows;
9
12
 
10
13
  #[derive(Error, Debug)]
11
14
  pub enum ReaderError {
@@ -0,0 +1,73 @@
1
+ #[derive(Copy, Clone, Debug, PartialEq, Eq)]
2
+ pub enum ParserResultType {
3
+ Hash,
4
+ Array,
5
+ }
6
+
7
+ impl ParserResultType {
8
+ pub fn iter() -> impl Iterator<Item = Self> {
9
+ [Self::Hash, Self::Array].into_iter()
10
+ }
11
+ }
12
+
13
+ impl TryFrom<&str> for ParserResultType {
14
+ type Error = String;
15
+
16
+ fn try_from(value: &str) -> Result<Self, Self::Error> {
17
+ match value {
18
+ "hash" => Ok(ParserResultType::Hash),
19
+ "array" => Ok(ParserResultType::Array),
20
+ _ => Err(format!("Invalid parser result type: {}", value)),
21
+ }
22
+ }
23
+ }
24
+
25
+ impl TryFrom<String> for ParserResultType {
26
+ type Error = String;
27
+
28
+ fn try_from(value: String) -> Result<Self, Self::Error> {
29
+ Self::try_from(value.as_str())
30
+ }
31
+ }
32
+
33
+ impl std::fmt::Display for ParserResultType {
34
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35
+ match self {
36
+ ParserResultType::Hash => write!(f, "hash"),
37
+ ParserResultType::Array => write!(f, "array"),
38
+ }
39
+ }
40
+ }
41
+
42
+ #[derive(Debug, Clone)]
43
+ pub struct ListField {
44
+ pub item_type: ParquetSchemaType,
45
+ }
46
+
47
+ #[derive(Debug, Clone)]
48
+ pub struct MapField {
49
+ pub key_type: ParquetSchemaType,
50
+ pub value_type: ParquetSchemaType,
51
+ }
52
+
53
+ #[derive(Debug, Clone)]
54
+ pub enum ParquetSchemaType {
55
+ Int8,
56
+ Int16,
57
+ Int32,
58
+ Int64,
59
+ UInt8,
60
+ UInt16,
61
+ UInt32,
62
+ UInt64,
63
+ Float,
64
+ Double,
65
+ String,
66
+ Binary,
67
+ Boolean,
68
+ Date32,
69
+ TimestampMillis,
70
+ TimestampMicros,
71
+ List(Box<ListField>),
72
+ Map(Box<MapField>),
73
+ }
@@ -0,0 +1,30 @@
1
+ // Re-export all public items from submodules
2
+ mod core_types;
3
+ mod parquet_value;
4
+ mod record_types;
5
+ mod timestamp;
6
+ mod type_conversion;
7
+ mod writer_types;
8
+
9
+ pub use core_types::*;
10
+ pub use parquet_value::*;
11
+ pub use record_types::*;
12
+ pub use timestamp::*;
13
+ pub use type_conversion::*;
14
+ pub use writer_types::*;
15
+
16
+ // Common imports used across the module
17
+ use arrow_array::cast::downcast_array;
18
+ use arrow_array::{
19
+ Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
20
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
21
+ StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
22
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
23
+ };
24
+ use arrow_schema::{DataType, TimeUnit};
25
+ use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
26
+ use parquet::data_type::Decimal;
27
+ use parquet::record::Field;
28
+ use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
29
+
30
+ use crate::header_cache::StringCacheKey;