parquet 0.0.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 398a8ab4fe6b9c8e82d63ec832aa73163e75874c39080d87291a60397756df42
4
- data.tar.gz: cace20e14d0eddc6e3185b2f9294253cb57c1689ec463ff66bc903d3c780af13
3
+ metadata.gz: 794d11142b73d13b665ecdb4ffd46df6ab7d97e5f99336e2bc91b79dbb55a514
4
+ data.tar.gz: eb2843d724e7aad70445a8b992a527e3bee0a79055fdeab7f2ebd2cdfb6247d6
5
5
  SHA512:
6
- metadata.gz: 72ae6542b367fe433016f06fa109aaa77fe360bbc1df64e5c997db8fcc0a00aa166aa19a37240a706b3f443612770b80bc387dd41b34ee4a94ab26c3b0e74832
7
- data.tar.gz: f69b10c6d4c8d879cdd3fce7c3b44933a99569358d1adfa3106760bd7c66036a2fef86737cf4dc6369be46234c124b9f2ef66e82fab118e36b5b079e9d23e10b
6
+ metadata.gz: 8b97550fb18f2ab4db0b5fbb170d12448237665d9372242d4027760f1c697be0d1e7a8bb47d43886f704e0923ddf57544961fe5af29c596b49aac188f714b9e6
7
+ data.tar.gz: 1ea56a23e39a084d40690d4e7bd108ec2a4cb20b61714bd564e68600d3f3edda3ffd5c3e646d49d4bb85632ad14f2c7d5735e645610e7a863d9e25d6f1d2b90d
data/Cargo.lock CHANGED
@@ -387,6 +387,22 @@ version = "1.13.0"
387
387
  source = "registry+https://github.com/rust-lang/crates.io-index"
388
388
  checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
389
389
 
390
+ [[package]]
391
+ name = "errno"
392
+ version = "0.3.10"
393
+ source = "registry+https://github.com/rust-lang/crates.io-index"
394
+ checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
395
+ dependencies = [
396
+ "libc",
397
+ "windows-sys",
398
+ ]
399
+
400
+ [[package]]
401
+ name = "fastrand"
402
+ version = "2.3.0"
403
+ source = "registry+https://github.com/rust-lang/crates.io-index"
404
+ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
405
+
390
406
  [[package]]
391
407
  name = "flatbuffers"
392
408
  version = "24.12.23"
@@ -934,6 +950,12 @@ dependencies = [
934
950
  "libc",
935
951
  ]
936
952
 
953
+ [[package]]
954
+ name = "linux-raw-sys"
955
+ version = "0.4.15"
956
+ source = "registry+https://github.com/rust-lang/crates.io-index"
957
+ checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
958
+
937
959
  [[package]]
938
960
  name = "litemap"
939
961
  version = "0.7.4"
@@ -1185,6 +1207,7 @@ dependencies = [
1185
1207
  "mimalloc",
1186
1208
  "parquet 54.0.0",
1187
1209
  "rb-sys",
1210
+ "tempfile",
1188
1211
  "thiserror",
1189
1212
  ]
1190
1213
 
@@ -1377,6 +1400,19 @@ dependencies = [
1377
1400
  "semver",
1378
1401
  ]
1379
1402
 
1403
+ [[package]]
1404
+ name = "rustix"
1405
+ version = "0.38.43"
1406
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1407
+ checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6"
1408
+ dependencies = [
1409
+ "bitflags 2.6.0",
1410
+ "errno",
1411
+ "libc",
1412
+ "linux-raw-sys",
1413
+ "windows-sys",
1414
+ ]
1415
+
1380
1416
  [[package]]
1381
1417
  name = "ryu"
1382
1418
  version = "1.0.18"
@@ -1530,6 +1566,20 @@ dependencies = [
1530
1566
  "syn",
1531
1567
  ]
1532
1568
 
1569
+ [[package]]
1570
+ name = "tempfile"
1571
+ version = "3.15.0"
1572
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1573
+ checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
1574
+ dependencies = [
1575
+ "cfg-if",
1576
+ "fastrand",
1577
+ "getrandom",
1578
+ "once_cell",
1579
+ "rustix",
1580
+ "windows-sys",
1581
+ ]
1582
+
1533
1583
  [[package]]
1534
1584
  name = "thiserror"
1535
1585
  version = "2.0.9"
data/README.md CHANGED
@@ -4,8 +4,6 @@
4
4
 
5
5
  This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
6
6
 
7
- At the moment, it only supports iterating rows as either a hash or an array.
8
-
9
7
  ## Usage
10
8
 
11
9
  This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
@@ -83,3 +81,95 @@ Additional arguments for `each_column`:
83
81
  - `batch_size`: Number of rows per batch (defaults to implementation-defined value)
84
82
 
85
83
  When no block is given, both methods return an Enumerator.
84
+
85
+ ### Writing Row-wise Data
86
+
87
+ The `write_rows` method allows you to write data row by row:
88
+
89
+ ```ruby
90
+ require "parquet"
91
+
92
+ # Define the schema for your data
93
+ schema = [
94
+ { "id" => "int64" },
95
+ { "name" => "string" },
96
+ { "score" => "double" }
97
+ ]
98
+
99
+ # Create an enumerator that yields arrays of row values
100
+ rows = [
101
+ [1, "Alice", 95.5],
102
+ [2, "Bob", 82.3],
103
+ [3, "Charlie", 88.7]
104
+ ].each
105
+
106
+ # Write to a file
107
+ Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
108
+
109
+ # Write to an IO object
110
+ File.open("data.parquet", "wb") do |file|
111
+ Parquet.write_rows(rows, schema: schema, write_to: file)
112
+ end
113
+
114
+ # Optionally specify batch size (default is 1000)
115
+ Parquet.write_rows(rows,
116
+ schema: schema,
117
+ write_to: "data.parquet",
118
+ batch_size: 500
119
+ )
120
+ ```
121
+
122
+ ### Writing Column-wise Data
123
+
124
+ The `write_columns` method provides a more efficient way to write data in column-oriented batches:
125
+
126
+ ```ruby
127
+ require "parquet"
128
+
129
+ # Define the schema
130
+ schema = [
131
+ { "id" => "int64" },
132
+ { "name" => "string" },
133
+ { "score" => "double" }
134
+ ]
135
+
136
+ # Create batches of column data
137
+ batches = [
138
+ # First batch
139
+ [
140
+ [1, 2], # id column
141
+ ["Alice", "Bob"], # name column
142
+ [95.5, 82.3] # score column
143
+ ],
144
+ # Second batch
145
+ [
146
+ [3], # id column
147
+ ["Charlie"], # name column
148
+ [88.7] # score column
149
+ ]
150
+ ]
151
+
152
+ # Create an enumerator from the batches
153
+ columns = batches.each
154
+
155
+ # Write to a parquet file
156
+ Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
+
158
+ # Write to an IO object
159
+ File.open("data.parquet", "wb") do |file|
160
+ Parquet.write_columns(columns, schema: schema, write_to: file)
161
+ end
162
+ ```
163
+
164
+ The following data types are supported in the schema:
165
+
166
+ - `int8`, `int16`, `int32`, `int64`
167
+ - `uint8`, `uint16`, `uint32`, `uint64`
168
+ - `float`, `double`
169
+ - `string`
170
+ - `binary`
171
+ - `boolean`
172
+ - `date32`
173
+ - `timestamp_millis`, `timestamp_micros`
174
+
175
+ Note: List and Map types are currently not supported.
@@ -17,6 +17,7 @@ magnus = { version = "0.7", features = ["rb-sys"] }
17
17
  parquet = { version = "^54.0", features = ["json", "object_store"] }
18
18
  rb-sys = "^0.9"
19
19
  thiserror = "2.0"
20
+ tempfile = "^3.15"
20
21
 
21
22
  [target.'cfg(target_os = "linux")'.dependencies]
22
23
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -6,9 +6,7 @@ mod ruby_integration;
6
6
  mod ruby_reader;
7
7
  mod types;
8
8
  mod utils;
9
-
10
- mod parquet_column_reader;
11
- mod parquet_row_reader;
9
+ mod writer;
12
10
 
13
11
  use crate::enumerator::*;
14
12
  use crate::reader::*;
@@ -16,6 +14,8 @@ use crate::ruby_integration::*;
16
14
  use crate::types::*;
17
15
 
18
16
  use magnus::{Error, Ruby};
17
+ use writer::write_columns;
18
+ use writer::write_rows;
19
19
 
20
20
  /// Initializes the Ruby extension and defines methods.
21
21
  #[magnus::init]
@@ -23,5 +23,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
23
23
  let module = ruby.define_module("Parquet")?;
24
24
  module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
25
25
  module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
26
+ module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
27
+ module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
26
28
  Ok(())
27
29
  }
@@ -1,11 +1,14 @@
1
+ mod parquet_column_reader;
2
+ mod parquet_row_reader;
3
+
1
4
  use std::io;
2
5
 
3
6
  use magnus::{Error as MagnusError, Ruby};
4
7
  use thiserror::Error;
5
8
 
6
9
  use crate::header_cache::CacheError;
7
- pub use crate::parquet_column_reader::parse_parquet_columns;
8
- pub use crate::parquet_row_reader::parse_parquet_rows;
10
+ pub use parquet_column_reader::parse_parquet_columns;
11
+ pub use parquet_row_reader::parse_parquet_rows;
9
12
 
10
13
  #[derive(Error, Debug)]
11
14
  pub enum ReaderError {
@@ -0,0 +1,73 @@
1
+ #[derive(Copy, Clone, Debug, PartialEq, Eq)]
2
+ pub enum ParserResultType {
3
+ Hash,
4
+ Array,
5
+ }
6
+
7
+ impl ParserResultType {
8
+ pub fn iter() -> impl Iterator<Item = Self> {
9
+ [Self::Hash, Self::Array].into_iter()
10
+ }
11
+ }
12
+
13
+ impl TryFrom<&str> for ParserResultType {
14
+ type Error = String;
15
+
16
+ fn try_from(value: &str) -> Result<Self, Self::Error> {
17
+ match value {
18
+ "hash" => Ok(ParserResultType::Hash),
19
+ "array" => Ok(ParserResultType::Array),
20
+ _ => Err(format!("Invalid parser result type: {}", value)),
21
+ }
22
+ }
23
+ }
24
+
25
+ impl TryFrom<String> for ParserResultType {
26
+ type Error = String;
27
+
28
+ fn try_from(value: String) -> Result<Self, Self::Error> {
29
+ Self::try_from(value.as_str())
30
+ }
31
+ }
32
+
33
+ impl std::fmt::Display for ParserResultType {
34
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35
+ match self {
36
+ ParserResultType::Hash => write!(f, "hash"),
37
+ ParserResultType::Array => write!(f, "array"),
38
+ }
39
+ }
40
+ }
41
+
42
+ #[derive(Debug, Clone)]
43
+ pub struct ListField {
44
+ pub item_type: ParquetSchemaType,
45
+ }
46
+
47
+ #[derive(Debug, Clone)]
48
+ pub struct MapField {
49
+ pub key_type: ParquetSchemaType,
50
+ pub value_type: ParquetSchemaType,
51
+ }
52
+
53
+ #[derive(Debug, Clone)]
54
+ pub enum ParquetSchemaType {
55
+ Int8,
56
+ Int16,
57
+ Int32,
58
+ Int64,
59
+ UInt8,
60
+ UInt16,
61
+ UInt32,
62
+ UInt64,
63
+ Float,
64
+ Double,
65
+ String,
66
+ Binary,
67
+ Boolean,
68
+ Date32,
69
+ TimestampMillis,
70
+ TimestampMicros,
71
+ List(Box<ListField>),
72
+ Map(Box<MapField>),
73
+ }
@@ -0,0 +1,30 @@
1
+ // Re-export all public items from submodules
2
+ mod core_types;
3
+ mod parquet_value;
4
+ mod record_types;
5
+ mod timestamp;
6
+ mod type_conversion;
7
+ mod writer_types;
8
+
9
+ pub use core_types::*;
10
+ pub use parquet_value::*;
11
+ pub use record_types::*;
12
+ pub use timestamp::*;
13
+ pub use type_conversion::*;
14
+ pub use writer_types::*;
15
+
16
+ // Common imports used across the module
17
+ use arrow_array::cast::downcast_array;
18
+ use arrow_array::{
19
+ Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
20
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
21
+ StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
22
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
23
+ };
24
+ use arrow_schema::{DataType, TimeUnit};
25
+ use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
26
+ use parquet::data_type::Decimal;
27
+ use parquet::record::Field;
28
+ use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
29
+
30
+ use crate::header_cache::StringCacheKey;