deltalake-rb 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Cargo.lock +504 -337
- data/README.md +11 -11
- data/ext/deltalake/Cargo.toml +5 -4
- data/ext/deltalake/src/error.rs +62 -15
- data/ext/deltalake/src/features.rs +67 -0
- data/ext/deltalake/src/lib.rs +632 -61
- data/ext/deltalake/src/merge.rs +205 -0
- data/lib/deltalake/table.rb +77 -28
- data/lib/deltalake/table_alterer.rb +33 -0
- data/lib/deltalake/table_merger.rb +38 -0
- data/lib/deltalake/table_optimizer.rb +20 -4
- data/lib/deltalake/utils.rb +59 -0
- data/lib/deltalake/version.rb +1 -1
- data/lib/deltalake.rb +34 -59
- metadata +6 -2
data/README.md
CHANGED
@@ -21,15 +21,15 @@ It can take 5-10 minutes to compile the gem.
|
|
21
21
|
Write data
|
22
22
|
|
23
23
|
```ruby
|
24
|
-
df = Polars::DataFrame.new({"
|
25
|
-
DeltaLake.write("./
|
24
|
+
df = Polars::DataFrame.new({"id" => [1, 2], "value" => [3.0, 4.0]})
|
25
|
+
DeltaLake.write("./events", df)
|
26
26
|
```
|
27
27
|
|
28
28
|
Load a table
|
29
29
|
|
30
30
|
```ruby
|
31
|
-
dt = DeltaLake::Table.new("./
|
32
|
-
|
31
|
+
dt = DeltaLake::Table.new("./events")
|
32
|
+
df = dt.to_polars
|
33
33
|
```
|
34
34
|
|
35
35
|
Get a lazy frame
|
@@ -41,31 +41,31 @@ lf = dt.to_polars(eager: false)
|
|
41
41
|
Append rows
|
42
42
|
|
43
43
|
```ruby
|
44
|
-
DeltaLake.write("./
|
44
|
+
DeltaLake.write("./events", df, mode: "append")
|
45
45
|
```
|
46
46
|
|
47
47
|
Overwrite a table
|
48
48
|
|
49
49
|
```ruby
|
50
|
-
DeltaLake.write("./
|
50
|
+
DeltaLake.write("./events", df, mode: "overwrite")
|
51
51
|
```
|
52
52
|
|
53
53
|
Add a constraint
|
54
54
|
|
55
55
|
```ruby
|
56
|
-
dt.alter.add_constraint({"
|
56
|
+
dt.alter.add_constraint({"id_gt_0" => "id > 0"})
|
57
57
|
```
|
58
58
|
|
59
59
|
Drop a constraint
|
60
60
|
|
61
61
|
```ruby
|
62
|
-
dt.alter.drop_constraint("
|
62
|
+
dt.alter.drop_constraint("id_gt_0")
|
63
63
|
```
|
64
64
|
|
65
65
|
Delete rows
|
66
66
|
|
67
67
|
```ruby
|
68
|
-
dt.delete("
|
68
|
+
dt.delete("id > 1")
|
69
69
|
```
|
70
70
|
|
71
71
|
Vacuum
|
@@ -83,13 +83,13 @@ dt.optimize.compact
|
|
83
83
|
Colocate similar data in the same files
|
84
84
|
|
85
85
|
```ruby
|
86
|
-
dt.optimize.z_order(["
|
86
|
+
dt.optimize.z_order(["category"])
|
87
87
|
```
|
88
88
|
|
89
89
|
Load a previous version of a table
|
90
90
|
|
91
91
|
```ruby
|
92
|
-
dt = DeltaLake::Table.new("./
|
92
|
+
dt = DeltaLake::Table.new("./events", version: 1)
|
93
93
|
# or
|
94
94
|
dt.load_as_version(1)
|
95
95
|
```
|
data/ext/deltalake/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "deltalake"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.3"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -11,10 +11,11 @@ publish = false
|
|
11
11
|
crate-type = ["cdylib"]
|
12
12
|
|
13
13
|
[dependencies]
|
14
|
-
arrow = { version = "
|
15
|
-
arrow-schema = { version = "
|
14
|
+
arrow = { version = "53", features = ["ffi"] }
|
15
|
+
arrow-schema = { version = "53", features = ["serde"] }
|
16
16
|
chrono = "0.4"
|
17
|
-
|
17
|
+
delta_kernel = "0.4"
|
18
|
+
deltalake = { version = "=0.22.3", features = ["azure", "datafusion", "gcs", "s3"] }
|
18
19
|
futures = "0.3"
|
19
20
|
magnus = "0.7"
|
20
21
|
num_cpus = "1"
|
data/ext/deltalake/src/error.rs
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
use arrow_schema::ArrowError;
|
2
|
+
use deltalake::protocol::ProtocolError;
|
2
3
|
use deltalake::{errors::DeltaTableError, ObjectStoreError};
|
3
|
-
use magnus::{exception, Error, Module, RModule, Ruby};
|
4
|
+
use magnus::{exception, Error as RbErr, Module, RModule, Ruby};
|
4
5
|
use std::borrow::Cow;
|
5
6
|
|
6
7
|
macro_rules! create_exception {
|
@@ -8,7 +9,7 @@ macro_rules! create_exception {
|
|
8
9
|
pub struct $type {}
|
9
10
|
|
10
11
|
impl $type {
|
11
|
-
pub fn new_err<T>(message: T) ->
|
12
|
+
pub fn new_err<T>(message: T) -> RbErr
|
12
13
|
where
|
13
14
|
T: Into<Cow<'static, str>>,
|
14
15
|
{
|
@@ -19,7 +20,7 @@ macro_rules! create_exception {
|
|
19
20
|
.unwrap()
|
20
21
|
.const_get($name)
|
21
22
|
.unwrap();
|
22
|
-
|
23
|
+
RbErr::new(class, message)
|
23
24
|
}
|
24
25
|
}
|
25
26
|
};
|
@@ -31,7 +32,7 @@ create_exception!(DeltaProtocolError, "DeltaProtocolError");
|
|
31
32
|
create_exception!(CommitFailedError, "CommitFailedError");
|
32
33
|
create_exception!(SchemaMismatchError, "SchemaMismatchError");
|
33
34
|
|
34
|
-
fn inner_to_rb_err(err: DeltaTableError) ->
|
35
|
+
fn inner_to_rb_err(err: DeltaTableError) -> RbErr {
|
35
36
|
match err {
|
36
37
|
DeltaTableError::NotATable(msg) => TableNotFoundError::new_err(msg),
|
37
38
|
DeltaTableError::InvalidTableLocation(msg) => TableNotFoundError::new_err(msg),
|
@@ -48,7 +49,7 @@ fn inner_to_rb_err(err: DeltaTableError) -> Error {
|
|
48
49
|
|
49
50
|
// ruby exceptions
|
50
51
|
DeltaTableError::ObjectStore { source } => object_store_to_rb(source),
|
51
|
-
DeltaTableError::Io { source } =>
|
52
|
+
DeltaTableError::Io { source } => RbIOError::new_err(source.to_string()),
|
52
53
|
|
53
54
|
DeltaTableError::Arrow { source } => arrow_to_rb(source),
|
54
55
|
|
@@ -56,31 +57,50 @@ fn inner_to_rb_err(err: DeltaTableError) -> Error {
|
|
56
57
|
}
|
57
58
|
}
|
58
59
|
|
59
|
-
fn object_store_to_rb(err: ObjectStoreError) ->
|
60
|
+
fn object_store_to_rb(err: ObjectStoreError) -> RbErr {
|
60
61
|
match err {
|
61
|
-
ObjectStoreError::NotFound { .. } =>
|
62
|
+
ObjectStoreError::NotFound { .. } => RbIOError::new_err(err.to_string()),
|
62
63
|
ObjectStoreError::Generic { source, .. }
|
63
64
|
if source.to_string().contains("AWS_S3_ALLOW_UNSAFE_RENAME") =>
|
64
65
|
{
|
65
66
|
DeltaProtocolError::new_err(source.to_string())
|
66
67
|
}
|
67
|
-
_ =>
|
68
|
+
_ => RbIOError::new_err(err.to_string()),
|
68
69
|
}
|
69
70
|
}
|
70
71
|
|
71
|
-
fn arrow_to_rb(err: ArrowError) ->
|
72
|
+
fn arrow_to_rb(err: ArrowError) -> RbErr {
|
72
73
|
match err {
|
73
|
-
ArrowError::IoError(msg, _) =>
|
74
|
-
ArrowError::DivideByZero =>
|
75
|
-
ArrowError::InvalidArgumentError(msg) =>
|
76
|
-
ArrowError::NotYetImplemented(msg) =>
|
74
|
+
ArrowError::IoError(msg, _) => RbIOError::new_err(msg),
|
75
|
+
ArrowError::DivideByZero => RbValueError::new_err("division by zero"),
|
76
|
+
ArrowError::InvalidArgumentError(msg) => RbValueError::new_err(msg),
|
77
|
+
ArrowError::NotYetImplemented(msg) => RbNotImplementedError::new_err(msg),
|
77
78
|
ArrowError::SchemaError(msg) => SchemaMismatchError::new_err(msg),
|
78
|
-
other =>
|
79
|
+
other => RbException::new_err(other.to_string()),
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
fn checkpoint_to_rb(err: ProtocolError) -> RbErr {
|
84
|
+
match err {
|
85
|
+
ProtocolError::Arrow { source } => arrow_to_rb(source),
|
86
|
+
ProtocolError::ObjectStore { source } => object_store_to_rb(source),
|
87
|
+
ProtocolError::EndOfLog => DeltaProtocolError::new_err("End of log"),
|
88
|
+
ProtocolError::NoMetaData => DeltaProtocolError::new_err("Table metadata missing"),
|
89
|
+
ProtocolError::CheckpointNotFound => DeltaProtocolError::new_err(err.to_string()),
|
90
|
+
ProtocolError::InvalidField(err) => RbValueError::new_err(err),
|
91
|
+
ProtocolError::InvalidRow(err) => RbValueError::new_err(err),
|
92
|
+
ProtocolError::InvalidDeletionVectorStorageType(err) => RbValueError::new_err(err),
|
93
|
+
ProtocolError::SerializeOperation { source } => RbValueError::new_err(source.to_string()),
|
94
|
+
ProtocolError::ParquetParseError { source } => RbIOError::new_err(source.to_string()),
|
95
|
+
ProtocolError::IO { source } => RbIOError::new_err(source.to_string()),
|
96
|
+
ProtocolError::Generic(msg) => DeltaError::new_err(msg),
|
97
|
+
ProtocolError::Kernel { source } => DeltaError::new_err(source.to_string()),
|
79
98
|
}
|
80
99
|
}
|
81
100
|
|
82
101
|
pub enum RubyError {
|
83
102
|
DeltaTable(DeltaTableError),
|
103
|
+
Protocol(ProtocolError),
|
84
104
|
}
|
85
105
|
|
86
106
|
impl From<DeltaTableError> for RubyError {
|
@@ -89,10 +109,37 @@ impl From<DeltaTableError> for RubyError {
|
|
89
109
|
}
|
90
110
|
}
|
91
111
|
|
92
|
-
impl From<
|
112
|
+
impl From<ProtocolError> for RubyError {
|
113
|
+
fn from(err: ProtocolError) -> Self {
|
114
|
+
RubyError::Protocol(err)
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
impl From<RubyError> for RbErr {
|
93
119
|
fn from(value: RubyError) -> Self {
|
94
120
|
match value {
|
95
121
|
RubyError::DeltaTable(err) => inner_to_rb_err(err),
|
122
|
+
RubyError::Protocol(err) => checkpoint_to_rb(err),
|
96
123
|
}
|
97
124
|
}
|
98
125
|
}
|
126
|
+
|
127
|
+
macro_rules! create_builtin_exception {
|
128
|
+
($type:ident, $class:expr) => {
|
129
|
+
pub struct $type {}
|
130
|
+
|
131
|
+
impl $type {
|
132
|
+
pub fn new_err<T>(message: T) -> RbErr
|
133
|
+
where
|
134
|
+
T: Into<Cow<'static, str>>,
|
135
|
+
{
|
136
|
+
RbErr::new($class, message)
|
137
|
+
}
|
138
|
+
}
|
139
|
+
};
|
140
|
+
}
|
141
|
+
|
142
|
+
create_builtin_exception!(RbException, exception::runtime_error());
|
143
|
+
create_builtin_exception!(RbIOError, exception::io_error());
|
144
|
+
create_builtin_exception!(RbNotImplementedError, exception::not_imp_error());
|
145
|
+
create_builtin_exception!(RbValueError, exception::arg_error());
|
@@ -0,0 +1,67 @@
|
|
1
|
+
use crate::{RbResult, RbValueError};
|
2
|
+
use deltalake::kernel::TableFeatures as KernelTableFeatures;
|
3
|
+
use magnus::{prelude::*, TryConvert, Value};
|
4
|
+
|
5
|
+
/// High level table features
|
6
|
+
#[derive(Clone)]
|
7
|
+
pub enum TableFeatures {
|
8
|
+
/// Mapping of one column to another
|
9
|
+
ColumnMapping,
|
10
|
+
/// Deletion vectors for merge, update, delete
|
11
|
+
DeletionVectors,
|
12
|
+
/// timestamps without timezone support
|
13
|
+
TimestampWithoutTimezone,
|
14
|
+
/// version 2 of checkpointing
|
15
|
+
V2Checkpoint,
|
16
|
+
/// Append Only Tables
|
17
|
+
AppendOnly,
|
18
|
+
/// Table invariants
|
19
|
+
Invariants,
|
20
|
+
/// Check constraints on columns
|
21
|
+
CheckConstraints,
|
22
|
+
/// CDF on a table
|
23
|
+
ChangeDataFeed,
|
24
|
+
/// Columns with generated values
|
25
|
+
GeneratedColumns,
|
26
|
+
/// ID Columns
|
27
|
+
IdentityColumns,
|
28
|
+
/// Row tracking on tables
|
29
|
+
RowTracking,
|
30
|
+
/// domain specific metadata
|
31
|
+
DomainMetadata,
|
32
|
+
/// Iceberg compatibility support
|
33
|
+
IcebergCompatV1,
|
34
|
+
}
|
35
|
+
|
36
|
+
impl From<TableFeatures> for KernelTableFeatures {
|
37
|
+
fn from(value: TableFeatures) -> Self {
|
38
|
+
match value {
|
39
|
+
TableFeatures::ColumnMapping => KernelTableFeatures::ColumnMapping,
|
40
|
+
TableFeatures::DeletionVectors => KernelTableFeatures::DeletionVectors,
|
41
|
+
TableFeatures::TimestampWithoutTimezone => {
|
42
|
+
KernelTableFeatures::TimestampWithoutTimezone
|
43
|
+
}
|
44
|
+
TableFeatures::V2Checkpoint => KernelTableFeatures::V2Checkpoint,
|
45
|
+
TableFeatures::AppendOnly => KernelTableFeatures::AppendOnly,
|
46
|
+
TableFeatures::Invariants => KernelTableFeatures::Invariants,
|
47
|
+
TableFeatures::CheckConstraints => KernelTableFeatures::CheckConstraints,
|
48
|
+
TableFeatures::ChangeDataFeed => KernelTableFeatures::ChangeDataFeed,
|
49
|
+
TableFeatures::GeneratedColumns => KernelTableFeatures::GeneratedColumns,
|
50
|
+
TableFeatures::IdentityColumns => KernelTableFeatures::IdentityColumns,
|
51
|
+
TableFeatures::RowTracking => KernelTableFeatures::RowTracking,
|
52
|
+
TableFeatures::DomainMetadata => KernelTableFeatures::DomainMetadata,
|
53
|
+
TableFeatures::IcebergCompatV1 => KernelTableFeatures::IcebergCompatV1,
|
54
|
+
}
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
impl TryConvert for TableFeatures {
|
59
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
60
|
+
// TODO add more features
|
61
|
+
let feature = match unsafe { val.to_r_string()?.as_str()? } {
|
62
|
+
"append_only" => TableFeatures::AppendOnly,
|
63
|
+
_ => return Err(RbValueError::new_err("Invalid feature")),
|
64
|
+
};
|
65
|
+
Ok(feature)
|
66
|
+
}
|
67
|
+
}
|