deltalake-rb 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Cargo.lock +506 -337
- data/README.md +33 -3
- data/ext/deltalake/Cargo.toml +7 -4
- data/ext/deltalake/src/error.rs +62 -15
- data/ext/deltalake/src/features.rs +67 -0
- data/ext/deltalake/src/lib.rs +1114 -48
- data/ext/deltalake/src/merge.rs +205 -0
- data/lib/deltalake/table.rb +170 -10
- data/lib/deltalake/table_alterer.rb +58 -0
- data/lib/deltalake/table_merger.rb +38 -0
- data/lib/deltalake/table_optimizer.rb +67 -0
- data/lib/deltalake/utils.rb +59 -0
- data/lib/deltalake/version.rb +1 -1
- data/lib/deltalake.rb +50 -12
- metadata +8 -2
data/README.md
CHANGED
@@ -14,7 +14,7 @@ Add this line to your application’s Gemfile:
|
|
14
14
|
gem "deltalake-rb"
|
15
15
|
```
|
16
16
|
|
17
|
-
It can take
|
17
|
+
It can take 5-10 minutes to compile the gem.
|
18
18
|
|
19
19
|
## Getting Started
|
20
20
|
|
@@ -50,6 +50,18 @@ Overwrite a table
|
|
50
50
|
DeltaLake.write("./data/delta", df, mode: "overwrite")
|
51
51
|
```
|
52
52
|
|
53
|
+
Add a constraint
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
dt.alter.add_constraint({"a_gt_0" => "a > 0"})
|
57
|
+
```
|
58
|
+
|
59
|
+
Drop a constraint
|
60
|
+
|
61
|
+
```ruby
|
62
|
+
dt.alter.drop_constraint("a_gt_0")
|
63
|
+
```
|
64
|
+
|
53
65
|
Delete rows
|
54
66
|
|
55
67
|
```ruby
|
@@ -62,6 +74,18 @@ Vacuum
|
|
62
74
|
dt.vacuum(dry_run: false)
|
63
75
|
```
|
64
76
|
|
77
|
+
Perform small file compaction
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
dt.optimize.compact
|
81
|
+
```
|
82
|
+
|
83
|
+
Colocate similar data in the same files
|
84
|
+
|
85
|
+
```ruby
|
86
|
+
dt.optimize.z_order(["a"])
|
87
|
+
```
|
88
|
+
|
65
89
|
Load a previous version of a table
|
66
90
|
|
67
91
|
```ruby
|
@@ -70,16 +94,22 @@ dt = DeltaLake::Table.new("./data/delta", version: 1)
|
|
70
94
|
dt.load_as_version(1)
|
71
95
|
```
|
72
96
|
|
97
|
+
Get the schema
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
dt.schema
|
101
|
+
```
|
102
|
+
|
73
103
|
Get metadata
|
74
104
|
|
75
105
|
```ruby
|
76
106
|
dt.metadata
|
77
107
|
```
|
78
108
|
|
79
|
-
Get
|
109
|
+
Get history
|
80
110
|
|
81
111
|
```ruby
|
82
|
-
dt.
|
112
|
+
dt.history
|
83
113
|
```
|
84
114
|
|
85
115
|
## API
|
data/ext/deltalake/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "deltalake"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.2"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -11,11 +11,14 @@ publish = false
|
|
11
11
|
crate-type = ["cdylib"]
|
12
12
|
|
13
13
|
[dependencies]
|
14
|
-
arrow = { version = "
|
15
|
-
arrow-schema = { version = "
|
14
|
+
arrow = { version = "53", features = ["ffi"] }
|
15
|
+
arrow-schema = { version = "53", features = ["serde"] }
|
16
16
|
chrono = "0.4"
|
17
|
-
|
17
|
+
delta_kernel = "0.4"
|
18
|
+
deltalake = { version = "=0.22.2", features = ["azure", "datafusion", "gcs", "s3"] }
|
19
|
+
futures = "0.3"
|
18
20
|
magnus = "0.7"
|
21
|
+
num_cpus = "1"
|
19
22
|
serde = "1"
|
20
23
|
serde_json = "1"
|
21
24
|
tokio = { version = "1", features = ["rt-multi-thread"] }
|
data/ext/deltalake/src/error.rs
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
use arrow_schema::ArrowError;
|
2
|
+
use deltalake::protocol::ProtocolError;
|
2
3
|
use deltalake::{errors::DeltaTableError, ObjectStoreError};
|
3
|
-
use magnus::{exception, Error, Module, RModule, Ruby};
|
4
|
+
use magnus::{exception, Error as RbErr, Module, RModule, Ruby};
|
4
5
|
use std::borrow::Cow;
|
5
6
|
|
6
7
|
macro_rules! create_exception {
|
@@ -8,7 +9,7 @@ macro_rules! create_exception {
|
|
8
9
|
pub struct $type {}
|
9
10
|
|
10
11
|
impl $type {
|
11
|
-
pub fn new_err<T>(message: T) ->
|
12
|
+
pub fn new_err<T>(message: T) -> RbErr
|
12
13
|
where
|
13
14
|
T: Into<Cow<'static, str>>,
|
14
15
|
{
|
@@ -19,7 +20,7 @@ macro_rules! create_exception {
|
|
19
20
|
.unwrap()
|
20
21
|
.const_get($name)
|
21
22
|
.unwrap();
|
22
|
-
|
23
|
+
RbErr::new(class, message)
|
23
24
|
}
|
24
25
|
}
|
25
26
|
};
|
@@ -31,7 +32,7 @@ create_exception!(DeltaProtocolError, "DeltaProtocolError");
|
|
31
32
|
create_exception!(CommitFailedError, "CommitFailedError");
|
32
33
|
create_exception!(SchemaMismatchError, "SchemaMismatchError");
|
33
34
|
|
34
|
-
fn inner_to_rb_err(err: DeltaTableError) ->
|
35
|
+
fn inner_to_rb_err(err: DeltaTableError) -> RbErr {
|
35
36
|
match err {
|
36
37
|
DeltaTableError::NotATable(msg) => TableNotFoundError::new_err(msg),
|
37
38
|
DeltaTableError::InvalidTableLocation(msg) => TableNotFoundError::new_err(msg),
|
@@ -48,7 +49,7 @@ fn inner_to_rb_err(err: DeltaTableError) -> Error {
|
|
48
49
|
|
49
50
|
// ruby exceptions
|
50
51
|
DeltaTableError::ObjectStore { source } => object_store_to_rb(source),
|
51
|
-
DeltaTableError::Io { source } =>
|
52
|
+
DeltaTableError::Io { source } => RbIOError::new_err(source.to_string()),
|
52
53
|
|
53
54
|
DeltaTableError::Arrow { source } => arrow_to_rb(source),
|
54
55
|
|
@@ -56,31 +57,50 @@ fn inner_to_rb_err(err: DeltaTableError) -> Error {
|
|
56
57
|
}
|
57
58
|
}
|
58
59
|
|
59
|
-
fn object_store_to_rb(err: ObjectStoreError) ->
|
60
|
+
fn object_store_to_rb(err: ObjectStoreError) -> RbErr {
|
60
61
|
match err {
|
61
|
-
ObjectStoreError::NotFound { .. } =>
|
62
|
+
ObjectStoreError::NotFound { .. } => RbIOError::new_err(err.to_string()),
|
62
63
|
ObjectStoreError::Generic { source, .. }
|
63
64
|
if source.to_string().contains("AWS_S3_ALLOW_UNSAFE_RENAME") =>
|
64
65
|
{
|
65
66
|
DeltaProtocolError::new_err(source.to_string())
|
66
67
|
}
|
67
|
-
_ =>
|
68
|
+
_ => RbIOError::new_err(err.to_string()),
|
68
69
|
}
|
69
70
|
}
|
70
71
|
|
71
|
-
fn arrow_to_rb(err: ArrowError) ->
|
72
|
+
fn arrow_to_rb(err: ArrowError) -> RbErr {
|
72
73
|
match err {
|
73
|
-
ArrowError::IoError(msg, _) =>
|
74
|
-
ArrowError::DivideByZero =>
|
75
|
-
ArrowError::InvalidArgumentError(msg) =>
|
76
|
-
ArrowError::NotYetImplemented(msg) =>
|
74
|
+
ArrowError::IoError(msg, _) => RbIOError::new_err(msg),
|
75
|
+
ArrowError::DivideByZero => RbValueError::new_err("division by zero"),
|
76
|
+
ArrowError::InvalidArgumentError(msg) => RbValueError::new_err(msg),
|
77
|
+
ArrowError::NotYetImplemented(msg) => RbNotImplementedError::new_err(msg),
|
77
78
|
ArrowError::SchemaError(msg) => SchemaMismatchError::new_err(msg),
|
78
|
-
other =>
|
79
|
+
other => RbException::new_err(other.to_string()),
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
fn checkpoint_to_rb(err: ProtocolError) -> RbErr {
|
84
|
+
match err {
|
85
|
+
ProtocolError::Arrow { source } => arrow_to_rb(source),
|
86
|
+
ProtocolError::ObjectStore { source } => object_store_to_rb(source),
|
87
|
+
ProtocolError::EndOfLog => DeltaProtocolError::new_err("End of log"),
|
88
|
+
ProtocolError::NoMetaData => DeltaProtocolError::new_err("Table metadata missing"),
|
89
|
+
ProtocolError::CheckpointNotFound => DeltaProtocolError::new_err(err.to_string()),
|
90
|
+
ProtocolError::InvalidField(err) => RbValueError::new_err(err),
|
91
|
+
ProtocolError::InvalidRow(err) => RbValueError::new_err(err),
|
92
|
+
ProtocolError::InvalidDeletionVectorStorageType(err) => RbValueError::new_err(err),
|
93
|
+
ProtocolError::SerializeOperation { source } => RbValueError::new_err(source.to_string()),
|
94
|
+
ProtocolError::ParquetParseError { source } => RbIOError::new_err(source.to_string()),
|
95
|
+
ProtocolError::IO { source } => RbIOError::new_err(source.to_string()),
|
96
|
+
ProtocolError::Generic(msg) => DeltaError::new_err(msg),
|
97
|
+
ProtocolError::Kernel { source } => DeltaError::new_err(source.to_string()),
|
79
98
|
}
|
80
99
|
}
|
81
100
|
|
82
101
|
pub enum RubyError {
|
83
102
|
DeltaTable(DeltaTableError),
|
103
|
+
Protocol(ProtocolError),
|
84
104
|
}
|
85
105
|
|
86
106
|
impl From<DeltaTableError> for RubyError {
|
@@ -89,10 +109,37 @@ impl From<DeltaTableError> for RubyError {
|
|
89
109
|
}
|
90
110
|
}
|
91
111
|
|
92
|
-
impl From<
|
112
|
+
impl From<ProtocolError> for RubyError {
|
113
|
+
fn from(err: ProtocolError) -> Self {
|
114
|
+
RubyError::Protocol(err)
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
impl From<RubyError> for RbErr {
|
93
119
|
fn from(value: RubyError) -> Self {
|
94
120
|
match value {
|
95
121
|
RubyError::DeltaTable(err) => inner_to_rb_err(err),
|
122
|
+
RubyError::Protocol(err) => checkpoint_to_rb(err),
|
96
123
|
}
|
97
124
|
}
|
98
125
|
}
|
126
|
+
|
127
|
+
macro_rules! create_builtin_exception {
|
128
|
+
($type:ident, $class:expr) => {
|
129
|
+
pub struct $type {}
|
130
|
+
|
131
|
+
impl $type {
|
132
|
+
pub fn new_err<T>(message: T) -> RbErr
|
133
|
+
where
|
134
|
+
T: Into<Cow<'static, str>>,
|
135
|
+
{
|
136
|
+
RbErr::new($class, message)
|
137
|
+
}
|
138
|
+
}
|
139
|
+
};
|
140
|
+
}
|
141
|
+
|
142
|
+
create_builtin_exception!(RbException, exception::runtime_error());
|
143
|
+
create_builtin_exception!(RbIOError, exception::io_error());
|
144
|
+
create_builtin_exception!(RbNotImplementedError, exception::not_imp_error());
|
145
|
+
create_builtin_exception!(RbValueError, exception::arg_error());
|
@@ -0,0 +1,67 @@
|
|
1
|
+
use crate::{RbResult, RbValueError};
|
2
|
+
use deltalake::kernel::TableFeatures as KernelTableFeatures;
|
3
|
+
use magnus::{prelude::*, TryConvert, Value};
|
4
|
+
|
5
|
+
/// High level table features
|
6
|
+
#[derive(Clone)]
|
7
|
+
pub enum TableFeatures {
|
8
|
+
/// Mapping of one column to another
|
9
|
+
ColumnMapping,
|
10
|
+
/// Deletion vectors for merge, update, delete
|
11
|
+
DeletionVectors,
|
12
|
+
/// timestamps without timezone support
|
13
|
+
TimestampWithoutTimezone,
|
14
|
+
/// version 2 of checkpointing
|
15
|
+
V2Checkpoint,
|
16
|
+
/// Append Only Tables
|
17
|
+
AppendOnly,
|
18
|
+
/// Table invariants
|
19
|
+
Invariants,
|
20
|
+
/// Check constraints on columns
|
21
|
+
CheckConstraints,
|
22
|
+
/// CDF on a table
|
23
|
+
ChangeDataFeed,
|
24
|
+
/// Columns with generated values
|
25
|
+
GeneratedColumns,
|
26
|
+
/// ID Columns
|
27
|
+
IdentityColumns,
|
28
|
+
/// Row tracking on tables
|
29
|
+
RowTracking,
|
30
|
+
/// domain specific metadata
|
31
|
+
DomainMetadata,
|
32
|
+
/// Iceberg compatibility support
|
33
|
+
IcebergCompatV1,
|
34
|
+
}
|
35
|
+
|
36
|
+
impl From<TableFeatures> for KernelTableFeatures {
|
37
|
+
fn from(value: TableFeatures) -> Self {
|
38
|
+
match value {
|
39
|
+
TableFeatures::ColumnMapping => KernelTableFeatures::ColumnMapping,
|
40
|
+
TableFeatures::DeletionVectors => KernelTableFeatures::DeletionVectors,
|
41
|
+
TableFeatures::TimestampWithoutTimezone => {
|
42
|
+
KernelTableFeatures::TimestampWithoutTimezone
|
43
|
+
}
|
44
|
+
TableFeatures::V2Checkpoint => KernelTableFeatures::V2Checkpoint,
|
45
|
+
TableFeatures::AppendOnly => KernelTableFeatures::AppendOnly,
|
46
|
+
TableFeatures::Invariants => KernelTableFeatures::Invariants,
|
47
|
+
TableFeatures::CheckConstraints => KernelTableFeatures::CheckConstraints,
|
48
|
+
TableFeatures::ChangeDataFeed => KernelTableFeatures::ChangeDataFeed,
|
49
|
+
TableFeatures::GeneratedColumns => KernelTableFeatures::GeneratedColumns,
|
50
|
+
TableFeatures::IdentityColumns => KernelTableFeatures::IdentityColumns,
|
51
|
+
TableFeatures::RowTracking => KernelTableFeatures::RowTracking,
|
52
|
+
TableFeatures::DomainMetadata => KernelTableFeatures::DomainMetadata,
|
53
|
+
TableFeatures::IcebergCompatV1 => KernelTableFeatures::IcebergCompatV1,
|
54
|
+
}
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
impl TryConvert for TableFeatures {
|
59
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
60
|
+
// TODO add more features
|
61
|
+
let feature = match unsafe { val.to_r_string()?.as_str()? } {
|
62
|
+
"append_only" => TableFeatures::AppendOnly,
|
63
|
+
_ => return Err(RbValueError::new_err("Invalid feature")),
|
64
|
+
};
|
65
|
+
Ok(feature)
|
66
|
+
}
|
67
|
+
}
|