deltalake-rb 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Cargo.lock +506 -337
- data/README.md +33 -3
- data/ext/deltalake/Cargo.toml +7 -4
- data/ext/deltalake/src/error.rs +62 -15
- data/ext/deltalake/src/features.rs +67 -0
- data/ext/deltalake/src/lib.rs +1114 -48
- data/ext/deltalake/src/merge.rs +205 -0
- data/lib/deltalake/table.rb +170 -10
- data/lib/deltalake/table_alterer.rb +58 -0
- data/lib/deltalake/table_merger.rb +38 -0
- data/lib/deltalake/table_optimizer.rb +67 -0
- data/lib/deltalake/utils.rb +59 -0
- data/lib/deltalake/version.rb +1 -1
- data/lib/deltalake.rb +50 -12
- metadata +8 -2
data/README.md
CHANGED
@@ -14,7 +14,7 @@ Add this line to your application’s Gemfile:
|
|
14
14
|
gem "deltalake-rb"
|
15
15
|
```
|
16
16
|
|
17
|
-
It can take
|
17
|
+
It can take 5-10 minutes to compile the gem.
|
18
18
|
|
19
19
|
## Getting Started
|
20
20
|
|
@@ -50,6 +50,18 @@ Overwrite a table
|
|
50
50
|
DeltaLake.write("./data/delta", df, mode: "overwrite")
|
51
51
|
```
|
52
52
|
|
53
|
+
Add a constraint
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
dt.alter.add_constraint({"a_gt_0" => "a > 0"})
|
57
|
+
```
|
58
|
+
|
59
|
+
Drop a constraint
|
60
|
+
|
61
|
+
```ruby
|
62
|
+
dt.alter.drop_constraint("a_gt_0")
|
63
|
+
```
|
64
|
+
|
53
65
|
Delete rows
|
54
66
|
|
55
67
|
```ruby
|
@@ -62,6 +74,18 @@ Vacuum
|
|
62
74
|
dt.vacuum(dry_run: false)
|
63
75
|
```
|
64
76
|
|
77
|
+
Perform small file compaction
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
dt.optimize.compact
|
81
|
+
```
|
82
|
+
|
83
|
+
Colocate similar data in the same files
|
84
|
+
|
85
|
+
```ruby
|
86
|
+
dt.optimize.z_order(["a"])
|
87
|
+
```
|
88
|
+
|
65
89
|
Load a previous version of a table
|
66
90
|
|
67
91
|
```ruby
|
@@ -70,16 +94,22 @@ dt = DeltaLake::Table.new("./data/delta", version: 1)
|
|
70
94
|
dt.load_as_version(1)
|
71
95
|
```
|
72
96
|
|
97
|
+
Get the schema
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
dt.schema
|
101
|
+
```
|
102
|
+
|
73
103
|
Get metadata
|
74
104
|
|
75
105
|
```ruby
|
76
106
|
dt.metadata
|
77
107
|
```
|
78
108
|
|
79
|
-
Get
|
109
|
+
Get history
|
80
110
|
|
81
111
|
```ruby
|
82
|
-
dt.
|
112
|
+
dt.history
|
83
113
|
```
|
84
114
|
|
85
115
|
## API
|
data/ext/deltalake/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "deltalake"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.2"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -11,11 +11,14 @@ publish = false
|
|
11
11
|
crate-type = ["cdylib"]
|
12
12
|
|
13
13
|
[dependencies]
|
14
|
-
arrow = { version = "
|
15
|
-
arrow-schema = { version = "
|
14
|
+
arrow = { version = "53", features = ["ffi"] }
|
15
|
+
arrow-schema = { version = "53", features = ["serde"] }
|
16
16
|
chrono = "0.4"
|
17
|
-
|
17
|
+
delta_kernel = "0.4"
|
18
|
+
deltalake = { version = "=0.22.2", features = ["azure", "datafusion", "gcs", "s3"] }
|
19
|
+
futures = "0.3"
|
18
20
|
magnus = "0.7"
|
21
|
+
num_cpus = "1"
|
19
22
|
serde = "1"
|
20
23
|
serde_json = "1"
|
21
24
|
tokio = { version = "1", features = ["rt-multi-thread"] }
|
data/ext/deltalake/src/error.rs
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
use arrow_schema::ArrowError;
|
2
|
+
use deltalake::protocol::ProtocolError;
|
2
3
|
use deltalake::{errors::DeltaTableError, ObjectStoreError};
|
3
|
-
use magnus::{exception, Error, Module, RModule, Ruby};
|
4
|
+
use magnus::{exception, Error as RbErr, Module, RModule, Ruby};
|
4
5
|
use std::borrow::Cow;
|
5
6
|
|
6
7
|
macro_rules! create_exception {
|
@@ -8,7 +9,7 @@ macro_rules! create_exception {
|
|
8
9
|
pub struct $type {}
|
9
10
|
|
10
11
|
impl $type {
|
11
|
-
pub fn new_err<T>(message: T) ->
|
12
|
+
pub fn new_err<T>(message: T) -> RbErr
|
12
13
|
where
|
13
14
|
T: Into<Cow<'static, str>>,
|
14
15
|
{
|
@@ -19,7 +20,7 @@ macro_rules! create_exception {
|
|
19
20
|
.unwrap()
|
20
21
|
.const_get($name)
|
21
22
|
.unwrap();
|
22
|
-
|
23
|
+
RbErr::new(class, message)
|
23
24
|
}
|
24
25
|
}
|
25
26
|
};
|
@@ -31,7 +32,7 @@ create_exception!(DeltaProtocolError, "DeltaProtocolError");
|
|
31
32
|
create_exception!(CommitFailedError, "CommitFailedError");
|
32
33
|
create_exception!(SchemaMismatchError, "SchemaMismatchError");
|
33
34
|
|
34
|
-
fn inner_to_rb_err(err: DeltaTableError) ->
|
35
|
+
fn inner_to_rb_err(err: DeltaTableError) -> RbErr {
|
35
36
|
match err {
|
36
37
|
DeltaTableError::NotATable(msg) => TableNotFoundError::new_err(msg),
|
37
38
|
DeltaTableError::InvalidTableLocation(msg) => TableNotFoundError::new_err(msg),
|
@@ -48,7 +49,7 @@ fn inner_to_rb_err(err: DeltaTableError) -> Error {
|
|
48
49
|
|
49
50
|
// ruby exceptions
|
50
51
|
DeltaTableError::ObjectStore { source } => object_store_to_rb(source),
|
51
|
-
DeltaTableError::Io { source } =>
|
52
|
+
DeltaTableError::Io { source } => RbIOError::new_err(source.to_string()),
|
52
53
|
|
53
54
|
DeltaTableError::Arrow { source } => arrow_to_rb(source),
|
54
55
|
|
@@ -56,31 +57,50 @@ fn inner_to_rb_err(err: DeltaTableError) -> Error {
|
|
56
57
|
}
|
57
58
|
}
|
58
59
|
|
59
|
-
fn object_store_to_rb(err: ObjectStoreError) ->
|
60
|
+
fn object_store_to_rb(err: ObjectStoreError) -> RbErr {
|
60
61
|
match err {
|
61
|
-
ObjectStoreError::NotFound { .. } =>
|
62
|
+
ObjectStoreError::NotFound { .. } => RbIOError::new_err(err.to_string()),
|
62
63
|
ObjectStoreError::Generic { source, .. }
|
63
64
|
if source.to_string().contains("AWS_S3_ALLOW_UNSAFE_RENAME") =>
|
64
65
|
{
|
65
66
|
DeltaProtocolError::new_err(source.to_string())
|
66
67
|
}
|
67
|
-
_ =>
|
68
|
+
_ => RbIOError::new_err(err.to_string()),
|
68
69
|
}
|
69
70
|
}
|
70
71
|
|
71
|
-
fn arrow_to_rb(err: ArrowError) ->
|
72
|
+
fn arrow_to_rb(err: ArrowError) -> RbErr {
|
72
73
|
match err {
|
73
|
-
ArrowError::IoError(msg, _) =>
|
74
|
-
ArrowError::DivideByZero =>
|
75
|
-
ArrowError::InvalidArgumentError(msg) =>
|
76
|
-
ArrowError::NotYetImplemented(msg) =>
|
74
|
+
ArrowError::IoError(msg, _) => RbIOError::new_err(msg),
|
75
|
+
ArrowError::DivideByZero => RbValueError::new_err("division by zero"),
|
76
|
+
ArrowError::InvalidArgumentError(msg) => RbValueError::new_err(msg),
|
77
|
+
ArrowError::NotYetImplemented(msg) => RbNotImplementedError::new_err(msg),
|
77
78
|
ArrowError::SchemaError(msg) => SchemaMismatchError::new_err(msg),
|
78
|
-
other =>
|
79
|
+
other => RbException::new_err(other.to_string()),
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
fn checkpoint_to_rb(err: ProtocolError) -> RbErr {
|
84
|
+
match err {
|
85
|
+
ProtocolError::Arrow { source } => arrow_to_rb(source),
|
86
|
+
ProtocolError::ObjectStore { source } => object_store_to_rb(source),
|
87
|
+
ProtocolError::EndOfLog => DeltaProtocolError::new_err("End of log"),
|
88
|
+
ProtocolError::NoMetaData => DeltaProtocolError::new_err("Table metadata missing"),
|
89
|
+
ProtocolError::CheckpointNotFound => DeltaProtocolError::new_err(err.to_string()),
|
90
|
+
ProtocolError::InvalidField(err) => RbValueError::new_err(err),
|
91
|
+
ProtocolError::InvalidRow(err) => RbValueError::new_err(err),
|
92
|
+
ProtocolError::InvalidDeletionVectorStorageType(err) => RbValueError::new_err(err),
|
93
|
+
ProtocolError::SerializeOperation { source } => RbValueError::new_err(source.to_string()),
|
94
|
+
ProtocolError::ParquetParseError { source } => RbIOError::new_err(source.to_string()),
|
95
|
+
ProtocolError::IO { source } => RbIOError::new_err(source.to_string()),
|
96
|
+
ProtocolError::Generic(msg) => DeltaError::new_err(msg),
|
97
|
+
ProtocolError::Kernel { source } => DeltaError::new_err(source.to_string()),
|
79
98
|
}
|
80
99
|
}
|
81
100
|
|
82
101
|
pub enum RubyError {
|
83
102
|
DeltaTable(DeltaTableError),
|
103
|
+
Protocol(ProtocolError),
|
84
104
|
}
|
85
105
|
|
86
106
|
impl From<DeltaTableError> for RubyError {
|
@@ -89,10 +109,37 @@ impl From<DeltaTableError> for RubyError {
|
|
89
109
|
}
|
90
110
|
}
|
91
111
|
|
92
|
-
impl From<
|
112
|
+
impl From<ProtocolError> for RubyError {
|
113
|
+
fn from(err: ProtocolError) -> Self {
|
114
|
+
RubyError::Protocol(err)
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
impl From<RubyError> for RbErr {
|
93
119
|
fn from(value: RubyError) -> Self {
|
94
120
|
match value {
|
95
121
|
RubyError::DeltaTable(err) => inner_to_rb_err(err),
|
122
|
+
RubyError::Protocol(err) => checkpoint_to_rb(err),
|
96
123
|
}
|
97
124
|
}
|
98
125
|
}
|
126
|
+
|
127
|
+
macro_rules! create_builtin_exception {
|
128
|
+
($type:ident, $class:expr) => {
|
129
|
+
pub struct $type {}
|
130
|
+
|
131
|
+
impl $type {
|
132
|
+
pub fn new_err<T>(message: T) -> RbErr
|
133
|
+
where
|
134
|
+
T: Into<Cow<'static, str>>,
|
135
|
+
{
|
136
|
+
RbErr::new($class, message)
|
137
|
+
}
|
138
|
+
}
|
139
|
+
};
|
140
|
+
}
|
141
|
+
|
142
|
+
create_builtin_exception!(RbException, exception::runtime_error());
|
143
|
+
create_builtin_exception!(RbIOError, exception::io_error());
|
144
|
+
create_builtin_exception!(RbNotImplementedError, exception::not_imp_error());
|
145
|
+
create_builtin_exception!(RbValueError, exception::arg_error());
|
@@ -0,0 +1,67 @@
|
|
1
|
+
use crate::{RbResult, RbValueError};
|
2
|
+
use deltalake::kernel::TableFeatures as KernelTableFeatures;
|
3
|
+
use magnus::{prelude::*, TryConvert, Value};
|
4
|
+
|
5
|
+
/// High level table features
|
6
|
+
#[derive(Clone)]
|
7
|
+
pub enum TableFeatures {
|
8
|
+
/// Mapping of one column to another
|
9
|
+
ColumnMapping,
|
10
|
+
/// Deletion vectors for merge, update, delete
|
11
|
+
DeletionVectors,
|
12
|
+
/// timestamps without timezone support
|
13
|
+
TimestampWithoutTimezone,
|
14
|
+
/// version 2 of checkpointing
|
15
|
+
V2Checkpoint,
|
16
|
+
/// Append Only Tables
|
17
|
+
AppendOnly,
|
18
|
+
/// Table invariants
|
19
|
+
Invariants,
|
20
|
+
/// Check constraints on columns
|
21
|
+
CheckConstraints,
|
22
|
+
/// CDF on a table
|
23
|
+
ChangeDataFeed,
|
24
|
+
/// Columns with generated values
|
25
|
+
GeneratedColumns,
|
26
|
+
/// ID Columns
|
27
|
+
IdentityColumns,
|
28
|
+
/// Row tracking on tables
|
29
|
+
RowTracking,
|
30
|
+
/// domain specific metadata
|
31
|
+
DomainMetadata,
|
32
|
+
/// Iceberg compatibility support
|
33
|
+
IcebergCompatV1,
|
34
|
+
}
|
35
|
+
|
36
|
+
impl From<TableFeatures> for KernelTableFeatures {
|
37
|
+
fn from(value: TableFeatures) -> Self {
|
38
|
+
match value {
|
39
|
+
TableFeatures::ColumnMapping => KernelTableFeatures::ColumnMapping,
|
40
|
+
TableFeatures::DeletionVectors => KernelTableFeatures::DeletionVectors,
|
41
|
+
TableFeatures::TimestampWithoutTimezone => {
|
42
|
+
KernelTableFeatures::TimestampWithoutTimezone
|
43
|
+
}
|
44
|
+
TableFeatures::V2Checkpoint => KernelTableFeatures::V2Checkpoint,
|
45
|
+
TableFeatures::AppendOnly => KernelTableFeatures::AppendOnly,
|
46
|
+
TableFeatures::Invariants => KernelTableFeatures::Invariants,
|
47
|
+
TableFeatures::CheckConstraints => KernelTableFeatures::CheckConstraints,
|
48
|
+
TableFeatures::ChangeDataFeed => KernelTableFeatures::ChangeDataFeed,
|
49
|
+
TableFeatures::GeneratedColumns => KernelTableFeatures::GeneratedColumns,
|
50
|
+
TableFeatures::IdentityColumns => KernelTableFeatures::IdentityColumns,
|
51
|
+
TableFeatures::RowTracking => KernelTableFeatures::RowTracking,
|
52
|
+
TableFeatures::DomainMetadata => KernelTableFeatures::DomainMetadata,
|
53
|
+
TableFeatures::IcebergCompatV1 => KernelTableFeatures::IcebergCompatV1,
|
54
|
+
}
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
impl TryConvert for TableFeatures {
|
59
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
60
|
+
// TODO add more features
|
61
|
+
let feature = match unsafe { val.to_r_string()?.as_str()? } {
|
62
|
+
"append_only" => TableFeatures::AppendOnly,
|
63
|
+
_ => return Err(RbValueError::new_err("Invalid feature")),
|
64
|
+
};
|
65
|
+
Ok(feature)
|
66
|
+
}
|
67
|
+
}
|