deltalake-rb 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Cargo.lock +897 -735
- data/ext/deltalake/Cargo.toml +6 -6
- data/ext/deltalake/src/error.rs +0 -27
- data/ext/deltalake/src/lib.rs +106 -103
- data/lib/deltalake/table.rb +9 -3
- data/lib/deltalake/version.rb +1 -1
- metadata +4 -4
data/ext/deltalake/Cargo.toml
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
[package]
|
2
2
|
name = "deltalake"
|
3
|
-
version = "0.
|
3
|
+
version = "0.2.0"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
7
|
-
rust-version = "1.
|
7
|
+
rust-version = "1.82"
|
8
8
|
publish = false
|
9
9
|
|
10
10
|
[lib]
|
11
11
|
crate-type = ["cdylib"]
|
12
12
|
|
13
13
|
[dependencies]
|
14
|
-
arrow = { version = "
|
15
|
-
arrow-schema = { version = "
|
14
|
+
arrow = { version = "55.2", features = ["ffi"] }
|
15
|
+
arrow-schema = { version = "55.2", features = ["serde"] }
|
16
16
|
chrono = "0.4"
|
17
|
-
delta_kernel = "
|
18
|
-
deltalake = { version = "=0.
|
17
|
+
delta_kernel = "0.13"
|
18
|
+
deltalake = { version = "=0.27.0", features = ["azure", "datafusion", "gcs", "s3"] }
|
19
19
|
futures = "0.3"
|
20
20
|
magnus = "0.7"
|
21
21
|
num_cpus = "1"
|
data/ext/deltalake/src/error.rs
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
use arrow_schema::ArrowError;
|
2
2
|
use deltalake::datafusion::error::DataFusionError;
|
3
|
-
use deltalake::protocol::ProtocolError;
|
4
3
|
use deltalake::{errors::DeltaTableError, ObjectStoreError};
|
5
4
|
use magnus::{exception, Error as RbErr, Module, RModule, Ruby};
|
6
5
|
use std::borrow::Cow;
|
@@ -81,31 +80,12 @@ fn arrow_to_rb(err: ArrowError) -> RbErr {
|
|
81
80
|
}
|
82
81
|
}
|
83
82
|
|
84
|
-
fn checkpoint_to_rb(err: ProtocolError) -> RbErr {
|
85
|
-
match err {
|
86
|
-
ProtocolError::Arrow { source } => arrow_to_rb(source),
|
87
|
-
ProtocolError::ObjectStore { source } => object_store_to_rb(source),
|
88
|
-
ProtocolError::EndOfLog => DeltaProtocolError::new_err("End of log"),
|
89
|
-
ProtocolError::NoMetaData => DeltaProtocolError::new_err("Table metadata missing"),
|
90
|
-
ProtocolError::CheckpointNotFound => DeltaProtocolError::new_err(err.to_string()),
|
91
|
-
ProtocolError::InvalidField(err) => RbValueError::new_err(err),
|
92
|
-
ProtocolError::InvalidRow(err) => RbValueError::new_err(err),
|
93
|
-
ProtocolError::InvalidDeletionVectorStorageType(err) => RbValueError::new_err(err),
|
94
|
-
ProtocolError::SerializeOperation { source } => RbValueError::new_err(source.to_string()),
|
95
|
-
ProtocolError::ParquetParseError { source } => RbIOError::new_err(source.to_string()),
|
96
|
-
ProtocolError::IO { source } => RbIOError::new_err(source.to_string()),
|
97
|
-
ProtocolError::Generic(msg) => DeltaError::new_err(msg),
|
98
|
-
ProtocolError::Kernel { source } => DeltaError::new_err(source.to_string()),
|
99
|
-
}
|
100
|
-
}
|
101
|
-
|
102
83
|
fn datafusion_to_rb(err: DataFusionError) -> RbErr {
|
103
84
|
DeltaError::new_err(err.to_string())
|
104
85
|
}
|
105
86
|
|
106
87
|
pub enum RubyError {
|
107
88
|
DeltaTable(DeltaTableError),
|
108
|
-
Protocol(ProtocolError),
|
109
89
|
DataFusion(DataFusionError),
|
110
90
|
}
|
111
91
|
|
@@ -115,12 +95,6 @@ impl From<DeltaTableError> for RubyError {
|
|
115
95
|
}
|
116
96
|
}
|
117
97
|
|
118
|
-
impl From<ProtocolError> for RubyError {
|
119
|
-
fn from(err: ProtocolError) -> Self {
|
120
|
-
RubyError::Protocol(err)
|
121
|
-
}
|
122
|
-
}
|
123
|
-
|
124
98
|
impl From<DataFusionError> for RubyError {
|
125
99
|
fn from(err: DataFusionError) -> Self {
|
126
100
|
RubyError::DataFusion(err)
|
@@ -131,7 +105,6 @@ impl From<RubyError> for RbErr {
|
|
131
105
|
fn from(value: RubyError) -> Self {
|
132
106
|
match value {
|
133
107
|
RubyError::DeltaTable(err) => inner_to_rb_err(err),
|
134
|
-
RubyError::Protocol(err) => checkpoint_to_rb(err),
|
135
108
|
RubyError::DataFusion(err) => datafusion_to_rb(err),
|
136
109
|
}
|
137
110
|
}
|
data/ext/deltalake/src/lib.rs
CHANGED
@@ -20,7 +20,10 @@ use deltalake::datafusion::catalog::TableProvider;
|
|
20
20
|
use deltalake::datafusion::prelude::SessionContext;
|
21
21
|
use deltalake::delta_datafusion::DeltaCdfTableProvider;
|
22
22
|
use deltalake::errors::DeltaTableError;
|
23
|
+
use deltalake::kernel::transaction::{CommitProperties, TableReference};
|
23
24
|
use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
|
25
|
+
use deltalake::logstore::IORuntime;
|
26
|
+
use deltalake::logstore::LogStoreRef;
|
24
27
|
use deltalake::operations::add_column::AddColumnBuilder;
|
25
28
|
use deltalake::operations::add_feature::AddTableFeatureBuilder;
|
26
29
|
use deltalake::operations::collect_sendable_stream;
|
@@ -32,32 +35,28 @@ use deltalake::operations::load_cdf::CdfLoadBuilder;
|
|
32
35
|
use deltalake::operations::optimize::{OptimizeBuilder, OptimizeType};
|
33
36
|
use deltalake::operations::restore::RestoreBuilder;
|
34
37
|
use deltalake::operations::set_tbl_properties::SetTablePropertiesBuilder;
|
35
|
-
use deltalake::operations::transaction::{CommitProperties, TableReference};
|
36
38
|
use deltalake::operations::vacuum::VacuumBuilder;
|
37
39
|
use deltalake::parquet::basic::Compression;
|
38
40
|
use deltalake::parquet::errors::ParquetError;
|
39
41
|
use deltalake::parquet::file::properties::WriterProperties;
|
40
42
|
use deltalake::partitions::PartitionFilter;
|
41
|
-
use deltalake::storage::IORuntime;
|
42
43
|
use deltalake::{DeltaOps, DeltaResult};
|
43
44
|
use error::DeltaError;
|
44
45
|
use futures::future::join_all;
|
45
46
|
|
46
47
|
use magnus::{
|
47
|
-
function, method, prelude::*, typed_data::Obj, Error, Integer, Module, RArray,
|
48
|
+
function, method, prelude::*, typed_data::Obj, Error as RbErr, Integer, Module, RArray, Ruby,
|
48
49
|
TryConvert, Value,
|
49
50
|
};
|
50
51
|
use serde_json::Map;
|
51
52
|
|
52
|
-
use crate::error::DeltaProtocolError;
|
53
|
-
use crate::error::RbValueError;
|
54
|
-
use crate::error::RubyError;
|
53
|
+
use crate::error::{DeltaProtocolError, RbValueError, RubyError};
|
55
54
|
use crate::features::TableFeatures;
|
56
55
|
use crate::merge::RbMergeBuilder;
|
57
56
|
use crate::schema::{schema_to_rbobject, Field};
|
58
57
|
use crate::utils::rt;
|
59
58
|
|
60
|
-
type RbResult<T> = Result<T,
|
59
|
+
type RbResult<T> = Result<T, RbErr>;
|
61
60
|
|
62
61
|
enum PartitionFilterValue {
|
63
62
|
Single(String),
|
@@ -86,7 +85,7 @@ struct RawDeltaTableMetaData {
|
|
86
85
|
description: Option<String>,
|
87
86
|
partition_columns: Vec<String>,
|
88
87
|
created_time: Option<i64>,
|
89
|
-
configuration: HashMap<String,
|
88
|
+
configuration: HashMap<String, String>,
|
90
89
|
}
|
91
90
|
|
92
91
|
impl RawDeltaTableMetaData {
|
@@ -110,13 +109,23 @@ impl RawDeltaTableMetaData {
|
|
110
109
|
self.created_time
|
111
110
|
}
|
112
111
|
|
113
|
-
fn configuration(&self) -> HashMap<String,
|
112
|
+
fn configuration(&self) -> HashMap<String, String> {
|
114
113
|
self.configuration.clone()
|
115
114
|
}
|
116
115
|
}
|
117
116
|
|
118
117
|
type StringVec = Vec<String>;
|
119
118
|
|
119
|
+
impl RawDeltaTable {
|
120
|
+
fn with_table<T>(&self, func: impl Fn(&deltalake::DeltaTable) -> RbResult<T>) -> RbResult<T> {
|
121
|
+
func(&self._table.borrow())
|
122
|
+
}
|
123
|
+
|
124
|
+
fn log_store(&self) -> RbResult<LogStoreRef> {
|
125
|
+
self.with_table(|t| Ok(t.log_store().clone()))
|
126
|
+
}
|
127
|
+
}
|
128
|
+
|
120
129
|
impl RawDeltaTable {
|
121
130
|
pub fn new(
|
122
131
|
table_uri: String,
|
@@ -168,60 +177,62 @@ impl RawDeltaTable {
|
|
168
177
|
}
|
169
178
|
|
170
179
|
pub fn table_uri(&self) -> RbResult<String> {
|
171
|
-
|
180
|
+
self.with_table(|t| Ok(t.table_uri()))
|
172
181
|
}
|
173
182
|
|
174
|
-
pub fn version(&self) -> RbResult<i64
|
175
|
-
|
183
|
+
pub fn version(&self) -> RbResult<Option<i64>> {
|
184
|
+
self.with_table(|t| Ok(t.version()))
|
176
185
|
}
|
177
186
|
|
178
187
|
pub fn has_files(&self) -> RbResult<bool> {
|
179
|
-
|
188
|
+
self.with_table(|t| Ok(t.config.require_files))
|
180
189
|
}
|
181
190
|
|
182
191
|
pub fn metadata(&self) -> RbResult<RawDeltaTableMetaData> {
|
183
|
-
let
|
184
|
-
|
192
|
+
let metadata = self.with_table(|t| {
|
193
|
+
t.metadata()
|
194
|
+
.cloned()
|
195
|
+
.map_err(RubyError::from)
|
196
|
+
.map_err(RbErr::from)
|
197
|
+
})?;
|
185
198
|
Ok(RawDeltaTableMetaData {
|
186
|
-
id: metadata.id.
|
187
|
-
name: metadata.name.
|
188
|
-
description: metadata.description.
|
189
|
-
partition_columns: metadata.partition_columns.clone(),
|
190
|
-
created_time: metadata.created_time,
|
191
|
-
configuration: metadata.configuration.clone(),
|
199
|
+
id: metadata.id().to_string(),
|
200
|
+
name: metadata.name().map(String::from),
|
201
|
+
description: metadata.description().map(String::from),
|
202
|
+
partition_columns: metadata.partition_columns().clone(),
|
203
|
+
created_time: metadata.created_time(),
|
204
|
+
configuration: metadata.configuration().clone(),
|
192
205
|
})
|
193
206
|
}
|
194
207
|
|
195
208
|
pub fn protocol_versions(&self) -> RbResult<(i32, i32, Option<StringVec>, Option<StringVec>)> {
|
196
|
-
let
|
197
|
-
|
209
|
+
let table_protocol = self.with_table(|t| {
|
210
|
+
t.protocol()
|
211
|
+
.cloned()
|
212
|
+
.map_err(RubyError::from)
|
213
|
+
.map_err(RbErr::from)
|
214
|
+
})?;
|
198
215
|
Ok((
|
199
|
-
table_protocol.min_reader_version,
|
200
|
-
table_protocol.min_writer_version,
|
201
|
-
table_protocol
|
202
|
-
.
|
203
|
-
.
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
features
|
220
|
-
.iter()
|
221
|
-
.map(|v| v.to_string())
|
222
|
-
.collect::<Vec<String>>()
|
223
|
-
})
|
224
|
-
}),
|
216
|
+
table_protocol.min_reader_version(),
|
217
|
+
table_protocol.min_writer_version(),
|
218
|
+
table_protocol.writer_features().and_then(|features| {
|
219
|
+
let empty_set = !features.is_empty();
|
220
|
+
empty_set.then(|| {
|
221
|
+
features
|
222
|
+
.iter()
|
223
|
+
.map(|v| v.to_string())
|
224
|
+
.collect::<Vec<String>>()
|
225
|
+
})
|
226
|
+
}),
|
227
|
+
table_protocol.reader_features().and_then(|features| {
|
228
|
+
let empty_set = !features.is_empty();
|
229
|
+
empty_set.then(|| {
|
230
|
+
features
|
231
|
+
.iter()
|
232
|
+
.map(|v| v.to_string())
|
233
|
+
.collect::<Vec<String>>()
|
234
|
+
})
|
235
|
+
}),
|
225
236
|
))
|
226
237
|
}
|
227
238
|
|
@@ -237,31 +248,23 @@ impl RawDeltaTable {
|
|
237
248
|
.map_err(RubyError::from)?)
|
238
249
|
}
|
239
250
|
|
240
|
-
pub fn get_earliest_version(&self) -> RbResult<i64> {
|
241
|
-
Ok(rt()
|
242
|
-
.block_on(self._table.borrow().get_earliest_version())
|
243
|
-
.map_err(RubyError::from)?)
|
244
|
-
}
|
245
|
-
|
246
251
|
pub fn get_num_index_cols(&self) -> RbResult<i32> {
|
247
|
-
|
248
|
-
.
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
.num_indexed_cols())
|
252
|
+
self.with_table(|t| {
|
253
|
+
Ok(t.snapshot()
|
254
|
+
.map_err(RubyError::from)?
|
255
|
+
.config()
|
256
|
+
.num_indexed_cols())
|
257
|
+
})
|
254
258
|
}
|
255
259
|
|
256
260
|
pub fn get_stats_columns(&self) -> RbResult<Option<Vec<String>>> {
|
257
|
-
|
258
|
-
.
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
.map(|v| v.iter().map(|v| v.to_string()).collect::<Vec<String>>()))
|
261
|
+
self.with_table(|t| {
|
262
|
+
Ok(t.snapshot()
|
263
|
+
.map_err(RubyError::from)?
|
264
|
+
.config()
|
265
|
+
.stats_columns()
|
266
|
+
.map(|v| v.iter().map(|s| s.to_string()).collect::<Vec<String>>()))
|
267
|
+
})
|
265
268
|
}
|
266
269
|
|
267
270
|
pub fn load_with_datetime(&self, ds: String) -> RbResult<()> {
|
@@ -285,10 +288,11 @@ impl RawDeltaTable {
|
|
285
288
|
if let Some(filters) = partition_filters {
|
286
289
|
let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
|
287
290
|
Ok(self
|
288
|
-
.
|
289
|
-
|
290
|
-
|
291
|
-
|
291
|
+
.with_table(|t| {
|
292
|
+
t.get_files_by_partitions(&filters)
|
293
|
+
.map_err(RubyError::from)
|
294
|
+
.map_err(RbErr::from)
|
295
|
+
})?
|
292
296
|
.into_iter()
|
293
297
|
.map(|p| p.to_string())
|
294
298
|
.collect())
|
@@ -307,30 +311,34 @@ impl RawDeltaTable {
|
|
307
311
|
&self,
|
308
312
|
partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
|
309
313
|
) -> RbResult<Vec<String>> {
|
310
|
-
if !self.
|
314
|
+
if !self.with_table(|t| Ok(t.config.require_files))? {
|
311
315
|
return Err(DeltaError::new_err("Table is initiated without files."));
|
312
316
|
}
|
313
317
|
|
314
318
|
if let Some(filters) = partition_filters {
|
315
319
|
let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
|
316
|
-
|
317
|
-
.
|
318
|
-
|
319
|
-
|
320
|
-
|
320
|
+
self.with_table(|t| {
|
321
|
+
t.get_file_uris_by_partitions(&filters)
|
322
|
+
.map_err(RubyError::from)
|
323
|
+
.map_err(RbErr::from)
|
324
|
+
})
|
321
325
|
} else {
|
322
|
-
|
323
|
-
.
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
326
|
+
self.with_table(|t| {
|
327
|
+
Ok(t.get_file_uris()
|
328
|
+
.map_err(RubyError::from)
|
329
|
+
.map_err(RbErr::from)?
|
330
|
+
.collect::<Vec<String>>())
|
331
|
+
})
|
328
332
|
}
|
329
333
|
}
|
330
334
|
|
331
335
|
pub fn schema(&self) -> RbResult<Value> {
|
332
|
-
let
|
333
|
-
|
336
|
+
let schema: StructType = self.with_table(|t| {
|
337
|
+
t.get_schema()
|
338
|
+
.map_err(RubyError::from)
|
339
|
+
.map_err(RbErr::from)
|
340
|
+
.map(|s| s.to_owned())
|
341
|
+
})?;
|
334
342
|
schema_to_rbobject(schema.to_owned())
|
335
343
|
}
|
336
344
|
|
@@ -727,7 +735,7 @@ impl RawDeltaTable {
|
|
727
735
|
let partition_columns: HashSet<&str> = binding
|
728
736
|
.metadata()
|
729
737
|
.map_err(RubyError::from)?
|
730
|
-
.partition_columns
|
738
|
+
.partition_columns()
|
731
739
|
.iter()
|
732
740
|
.map(|col| col.as_str())
|
733
741
|
.collect();
|
@@ -874,14 +882,13 @@ impl RawDeltaTable {
|
|
874
882
|
Ok(serde_json::to_string(&metrics).unwrap())
|
875
883
|
}
|
876
884
|
|
877
|
-
pub fn
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
)
|
885
|
+
pub fn transaction_version(&self, app_id: String) -> RbResult<Option<i64>> {
|
886
|
+
// NOTE: this will simplify once we have moved logstore onto state.
|
887
|
+
let log_store = self.log_store()?;
|
888
|
+
let snapshot = self.with_table(|t| Ok(t.snapshot().map_err(RubyError::from)?.clone()))?;
|
889
|
+
Ok(rt()
|
890
|
+
.block_on(snapshot.transaction_version(log_store.as_ref(), app_id))
|
891
|
+
.map_err(RubyError::from)?)
|
885
892
|
}
|
886
893
|
}
|
887
894
|
|
@@ -1293,10 +1300,6 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
1293
1300
|
"get_latest_version",
|
1294
1301
|
method!(RawDeltaTable::get_latest_version, 0),
|
1295
1302
|
)?;
|
1296
|
-
class.define_method(
|
1297
|
-
"get_earliest_version",
|
1298
|
-
method!(RawDeltaTable::get_earliest_version, 0),
|
1299
|
-
)?;
|
1300
1303
|
class.define_method(
|
1301
1304
|
"get_num_index_cols",
|
1302
1305
|
method!(RawDeltaTable::get_num_index_cols, 0),
|
@@ -1366,8 +1369,8 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
1366
1369
|
)?;
|
1367
1370
|
class.define_method("repair", method!(RawDeltaTable::repair, 3))?;
|
1368
1371
|
class.define_method(
|
1369
|
-
"
|
1370
|
-
method!(RawDeltaTable::
|
1372
|
+
"transaction_version",
|
1373
|
+
method!(RawDeltaTable::transaction_version, 1),
|
1371
1374
|
)?;
|
1372
1375
|
|
1373
1376
|
let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
|
data/lib/deltalake/table.rb
CHANGED
@@ -204,7 +204,13 @@ module DeltaLake
|
|
204
204
|
"DELTA_DYNAMO_TABLE_NAME"
|
205
205
|
]
|
206
206
|
storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
|
207
|
-
lf =
|
207
|
+
lf =
|
208
|
+
Polars.scan_parquet(
|
209
|
+
sources,
|
210
|
+
hive_partitioning: true,
|
211
|
+
storage_options: storage_options,
|
212
|
+
rechunk: rechunk
|
213
|
+
)
|
208
214
|
|
209
215
|
if columns
|
210
216
|
# by_name requires polars-df > 0.15.0
|
@@ -253,8 +259,8 @@ module DeltaLake
|
|
253
259
|
deserialized_metrics.transform_keys(&:to_sym)
|
254
260
|
end
|
255
261
|
|
256
|
-
def
|
257
|
-
@table.
|
262
|
+
def transaction_version(app_id)
|
263
|
+
@table.transaction_version(app_id)
|
258
264
|
end
|
259
265
|
|
260
266
|
# private
|
data/lib/deltalake/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: deltalake-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
bindir: bin
|
9
9
|
cert_chain: []
|
10
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: rb_sys
|
@@ -64,14 +64,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
64
64
|
requirements:
|
65
65
|
- - ">="
|
66
66
|
- !ruby/object:Gem::Version
|
67
|
-
version: '3.
|
67
|
+
version: '3.2'
|
68
68
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
69
|
requirements:
|
70
70
|
- - ">="
|
71
71
|
- !ruby/object:Gem::Version
|
72
72
|
version: '0'
|
73
73
|
requirements: []
|
74
|
-
rubygems_version: 3.6.
|
74
|
+
rubygems_version: 3.6.7
|
75
75
|
specification_version: 4
|
76
76
|
summary: Delta Lake for Ruby
|
77
77
|
test_files: []
|