deltalake-rb 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +775 -612
- data/ext/deltalake/Cargo.toml +5 -5
- data/ext/deltalake/src/error.rs +13 -0
- data/ext/deltalake/src/lib.rs +21 -14
- data/ext/deltalake/src/merge.rs +1 -0
- data/lib/deltalake/table.rb +7 -1
- data/lib/deltalake/version.rb +1 -1
- metadata +2 -2
data/ext/deltalake/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "deltalake"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.6"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -11,11 +11,11 @@ publish = false
|
|
11
11
|
crate-type = ["cdylib"]
|
12
12
|
|
13
13
|
[dependencies]
|
14
|
-
arrow = { version = "
|
15
|
-
arrow-schema = { version = "
|
14
|
+
arrow = { version = "54", features = ["ffi"] }
|
15
|
+
arrow-schema = { version = "54", features = ["serde"] }
|
16
16
|
chrono = "0.4"
|
17
|
-
delta_kernel = "=0.
|
18
|
-
deltalake = { version = "=0.
|
17
|
+
delta_kernel = "=0.8.0"
|
18
|
+
deltalake = { version = "=0.25.0", features = ["azure", "datafusion", "gcs", "s3"] }
|
19
19
|
futures = "0.3"
|
20
20
|
magnus = "0.7"
|
21
21
|
num_cpus = "1"
|
data/ext/deltalake/src/error.rs
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
use arrow_schema::ArrowError;
|
2
|
+
use deltalake::datafusion::error::DataFusionError;
|
2
3
|
use deltalake::protocol::ProtocolError;
|
3
4
|
use deltalake::{errors::DeltaTableError, ObjectStoreError};
|
4
5
|
use magnus::{exception, Error as RbErr, Module, RModule, Ruby};
|
@@ -98,9 +99,14 @@ fn checkpoint_to_rb(err: ProtocolError) -> RbErr {
|
|
98
99
|
}
|
99
100
|
}
|
100
101
|
|
102
|
+
fn datafusion_to_rb(err: DataFusionError) -> RbErr {
|
103
|
+
DeltaError::new_err(err.to_string())
|
104
|
+
}
|
105
|
+
|
101
106
|
pub enum RubyError {
|
102
107
|
DeltaTable(DeltaTableError),
|
103
108
|
Protocol(ProtocolError),
|
109
|
+
DataFusion(DataFusionError),
|
104
110
|
}
|
105
111
|
|
106
112
|
impl From<DeltaTableError> for RubyError {
|
@@ -115,11 +121,18 @@ impl From<ProtocolError> for RubyError {
|
|
115
121
|
}
|
116
122
|
}
|
117
123
|
|
124
|
+
impl From<DataFusionError> for RubyError {
|
125
|
+
fn from(err: DataFusionError) -> Self {
|
126
|
+
RubyError::DataFusion(err)
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
118
130
|
impl From<RubyError> for RbErr {
|
119
131
|
fn from(value: RubyError) -> Self {
|
120
132
|
match value {
|
121
133
|
RubyError::DeltaTable(err) => inner_to_rb_err(err),
|
122
134
|
RubyError::Protocol(err) => checkpoint_to_rb(err),
|
135
|
+
RubyError::DataFusion(err) => datafusion_to_rb(err),
|
123
136
|
}
|
124
137
|
}
|
125
138
|
}
|
data/ext/deltalake/src/lib.rs
CHANGED
@@ -8,6 +8,7 @@ use std::cell::RefCell;
|
|
8
8
|
use std::collections::{HashMap, HashSet};
|
9
9
|
use std::future::IntoFuture;
|
10
10
|
use std::str::FromStr;
|
11
|
+
use std::sync::Arc;
|
11
12
|
use std::time;
|
12
13
|
|
13
14
|
use chrono::{DateTime, Duration, FixedOffset, Utc};
|
@@ -15,8 +16,9 @@ use delta_kernel::schema::StructField;
|
|
15
16
|
use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
|
16
17
|
use deltalake::arrow::record_batch::RecordBatchIterator;
|
17
18
|
use deltalake::checkpoints::{cleanup_metadata, create_checkpoint};
|
18
|
-
use deltalake::datafusion::
|
19
|
+
use deltalake::datafusion::catalog::TableProvider;
|
19
20
|
use deltalake::datafusion::prelude::SessionContext;
|
21
|
+
use deltalake::delta_datafusion::DeltaCdfTableProvider;
|
20
22
|
use deltalake::errors::DeltaTableError;
|
21
23
|
use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
|
22
24
|
use deltalake::operations::add_column::AddColumnBuilder;
|
@@ -364,6 +366,7 @@ impl RawDeltaTable {
|
|
364
366
|
Ok(metrics.files_deleted)
|
365
367
|
}
|
366
368
|
|
369
|
+
#[allow(clippy::too_many_arguments)]
|
367
370
|
pub fn compact_optimize(
|
368
371
|
&self,
|
369
372
|
partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
|
@@ -411,6 +414,7 @@ impl RawDeltaTable {
|
|
411
414
|
Ok(serde_json::to_string(&metrics).unwrap())
|
412
415
|
}
|
413
416
|
|
417
|
+
#[allow(clippy::too_many_arguments)]
|
414
418
|
pub fn z_order_optimize(
|
415
419
|
&self,
|
416
420
|
z_order_columns: Vec<String>,
|
@@ -492,7 +496,7 @@ impl RawDeltaTable {
|
|
492
496
|
) -> RbResult<()> {
|
493
497
|
let feature = feature
|
494
498
|
.into_iter()
|
495
|
-
.map(
|
499
|
+
.map(TableFeatures::try_convert)
|
496
500
|
.collect::<RbResult<Vec<_>>>()?;
|
497
501
|
let cmd = AddTableFeatureBuilder::new(
|
498
502
|
self._table.borrow().log_store(),
|
@@ -581,13 +585,19 @@ impl RawDeltaTable {
|
|
581
585
|
cdf_read = cdf_read.with_starting_timestamp(ending_ts);
|
582
586
|
}
|
583
587
|
|
584
|
-
|
585
|
-
cdf_read
|
586
|
-
}
|
587
|
-
|
588
|
-
cdf_read = cdf_read.with_session_ctx(ctx.clone());
|
588
|
+
let table_provider: Arc<dyn TableProvider> =
|
589
|
+
Arc::new(DeltaCdfTableProvider::try_new(cdf_read).map_err(RubyError::from)?);
|
589
590
|
|
590
|
-
let plan = rt()
|
591
|
+
let plan = rt()
|
592
|
+
.block_on(async {
|
593
|
+
let mut df = ctx.read_table(table_provider)?;
|
594
|
+
if let Some(columns) = columns {
|
595
|
+
let cols: Vec<_> = columns.iter().map(|c| c.as_ref()).collect();
|
596
|
+
df = df.select_columns(&cols)?;
|
597
|
+
}
|
598
|
+
df.create_physical_plan().await
|
599
|
+
})
|
600
|
+
.map_err(RubyError::from)?;
|
591
601
|
|
592
602
|
let mut tasks = vec![];
|
593
603
|
for p in 0..plan.properties().output_partitioning().partition_count() {
|
@@ -612,6 +622,7 @@ impl RawDeltaTable {
|
|
612
622
|
Ok(ArrowArrayStream { stream: ffi_stream })
|
613
623
|
}
|
614
624
|
|
625
|
+
#[allow(clippy::too_many_arguments)]
|
615
626
|
pub fn create_merge_builder(
|
616
627
|
&self,
|
617
628
|
source: RbArrowType<ArrowArrayStreamReader>,
|
@@ -926,9 +937,6 @@ fn set_writer_properties(writer_properties: RbWriterProperties) -> DeltaResult<W
|
|
926
937
|
if let Some(dictionary_enabled) = default_column_properties.dictionary_enabled {
|
927
938
|
properties = properties.set_dictionary_enabled(dictionary_enabled);
|
928
939
|
}
|
929
|
-
if let Some(max_statistics_size) = default_column_properties.max_statistics_size {
|
930
|
-
properties = properties.set_max_statistics_size(max_statistics_size);
|
931
|
-
}
|
932
940
|
if let Some(bloom_filter_properties) = default_column_properties.bloom_filter_properties {
|
933
941
|
if let Some(set_bloom_filter_enabled) = bloom_filter_properties.set_bloom_filter_enabled
|
934
942
|
{
|
@@ -1195,9 +1203,8 @@ fn write_to_deltalake(
|
|
1195
1203
|
}
|
1196
1204
|
|
1197
1205
|
if let Some(writer_props) = writer_properties {
|
1198
|
-
builder = builder
|
1199
|
-
set_writer_properties(writer_props).map_err(RubyError::from)
|
1200
|
-
);
|
1206
|
+
builder = builder
|
1207
|
+
.with_writer_properties(set_writer_properties(writer_props).map_err(RubyError::from)?);
|
1201
1208
|
}
|
1202
1209
|
|
1203
1210
|
if let Some(name) = &name {
|
data/ext/deltalake/src/merge.rs
CHANGED
data/lib/deltalake/table.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module DeltaLake
|
2
2
|
class Table
|
3
|
+
FSCK_METRICS_FILES_REMOVED_LABEL = "files_removed"
|
4
|
+
|
3
5
|
def initialize(
|
4
6
|
table_uri,
|
5
7
|
version: nil,
|
@@ -244,7 +246,11 @@ module DeltaLake
|
|
244
246
|
commit_properties,
|
245
247
|
post_commithook_properties
|
246
248
|
)
|
247
|
-
JSON.parse(metrics)
|
249
|
+
deserialized_metrics = JSON.parse(metrics)
|
250
|
+
deserialized_metrics[FSCK_METRICS_FILES_REMOVED_LABEL] = JSON.parse(
|
251
|
+
deserialized_metrics[FSCK_METRICS_FILES_REMOVED_LABEL]
|
252
|
+
)
|
253
|
+
deserialized_metrics.transform_keys(&:to_sym)
|
248
254
|
end
|
249
255
|
|
250
256
|
def transaction_versions
|
data/lib/deltalake/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: deltalake-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
bindir: bin
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-
|
10
|
+
date: 2025-03-13 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: rb_sys
|