deltalake-rb 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "deltalake"
3
- version = "0.1.5"
3
+ version = "0.1.6"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -11,11 +11,11 @@ publish = false
11
11
  crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
- arrow = { version = "53", features = ["ffi"] }
15
- arrow-schema = { version = "53", features = ["serde"] }
14
+ arrow = { version = "54", features = ["ffi"] }
15
+ arrow-schema = { version = "54", features = ["serde"] }
16
16
  chrono = "0.4"
17
- delta_kernel = "=0.6.0"
18
- deltalake = { version = "=0.24.0", features = ["azure", "datafusion", "gcs", "s3"] }
17
+ delta_kernel = "=0.8.0"
18
+ deltalake = { version = "=0.25.0", features = ["azure", "datafusion", "gcs", "s3"] }
19
19
  futures = "0.3"
20
20
  magnus = "0.7"
21
21
  num_cpus = "1"
@@ -1,4 +1,5 @@
1
1
  use arrow_schema::ArrowError;
2
+ use deltalake::datafusion::error::DataFusionError;
2
3
  use deltalake::protocol::ProtocolError;
3
4
  use deltalake::{errors::DeltaTableError, ObjectStoreError};
4
5
  use magnus::{exception, Error as RbErr, Module, RModule, Ruby};
@@ -98,9 +99,14 @@ fn checkpoint_to_rb(err: ProtocolError) -> RbErr {
98
99
  }
99
100
  }
100
101
 
102
+ fn datafusion_to_rb(err: DataFusionError) -> RbErr {
103
+ DeltaError::new_err(err.to_string())
104
+ }
105
+
101
106
  pub enum RubyError {
102
107
  DeltaTable(DeltaTableError),
103
108
  Protocol(ProtocolError),
109
+ DataFusion(DataFusionError),
104
110
  }
105
111
 
106
112
  impl From<DeltaTableError> for RubyError {
@@ -115,11 +121,18 @@ impl From<ProtocolError> for RubyError {
115
121
  }
116
122
  }
117
123
 
124
+ impl From<DataFusionError> for RubyError {
125
+ fn from(err: DataFusionError) -> Self {
126
+ RubyError::DataFusion(err)
127
+ }
128
+ }
129
+
118
130
  impl From<RubyError> for RbErr {
119
131
  fn from(value: RubyError) -> Self {
120
132
  match value {
121
133
  RubyError::DeltaTable(err) => inner_to_rb_err(err),
122
134
  RubyError::Protocol(err) => checkpoint_to_rb(err),
135
+ RubyError::DataFusion(err) => datafusion_to_rb(err),
123
136
  }
124
137
  }
125
138
  }
@@ -8,6 +8,7 @@ use std::cell::RefCell;
8
8
  use std::collections::{HashMap, HashSet};
9
9
  use std::future::IntoFuture;
10
10
  use std::str::FromStr;
11
+ use std::sync::Arc;
11
12
  use std::time;
12
13
 
13
14
  use chrono::{DateTime, Duration, FixedOffset, Utc};
@@ -15,8 +16,9 @@ use delta_kernel::schema::StructField;
15
16
  use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
16
17
  use deltalake::arrow::record_batch::RecordBatchIterator;
17
18
  use deltalake::checkpoints::{cleanup_metadata, create_checkpoint};
18
- use deltalake::datafusion::physical_plan::ExecutionPlan;
19
+ use deltalake::datafusion::catalog::TableProvider;
19
20
  use deltalake::datafusion::prelude::SessionContext;
21
+ use deltalake::delta_datafusion::DeltaCdfTableProvider;
20
22
  use deltalake::errors::DeltaTableError;
21
23
  use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
22
24
  use deltalake::operations::add_column::AddColumnBuilder;
@@ -364,6 +366,7 @@ impl RawDeltaTable {
364
366
  Ok(metrics.files_deleted)
365
367
  }
366
368
 
369
+ #[allow(clippy::too_many_arguments)]
367
370
  pub fn compact_optimize(
368
371
  &self,
369
372
  partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
@@ -411,6 +414,7 @@ impl RawDeltaTable {
411
414
  Ok(serde_json::to_string(&metrics).unwrap())
412
415
  }
413
416
 
417
+ #[allow(clippy::too_many_arguments)]
414
418
  pub fn z_order_optimize(
415
419
  &self,
416
420
  z_order_columns: Vec<String>,
@@ -492,7 +496,7 @@ impl RawDeltaTable {
492
496
  ) -> RbResult<()> {
493
497
  let feature = feature
494
498
  .into_iter()
495
- .map(|v| TableFeatures::try_convert(v))
499
+ .map(TableFeatures::try_convert)
496
500
  .collect::<RbResult<Vec<_>>>()?;
497
501
  let cmd = AddTableFeatureBuilder::new(
498
502
  self._table.borrow().log_store(),
@@ -581,13 +585,19 @@ impl RawDeltaTable {
581
585
  cdf_read = cdf_read.with_starting_timestamp(ending_ts);
582
586
  }
583
587
 
584
- if let Some(columns) = columns {
585
- cdf_read = cdf_read.with_columns(columns);
586
- }
587
-
588
- cdf_read = cdf_read.with_session_ctx(ctx.clone());
588
+ let table_provider: Arc<dyn TableProvider> =
589
+ Arc::new(DeltaCdfTableProvider::try_new(cdf_read).map_err(RubyError::from)?);
589
590
 
590
- let plan = rt().block_on(cdf_read.build()).map_err(RubyError::from)?;
591
+ let plan = rt()
592
+ .block_on(async {
593
+ let mut df = ctx.read_table(table_provider)?;
594
+ if let Some(columns) = columns {
595
+ let cols: Vec<_> = columns.iter().map(|c| c.as_ref()).collect();
596
+ df = df.select_columns(&cols)?;
597
+ }
598
+ df.create_physical_plan().await
599
+ })
600
+ .map_err(RubyError::from)?;
591
601
 
592
602
  let mut tasks = vec![];
593
603
  for p in 0..plan.properties().output_partitioning().partition_count() {
@@ -612,6 +622,7 @@ impl RawDeltaTable {
612
622
  Ok(ArrowArrayStream { stream: ffi_stream })
613
623
  }
614
624
 
625
+ #[allow(clippy::too_many_arguments)]
615
626
  pub fn create_merge_builder(
616
627
  &self,
617
628
  source: RbArrowType<ArrowArrayStreamReader>,
@@ -926,9 +937,6 @@ fn set_writer_properties(writer_properties: RbWriterProperties) -> DeltaResult<W
926
937
  if let Some(dictionary_enabled) = default_column_properties.dictionary_enabled {
927
938
  properties = properties.set_dictionary_enabled(dictionary_enabled);
928
939
  }
929
- if let Some(max_statistics_size) = default_column_properties.max_statistics_size {
930
- properties = properties.set_max_statistics_size(max_statistics_size);
931
- }
932
940
  if let Some(bloom_filter_properties) = default_column_properties.bloom_filter_properties {
933
941
  if let Some(set_bloom_filter_enabled) = bloom_filter_properties.set_bloom_filter_enabled
934
942
  {
@@ -1195,9 +1203,8 @@ fn write_to_deltalake(
1195
1203
  }
1196
1204
 
1197
1205
  if let Some(writer_props) = writer_properties {
1198
- builder = builder.with_writer_properties(
1199
- set_writer_properties(writer_props).map_err(RubyError::from)?,
1200
- );
1206
+ builder = builder
1207
+ .with_writer_properties(set_writer_properties(writer_props).map_err(RubyError::from)?);
1201
1208
  }
1202
1209
 
1203
1210
  if let Some(name) = &name {
@@ -42,6 +42,7 @@ impl RbMergeBuilder {
42
42
  }
43
43
 
44
44
  impl RbMergeBuilder {
45
+ #[allow(clippy::too_many_arguments)]
45
46
  pub fn new(
46
47
  log_store: LogStoreRef,
47
48
  snapshot: DeltaTableState,
@@ -1,5 +1,7 @@
1
1
  module DeltaLake
2
2
  class Table
3
+ FSCK_METRICS_FILES_REMOVED_LABEL = "files_removed"
4
+
3
5
  def initialize(
4
6
  table_uri,
5
7
  version: nil,
@@ -244,7 +246,11 @@ module DeltaLake
244
246
  commit_properties,
245
247
  post_commithook_properties
246
248
  )
247
- JSON.parse(metrics).transform_keys(&:to_sym)
249
+ deserialized_metrics = JSON.parse(metrics)
250
+ deserialized_metrics[FSCK_METRICS_FILES_REMOVED_LABEL] = JSON.parse(
251
+ deserialized_metrics[FSCK_METRICS_FILES_REMOVED_LABEL]
252
+ )
253
+ deserialized_metrics.transform_keys(&:to_sym)
248
254
  end
249
255
 
250
256
  def transaction_versions
@@ -1,3 +1,3 @@
1
1
  module DeltaLake
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: deltalake-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-01-28 00:00:00.000000000 Z
10
+ date: 2025-03-13 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: rb_sys