deltalake-rb 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "deltalake"
3
- version = "0.2.0"
3
+ version = "0.2.2"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -14,11 +14,19 @@ crate-type = ["cdylib"]
14
14
  arrow = { version = "55.2", features = ["ffi"] }
15
15
  arrow-schema = { version = "55.2", features = ["serde"] }
16
16
  chrono = "0.4"
17
- delta_kernel = "0.13"
18
- deltalake = { version = "=0.27.0", features = ["azure", "datafusion", "gcs", "s3"] }
17
+ delta_kernel = { version = "0.15", features = ["arrow-55", "default-engine-rustls"] }
19
18
  futures = "0.3"
20
- magnus = "0.7"
19
+ magnus = "0.8"
21
20
  num_cpus = "1"
22
21
  serde = "1"
23
22
  serde_json = "1"
24
23
  tokio = { version = "1", features = ["rt-multi-thread"] }
24
+
25
+ [dependencies.deltalake]
26
+ version = "=0.28.1"
27
+ features = [
28
+ "azure",
29
+ "datafusion",
30
+ "gcs",
31
+ "s3"
32
+ ]
@@ -1,7 +1,7 @@
1
1
  use arrow_schema::ArrowError;
2
2
  use deltalake::datafusion::error::DataFusionError;
3
3
  use deltalake::{errors::DeltaTableError, ObjectStoreError};
4
- use magnus::{exception, Error as RbErr, Module, RModule, Ruby};
4
+ use magnus::{Error as RbErr, Module, RModule, Ruby};
5
5
  use std::borrow::Cow;
6
6
 
7
7
  macro_rules! create_exception {
@@ -41,7 +41,7 @@ fn inner_to_rb_err(err: DeltaTableError) -> RbErr {
41
41
  DeltaTableError::InvalidJsonLog { .. } => DeltaProtocolError::new_err(err.to_string()),
42
42
  DeltaTableError::InvalidStatsJson { .. } => DeltaProtocolError::new_err(err.to_string()),
43
43
  DeltaTableError::InvalidData { violations } => {
44
- DeltaProtocolError::new_err(format!("Invariant violations: {:?}", violations))
44
+ DeltaProtocolError::new_err(format!("Invariant violations: {violations:?}"))
45
45
  }
46
46
 
47
47
  // commit errors
@@ -111,7 +111,7 @@ impl From<RubyError> for RbErr {
111
111
  }
112
112
 
113
113
  macro_rules! create_builtin_exception {
114
- ($type:ident, $class:expr) => {
114
+ ($type:ident, $method:ident) => {
115
115
  pub struct $type {}
116
116
 
117
117
  impl $type {
@@ -119,13 +119,14 @@ macro_rules! create_builtin_exception {
119
119
  where
120
120
  T: Into<Cow<'static, str>>,
121
121
  {
122
- RbErr::new($class, message)
122
+ let ruby = Ruby::get().unwrap();
123
+ RbErr::new(ruby.$method(), message)
123
124
  }
124
125
  }
125
126
  };
126
127
  }
127
128
 
128
- create_builtin_exception!(RbException, exception::runtime_error());
129
- create_builtin_exception!(RbIOError, exception::io_error());
130
- create_builtin_exception!(RbNotImplementedError, exception::not_imp_error());
131
- create_builtin_exception!(RbValueError, exception::arg_error());
129
+ create_builtin_exception!(RbException, exception_runtime_error);
130
+ create_builtin_exception!(RbIOError, exception_io_error);
131
+ create_builtin_exception!(RbNotImplementedError, exception_not_imp_error);
132
+ create_builtin_exception!(RbValueError, exception_arg_error);
@@ -13,6 +13,7 @@ use std::time;
13
13
 
14
14
  use chrono::{DateTime, Duration, FixedOffset, Utc};
15
15
  use delta_kernel::schema::StructField;
16
+ use delta_kernel::table_properties::DataSkippingNumIndexedCols;
16
17
  use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
17
18
  use deltalake::arrow::record_batch::RecordBatchIterator;
18
19
  use deltalake::checkpoints::{cleanup_metadata, create_checkpoint};
@@ -21,6 +22,7 @@ use deltalake::datafusion::prelude::SessionContext;
21
22
  use deltalake::delta_datafusion::DeltaCdfTableProvider;
22
23
  use deltalake::errors::DeltaTableError;
23
24
  use deltalake::kernel::transaction::{CommitProperties, TableReference};
25
+ use deltalake::kernel::StructDataExt;
24
26
  use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
25
27
  use deltalake::logstore::IORuntime;
26
28
  use deltalake::logstore::LogStoreRef;
@@ -40,17 +42,20 @@ use deltalake::parquet::basic::Compression;
40
42
  use deltalake::parquet::errors::ParquetError;
41
43
  use deltalake::parquet::file::properties::WriterProperties;
42
44
  use deltalake::partitions::PartitionFilter;
45
+ use deltalake::table::config::TablePropertiesExt;
46
+ use deltalake::table::state::DeltaTableState;
43
47
  use deltalake::{DeltaOps, DeltaResult};
44
48
  use error::DeltaError;
45
49
  use futures::future::join_all;
50
+ use futures::TryStreamExt;
46
51
 
47
52
  use magnus::{
48
- function, method, prelude::*, typed_data::Obj, Error as RbErr, Integer, Module, RArray, Ruby,
49
- TryConvert, Value,
53
+ function, method, prelude::*, try_convert::TryConvertOwned, typed_data::Obj, Error as RbErr,
54
+ Integer, Module, RArray, Ruby, TryConvert, Value,
50
55
  };
51
56
  use serde_json::Map;
52
57
 
53
- use crate::error::{DeltaProtocolError, RbValueError, RubyError};
58
+ use crate::error::{RbValueError, RubyError};
54
59
  use crate::features::TableFeatures;
55
60
  use crate::merge::RbMergeBuilder;
56
61
  use crate::schema::{schema_to_rbobject, Field};
@@ -73,6 +78,8 @@ impl TryConvert for PartitionFilterValue {
73
78
  }
74
79
  }
75
80
 
81
+ unsafe impl TryConvertOwned for PartitionFilterValue {}
82
+
76
83
  #[magnus::wrap(class = "DeltaLake::RawDeltaTable")]
77
84
  struct RawDeltaTable {
78
85
  _table: RefCell<deltalake::DeltaTable>,
@@ -121,6 +128,15 @@ impl RawDeltaTable {
121
128
  func(&self._table.borrow())
122
129
  }
123
130
 
131
+ fn cloned_state(&self) -> RbResult<DeltaTableState> {
132
+ self.with_table(|t| {
133
+ t.snapshot()
134
+ .cloned()
135
+ .map_err(RubyError::from)
136
+ .map_err(RbErr::from)
137
+ })
138
+ }
139
+
124
140
  fn log_store(&self) -> RbResult<LogStoreRef> {
125
141
  self.with_table(|t| Ok(t.log_store().clone()))
126
142
  }
@@ -190,10 +206,8 @@ impl RawDeltaTable {
190
206
 
191
207
  pub fn metadata(&self) -> RbResult<RawDeltaTableMetaData> {
192
208
  let metadata = self.with_table(|t| {
193
- t.metadata()
194
- .cloned()
195
- .map_err(RubyError::from)
196
- .map_err(RbErr::from)
209
+ let snapshot = t.snapshot().map_err(RubyError::from).map_err(RbErr::from)?;
210
+ Ok(snapshot.metadata().clone())
197
211
  })?;
198
212
  Ok(RawDeltaTableMetaData {
199
213
  id: metadata.id().to_string(),
@@ -207,10 +221,8 @@ impl RawDeltaTable {
207
221
 
208
222
  pub fn protocol_versions(&self) -> RbResult<(i32, i32, Option<StringVec>, Option<StringVec>)> {
209
223
  let table_protocol = self.with_table(|t| {
210
- t.protocol()
211
- .cloned()
212
- .map_err(RubyError::from)
213
- .map_err(RbErr::from)
224
+ let snapshot = t.snapshot().map_err(RubyError::from).map_err(RbErr::from)?;
225
+ Ok(snapshot.protocol().clone())
214
226
  })?;
215
227
  Ok((
216
228
  table_protocol.min_reader_version(),
@@ -250,10 +262,15 @@ impl RawDeltaTable {
250
262
 
251
263
  pub fn get_num_index_cols(&self) -> RbResult<i32> {
252
264
  self.with_table(|t| {
253
- Ok(t.snapshot()
265
+ let n_cols = t
266
+ .snapshot()
254
267
  .map_err(RubyError::from)?
255
268
  .config()
256
- .num_indexed_cols())
269
+ .num_indexed_cols();
270
+ Ok(match n_cols {
271
+ DataSkippingNumIndexedCols::NumColumns(n_cols) => n_cols as i32,
272
+ DataSkippingNumIndexedCols::AllColumns => -1,
273
+ })
257
274
  })
258
275
  }
259
276
 
@@ -262,7 +279,8 @@ impl RawDeltaTable {
262
279
  Ok(t.snapshot()
263
280
  .map_err(RubyError::from)?
264
281
  .config()
265
- .stats_columns()
282
+ .data_skipping_stats_columns
283
+ .as_ref()
266
284
  .map(|v| v.iter().map(|s| s.to_string()).collect::<Vec<String>>()))
267
285
  })
268
286
  }
@@ -289,9 +307,12 @@ impl RawDeltaTable {
289
307
  let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
290
308
  Ok(self
291
309
  .with_table(|t| {
292
- t.get_files_by_partitions(&filters)
293
- .map_err(RubyError::from)
294
- .map_err(RbErr::from)
310
+ rt().block_on(async {
311
+ t.get_files_by_partitions(&filters)
312
+ .await
313
+ .map_err(RubyError::from)
314
+ .map_err(RbErr::from)
315
+ })
295
316
  })?
296
317
  .into_iter()
297
318
  .map(|p| p.to_string())
@@ -300,8 +321,9 @@ impl RawDeltaTable {
300
321
  Ok(self
301
322
  ._table
302
323
  .borrow()
303
- .get_files_iter()
324
+ .snapshot()
304
325
  .map_err(RubyError::from)?
326
+ .file_paths_iter()
305
327
  .map(|f| f.to_string())
306
328
  .collect())
307
329
  }
@@ -318,9 +340,12 @@ impl RawDeltaTable {
318
340
  if let Some(filters) = partition_filters {
319
341
  let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
320
342
  self.with_table(|t| {
321
- t.get_file_uris_by_partitions(&filters)
322
- .map_err(RubyError::from)
323
- .map_err(RbErr::from)
343
+ rt().block_on(async {
344
+ t.get_file_uris_by_partitions(&filters)
345
+ .await
346
+ .map_err(RubyError::from)
347
+ .map_err(RbErr::from)
348
+ })
324
349
  })
325
350
  } else {
326
351
  self.with_table(|t| {
@@ -332,14 +357,12 @@ impl RawDeltaTable {
332
357
  }
333
358
  }
334
359
 
335
- pub fn schema(&self) -> RbResult<Value> {
336
- let schema: StructType = self.with_table(|t| {
337
- t.get_schema()
338
- .map_err(RubyError::from)
339
- .map_err(RbErr::from)
340
- .map(|s| s.to_owned())
360
+ pub fn schema(ruby: &Ruby, rb_self: &Self) -> RbResult<Value> {
361
+ let schema: StructType = rb_self.with_table(|t| {
362
+ let snapshot = t.snapshot().map_err(RubyError::from).map_err(RbErr::from)?;
363
+ Ok(snapshot.schema().clone())
341
364
  })?;
342
- schema_to_rbobject(schema.to_owned())
365
+ schema_to_rbobject(schema.to_owned(), ruby)
343
366
  }
344
367
 
345
368
  pub fn vacuum(
@@ -378,7 +401,7 @@ impl RawDeltaTable {
378
401
  pub fn compact_optimize(
379
402
  &self,
380
403
  partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
381
- target_size: Option<i64>,
404
+ target_size: Option<u64>,
382
405
  max_concurrent_tasks: Option<usize>,
383
406
  min_commit_interval: Option<u64>,
384
407
  writer_properties: Option<RbWriterProperties>,
@@ -427,7 +450,7 @@ impl RawDeltaTable {
427
450
  &self,
428
451
  z_order_columns: Vec<String>,
429
452
  partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
430
- target_size: Option<i64>,
453
+ target_size: Option<u64>,
431
454
  max_concurrent_tasks: Option<usize>,
432
455
  max_spill_size: usize,
433
456
  min_commit_interval: Option<u64>,
@@ -724,17 +747,18 @@ impl RawDeltaTable {
724
747
  .map_err(RubyError::from)?)
725
748
  }
726
749
 
727
- fn get_active_partitions(&self) -> RbResult<RArray> {
728
- let binding = self._table.borrow();
729
- let _column_names: HashSet<&str> = binding
730
- .get_schema()
731
- .map_err(|_| DeltaProtocolError::new_err("table does not yet have a schema"))?
732
- .fields()
733
- .map(|field| field.name().as_str())
734
- .collect();
735
- let partition_columns: HashSet<&str> = binding
736
- .metadata()
737
- .map_err(RubyError::from)?
750
+ fn get_active_partitions(ruby: &Ruby, rb_self: &Self) -> RbResult<RArray> {
751
+ let schema = rb_self.with_table(|t| {
752
+ let snapshot = t.snapshot().map_err(RubyError::from).map_err(RbErr::from)?;
753
+ Ok(snapshot.schema().clone())
754
+ })?;
755
+ let metadata = rb_self.with_table(|t| {
756
+ let snapshot = t.snapshot().map_err(RubyError::from).map_err(RbErr::from)?;
757
+ Ok(snapshot.metadata().clone())
758
+ })?;
759
+ let _column_names: HashSet<&str> =
760
+ schema.fields().map(|field| field.name().as_str()).collect();
761
+ let partition_columns: HashSet<&str> = metadata
738
762
  .partition_columns()
739
763
  .iter()
740
764
  .map(|col| col.as_str())
@@ -744,12 +768,15 @@ impl RawDeltaTable {
744
768
 
745
769
  let partition_columns: Vec<&str> = partition_columns.into_iter().collect();
746
770
 
747
- let adds = binding
748
- .snapshot()
749
- .map_err(RubyError::from)?
750
- .get_active_add_actions_by_partitions(&converted_filters)
751
- .map_err(RubyError::from)?
752
- .collect::<Result<Vec<_>, _>>()
771
+ let state = rb_self.cloned_state()?;
772
+ let log_store = rb_self.log_store()?;
773
+ let adds: Vec<_> = rt()
774
+ .block_on(async {
775
+ state
776
+ .get_active_add_actions_by_partitions(&log_store, &converted_filters)
777
+ .try_collect()
778
+ .await
779
+ })
753
780
  .map_err(RubyError::from)?;
754
781
  let active_partitions: HashSet<Vec<(&str, Option<String>)>> = adds
755
782
  .iter()
@@ -757,21 +784,22 @@ impl RawDeltaTable {
757
784
  Ok::<_, RubyError>(
758
785
  partition_columns
759
786
  .iter()
760
- .flat_map(|col| {
761
- Ok::<_, RubyError>((
787
+ .map(|col| {
788
+ (
762
789
  *col,
763
790
  add.partition_values()
764
- .map_err(RubyError::from)?
765
- .get(*col)
791
+ .and_then(|v| {
792
+ v.index_of(col).and_then(|idx| v.value(idx).cloned())
793
+ })
766
794
  .map(|v| v.serialize()),
767
- ))
795
+ )
768
796
  })
769
797
  .collect(),
770
798
  )
771
799
  })
772
800
  .collect();
773
801
 
774
- Ok(RArray::from_iter(active_partitions))
802
+ Ok(ruby.ary_from_iter(active_partitions))
775
803
  }
776
804
 
777
805
  pub fn create_checkpoint(&self) -> RbResult<()> {
@@ -789,15 +817,20 @@ impl RawDeltaTable {
789
817
  }
790
818
 
791
819
  pub fn get_add_file_sizes(&self) -> RbResult<HashMap<String, i64>> {
792
- Ok(self
793
- ._table
794
- .borrow()
795
- .snapshot()
796
- .map_err(RubyError::from)?
797
- .eager_snapshot()
798
- .files()
799
- .map(|f| (f.path().to_string(), f.size()))
800
- .collect::<HashMap<String, i64>>())
820
+ self.with_table(|t| {
821
+ let log_store = t.log_store();
822
+ let sizes: HashMap<String, i64> = rt()
823
+ .block_on(async {
824
+ t.snapshot()?
825
+ .snapshot()
826
+ .files(&log_store, None)
827
+ .map_ok(|f| (f.path().to_string(), f.size()))
828
+ .try_collect()
829
+ .await
830
+ })
831
+ .map_err(RubyError::from)?;
832
+ Ok(sizes)
833
+ })
801
834
  }
802
835
 
803
836
  pub fn delete(
@@ -1397,6 +1430,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
1397
1430
  class.define_method("to_i", method!(ArrowArrayStream::to_i, 0))?;
1398
1431
 
1399
1432
  let class = module.define_class("Field", ruby.class_object())?;
1433
+ class.define_singleton_method("new", function!(Field::new, 2))?;
1400
1434
  class.define_method("name", method!(Field::name, 0))?;
1401
1435
  class.define_method("type", method!(Field::get_type, 0))?;
1402
1436
  class.define_method("nullable", method!(Field::nullable, 0))?;
@@ -1,20 +1,53 @@
1
- use deltalake::kernel::{StructField, StructType as DeltaStructType};
2
- use magnus::{value::ReprValue, Module, RArray, RModule, Ruby, Value};
1
+ use deltalake::kernel::{
2
+ DataType, PrimitiveType as DeltaPrimitive, StructField, StructType as DeltaStructType,
3
+ };
4
+ use magnus::{value::ReprValue, Module, RModule, Ruby, TryConvert, Value};
3
5
 
4
- use crate::RbResult;
6
+ use crate::{RbResult, RbValueError};
5
7
 
6
- pub fn schema_to_rbobject(schema: DeltaStructType) -> RbResult<Value> {
8
+ pub fn schema_to_rbobject(schema: DeltaStructType, ruby: &Ruby) -> RbResult<Value> {
7
9
  let fields = schema.fields().map(|field| Field {
8
10
  inner: field.clone(),
9
11
  });
10
12
 
11
- let rb_schema: Value = Ruby::get()
12
- .unwrap()
13
+ let rb_schema: Value = ruby
13
14
  .class_object()
14
15
  .const_get::<_, RModule>("DeltaLake")?
15
16
  .const_get("Schema")?;
16
17
 
17
- rb_schema.funcall("new", (RArray::from_iter(fields),))
18
+ rb_schema.funcall("new", (ruby.ary_from_iter(fields),))
19
+ }
20
+
21
+ fn ruby_type_to_schema(ob: Value) -> RbResult<DataType> {
22
+ if let Ok(raw_primitive) = String::try_convert(ob) {
23
+ // Pass through PrimitiveType::new() to do validation
24
+ return PrimitiveType::new(raw_primitive)
25
+ .map(|data_type| DataType::Primitive(data_type.inner_type));
26
+ }
27
+ Err(RbValueError::new_err("Invalid data type"))
28
+ }
29
+
30
+ pub struct PrimitiveType {
31
+ inner_type: DeltaPrimitive,
32
+ }
33
+
34
+ impl PrimitiveType {
35
+ fn new(data_type: String) -> RbResult<Self> {
36
+ let data_type: DeltaPrimitive =
37
+ serde_json::from_str(&format!("\"{data_type}\"")).map_err(|_| {
38
+ if data_type.starts_with("decimal") {
39
+ RbValueError::new_err(format!(
40
+ "invalid type string: {data_type}, precision/scale can't be larger than 38"
41
+ ))
42
+ } else {
43
+ RbValueError::new_err(format!("invalid type string: {data_type}"))
44
+ }
45
+ })?;
46
+
47
+ Ok(Self {
48
+ inner_type: data_type,
49
+ })
50
+ }
18
51
  }
19
52
 
20
53
  #[magnus::wrap(class = "DeltaLake::Field")]
@@ -23,6 +56,13 @@ pub struct Field {
23
56
  }
24
57
 
25
58
  impl Field {
59
+ pub fn new(name: String, r#type: Value) -> RbResult<Self> {
60
+ let ty = ruby_type_to_schema(r#type)?;
61
+ Ok(Self {
62
+ inner: StructField::new(name, ty, true),
63
+ })
64
+ }
65
+
26
66
  pub fn name(&self) -> String {
27
67
  self.inner.name().to_string()
28
68
  }
@@ -10,11 +10,10 @@ pub fn rt() -> &'static Runtime {
10
10
  let runtime_pid = *PID.get_or_init(|| pid);
11
11
  if pid != runtime_pid {
12
12
  panic!(
13
- "Forked process detected - current PID is {} but the tokio runtime was created by {}. The tokio \
13
+ "Forked process detected - current PID is {pid} but the tokio runtime was created by {runtime_pid}. The tokio \
14
14
  runtime does not support forked processes https://github.com/tokio-rs/tokio/issues/4301. If you are \
15
15
  seeing this message while using Ruby multithreading make sure to use the `spawn` or `forkserver` \
16
- mode.",
17
- pid, runtime_pid
16
+ mode."
18
17
  );
19
18
  }
20
19
  TOKIO_RT.get_or_init(|| Runtime::new().expect("Failed to create a tokio runtime."))
@@ -195,6 +195,7 @@ module DeltaLake
195
195
 
196
196
  sources = file_uris
197
197
  if sources.empty?
198
+ # TODO pass schema
198
199
  lf = Polars::LazyFrame.new
199
200
  else
200
201
  delta_keys = [
@@ -209,7 +210,8 @@ module DeltaLake
209
210
  sources,
210
211
  hive_partitioning: true,
211
212
  storage_options: storage_options,
212
- rechunk: rechunk
213
+ rechunk: rechunk,
214
+ allow_missing_columns: true
213
215
  )
214
216
 
215
217
  if columns
@@ -1,3 +1,3 @@
1
1
  module DeltaLake
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: deltalake-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -71,7 +71,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
71
71
  - !ruby/object:Gem::Version
72
72
  version: '0'
73
73
  requirements: []
74
- rubygems_version: 3.6.7
74
+ rubygems_version: 3.6.9
75
75
  specification_version: 4
76
76
  summary: Delta Lake for Ruby
77
77
  test_files: []