deltalake-rb 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,21 @@
1
1
  [package]
2
2
  name = "deltalake"
3
- version = "0.1.7"
3
+ version = "0.2.0"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
7
- rust-version = "1.82.0"
7
+ rust-version = "1.82"
8
8
  publish = false
9
9
 
10
10
  [lib]
11
11
  crate-type = ["cdylib"]
12
12
 
13
13
  [dependencies]
14
- arrow = { version = "55", features = ["ffi"] }
15
- arrow-schema = { version = "55", features = ["serde"] }
14
+ arrow = { version = "55.2", features = ["ffi"] }
15
+ arrow-schema = { version = "55.2", features = ["serde"] }
16
16
  chrono = "0.4"
17
- delta_kernel = "=0.10.0"
18
- deltalake = { version = "=0.26.0", features = ["azure", "datafusion", "gcs", "s3"] }
17
+ delta_kernel = "0.13"
18
+ deltalake = { version = "=0.27.0", features = ["azure", "datafusion", "gcs", "s3"] }
19
19
  futures = "0.3"
20
20
  magnus = "0.7"
21
21
  num_cpus = "1"
@@ -1,6 +1,5 @@
1
1
  use arrow_schema::ArrowError;
2
2
  use deltalake::datafusion::error::DataFusionError;
3
- use deltalake::protocol::ProtocolError;
4
3
  use deltalake::{errors::DeltaTableError, ObjectStoreError};
5
4
  use magnus::{exception, Error as RbErr, Module, RModule, Ruby};
6
5
  use std::borrow::Cow;
@@ -81,31 +80,12 @@ fn arrow_to_rb(err: ArrowError) -> RbErr {
81
80
  }
82
81
  }
83
82
 
84
- fn checkpoint_to_rb(err: ProtocolError) -> RbErr {
85
- match err {
86
- ProtocolError::Arrow { source } => arrow_to_rb(source),
87
- ProtocolError::ObjectStore { source } => object_store_to_rb(source),
88
- ProtocolError::EndOfLog => DeltaProtocolError::new_err("End of log"),
89
- ProtocolError::NoMetaData => DeltaProtocolError::new_err("Table metadata missing"),
90
- ProtocolError::CheckpointNotFound => DeltaProtocolError::new_err(err.to_string()),
91
- ProtocolError::InvalidField(err) => RbValueError::new_err(err),
92
- ProtocolError::InvalidRow(err) => RbValueError::new_err(err),
93
- ProtocolError::InvalidDeletionVectorStorageType(err) => RbValueError::new_err(err),
94
- ProtocolError::SerializeOperation { source } => RbValueError::new_err(source.to_string()),
95
- ProtocolError::ParquetParseError { source } => RbIOError::new_err(source.to_string()),
96
- ProtocolError::IO { source } => RbIOError::new_err(source.to_string()),
97
- ProtocolError::Generic(msg) => DeltaError::new_err(msg),
98
- ProtocolError::Kernel { source } => DeltaError::new_err(source.to_string()),
99
- }
100
- }
101
-
102
83
  fn datafusion_to_rb(err: DataFusionError) -> RbErr {
103
84
  DeltaError::new_err(err.to_string())
104
85
  }
105
86
 
106
87
  pub enum RubyError {
107
88
  DeltaTable(DeltaTableError),
108
- Protocol(ProtocolError),
109
89
  DataFusion(DataFusionError),
110
90
  }
111
91
 
@@ -115,12 +95,6 @@ impl From<DeltaTableError> for RubyError {
115
95
  }
116
96
  }
117
97
 
118
- impl From<ProtocolError> for RubyError {
119
- fn from(err: ProtocolError) -> Self {
120
- RubyError::Protocol(err)
121
- }
122
- }
123
-
124
98
  impl From<DataFusionError> for RubyError {
125
99
  fn from(err: DataFusionError) -> Self {
126
100
  RubyError::DataFusion(err)
@@ -131,7 +105,6 @@ impl From<RubyError> for RbErr {
131
105
  fn from(value: RubyError) -> Self {
132
106
  match value {
133
107
  RubyError::DeltaTable(err) => inner_to_rb_err(err),
134
- RubyError::Protocol(err) => checkpoint_to_rb(err),
135
108
  RubyError::DataFusion(err) => datafusion_to_rb(err),
136
109
  }
137
110
  }
@@ -23,6 +23,7 @@ use deltalake::errors::DeltaTableError;
23
23
  use deltalake::kernel::transaction::{CommitProperties, TableReference};
24
24
  use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
25
25
  use deltalake::logstore::IORuntime;
26
+ use deltalake::logstore::LogStoreRef;
26
27
  use deltalake::operations::add_column::AddColumnBuilder;
27
28
  use deltalake::operations::add_feature::AddTableFeatureBuilder;
28
29
  use deltalake::operations::collect_sendable_stream;
@@ -44,20 +45,18 @@ use error::DeltaError;
44
45
  use futures::future::join_all;
45
46
 
46
47
  use magnus::{
47
- function, method, prelude::*, typed_data::Obj, Error, Integer, Module, RArray, RHash, Ruby,
48
+ function, method, prelude::*, typed_data::Obj, Error as RbErr, Integer, Module, RArray, Ruby,
48
49
  TryConvert, Value,
49
50
  };
50
51
  use serde_json::Map;
51
52
 
52
- use crate::error::DeltaProtocolError;
53
- use crate::error::RbValueError;
54
- use crate::error::RubyError;
53
+ use crate::error::{DeltaProtocolError, RbValueError, RubyError};
55
54
  use crate::features::TableFeatures;
56
55
  use crate::merge::RbMergeBuilder;
57
56
  use crate::schema::{schema_to_rbobject, Field};
58
57
  use crate::utils::rt;
59
58
 
60
- type RbResult<T> = Result<T, Error>;
59
+ type RbResult<T> = Result<T, RbErr>;
61
60
 
62
61
  enum PartitionFilterValue {
63
62
  Single(String),
@@ -86,7 +85,7 @@ struct RawDeltaTableMetaData {
86
85
  description: Option<String>,
87
86
  partition_columns: Vec<String>,
88
87
  created_time: Option<i64>,
89
- configuration: HashMap<String, Option<String>>,
88
+ configuration: HashMap<String, String>,
90
89
  }
91
90
 
92
91
  impl RawDeltaTableMetaData {
@@ -110,13 +109,23 @@ impl RawDeltaTableMetaData {
110
109
  self.created_time
111
110
  }
112
111
 
113
- fn configuration(&self) -> HashMap<String, Option<String>> {
112
+ fn configuration(&self) -> HashMap<String, String> {
114
113
  self.configuration.clone()
115
114
  }
116
115
  }
117
116
 
118
117
  type StringVec = Vec<String>;
119
118
 
119
+ impl RawDeltaTable {
120
+ fn with_table<T>(&self, func: impl Fn(&deltalake::DeltaTable) -> RbResult<T>) -> RbResult<T> {
121
+ func(&self._table.borrow())
122
+ }
123
+
124
+ fn log_store(&self) -> RbResult<LogStoreRef> {
125
+ self.with_table(|t| Ok(t.log_store().clone()))
126
+ }
127
+ }
128
+
120
129
  impl RawDeltaTable {
121
130
  pub fn new(
122
131
  table_uri: String,
@@ -168,60 +177,62 @@ impl RawDeltaTable {
168
177
  }
169
178
 
170
179
  pub fn table_uri(&self) -> RbResult<String> {
171
- Ok(self._table.borrow().table_uri())
180
+ self.with_table(|t| Ok(t.table_uri()))
172
181
  }
173
182
 
174
- pub fn version(&self) -> RbResult<i64> {
175
- Ok(self._table.borrow().version())
183
+ pub fn version(&self) -> RbResult<Option<i64>> {
184
+ self.with_table(|t| Ok(t.version()))
176
185
  }
177
186
 
178
187
  pub fn has_files(&self) -> RbResult<bool> {
179
- Ok(self._table.borrow().config.require_files)
188
+ self.with_table(|t| Ok(t.config.require_files))
180
189
  }
181
190
 
182
191
  pub fn metadata(&self) -> RbResult<RawDeltaTableMetaData> {
183
- let binding = self._table.borrow();
184
- let metadata = binding.metadata().map_err(RubyError::from)?;
192
+ let metadata = self.with_table(|t| {
193
+ t.metadata()
194
+ .cloned()
195
+ .map_err(RubyError::from)
196
+ .map_err(RbErr::from)
197
+ })?;
185
198
  Ok(RawDeltaTableMetaData {
186
- id: metadata.id.clone(),
187
- name: metadata.name.clone(),
188
- description: metadata.description.clone(),
189
- partition_columns: metadata.partition_columns.clone(),
190
- created_time: metadata.created_time,
191
- configuration: metadata.configuration.clone(),
199
+ id: metadata.id().to_string(),
200
+ name: metadata.name().map(String::from),
201
+ description: metadata.description().map(String::from),
202
+ partition_columns: metadata.partition_columns().clone(),
203
+ created_time: metadata.created_time(),
204
+ configuration: metadata.configuration().clone(),
192
205
  })
193
206
  }
194
207
 
195
208
  pub fn protocol_versions(&self) -> RbResult<(i32, i32, Option<StringVec>, Option<StringVec>)> {
196
- let binding = self._table.borrow();
197
- let table_protocol = binding.protocol().map_err(RubyError::from)?;
209
+ let table_protocol = self.with_table(|t| {
210
+ t.protocol()
211
+ .cloned()
212
+ .map_err(RubyError::from)
213
+ .map_err(RbErr::from)
214
+ })?;
198
215
  Ok((
199
- table_protocol.min_reader_version,
200
- table_protocol.min_writer_version,
201
- table_protocol
202
- .writer_features
203
- .as_ref()
204
- .and_then(|features| {
205
- let empty_set = !features.is_empty();
206
- empty_set.then(|| {
207
- features
208
- .iter()
209
- .map(|v| v.to_string())
210
- .collect::<Vec<String>>()
211
- })
212
- }),
213
- table_protocol
214
- .reader_features
215
- .as_ref()
216
- .and_then(|features| {
217
- let empty_set = !features.is_empty();
218
- empty_set.then(|| {
219
- features
220
- .iter()
221
- .map(|v| v.to_string())
222
- .collect::<Vec<String>>()
223
- })
224
- }),
216
+ table_protocol.min_reader_version(),
217
+ table_protocol.min_writer_version(),
218
+ table_protocol.writer_features().and_then(|features| {
219
+ let empty_set = !features.is_empty();
220
+ empty_set.then(|| {
221
+ features
222
+ .iter()
223
+ .map(|v| v.to_string())
224
+ .collect::<Vec<String>>()
225
+ })
226
+ }),
227
+ table_protocol.reader_features().and_then(|features| {
228
+ let empty_set = !features.is_empty();
229
+ empty_set.then(|| {
230
+ features
231
+ .iter()
232
+ .map(|v| v.to_string())
233
+ .collect::<Vec<String>>()
234
+ })
235
+ }),
225
236
  ))
226
237
  }
227
238
 
@@ -237,31 +248,23 @@ impl RawDeltaTable {
237
248
  .map_err(RubyError::from)?)
238
249
  }
239
250
 
240
- pub fn get_earliest_version(&self) -> RbResult<i64> {
241
- Ok(rt()
242
- .block_on(self._table.borrow().get_earliest_version())
243
- .map_err(RubyError::from)?)
244
- }
245
-
246
251
  pub fn get_num_index_cols(&self) -> RbResult<i32> {
247
- Ok(self
248
- ._table
249
- .borrow()
250
- .snapshot()
251
- .map_err(RubyError::from)?
252
- .config()
253
- .num_indexed_cols())
252
+ self.with_table(|t| {
253
+ Ok(t.snapshot()
254
+ .map_err(RubyError::from)?
255
+ .config()
256
+ .num_indexed_cols())
257
+ })
254
258
  }
255
259
 
256
260
  pub fn get_stats_columns(&self) -> RbResult<Option<Vec<String>>> {
257
- Ok(self
258
- ._table
259
- .borrow()
260
- .snapshot()
261
- .map_err(RubyError::from)?
262
- .config()
263
- .stats_columns()
264
- .map(|v| v.iter().map(|v| v.to_string()).collect::<Vec<String>>()))
261
+ self.with_table(|t| {
262
+ Ok(t.snapshot()
263
+ .map_err(RubyError::from)?
264
+ .config()
265
+ .stats_columns()
266
+ .map(|v| v.iter().map(|s| s.to_string()).collect::<Vec<String>>()))
267
+ })
265
268
  }
266
269
 
267
270
  pub fn load_with_datetime(&self, ds: String) -> RbResult<()> {
@@ -285,10 +288,11 @@ impl RawDeltaTable {
285
288
  if let Some(filters) = partition_filters {
286
289
  let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
287
290
  Ok(self
288
- ._table
289
- .borrow()
290
- .get_files_by_partitions(&filters)
291
- .map_err(RubyError::from)?
291
+ .with_table(|t| {
292
+ t.get_files_by_partitions(&filters)
293
+ .map_err(RubyError::from)
294
+ .map_err(RbErr::from)
295
+ })?
292
296
  .into_iter()
293
297
  .map(|p| p.to_string())
294
298
  .collect())
@@ -307,30 +311,34 @@ impl RawDeltaTable {
307
311
  &self,
308
312
  partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
309
313
  ) -> RbResult<Vec<String>> {
310
- if !self._table.borrow().config.require_files {
314
+ if !self.with_table(|t| Ok(t.config.require_files))? {
311
315
  return Err(DeltaError::new_err("Table is initiated without files."));
312
316
  }
313
317
 
314
318
  if let Some(filters) = partition_filters {
315
319
  let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
316
- Ok(self
317
- ._table
318
- .borrow()
319
- .get_file_uris_by_partitions(&filters)
320
- .map_err(RubyError::from)?)
320
+ self.with_table(|t| {
321
+ t.get_file_uris_by_partitions(&filters)
322
+ .map_err(RubyError::from)
323
+ .map_err(RbErr::from)
324
+ })
321
325
  } else {
322
- Ok(self
323
- ._table
324
- .borrow()
325
- .get_file_uris()
326
- .map_err(RubyError::from)?
327
- .collect())
326
+ self.with_table(|t| {
327
+ Ok(t.get_file_uris()
328
+ .map_err(RubyError::from)
329
+ .map_err(RbErr::from)?
330
+ .collect::<Vec<String>>())
331
+ })
328
332
  }
329
333
  }
330
334
 
331
335
  pub fn schema(&self) -> RbResult<Value> {
332
- let binding = self._table.borrow();
333
- let schema: &StructType = binding.get_schema().map_err(RubyError::from)?;
336
+ let schema: StructType = self.with_table(|t| {
337
+ t.get_schema()
338
+ .map_err(RubyError::from)
339
+ .map_err(RbErr::from)
340
+ .map(|s| s.to_owned())
341
+ })?;
334
342
  schema_to_rbobject(schema.to_owned())
335
343
  }
336
344
 
@@ -727,7 +735,7 @@ impl RawDeltaTable {
727
735
  let partition_columns: HashSet<&str> = binding
728
736
  .metadata()
729
737
  .map_err(RubyError::from)?
730
- .partition_columns
738
+ .partition_columns()
731
739
  .iter()
732
740
  .map(|col| col.as_str())
733
741
  .collect();
@@ -874,14 +882,13 @@ impl RawDeltaTable {
874
882
  Ok(serde_json::to_string(&metrics).unwrap())
875
883
  }
876
884
 
877
- pub fn transaction_versions(&self) -> RHash {
878
- RHash::from_iter(
879
- self._table
880
- .borrow()
881
- .get_app_transaction_version()
882
- .into_iter()
883
- .map(|(app_id, transaction)| (app_id, RbTransaction::from(transaction))),
884
- )
885
+ pub fn transaction_version(&self, app_id: String) -> RbResult<Option<i64>> {
886
+ // NOTE: this will simplify once we have moved logstore onto state.
887
+ let log_store = self.log_store()?;
888
+ let snapshot = self.with_table(|t| Ok(t.snapshot().map_err(RubyError::from)?.clone()))?;
889
+ Ok(rt()
890
+ .block_on(snapshot.transaction_version(log_store.as_ref(), app_id))
891
+ .map_err(RubyError::from)?)
885
892
  }
886
893
  }
887
894
 
@@ -1293,10 +1300,6 @@ fn init(ruby: &Ruby) -> RbResult<()> {
1293
1300
  "get_latest_version",
1294
1301
  method!(RawDeltaTable::get_latest_version, 0),
1295
1302
  )?;
1296
- class.define_method(
1297
- "get_earliest_version",
1298
- method!(RawDeltaTable::get_earliest_version, 0),
1299
- )?;
1300
1303
  class.define_method(
1301
1304
  "get_num_index_cols",
1302
1305
  method!(RawDeltaTable::get_num_index_cols, 0),
@@ -1366,8 +1369,8 @@ fn init(ruby: &Ruby) -> RbResult<()> {
1366
1369
  )?;
1367
1370
  class.define_method("repair", method!(RawDeltaTable::repair, 3))?;
1368
1371
  class.define_method(
1369
- "transaction_versions",
1370
- method!(RawDeltaTable::transaction_versions, 0),
1372
+ "transaction_version",
1373
+ method!(RawDeltaTable::transaction_version, 1),
1371
1374
  )?;
1372
1375
 
1373
1376
  let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
@@ -204,7 +204,13 @@ module DeltaLake
204
204
  "DELTA_DYNAMO_TABLE_NAME"
205
205
  ]
206
206
  storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
207
- lf = Polars.scan_parquet(sources, storage_options: storage_options, rechunk: rechunk)
207
+ lf =
208
+ Polars.scan_parquet(
209
+ sources,
210
+ hive_partitioning: true,
211
+ storage_options: storage_options,
212
+ rechunk: rechunk
213
+ )
208
214
 
209
215
  if columns
210
216
  # by_name requires polars-df > 0.15.0
@@ -253,8 +259,8 @@ module DeltaLake
253
259
  deserialized_metrics.transform_keys(&:to_sym)
254
260
  end
255
261
 
256
- def transaction_versions
257
- @table.transaction_versions
262
+ def transaction_version(app_id)
263
+ @table.transaction_version(app_id)
258
264
  end
259
265
 
260
266
  # private
@@ -1,3 +1,3 @@
1
1
  module DeltaLake
2
- VERSION = "0.1.7"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: deltalake-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -64,7 +64,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
64
64
  requirements:
65
65
  - - ">="
66
66
  - !ruby/object:Gem::Version
67
- version: '3.1'
67
+ version: '3.2'
68
68
  required_rubygems_version: !ruby/object:Gem::Requirement
69
69
  requirements:
70
70
  - - ">="