deltalake-rb 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,205 @@
1
+ use deltalake::arrow::array::RecordBatchReader;
2
+ use deltalake::arrow::datatypes::Schema as ArrowSchema;
3
+ use deltalake::arrow::ffi_stream::ArrowArrayStreamReader;
4
+ use deltalake::datafusion::catalog::TableProvider;
5
+ use deltalake::datafusion::datasource::MemTable;
6
+ use deltalake::datafusion::prelude::SessionContext;
7
+ use deltalake::logstore::LogStoreRef;
8
+ use deltalake::operations::merge::MergeBuilder;
9
+ use deltalake::table::state::DeltaTableState;
10
+ use deltalake::{DeltaResult, DeltaTable};
11
+ use std::cell::RefCell;
12
+ use std::collections::HashMap;
13
+ use std::future::IntoFuture;
14
+ use std::sync::Arc;
15
+
16
+ use crate::error::RubyError;
17
+ use crate::utils::rt;
18
+ use crate::RbResult;
19
+ use crate::{
20
+ maybe_create_commit_properties, set_writer_properties, RbCommitProperties,
21
+ RbPostCommitHookProperties, RbWriterProperties,
22
+ };
23
+
24
+ #[magnus::wrap(class = "DeltaLake::RbMergeBuilder")]
25
+ pub(crate) struct RbMergeBuilder {
26
+ _builder: RefCell<Option<MergeBuilder>>,
27
+ source_alias: Option<String>,
28
+ target_alias: Option<String>,
29
+ #[allow(dead_code)]
30
+ arrow_schema: Arc<ArrowSchema>,
31
+ }
32
+
33
+ // getters
34
+ impl RbMergeBuilder {
35
+ pub fn source_alias(&self) -> Option<String> {
36
+ self.source_alias.clone()
37
+ }
38
+
39
+ pub fn target_alias(&self) -> Option<String> {
40
+ self.target_alias.clone()
41
+ }
42
+ }
43
+
44
+ impl RbMergeBuilder {
45
+ pub fn new(
46
+ log_store: LogStoreRef,
47
+ snapshot: DeltaTableState,
48
+ source: ArrowArrayStreamReader,
49
+ predicate: String,
50
+ source_alias: Option<String>,
51
+ target_alias: Option<String>,
52
+ safe_cast: bool,
53
+ writer_properties: Option<RbWriterProperties>,
54
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
55
+ commit_properties: Option<RbCommitProperties>,
56
+ ) -> DeltaResult<Self> {
57
+ let ctx = SessionContext::new();
58
+ let schema = source.schema();
59
+ let batches = vec![source.map(|batch| batch.unwrap()).collect::<Vec<_>>()];
60
+ let table_provider: Arc<dyn TableProvider> =
61
+ Arc::new(MemTable::try_new(schema.clone(), batches).unwrap());
62
+ let source_df = ctx.read_table(table_provider).unwrap();
63
+
64
+ let mut cmd =
65
+ MergeBuilder::new(log_store, snapshot, predicate, source_df).with_safe_cast(safe_cast);
66
+
67
+ if let Some(src_alias) = &source_alias {
68
+ cmd = cmd.with_source_alias(src_alias);
69
+ }
70
+
71
+ if let Some(trgt_alias) = &target_alias {
72
+ cmd = cmd.with_target_alias(trgt_alias);
73
+ }
74
+
75
+ if let Some(writer_props) = writer_properties {
76
+ cmd = cmd.with_writer_properties(set_writer_properties(writer_props)?);
77
+ }
78
+
79
+ if let Some(commit_properties) =
80
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
81
+ {
82
+ cmd = cmd.with_commit_properties(commit_properties);
83
+ }
84
+
85
+ Ok(Self {
86
+ _builder: RefCell::new(Some(cmd)),
87
+ source_alias,
88
+ target_alias,
89
+ arrow_schema: schema,
90
+ })
91
+ }
92
+
93
+ pub fn execute(&self) -> DeltaResult<(DeltaTable, String)> {
94
+ let (table, metrics) = rt().block_on(self._builder.take().unwrap().into_future())?;
95
+ Ok((table, serde_json::to_string(&metrics).unwrap()))
96
+ }
97
+ }
98
+
99
+ impl RbMergeBuilder {
100
+ pub fn when_matched_update(
101
+ &self,
102
+ updates: HashMap<String, String>,
103
+ predicate: Option<String>,
104
+ ) -> RbResult<()> {
105
+ let mut binding = self._builder.borrow_mut();
106
+ *binding = match binding.take() {
107
+ Some(cmd) => Some(
108
+ cmd.when_matched_update(|mut update| {
109
+ for (column, expression) in updates {
110
+ update = update.update(column, expression)
111
+ }
112
+ if let Some(predicate) = predicate {
113
+ update = update.predicate(predicate)
114
+ };
115
+ update
116
+ })
117
+ .map_err(RubyError::from)?,
118
+ ),
119
+ None => unreachable!(),
120
+ };
121
+ Ok(())
122
+ }
123
+
124
+ pub fn when_matched_delete(&self, predicate: Option<String>) -> RbResult<()> {
125
+ let mut binding = self._builder.borrow_mut();
126
+ *binding = match binding.take() {
127
+ Some(cmd) => Some(
128
+ cmd.when_matched_delete(|mut delete| {
129
+ if let Some(predicate) = predicate {
130
+ delete = delete.predicate(predicate)
131
+ };
132
+ delete
133
+ })
134
+ .map_err(RubyError::from)?,
135
+ ),
136
+ None => unreachable!(),
137
+ };
138
+ Ok(())
139
+ }
140
+
141
+ pub fn when_not_matched_insert(
142
+ &self,
143
+ updates: HashMap<String, String>,
144
+ predicate: Option<String>,
145
+ ) -> RbResult<()> {
146
+ let mut binding = self._builder.borrow_mut();
147
+ *binding = match binding.take() {
148
+ Some(cmd) => Some(
149
+ cmd.when_not_matched_insert(|mut insert| {
150
+ for (column, expression) in updates {
151
+ insert = insert.set(column, expression)
152
+ }
153
+ if let Some(predicate) = predicate {
154
+ insert = insert.predicate(predicate)
155
+ };
156
+ insert
157
+ })
158
+ .map_err(RubyError::from)?,
159
+ ),
160
+ None => unreachable!(),
161
+ };
162
+ Ok(())
163
+ }
164
+
165
+ pub fn when_not_matched_by_source_update(
166
+ &self,
167
+ updates: HashMap<String, String>,
168
+ predicate: Option<String>,
169
+ ) -> RbResult<()> {
170
+ let mut binding = self._builder.borrow_mut();
171
+ *binding = match binding.take() {
172
+ Some(cmd) => Some(
173
+ cmd.when_not_matched_by_source_update(|mut update| {
174
+ for (column, expression) in updates {
175
+ update = update.update(column, expression)
176
+ }
177
+ if let Some(predicate) = predicate {
178
+ update = update.predicate(predicate)
179
+ };
180
+ update
181
+ })
182
+ .map_err(RubyError::from)?,
183
+ ),
184
+ None => unreachable!(),
185
+ };
186
+ Ok(())
187
+ }
188
+
189
+ pub fn when_not_matched_by_source_delete(&self, predicate: Option<String>) -> RbResult<()> {
190
+ let mut binding = self._builder.borrow_mut();
191
+ *binding = match binding.take() {
192
+ Some(cmd) => Some(
193
+ cmd.when_not_matched_by_source_delete(|mut delete| {
194
+ if let Some(predicate) = predicate {
195
+ delete = delete.predicate(predicate)
196
+ };
197
+ delete
198
+ })
199
+ .map_err(RubyError::from)?,
200
+ ),
201
+ None => unreachable!(),
202
+ };
203
+ Ok(())
204
+ }
205
+ }
@@ -26,22 +26,51 @@ module DeltaLake
26
26
  @table.version
27
27
  end
28
28
 
29
- def files
30
- @table.files
29
+ def partitions
30
+ partitions = []
31
+ @table.get_active_partitions.each do |partition|
32
+ next unless partition
33
+ partitions << partition.to_h
34
+ end
35
+ partitions
36
+ end
37
+
38
+ def files(partition_filters: nil)
39
+ @table.files(_stringify_partition_values(partition_filters))
31
40
  end
32
41
 
33
- def file_uris
34
- @table.file_uris
42
+ def file_uris(partition_filters: nil)
43
+ @table.file_uris(_stringify_partition_values(partition_filters))
35
44
  end
36
45
 
37
46
  def load_as_version(version)
38
47
  if version.is_a?(Integer)
39
48
  @table.load_version(version)
49
+ elsif version.is_a?(Time)
50
+ @table.load_with_datetime(version.utc.iso8601(9))
51
+ elsif version.is_a?(String)
52
+ @table.load_with_datetime(version)
40
53
  else
41
- raise TypeError, "Invalid datatype provided for version, only Integer is accepted."
54
+ raise TypeError, "Invalid datatype provided for version, only Integer, String, and Time are accepted."
42
55
  end
43
56
  end
44
57
 
58
+ def load_cdf(
59
+ starting_version: 0,
60
+ ending_version: nil,
61
+ starting_timestamp: nil,
62
+ ending_timestamp: nil,
63
+ columns: nil
64
+ )
65
+ @table.load_cdf(
66
+ starting_version,
67
+ ending_version,
68
+ starting_timestamp,
69
+ ending_timestamp,
70
+ columns
71
+ )
72
+ end
73
+
45
74
  def table_uri
46
75
  @table.table_uri
47
76
  end
@@ -54,10 +83,35 @@ module DeltaLake
54
83
  Metadata.new(@table)
55
84
  end
56
85
 
86
+ def protocol
87
+ ProtocolVersions.new(*@table.protocol_versions)
88
+ end
89
+
90
+ def history(limit: nil)
91
+ backwards_enumerate = lambda do |iterable, start_end, &block|
92
+ n = start_end
93
+ iterable.each do |elem|
94
+ block.call(n, elem)
95
+ n -= 1
96
+ end
97
+ end
98
+
99
+ commits = @table.history(limit)
100
+ history = []
101
+ backwards_enumerate.(commits, @table.get_latest_version) do |version, commit_info_raw|
102
+ commit = JSON.parse(commit_info_raw)
103
+ commit["version"] = version
104
+ history << commit
105
+ end
106
+ history
107
+ end
108
+
57
109
  def vacuum(
58
110
  retention_hours: nil,
59
111
  dry_run: true,
60
- enforce_retention_duration: true
112
+ enforce_retention_duration: true,
113
+ post_commithook_properties: nil,
114
+ commit_properties: nil
61
115
  )
62
116
  if retention_hours
63
117
  if retention_hours < 0
@@ -68,10 +122,72 @@ module DeltaLake
68
122
  @table.vacuum(
69
123
  dry_run,
70
124
  retention_hours,
71
- enforce_retention_duration
125
+ enforce_retention_duration,
126
+ commit_properties,
127
+ post_commithook_properties
72
128
  )
73
129
  end
74
130
 
131
+ def optimize
132
+ TableOptimizer.new(self)
133
+ end
134
+
135
+ def alter
136
+ TableAlterer.new(self)
137
+ end
138
+
139
+ def merge(
140
+ source,
141
+ predicate,
142
+ source_alias: nil,
143
+ target_alias: nil,
144
+ error_on_type_mismatch: true,
145
+ writer_properties: nil,
146
+ post_commithook_properties: nil,
147
+ commit_properties: nil
148
+ )
149
+ source = Utils.convert_data(source)
150
+
151
+ rb_merge_builder =
152
+ @table.create_merge_builder(
153
+ source,
154
+ predicate,
155
+ source_alias,
156
+ target_alias,
157
+ !error_on_type_mismatch,
158
+ writer_properties,
159
+ post_commithook_properties,
160
+ commit_properties
161
+ )
162
+ TableMerger.new(rb_merge_builder, @table)
163
+ end
164
+
165
+ def restore(
166
+ target,
167
+ ignore_missing_files: false,
168
+ protocol_downgrade_allowed: false,
169
+ commit_properties: nil
170
+ )
171
+ if target.is_a?(Time)
172
+ metrics =
173
+ @table.restore(
174
+ target.utc.iso8601(9),
175
+ ignore_missing_files,
176
+ protocol_downgrade_allowed,
177
+ commit_properties
178
+ )
179
+ else
180
+ metrics =
181
+ @table.restore(
182
+ target,
183
+ ignore_missing_files,
184
+ protocol_downgrade_allowed,
185
+ commit_properties
186
+ )
187
+ end
188
+ JSON.parse(metrics)
189
+ end
190
+
75
191
  def to_polars(eager: true)
76
192
  require "polars-df"
77
193
 
@@ -80,7 +196,13 @@ module DeltaLake
80
196
  if sources.empty?
81
197
  Polars::LazyFrame.new
82
198
  else
83
- storage_options = @storage_options&.except("AWS_S3_ALLOW_UNSAFE_RENAME")
199
+ delta_keys = [
200
+ "AWS_S3_ALLOW_UNSAFE_RENAME",
201
+ "AWS_S3_LOCKING_PROVIDER",
202
+ "CONDITIONAL_PUT",
203
+ "DELTA_DYNAMO_TABLE_NAME"
204
+ ]
205
+ storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
84
206
  Polars.scan_parquet(sources, storage_options: storage_options)
85
207
  end
86
208
  eager ? lf.collect : lf
@@ -90,14 +212,52 @@ module DeltaLake
90
212
  @table.update_incremental
91
213
  end
92
214
 
93
- def delete(predicate = nil)
94
- metrics = @table.delete(predicate)
215
+ def delete(
216
+ predicate = nil,
217
+ writer_properties: nil,
218
+ post_commithook_properties: nil,
219
+ commit_properties: nil
220
+ )
221
+ metrics =
222
+ @table.delete(
223
+ predicate,
224
+ writer_properties,
225
+ post_commithook_properties,
226
+ commit_properties
227
+ )
95
228
  JSON.parse(metrics).transform_keys(&:to_sym)
96
229
  end
97
230
 
231
+ def repair(
232
+ dry_run: false,
233
+ post_commithook_properties: nil,
234
+ commit_properties: nil
235
+ )
236
+ metrics =
237
+ @table.repair(
238
+ dry_run,
239
+ commit_properties,
240
+ post_commithook_properties
241
+ )
242
+ JSON.parse(metrics).transform_keys(&:to_sym)
243
+ end
244
+
245
+ def transaction_versions
246
+ @table.transaction_versions
247
+ end
248
+
98
249
  # private
99
250
  def _table
100
251
  @table
101
252
  end
253
+
254
+ # private
255
+ def _stringify_partition_values(partition_filters)
256
+ if partition_filters.nil?
257
+ return partition_filters
258
+ end
259
+
260
+ raise Todo
261
+ end
102
262
  end
103
263
  end
@@ -0,0 +1,58 @@
1
+ module DeltaLake
2
+ class TableAlterer
3
+ def initialize(table)
4
+ @table = table
5
+ end
6
+
7
+ def add_feature(
8
+ feature,
9
+ allow_protocol_versions_increase: false
10
+ )
11
+ if !feature.is_a?(Array)
12
+ feature = [feature]
13
+ end
14
+ @table._table.add_feature(
15
+ feature,
16
+ allow_protocol_versions_increase
17
+ )
18
+ end
19
+
20
+ def add_columns(fields)
21
+ if fields.is_a?(DeltaLake::Field)
22
+ fields = [fields]
23
+ end
24
+
25
+ @table._table.add_columns(
26
+ fields
27
+ )
28
+ end
29
+
30
+ def add_constraint(constraints)
31
+ if constraints.length > 1
32
+ raise ArgumentError,
33
+ "add_constraints is limited to a single constraint addition at once for now."
34
+ end
35
+
36
+ @table._table.add_constraints(
37
+ constraints
38
+ )
39
+ end
40
+
41
+ def drop_constraint(name, raise_if_not_exists: true)
42
+ @table._table.drop_constraints(
43
+ name,
44
+ raise_if_not_exists
45
+ )
46
+ end
47
+
48
+ def set_table_properties(
49
+ properties,
50
+ raise_if_not_exists: true
51
+ )
52
+ @table._table.set_table_properties(
53
+ properties,
54
+ raise_if_not_exists
55
+ )
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,38 @@
1
+ module DeltaLake
2
+ class TableMerger
3
+ def initialize(builder, table)
4
+ @builder = builder
5
+ @table = table
6
+ end
7
+
8
+ def when_matched_update(updates, predicate: nil)
9
+ @builder.when_matched_update(updates, predicate)
10
+ self
11
+ end
12
+
13
+ def when_not_matched_insert(updates, predicate: nil)
14
+ @builder.when_not_matched_insert(updates, predicate)
15
+ self
16
+ end
17
+
18
+ def when_matched_delete(predicate: nil)
19
+ @builder.when_matched_delete(predicate)
20
+ self
21
+ end
22
+
23
+ def when_not_matched_by_source_update(updates, predicate: nil)
24
+ @builder.when_not_matched_by_source_update(updates, predicate)
25
+ self
26
+ end
27
+
28
+ def when_not_matched_by_source_delete(predicate: nil)
29
+ @builder.when_not_matched_by_source_delete(predicate)
30
+ self
31
+ end
32
+
33
+ def execute
34
+ metrics = @table.merge_execute(@builder)
35
+ JSON.parse(metrics).transform_keys(&:to_sym)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,67 @@
1
+ module DeltaLake
2
+ class TableOptimizer
3
+ def initialize(table)
4
+ @table = table
5
+ end
6
+
7
+ def compact(
8
+ partition_filters: nil,
9
+ target_size: nil,
10
+ max_concurrent_tasks: nil,
11
+ min_commit_interval: nil,
12
+ writer_properties: nil,
13
+ post_commithook_properties: nil,
14
+ commit_properties: nil
15
+ )
16
+ metrics =
17
+ @table._table.compact_optimize(
18
+ @table._stringify_partition_values(partition_filters),
19
+ target_size,
20
+ max_concurrent_tasks,
21
+ min_commit_interval,
22
+ writer_properties,
23
+ post_commithook_properties,
24
+ commit_properties
25
+ )
26
+ @table.update_incremental
27
+ result = JSON.parse(metrics)
28
+ ["filesAdded", "filesRemoved"].each do |key|
29
+ result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
30
+ end
31
+ # TODO return underscore symbols like delete
32
+ result
33
+ end
34
+
35
+ def z_order(
36
+ columns,
37
+ partition_filters: nil,
38
+ target_size: nil,
39
+ max_concurrent_tasks: nil,
40
+ max_spill_size: 20 * 1024 * 1024 * 1024,
41
+ min_commit_interval: nil,
42
+ writer_properties: nil,
43
+ post_commithook_properties: nil,
44
+ commit_properties: nil
45
+ )
46
+ metrics =
47
+ @table._table.z_order_optimize(
48
+ Array(columns),
49
+ @table._stringify_partition_values(partition_filters),
50
+ target_size,
51
+ max_concurrent_tasks,
52
+ max_spill_size,
53
+ min_commit_interval,
54
+ writer_properties,
55
+ post_commithook_properties,
56
+ commit_properties
57
+ )
58
+ @table.update_incremental
59
+ result = JSON.parse(metrics)
60
+ ["filesAdded", "filesRemoved"].each do |key|
61
+ result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
62
+ end
63
+ # TODO return underscore symbols like delete
64
+ result
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,59 @@
1
+ module DeltaLake
2
+ module Utils
3
+ def self.convert_data(data)
4
+ if data.respond_to?(:arrow_c_stream)
5
+ # TODO convert other object types
6
+ # should probably move logic to Rust
7
+ if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
8
+ data = convert_polars_data(data)
9
+ end
10
+
11
+ data.arrow_c_stream
12
+ else
13
+ raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
14
+ end
15
+ end
16
+
17
+ # unsigned integers are not part of the protocol
18
+ # https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
19
+ def self.convert_polars_data(data)
20
+ new_schema = {}
21
+ data.schema.each do |k, v|
22
+ new_type = convert_polars_type(v)
23
+ new_schema[k] = new_type if new_type
24
+ end
25
+
26
+ if new_schema.any?
27
+ data.cast(new_schema)
28
+ else
29
+ data
30
+ end
31
+ end
32
+
33
+ def self.convert_polars_type(t)
34
+ case t
35
+ when Polars::UInt8
36
+ Polars::Int8
37
+ when Polars::UInt16
38
+ Polars::Int16
39
+ when Polars::UInt32
40
+ Polars::Int32
41
+ when Polars::UInt64
42
+ Polars::Int64
43
+ when Polars::Datetime
44
+ Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
45
+ when Polars::List
46
+ inner = convert_polars_type(t.inner)
47
+ Polars::List.new(inner) if inner
48
+ when Polars::Array
49
+ inner = convert_polars_type(t.inner)
50
+ Polars::Array.new(t.inner, t.width) if inner
51
+ when Polars::Struct
52
+ if t.fields.any? { |f| convert_polars_type(f.dtype) }
53
+ fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
54
+ Polars::Struct.new(fields)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -1,3 +1,3 @@
1
1
  module DeltaLake
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.2"
3
3
  end