deltalake-rb 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,205 @@
1
+ use deltalake::arrow::array::RecordBatchReader;
2
+ use deltalake::arrow::datatypes::Schema as ArrowSchema;
3
+ use deltalake::arrow::ffi_stream::ArrowArrayStreamReader;
4
+ use deltalake::datafusion::catalog::TableProvider;
5
+ use deltalake::datafusion::datasource::MemTable;
6
+ use deltalake::datafusion::prelude::SessionContext;
7
+ use deltalake::logstore::LogStoreRef;
8
+ use deltalake::operations::merge::MergeBuilder;
9
+ use deltalake::table::state::DeltaTableState;
10
+ use deltalake::{DeltaResult, DeltaTable};
11
+ use std::cell::RefCell;
12
+ use std::collections::HashMap;
13
+ use std::future::IntoFuture;
14
+ use std::sync::Arc;
15
+
16
+ use crate::error::RubyError;
17
+ use crate::utils::rt;
18
+ use crate::RbResult;
19
+ use crate::{
20
+ maybe_create_commit_properties, set_writer_properties, RbCommitProperties,
21
+ RbPostCommitHookProperties, RbWriterProperties,
22
+ };
23
+
24
+ #[magnus::wrap(class = "DeltaLake::RbMergeBuilder")]
25
+ pub(crate) struct RbMergeBuilder {
26
+ _builder: RefCell<Option<MergeBuilder>>,
27
+ source_alias: Option<String>,
28
+ target_alias: Option<String>,
29
+ #[allow(dead_code)]
30
+ arrow_schema: Arc<ArrowSchema>,
31
+ }
32
+
33
+ // getters
34
+ impl RbMergeBuilder {
35
+ pub fn source_alias(&self) -> Option<String> {
36
+ self.source_alias.clone()
37
+ }
38
+
39
+ pub fn target_alias(&self) -> Option<String> {
40
+ self.target_alias.clone()
41
+ }
42
+ }
43
+
44
+ impl RbMergeBuilder {
45
+ pub fn new(
46
+ log_store: LogStoreRef,
47
+ snapshot: DeltaTableState,
48
+ source: ArrowArrayStreamReader,
49
+ predicate: String,
50
+ source_alias: Option<String>,
51
+ target_alias: Option<String>,
52
+ safe_cast: bool,
53
+ writer_properties: Option<RbWriterProperties>,
54
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
55
+ commit_properties: Option<RbCommitProperties>,
56
+ ) -> DeltaResult<Self> {
57
+ let ctx = SessionContext::new();
58
+ let schema = source.schema();
59
+ let batches = vec![source.map(|batch| batch.unwrap()).collect::<Vec<_>>()];
60
+ let table_provider: Arc<dyn TableProvider> =
61
+ Arc::new(MemTable::try_new(schema.clone(), batches).unwrap());
62
+ let source_df = ctx.read_table(table_provider).unwrap();
63
+
64
+ let mut cmd =
65
+ MergeBuilder::new(log_store, snapshot, predicate, source_df).with_safe_cast(safe_cast);
66
+
67
+ if let Some(src_alias) = &source_alias {
68
+ cmd = cmd.with_source_alias(src_alias);
69
+ }
70
+
71
+ if let Some(trgt_alias) = &target_alias {
72
+ cmd = cmd.with_target_alias(trgt_alias);
73
+ }
74
+
75
+ if let Some(writer_props) = writer_properties {
76
+ cmd = cmd.with_writer_properties(set_writer_properties(writer_props)?);
77
+ }
78
+
79
+ if let Some(commit_properties) =
80
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
81
+ {
82
+ cmd = cmd.with_commit_properties(commit_properties);
83
+ }
84
+
85
+ Ok(Self {
86
+ _builder: RefCell::new(Some(cmd)),
87
+ source_alias,
88
+ target_alias,
89
+ arrow_schema: schema,
90
+ })
91
+ }
92
+
93
+ pub fn execute(&self) -> DeltaResult<(DeltaTable, String)> {
94
+ let (table, metrics) = rt().block_on(self._builder.take().unwrap().into_future())?;
95
+ Ok((table, serde_json::to_string(&metrics).unwrap()))
96
+ }
97
+ }
98
+
99
+ impl RbMergeBuilder {
100
+ pub fn when_matched_update(
101
+ &self,
102
+ updates: HashMap<String, String>,
103
+ predicate: Option<String>,
104
+ ) -> RbResult<()> {
105
+ let mut binding = self._builder.borrow_mut();
106
+ *binding = match binding.take() {
107
+ Some(cmd) => Some(
108
+ cmd.when_matched_update(|mut update| {
109
+ for (column, expression) in updates {
110
+ update = update.update(column, expression)
111
+ }
112
+ if let Some(predicate) = predicate {
113
+ update = update.predicate(predicate)
114
+ };
115
+ update
116
+ })
117
+ .map_err(RubyError::from)?,
118
+ ),
119
+ None => unreachable!(),
120
+ };
121
+ Ok(())
122
+ }
123
+
124
+ pub fn when_matched_delete(&self, predicate: Option<String>) -> RbResult<()> {
125
+ let mut binding = self._builder.borrow_mut();
126
+ *binding = match binding.take() {
127
+ Some(cmd) => Some(
128
+ cmd.when_matched_delete(|mut delete| {
129
+ if let Some(predicate) = predicate {
130
+ delete = delete.predicate(predicate)
131
+ };
132
+ delete
133
+ })
134
+ .map_err(RubyError::from)?,
135
+ ),
136
+ None => unreachable!(),
137
+ };
138
+ Ok(())
139
+ }
140
+
141
+ pub fn when_not_matched_insert(
142
+ &self,
143
+ updates: HashMap<String, String>,
144
+ predicate: Option<String>,
145
+ ) -> RbResult<()> {
146
+ let mut binding = self._builder.borrow_mut();
147
+ *binding = match binding.take() {
148
+ Some(cmd) => Some(
149
+ cmd.when_not_matched_insert(|mut insert| {
150
+ for (column, expression) in updates {
151
+ insert = insert.set(column, expression)
152
+ }
153
+ if let Some(predicate) = predicate {
154
+ insert = insert.predicate(predicate)
155
+ };
156
+ insert
157
+ })
158
+ .map_err(RubyError::from)?,
159
+ ),
160
+ None => unreachable!(),
161
+ };
162
+ Ok(())
163
+ }
164
+
165
+ pub fn when_not_matched_by_source_update(
166
+ &self,
167
+ updates: HashMap<String, String>,
168
+ predicate: Option<String>,
169
+ ) -> RbResult<()> {
170
+ let mut binding = self._builder.borrow_mut();
171
+ *binding = match binding.take() {
172
+ Some(cmd) => Some(
173
+ cmd.when_not_matched_by_source_update(|mut update| {
174
+ for (column, expression) in updates {
175
+ update = update.update(column, expression)
176
+ }
177
+ if let Some(predicate) = predicate {
178
+ update = update.predicate(predicate)
179
+ };
180
+ update
181
+ })
182
+ .map_err(RubyError::from)?,
183
+ ),
184
+ None => unreachable!(),
185
+ };
186
+ Ok(())
187
+ }
188
+
189
+ pub fn when_not_matched_by_source_delete(&self, predicate: Option<String>) -> RbResult<()> {
190
+ let mut binding = self._builder.borrow_mut();
191
+ *binding = match binding.take() {
192
+ Some(cmd) => Some(
193
+ cmd.when_not_matched_by_source_delete(|mut delete| {
194
+ if let Some(predicate) = predicate {
195
+ delete = delete.predicate(predicate)
196
+ };
197
+ delete
198
+ })
199
+ .map_err(RubyError::from)?,
200
+ ),
201
+ None => unreachable!(),
202
+ };
203
+ Ok(())
204
+ }
205
+ }
@@ -47,9 +47,6 @@ module DeltaLake
47
47
  if version.is_a?(Integer)
48
48
  @table.load_version(version)
49
49
  elsif version.is_a?(Time)
50
- # needed for iso8601
51
- require "time"
52
-
53
50
  @table.load_with_datetime(version.utc.iso8601(9))
54
51
  elsif version.is_a?(String)
55
52
  @table.load_with_datetime(version)
@@ -112,7 +109,9 @@ module DeltaLake
112
109
  def vacuum(
113
110
  retention_hours: nil,
114
111
  dry_run: true,
115
- enforce_retention_duration: true
112
+ enforce_retention_duration: true,
113
+ post_commithook_properties: nil,
114
+ commit_properties: nil
116
115
  )
117
116
  if retention_hours
118
117
  if retention_hours < 0
@@ -123,7 +122,9 @@ module DeltaLake
123
122
  @table.vacuum(
124
123
  dry_run,
125
124
  retention_hours,
126
- enforce_retention_duration
125
+ enforce_retention_duration,
126
+ commit_properties,
127
+ post_commithook_properties
127
128
  )
128
129
  end
129
130
 
@@ -135,49 +136,80 @@ module DeltaLake
135
136
  TableAlterer.new(self)
136
137
  end
137
138
 
139
+ def merge(
140
+ source,
141
+ predicate,
142
+ source_alias: nil,
143
+ target_alias: nil,
144
+ error_on_type_mismatch: true,
145
+ writer_properties: nil,
146
+ post_commithook_properties: nil,
147
+ commit_properties: nil
148
+ )
149
+ source = Utils.convert_data(source)
150
+
151
+ rb_merge_builder =
152
+ @table.create_merge_builder(
153
+ source,
154
+ predicate,
155
+ source_alias,
156
+ target_alias,
157
+ !error_on_type_mismatch,
158
+ writer_properties,
159
+ post_commithook_properties,
160
+ commit_properties
161
+ )
162
+ TableMerger.new(rb_merge_builder, @table)
163
+ end
164
+
138
165
  def restore(
139
166
  target,
140
167
  ignore_missing_files: false,
141
- protocol_downgrade_allowed: false
168
+ protocol_downgrade_allowed: false,
169
+ commit_properties: nil
142
170
  )
143
171
  if target.is_a?(Time)
144
- # needed for iso8601
145
- require "time"
146
-
147
172
  metrics =
148
173
  @table.restore(
149
174
  target.utc.iso8601(9),
150
175
  ignore_missing_files,
151
- protocol_downgrade_allowed
176
+ protocol_downgrade_allowed,
177
+ commit_properties
152
178
  )
153
179
  else
154
180
  metrics =
155
181
  @table.restore(
156
182
  target,
157
183
  ignore_missing_files,
158
- protocol_downgrade_allowed
184
+ protocol_downgrade_allowed,
185
+ commit_properties
159
186
  )
160
187
  end
161
188
  JSON.parse(metrics)
162
189
  end
163
190
 
164
- def to_polars(eager: true)
191
+ def to_polars(eager: true, rechunk: false, columns: nil)
165
192
  require "polars-df"
166
193
 
167
194
  sources = file_uris
168
- lf =
169
- if sources.empty?
170
- Polars::LazyFrame.new
171
- else
172
- delta_keys = [
173
- "AWS_S3_ALLOW_UNSAFE_RENAME",
174
- "AWS_S3_LOCKING_PROVIDER",
175
- "CONDITIONAL_PUT",
176
- "DELTA_DYNAMO_TABLE_NAME"
177
- ]
178
- storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
179
- Polars.scan_parquet(sources, storage_options: storage_options)
195
+ if sources.empty?
196
+ lf = Polars::LazyFrame.new
197
+ else
198
+ delta_keys = [
199
+ "AWS_S3_ALLOW_UNSAFE_RENAME",
200
+ "AWS_S3_LOCKING_PROVIDER",
201
+ "CONDITIONAL_PUT",
202
+ "DELTA_DYNAMO_TABLE_NAME"
203
+ ]
204
+ storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
205
+ lf = Polars.scan_parquet(sources, storage_options: storage_options, rechunk: rechunk)
206
+
207
+ if columns
208
+ # by_name requires polars-df > 0.15.0
209
+ lf = lf.select(Polars.cs.by_name(*columns))
180
210
  end
211
+ end
212
+
181
213
  eager ? lf.collect : lf
182
214
  end
183
215
 
@@ -185,15 +217,32 @@ module DeltaLake
185
217
  @table.update_incremental
186
218
  end
187
219
 
188
- def delete(predicate = nil)
189
- metrics = @table.delete(predicate)
220
+ def delete(
221
+ predicate = nil,
222
+ writer_properties: nil,
223
+ post_commithook_properties: nil,
224
+ commit_properties: nil
225
+ )
226
+ metrics =
227
+ @table.delete(
228
+ predicate,
229
+ writer_properties,
230
+ post_commithook_properties,
231
+ commit_properties
232
+ )
190
233
  JSON.parse(metrics).transform_keys(&:to_sym)
191
234
  end
192
235
 
193
- def repair(dry_run: false)
236
+ def repair(
237
+ dry_run: false,
238
+ post_commithook_properties: nil,
239
+ commit_properties: nil
240
+ )
194
241
  metrics =
195
242
  @table.repair(
196
- dry_run
243
+ dry_run,
244
+ commit_properties,
245
+ post_commithook_properties
197
246
  )
198
247
  JSON.parse(metrics).transform_keys(&:to_sym)
199
248
  end
@@ -4,6 +4,29 @@ module DeltaLake
4
4
  @table = table
5
5
  end
6
6
 
7
+ def add_feature(
8
+ feature,
9
+ allow_protocol_versions_increase: false
10
+ )
11
+ if !feature.is_a?(Array)
12
+ feature = [feature]
13
+ end
14
+ @table._table.add_feature(
15
+ feature,
16
+ allow_protocol_versions_increase
17
+ )
18
+ end
19
+
20
+ def add_columns(fields)
21
+ if fields.is_a?(DeltaLake::Field)
22
+ fields = [fields]
23
+ end
24
+
25
+ @table._table.add_columns(
26
+ fields
27
+ )
28
+ end
29
+
7
30
  def add_constraint(constraints)
8
31
  if constraints.length > 1
9
32
  raise ArgumentError,
@@ -21,5 +44,15 @@ module DeltaLake
21
44
  raise_if_not_exists
22
45
  )
23
46
  end
47
+
48
+ def set_table_properties(
49
+ properties,
50
+ raise_if_not_exists: true
51
+ )
52
+ @table._table.set_table_properties(
53
+ properties,
54
+ raise_if_not_exists
55
+ )
56
+ end
24
57
  end
25
58
  end
@@ -0,0 +1,38 @@
1
+ module DeltaLake
2
+ class TableMerger
3
+ def initialize(builder, table)
4
+ @builder = builder
5
+ @table = table
6
+ end
7
+
8
+ def when_matched_update(updates, predicate: nil)
9
+ @builder.when_matched_update(updates, predicate)
10
+ self
11
+ end
12
+
13
+ def when_not_matched_insert(updates, predicate: nil)
14
+ @builder.when_not_matched_insert(updates, predicate)
15
+ self
16
+ end
17
+
18
+ def when_matched_delete(predicate: nil)
19
+ @builder.when_matched_delete(predicate)
20
+ self
21
+ end
22
+
23
+ def when_not_matched_by_source_update(updates, predicate: nil)
24
+ @builder.when_not_matched_by_source_update(updates, predicate)
25
+ self
26
+ end
27
+
28
+ def when_not_matched_by_source_delete(predicate: nil)
29
+ @builder.when_not_matched_by_source_delete(predicate)
30
+ self
31
+ end
32
+
33
+ def execute
34
+ metrics = @table.merge_execute(@builder)
35
+ JSON.parse(metrics).transform_keys(&:to_sym)
36
+ end
37
+ end
38
+ end
@@ -5,15 +5,23 @@ module DeltaLake
5
5
  end
6
6
 
7
7
  def compact(
8
+ partition_filters: nil,
8
9
  target_size: nil,
9
10
  max_concurrent_tasks: nil,
10
- min_commit_interval: nil
11
+ min_commit_interval: nil,
12
+ writer_properties: nil,
13
+ post_commithook_properties: nil,
14
+ commit_properties: nil
11
15
  )
12
16
  metrics =
13
17
  @table._table.compact_optimize(
18
+ @table._stringify_partition_values(partition_filters),
14
19
  target_size,
15
20
  max_concurrent_tasks,
16
- min_commit_interval
21
+ min_commit_interval,
22
+ writer_properties,
23
+ post_commithook_properties,
24
+ commit_properties
17
25
  )
18
26
  @table.update_incremental
19
27
  result = JSON.parse(metrics)
@@ -26,18 +34,26 @@ module DeltaLake
26
34
 
27
35
  def z_order(
28
36
  columns,
37
+ partition_filters: nil,
29
38
  target_size: nil,
30
39
  max_concurrent_tasks: nil,
31
40
  max_spill_size: 20 * 1024 * 1024 * 1024,
32
- min_commit_interval: nil
41
+ min_commit_interval: nil,
42
+ writer_properties: nil,
43
+ post_commithook_properties: nil,
44
+ commit_properties: nil
33
45
  )
34
46
  metrics =
35
47
  @table._table.z_order_optimize(
36
48
  Array(columns),
49
+ @table._stringify_partition_values(partition_filters),
37
50
  target_size,
38
51
  max_concurrent_tasks,
39
52
  max_spill_size,
40
- min_commit_interval
53
+ min_commit_interval,
54
+ writer_properties,
55
+ post_commithook_properties,
56
+ commit_properties
41
57
  )
42
58
  @table.update_incremental
43
59
  result = JSON.parse(metrics)
@@ -0,0 +1,59 @@
1
+ module DeltaLake
2
+ module Utils
3
+ def self.convert_data(data)
4
+ if data.respond_to?(:arrow_c_stream)
5
+ # TODO convert other object types
6
+ # should probably move logic to Rust
7
+ if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
8
+ data = convert_polars_data(data)
9
+ end
10
+
11
+ data.arrow_c_stream
12
+ else
13
+ raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
14
+ end
15
+ end
16
+
17
+ # unsigned integers are not part of the protocol
18
+ # https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
19
+ def self.convert_polars_data(data)
20
+ new_schema = {}
21
+ data.schema.each do |k, v|
22
+ new_type = convert_polars_type(v)
23
+ new_schema[k] = new_type if new_type
24
+ end
25
+
26
+ if new_schema.any?
27
+ data.cast(new_schema)
28
+ else
29
+ data
30
+ end
31
+ end
32
+
33
+ def self.convert_polars_type(t)
34
+ case t
35
+ when Polars::UInt8
36
+ Polars::Int8
37
+ when Polars::UInt16
38
+ Polars::Int16
39
+ when Polars::UInt32
40
+ Polars::Int32
41
+ when Polars::UInt64
42
+ Polars::Int64
43
+ when Polars::Datetime
44
+ Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
45
+ when Polars::List
46
+ inner = convert_polars_type(t.inner)
47
+ Polars::List.new(inner) if inner
48
+ when Polars::Array
49
+ inner = convert_polars_type(t.inner)
50
+ Polars::Array.new(t.inner, t.width) if inner
51
+ when Polars::Struct
52
+ if t.fields.any? { |f| convert_polars_type(f.dtype) }
53
+ fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
54
+ Polars::Struct.new(fields)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -1,3 +1,3 @@
1
1
  module DeltaLake
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.3"
3
3
  end