deltalake-rb 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,205 @@
1
+ use deltalake::arrow::array::RecordBatchReader;
2
+ use deltalake::arrow::datatypes::Schema as ArrowSchema;
3
+ use deltalake::arrow::ffi_stream::ArrowArrayStreamReader;
4
+ use deltalake::datafusion::catalog::TableProvider;
5
+ use deltalake::datafusion::datasource::MemTable;
6
+ use deltalake::datafusion::prelude::SessionContext;
7
+ use deltalake::logstore::LogStoreRef;
8
+ use deltalake::operations::merge::MergeBuilder;
9
+ use deltalake::table::state::DeltaTableState;
10
+ use deltalake::{DeltaResult, DeltaTable};
11
+ use std::cell::RefCell;
12
+ use std::collections::HashMap;
13
+ use std::future::IntoFuture;
14
+ use std::sync::Arc;
15
+
16
+ use crate::error::RubyError;
17
+ use crate::utils::rt;
18
+ use crate::RbResult;
19
+ use crate::{
20
+ maybe_create_commit_properties, set_writer_properties, RbCommitProperties,
21
+ RbPostCommitHookProperties, RbWriterProperties,
22
+ };
23
+
24
+ #[magnus::wrap(class = "DeltaLake::RbMergeBuilder")]
25
+ pub(crate) struct RbMergeBuilder {
26
+ _builder: RefCell<Option<MergeBuilder>>,
27
+ source_alias: Option<String>,
28
+ target_alias: Option<String>,
29
+ #[allow(dead_code)]
30
+ arrow_schema: Arc<ArrowSchema>,
31
+ }
32
+
33
+ // getters
34
+ impl RbMergeBuilder {
35
+ pub fn source_alias(&self) -> Option<String> {
36
+ self.source_alias.clone()
37
+ }
38
+
39
+ pub fn target_alias(&self) -> Option<String> {
40
+ self.target_alias.clone()
41
+ }
42
+ }
43
+
44
+ impl RbMergeBuilder {
45
+ pub fn new(
46
+ log_store: LogStoreRef,
47
+ snapshot: DeltaTableState,
48
+ source: ArrowArrayStreamReader,
49
+ predicate: String,
50
+ source_alias: Option<String>,
51
+ target_alias: Option<String>,
52
+ safe_cast: bool,
53
+ writer_properties: Option<RbWriterProperties>,
54
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
55
+ commit_properties: Option<RbCommitProperties>,
56
+ ) -> DeltaResult<Self> {
57
+ let ctx = SessionContext::new();
58
+ let schema = source.schema();
59
+ let batches = vec![source.map(|batch| batch.unwrap()).collect::<Vec<_>>()];
60
+ let table_provider: Arc<dyn TableProvider> =
61
+ Arc::new(MemTable::try_new(schema.clone(), batches).unwrap());
62
+ let source_df = ctx.read_table(table_provider).unwrap();
63
+
64
+ let mut cmd =
65
+ MergeBuilder::new(log_store, snapshot, predicate, source_df).with_safe_cast(safe_cast);
66
+
67
+ if let Some(src_alias) = &source_alias {
68
+ cmd = cmd.with_source_alias(src_alias);
69
+ }
70
+
71
+ if let Some(trgt_alias) = &target_alias {
72
+ cmd = cmd.with_target_alias(trgt_alias);
73
+ }
74
+
75
+ if let Some(writer_props) = writer_properties {
76
+ cmd = cmd.with_writer_properties(set_writer_properties(writer_props)?);
77
+ }
78
+
79
+ if let Some(commit_properties) =
80
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
81
+ {
82
+ cmd = cmd.with_commit_properties(commit_properties);
83
+ }
84
+
85
+ Ok(Self {
86
+ _builder: RefCell::new(Some(cmd)),
87
+ source_alias,
88
+ target_alias,
89
+ arrow_schema: schema,
90
+ })
91
+ }
92
+
93
+ pub fn execute(&self) -> DeltaResult<(DeltaTable, String)> {
94
+ let (table, metrics) = rt().block_on(self._builder.take().unwrap().into_future())?;
95
+ Ok((table, serde_json::to_string(&metrics).unwrap()))
96
+ }
97
+ }
98
+
99
+ impl RbMergeBuilder {
100
+ pub fn when_matched_update(
101
+ &self,
102
+ updates: HashMap<String, String>,
103
+ predicate: Option<String>,
104
+ ) -> RbResult<()> {
105
+ let mut binding = self._builder.borrow_mut();
106
+ *binding = match binding.take() {
107
+ Some(cmd) => Some(
108
+ cmd.when_matched_update(|mut update| {
109
+ for (column, expression) in updates {
110
+ update = update.update(column, expression)
111
+ }
112
+ if let Some(predicate) = predicate {
113
+ update = update.predicate(predicate)
114
+ };
115
+ update
116
+ })
117
+ .map_err(RubyError::from)?,
118
+ ),
119
+ None => unreachable!(),
120
+ };
121
+ Ok(())
122
+ }
123
+
124
+ pub fn when_matched_delete(&self, predicate: Option<String>) -> RbResult<()> {
125
+ let mut binding = self._builder.borrow_mut();
126
+ *binding = match binding.take() {
127
+ Some(cmd) => Some(
128
+ cmd.when_matched_delete(|mut delete| {
129
+ if let Some(predicate) = predicate {
130
+ delete = delete.predicate(predicate)
131
+ };
132
+ delete
133
+ })
134
+ .map_err(RubyError::from)?,
135
+ ),
136
+ None => unreachable!(),
137
+ };
138
+ Ok(())
139
+ }
140
+
141
+ pub fn when_not_matched_insert(
142
+ &self,
143
+ updates: HashMap<String, String>,
144
+ predicate: Option<String>,
145
+ ) -> RbResult<()> {
146
+ let mut binding = self._builder.borrow_mut();
147
+ *binding = match binding.take() {
148
+ Some(cmd) => Some(
149
+ cmd.when_not_matched_insert(|mut insert| {
150
+ for (column, expression) in updates {
151
+ insert = insert.set(column, expression)
152
+ }
153
+ if let Some(predicate) = predicate {
154
+ insert = insert.predicate(predicate)
155
+ };
156
+ insert
157
+ })
158
+ .map_err(RubyError::from)?,
159
+ ),
160
+ None => unreachable!(),
161
+ };
162
+ Ok(())
163
+ }
164
+
165
+ pub fn when_not_matched_by_source_update(
166
+ &self,
167
+ updates: HashMap<String, String>,
168
+ predicate: Option<String>,
169
+ ) -> RbResult<()> {
170
+ let mut binding = self._builder.borrow_mut();
171
+ *binding = match binding.take() {
172
+ Some(cmd) => Some(
173
+ cmd.when_not_matched_by_source_update(|mut update| {
174
+ for (column, expression) in updates {
175
+ update = update.update(column, expression)
176
+ }
177
+ if let Some(predicate) = predicate {
178
+ update = update.predicate(predicate)
179
+ };
180
+ update
181
+ })
182
+ .map_err(RubyError::from)?,
183
+ ),
184
+ None => unreachable!(),
185
+ };
186
+ Ok(())
187
+ }
188
+
189
+ pub fn when_not_matched_by_source_delete(&self, predicate: Option<String>) -> RbResult<()> {
190
+ let mut binding = self._builder.borrow_mut();
191
+ *binding = match binding.take() {
192
+ Some(cmd) => Some(
193
+ cmd.when_not_matched_by_source_delete(|mut delete| {
194
+ if let Some(predicate) = predicate {
195
+ delete = delete.predicate(predicate)
196
+ };
197
+ delete
198
+ })
199
+ .map_err(RubyError::from)?,
200
+ ),
201
+ None => unreachable!(),
202
+ };
203
+ Ok(())
204
+ }
205
+ }
@@ -47,9 +47,6 @@ module DeltaLake
47
47
  if version.is_a?(Integer)
48
48
  @table.load_version(version)
49
49
  elsif version.is_a?(Time)
50
- # needed for iso8601
51
- require "time"
52
-
53
50
  @table.load_with_datetime(version.utc.iso8601(9))
54
51
  elsif version.is_a?(String)
55
52
  @table.load_with_datetime(version)
@@ -112,7 +109,9 @@ module DeltaLake
112
109
  def vacuum(
113
110
  retention_hours: nil,
114
111
  dry_run: true,
115
- enforce_retention_duration: true
112
+ enforce_retention_duration: true,
113
+ post_commithook_properties: nil,
114
+ commit_properties: nil
116
115
  )
117
116
  if retention_hours
118
117
  if retention_hours < 0
@@ -123,7 +122,9 @@ module DeltaLake
123
122
  @table.vacuum(
124
123
  dry_run,
125
124
  retention_hours,
126
- enforce_retention_duration
125
+ enforce_retention_duration,
126
+ commit_properties,
127
+ post_commithook_properties
127
128
  )
128
129
  end
129
130
 
@@ -135,27 +136,53 @@ module DeltaLake
135
136
  TableAlterer.new(self)
136
137
  end
137
138
 
139
+ def merge(
140
+ source,
141
+ predicate,
142
+ source_alias: nil,
143
+ target_alias: nil,
144
+ error_on_type_mismatch: true,
145
+ writer_properties: nil,
146
+ post_commithook_properties: nil,
147
+ commit_properties: nil
148
+ )
149
+ source = Utils.convert_data(source)
150
+
151
+ rb_merge_builder =
152
+ @table.create_merge_builder(
153
+ source,
154
+ predicate,
155
+ source_alias,
156
+ target_alias,
157
+ !error_on_type_mismatch,
158
+ writer_properties,
159
+ post_commithook_properties,
160
+ commit_properties
161
+ )
162
+ TableMerger.new(rb_merge_builder, @table)
163
+ end
164
+
138
165
  def restore(
139
166
  target,
140
167
  ignore_missing_files: false,
141
- protocol_downgrade_allowed: false
168
+ protocol_downgrade_allowed: false,
169
+ commit_properties: nil
142
170
  )
143
171
  if target.is_a?(Time)
144
- # needed for iso8601
145
- require "time"
146
-
147
172
  metrics =
148
173
  @table.restore(
149
174
  target.utc.iso8601(9),
150
175
  ignore_missing_files,
151
- protocol_downgrade_allowed
176
+ protocol_downgrade_allowed,
177
+ commit_properties
152
178
  )
153
179
  else
154
180
  metrics =
155
181
  @table.restore(
156
182
  target,
157
183
  ignore_missing_files,
158
- protocol_downgrade_allowed
184
+ protocol_downgrade_allowed,
185
+ commit_properties
159
186
  )
160
187
  end
161
188
  JSON.parse(metrics)
@@ -185,15 +212,32 @@ module DeltaLake
185
212
  @table.update_incremental
186
213
  end
187
214
 
188
- def delete(predicate = nil)
189
- metrics = @table.delete(predicate)
215
+ def delete(
216
+ predicate = nil,
217
+ writer_properties: nil,
218
+ post_commithook_properties: nil,
219
+ commit_properties: nil
220
+ )
221
+ metrics =
222
+ @table.delete(
223
+ predicate,
224
+ writer_properties,
225
+ post_commithook_properties,
226
+ commit_properties
227
+ )
190
228
  JSON.parse(metrics).transform_keys(&:to_sym)
191
229
  end
192
230
 
193
- def repair(dry_run: false)
231
+ def repair(
232
+ dry_run: false,
233
+ post_commithook_properties: nil,
234
+ commit_properties: nil
235
+ )
194
236
  metrics =
195
237
  @table.repair(
196
- dry_run
238
+ dry_run,
239
+ commit_properties,
240
+ post_commithook_properties
197
241
  )
198
242
  JSON.parse(metrics).transform_keys(&:to_sym)
199
243
  end
@@ -4,6 +4,29 @@ module DeltaLake
4
4
  @table = table
5
5
  end
6
6
 
7
+ def add_feature(
8
+ feature,
9
+ allow_protocol_versions_increase: false
10
+ )
11
+ if !feature.is_a?(Array)
12
+ feature = [feature]
13
+ end
14
+ @table._table.add_feature(
15
+ feature,
16
+ allow_protocol_versions_increase
17
+ )
18
+ end
19
+
20
+ def add_columns(fields)
21
+ if fields.is_a?(DeltaLake::Field)
22
+ fields = [fields]
23
+ end
24
+
25
+ @table._table.add_columns(
26
+ fields
27
+ )
28
+ end
29
+
7
30
  def add_constraint(constraints)
8
31
  if constraints.length > 1
9
32
  raise ArgumentError,
@@ -21,5 +44,15 @@ module DeltaLake
21
44
  raise_if_not_exists
22
45
  )
23
46
  end
47
+
48
+ def set_table_properties(
49
+ properties,
50
+ raise_if_not_exists: true
51
+ )
52
+ @table._table.set_table_properties(
53
+ properties,
54
+ raise_if_not_exists
55
+ )
56
+ end
24
57
  end
25
58
  end
@@ -0,0 +1,38 @@
1
+ module DeltaLake
2
+ class TableMerger
3
+ def initialize(builder, table)
4
+ @builder = builder
5
+ @table = table
6
+ end
7
+
8
+ def when_matched_update(updates, predicate: nil)
9
+ @builder.when_matched_update(updates, predicate)
10
+ self
11
+ end
12
+
13
+ def when_not_matched_insert(updates, predicate: nil)
14
+ @builder.when_not_matched_insert(updates, predicate)
15
+ self
16
+ end
17
+
18
+ def when_matched_delete(predicate: nil)
19
+ @builder.when_matched_delete(predicate)
20
+ self
21
+ end
22
+
23
+ def when_not_matched_by_source_update(updates, predicate: nil)
24
+ @builder.when_not_matched_by_source_update(updates, predicate)
25
+ self
26
+ end
27
+
28
+ def when_not_matched_by_source_delete(predicate: nil)
29
+ @builder.when_not_matched_by_source_delete(predicate)
30
+ self
31
+ end
32
+
33
+ def execute
34
+ metrics = @table.merge_execute(@builder)
35
+ JSON.parse(metrics).transform_keys(&:to_sym)
36
+ end
37
+ end
38
+ end
@@ -5,15 +5,23 @@ module DeltaLake
5
5
  end
6
6
 
7
7
  def compact(
8
+ partition_filters: nil,
8
9
  target_size: nil,
9
10
  max_concurrent_tasks: nil,
10
- min_commit_interval: nil
11
+ min_commit_interval: nil,
12
+ writer_properties: nil,
13
+ post_commithook_properties: nil,
14
+ commit_properties: nil
11
15
  )
12
16
  metrics =
13
17
  @table._table.compact_optimize(
18
+ @table._stringify_partition_values(partition_filters),
14
19
  target_size,
15
20
  max_concurrent_tasks,
16
- min_commit_interval
21
+ min_commit_interval,
22
+ writer_properties,
23
+ post_commithook_properties,
24
+ commit_properties
17
25
  )
18
26
  @table.update_incremental
19
27
  result = JSON.parse(metrics)
@@ -26,18 +34,26 @@ module DeltaLake
26
34
 
27
35
  def z_order(
28
36
  columns,
37
+ partition_filters: nil,
29
38
  target_size: nil,
30
39
  max_concurrent_tasks: nil,
31
40
  max_spill_size: 20 * 1024 * 1024 * 1024,
32
- min_commit_interval: nil
41
+ min_commit_interval: nil,
42
+ writer_properties: nil,
43
+ post_commithook_properties: nil,
44
+ commit_properties: nil
33
45
  )
34
46
  metrics =
35
47
  @table._table.z_order_optimize(
36
48
  Array(columns),
49
+ @table._stringify_partition_values(partition_filters),
37
50
  target_size,
38
51
  max_concurrent_tasks,
39
52
  max_spill_size,
40
- min_commit_interval
53
+ min_commit_interval,
54
+ writer_properties,
55
+ post_commithook_properties,
56
+ commit_properties
41
57
  )
42
58
  @table.update_incremental
43
59
  result = JSON.parse(metrics)
@@ -0,0 +1,59 @@
1
+ module DeltaLake
2
+ module Utils
3
+ def self.convert_data(data)
4
+ if data.respond_to?(:arrow_c_stream)
5
+ # TODO convert other object types
6
+ # should probably move logic to Rust
7
+ if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
8
+ data = convert_polars_data(data)
9
+ end
10
+
11
+ data.arrow_c_stream
12
+ else
13
+ raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
14
+ end
15
+ end
16
+
17
+ # unsigned integers are not part of the protocol
18
+ # https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
19
+ def self.convert_polars_data(data)
20
+ new_schema = {}
21
+ data.schema.each do |k, v|
22
+ new_type = convert_polars_type(v)
23
+ new_schema[k] = new_type if new_type
24
+ end
25
+
26
+ if new_schema.any?
27
+ data.cast(new_schema)
28
+ else
29
+ data
30
+ end
31
+ end
32
+
33
+ def self.convert_polars_type(t)
34
+ case t
35
+ when Polars::UInt8
36
+ Polars::Int8
37
+ when Polars::UInt16
38
+ Polars::Int16
39
+ when Polars::UInt32
40
+ Polars::Int32
41
+ when Polars::UInt64
42
+ Polars::Int64
43
+ when Polars::Datetime
44
+ Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
45
+ when Polars::List
46
+ inner = convert_polars_type(t.inner)
47
+ Polars::List.new(inner) if inner
48
+ when Polars::Array
49
+ inner = convert_polars_type(t.inner)
50
+ Polars::Array.new(t.inner, t.width) if inner
51
+ when Polars::Struct
52
+ if t.fields.any? { |f| convert_polars_type(f.dtype) }
53
+ fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
54
+ Polars::Struct.new(fields)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -1,3 +1,3 @@
1
1
  module DeltaLake
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/lib/deltalake.rb CHANGED
@@ -7,6 +7,7 @@ end
7
7
 
8
8
  # stdlib
9
9
  require "json"
10
+ require "time"
10
11
 
11
12
  # modules
12
13
  require_relative "deltalake/field"
@@ -14,7 +15,9 @@ require_relative "deltalake/metadata"
14
15
  require_relative "deltalake/schema"
15
16
  require_relative "deltalake/table"
16
17
  require_relative "deltalake/table_alterer"
18
+ require_relative "deltalake/table_merger"
17
19
  require_relative "deltalake/table_optimizer"
20
+ require_relative "deltalake/utils"
18
21
  require_relative "deltalake/version"
19
22
 
20
23
  module DeltaLake
@@ -38,6 +41,28 @@ module DeltaLake
38
41
  :reader_features
39
42
  )
40
43
 
44
+ CommitProperties =
45
+ Struct.new(
46
+ :custom_metadata,
47
+ :max_commit_retries,
48
+ # TODO
49
+ # :app_transactions,
50
+ keyword_init: true
51
+ )
52
+
53
+ PostCommitHookProperties =
54
+ Struct.new(
55
+ :create_checkpoint,
56
+ :cleanup_expired_logs,
57
+ keyword_init: true
58
+ )
59
+
60
+ class ArrowArrayStream
61
+ def arrow_c_stream
62
+ self
63
+ end
64
+ end
65
+
41
66
  class << self
42
67
  def write(
43
68
  table_or_uri,
@@ -50,7 +75,10 @@ module DeltaLake
50
75
  schema_mode: nil,
51
76
  storage_options: nil,
52
77
  predicate: nil,
53
- target_file_size: nil
78
+ target_file_size: nil,
79
+ writer_properties: nil,
80
+ commit_properties: nil,
81
+ post_commithook_properties: nil
54
82
  )
55
83
  table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options)
56
84
 
@@ -62,7 +90,7 @@ module DeltaLake
62
90
  return
63
91
  end
64
92
 
65
- data = convert_data(data)
93
+ data = Utils.convert_data(data)
66
94
 
67
95
  write_deltalake_rust(
68
96
  table_uri,
@@ -76,7 +104,10 @@ module DeltaLake
76
104
  name,
77
105
  description,
78
106
  configuration,
79
- storage_options
107
+ storage_options,
108
+ writer_properties,
109
+ commit_properties,
110
+ post_commithook_properties
80
111
  )
81
112
 
82
113
  if table
@@ -107,61 +138,5 @@ module DeltaLake
107
138
  rescue TableNotFoundError
108
139
  nil
109
140
  end
110
-
111
- def convert_data(data)
112
- if data.respond_to?(:arrow_c_stream)
113
- # TODO convert other object types
114
- # should probably move logic to Rust
115
- if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
116
- data = convert_polars_data(data)
117
- end
118
-
119
- data.arrow_c_stream
120
- else
121
- raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
122
- end
123
- end
124
-
125
- # unsigned integers are not part of the protocol
126
- # https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
127
- def convert_polars_data(data)
128
- new_schema = {}
129
- data.schema.each do |k, v|
130
- new_type = convert_polars_type(v)
131
- new_schema[k] = new_type if new_type
132
- end
133
-
134
- if new_schema.any?
135
- data.cast(new_schema)
136
- else
137
- data
138
- end
139
- end
140
-
141
- def convert_polars_type(t)
142
- case t
143
- when Polars::UInt8
144
- Polars::Int8
145
- when Polars::UInt16
146
- Polars::Int16
147
- when Polars::UInt32
148
- Polars::Int32
149
- when Polars::UInt64
150
- Polars::Int64
151
- when Polars::Datetime
152
- Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
153
- when Polars::List
154
- inner = convert_polars_type(t.inner)
155
- Polars::List.new(inner) if inner
156
- when Polars::Array
157
- inner = convert_polars_type(t.inner)
158
- Polars::Array.new(t.inner, t.width) if inner
159
- when Polars::Struct
160
- if t.fields.any? { |f| convert_polars_type(f.dtype) }
161
- fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
162
- Polars::Struct.new(fields)
163
- end
164
- end
165
- end
166
141
  end
167
142
  end