deltalake-rb 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,205 @@
1
+ use deltalake::arrow::array::RecordBatchReader;
2
+ use deltalake::arrow::datatypes::Schema as ArrowSchema;
3
+ use deltalake::arrow::ffi_stream::ArrowArrayStreamReader;
4
+ use deltalake::datafusion::catalog::TableProvider;
5
+ use deltalake::datafusion::datasource::MemTable;
6
+ use deltalake::datafusion::prelude::SessionContext;
7
+ use deltalake::logstore::LogStoreRef;
8
+ use deltalake::operations::merge::MergeBuilder;
9
+ use deltalake::table::state::DeltaTableState;
10
+ use deltalake::{DeltaResult, DeltaTable};
11
+ use std::cell::RefCell;
12
+ use std::collections::HashMap;
13
+ use std::future::IntoFuture;
14
+ use std::sync::Arc;
15
+
16
+ use crate::error::RubyError;
17
+ use crate::utils::rt;
18
+ use crate::RbResult;
19
+ use crate::{
20
+ maybe_create_commit_properties, set_writer_properties, RbCommitProperties,
21
+ RbPostCommitHookProperties, RbWriterProperties,
22
+ };
23
+
24
+ #[magnus::wrap(class = "DeltaLake::RbMergeBuilder")]
25
+ pub(crate) struct RbMergeBuilder {
26
+ _builder: RefCell<Option<MergeBuilder>>,
27
+ source_alias: Option<String>,
28
+ target_alias: Option<String>,
29
+ #[allow(dead_code)]
30
+ arrow_schema: Arc<ArrowSchema>,
31
+ }
32
+
33
+ // getters
34
+ impl RbMergeBuilder {
35
+ pub fn source_alias(&self) -> Option<String> {
36
+ self.source_alias.clone()
37
+ }
38
+
39
+ pub fn target_alias(&self) -> Option<String> {
40
+ self.target_alias.clone()
41
+ }
42
+ }
43
+
44
+ impl RbMergeBuilder {
45
+ pub fn new(
46
+ log_store: LogStoreRef,
47
+ snapshot: DeltaTableState,
48
+ source: ArrowArrayStreamReader,
49
+ predicate: String,
50
+ source_alias: Option<String>,
51
+ target_alias: Option<String>,
52
+ safe_cast: bool,
53
+ writer_properties: Option<RbWriterProperties>,
54
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
55
+ commit_properties: Option<RbCommitProperties>,
56
+ ) -> DeltaResult<Self> {
57
+ let ctx = SessionContext::new();
58
+ let schema = source.schema();
59
+ let batches = vec![source.map(|batch| batch.unwrap()).collect::<Vec<_>>()];
60
+ let table_provider: Arc<dyn TableProvider> =
61
+ Arc::new(MemTable::try_new(schema.clone(), batches).unwrap());
62
+ let source_df = ctx.read_table(table_provider).unwrap();
63
+
64
+ let mut cmd =
65
+ MergeBuilder::new(log_store, snapshot, predicate, source_df).with_safe_cast(safe_cast);
66
+
67
+ if let Some(src_alias) = &source_alias {
68
+ cmd = cmd.with_source_alias(src_alias);
69
+ }
70
+
71
+ if let Some(trgt_alias) = &target_alias {
72
+ cmd = cmd.with_target_alias(trgt_alias);
73
+ }
74
+
75
+ if let Some(writer_props) = writer_properties {
76
+ cmd = cmd.with_writer_properties(set_writer_properties(writer_props)?);
77
+ }
78
+
79
+ if let Some(commit_properties) =
80
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
81
+ {
82
+ cmd = cmd.with_commit_properties(commit_properties);
83
+ }
84
+
85
+ Ok(Self {
86
+ _builder: RefCell::new(Some(cmd)),
87
+ source_alias,
88
+ target_alias,
89
+ arrow_schema: schema,
90
+ })
91
+ }
92
+
93
+ pub fn execute(&self) -> DeltaResult<(DeltaTable, String)> {
94
+ let (table, metrics) = rt().block_on(self._builder.take().unwrap().into_future())?;
95
+ Ok((table, serde_json::to_string(&metrics).unwrap()))
96
+ }
97
+ }
98
+
99
+ impl RbMergeBuilder {
100
+ pub fn when_matched_update(
101
+ &self,
102
+ updates: HashMap<String, String>,
103
+ predicate: Option<String>,
104
+ ) -> RbResult<()> {
105
+ let mut binding = self._builder.borrow_mut();
106
+ *binding = match binding.take() {
107
+ Some(cmd) => Some(
108
+ cmd.when_matched_update(|mut update| {
109
+ for (column, expression) in updates {
110
+ update = update.update(column, expression)
111
+ }
112
+ if let Some(predicate) = predicate {
113
+ update = update.predicate(predicate)
114
+ };
115
+ update
116
+ })
117
+ .map_err(RubyError::from)?,
118
+ ),
119
+ None => unreachable!(),
120
+ };
121
+ Ok(())
122
+ }
123
+
124
+ pub fn when_matched_delete(&self, predicate: Option<String>) -> RbResult<()> {
125
+ let mut binding = self._builder.borrow_mut();
126
+ *binding = match binding.take() {
127
+ Some(cmd) => Some(
128
+ cmd.when_matched_delete(|mut delete| {
129
+ if let Some(predicate) = predicate {
130
+ delete = delete.predicate(predicate)
131
+ };
132
+ delete
133
+ })
134
+ .map_err(RubyError::from)?,
135
+ ),
136
+ None => unreachable!(),
137
+ };
138
+ Ok(())
139
+ }
140
+
141
+ pub fn when_not_matched_insert(
142
+ &self,
143
+ updates: HashMap<String, String>,
144
+ predicate: Option<String>,
145
+ ) -> RbResult<()> {
146
+ let mut binding = self._builder.borrow_mut();
147
+ *binding = match binding.take() {
148
+ Some(cmd) => Some(
149
+ cmd.when_not_matched_insert(|mut insert| {
150
+ for (column, expression) in updates {
151
+ insert = insert.set(column, expression)
152
+ }
153
+ if let Some(predicate) = predicate {
154
+ insert = insert.predicate(predicate)
155
+ };
156
+ insert
157
+ })
158
+ .map_err(RubyError::from)?,
159
+ ),
160
+ None => unreachable!(),
161
+ };
162
+ Ok(())
163
+ }
164
+
165
+ pub fn when_not_matched_by_source_update(
166
+ &self,
167
+ updates: HashMap<String, String>,
168
+ predicate: Option<String>,
169
+ ) -> RbResult<()> {
170
+ let mut binding = self._builder.borrow_mut();
171
+ *binding = match binding.take() {
172
+ Some(cmd) => Some(
173
+ cmd.when_not_matched_by_source_update(|mut update| {
174
+ for (column, expression) in updates {
175
+ update = update.update(column, expression)
176
+ }
177
+ if let Some(predicate) = predicate {
178
+ update = update.predicate(predicate)
179
+ };
180
+ update
181
+ })
182
+ .map_err(RubyError::from)?,
183
+ ),
184
+ None => unreachable!(),
185
+ };
186
+ Ok(())
187
+ }
188
+
189
+ pub fn when_not_matched_by_source_delete(&self, predicate: Option<String>) -> RbResult<()> {
190
+ let mut binding = self._builder.borrow_mut();
191
+ *binding = match binding.take() {
192
+ Some(cmd) => Some(
193
+ cmd.when_not_matched_by_source_delete(|mut delete| {
194
+ if let Some(predicate) = predicate {
195
+ delete = delete.predicate(predicate)
196
+ };
197
+ delete
198
+ })
199
+ .map_err(RubyError::from)?,
200
+ ),
201
+ None => unreachable!(),
202
+ };
203
+ Ok(())
204
+ }
205
+ }
@@ -26,22 +26,51 @@ module DeltaLake
26
26
  @table.version
27
27
  end
28
28
 
29
- def files
30
- @table.files
29
+ def partitions
30
+ partitions = []
31
+ @table.get_active_partitions.each do |partition|
32
+ next unless partition
33
+ partitions << partition.to_h
34
+ end
35
+ partitions
36
+ end
37
+
38
+ def files(partition_filters: nil)
39
+ @table.files(_stringify_partition_values(partition_filters))
31
40
  end
32
41
 
33
- def file_uris
34
- @table.file_uris
42
+ def file_uris(partition_filters: nil)
43
+ @table.file_uris(_stringify_partition_values(partition_filters))
35
44
  end
36
45
 
37
46
  def load_as_version(version)
38
47
  if version.is_a?(Integer)
39
48
  @table.load_version(version)
49
+ elsif version.is_a?(Time)
50
+ @table.load_with_datetime(version.utc.iso8601(9))
51
+ elsif version.is_a?(String)
52
+ @table.load_with_datetime(version)
40
53
  else
41
- raise TypeError, "Invalid datatype provided for version, only Integer is accepted."
54
+ raise TypeError, "Invalid datatype provided for version, only Integer, String, and Time are accepted."
42
55
  end
43
56
  end
44
57
 
58
+ def load_cdf(
59
+ starting_version: 0,
60
+ ending_version: nil,
61
+ starting_timestamp: nil,
62
+ ending_timestamp: nil,
63
+ columns: nil
64
+ )
65
+ @table.load_cdf(
66
+ starting_version,
67
+ ending_version,
68
+ starting_timestamp,
69
+ ending_timestamp,
70
+ columns
71
+ )
72
+ end
73
+
45
74
  def table_uri
46
75
  @table.table_uri
47
76
  end
@@ -54,10 +83,35 @@ module DeltaLake
54
83
  Metadata.new(@table)
55
84
  end
56
85
 
86
+ def protocol
87
+ ProtocolVersions.new(*@table.protocol_versions)
88
+ end
89
+
90
+ def history(limit: nil)
91
+ backwards_enumerate = lambda do |iterable, start_end, &block|
92
+ n = start_end
93
+ iterable.each do |elem|
94
+ block.call(n, elem)
95
+ n -= 1
96
+ end
97
+ end
98
+
99
+ commits = @table.history(limit)
100
+ history = []
101
+ backwards_enumerate.(commits, @table.get_latest_version) do |version, commit_info_raw|
102
+ commit = JSON.parse(commit_info_raw)
103
+ commit["version"] = version
104
+ history << commit
105
+ end
106
+ history
107
+ end
108
+
57
109
  def vacuum(
58
110
  retention_hours: nil,
59
111
  dry_run: true,
60
- enforce_retention_duration: true
112
+ enforce_retention_duration: true,
113
+ post_commithook_properties: nil,
114
+ commit_properties: nil
61
115
  )
62
116
  if retention_hours
63
117
  if retention_hours < 0
@@ -68,10 +122,72 @@ module DeltaLake
68
122
  @table.vacuum(
69
123
  dry_run,
70
124
  retention_hours,
71
- enforce_retention_duration
125
+ enforce_retention_duration,
126
+ commit_properties,
127
+ post_commithook_properties
72
128
  )
73
129
  end
74
130
 
131
+ def optimize
132
+ TableOptimizer.new(self)
133
+ end
134
+
135
+ def alter
136
+ TableAlterer.new(self)
137
+ end
138
+
139
+ def merge(
140
+ source,
141
+ predicate,
142
+ source_alias: nil,
143
+ target_alias: nil,
144
+ error_on_type_mismatch: true,
145
+ writer_properties: nil,
146
+ post_commithook_properties: nil,
147
+ commit_properties: nil
148
+ )
149
+ source = Utils.convert_data(source)
150
+
151
+ rb_merge_builder =
152
+ @table.create_merge_builder(
153
+ source,
154
+ predicate,
155
+ source_alias,
156
+ target_alias,
157
+ !error_on_type_mismatch,
158
+ writer_properties,
159
+ post_commithook_properties,
160
+ commit_properties
161
+ )
162
+ TableMerger.new(rb_merge_builder, @table)
163
+ end
164
+
165
+ def restore(
166
+ target,
167
+ ignore_missing_files: false,
168
+ protocol_downgrade_allowed: false,
169
+ commit_properties: nil
170
+ )
171
+ if target.is_a?(Time)
172
+ metrics =
173
+ @table.restore(
174
+ target.utc.iso8601(9),
175
+ ignore_missing_files,
176
+ protocol_downgrade_allowed,
177
+ commit_properties
178
+ )
179
+ else
180
+ metrics =
181
+ @table.restore(
182
+ target,
183
+ ignore_missing_files,
184
+ protocol_downgrade_allowed,
185
+ commit_properties
186
+ )
187
+ end
188
+ JSON.parse(metrics)
189
+ end
190
+
75
191
  def to_polars(eager: true)
76
192
  require "polars-df"
77
193
 
@@ -80,7 +196,13 @@ module DeltaLake
80
196
  if sources.empty?
81
197
  Polars::LazyFrame.new
82
198
  else
83
- storage_options = @storage_options&.except("AWS_S3_ALLOW_UNSAFE_RENAME")
199
+ delta_keys = [
200
+ "AWS_S3_ALLOW_UNSAFE_RENAME",
201
+ "AWS_S3_LOCKING_PROVIDER",
202
+ "CONDITIONAL_PUT",
203
+ "DELTA_DYNAMO_TABLE_NAME"
204
+ ]
205
+ storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
84
206
  Polars.scan_parquet(sources, storage_options: storage_options)
85
207
  end
86
208
  eager ? lf.collect : lf
@@ -90,14 +212,52 @@ module DeltaLake
90
212
  @table.update_incremental
91
213
  end
92
214
 
93
- def delete(predicate = nil)
94
- metrics = @table.delete(predicate)
215
+ def delete(
216
+ predicate = nil,
217
+ writer_properties: nil,
218
+ post_commithook_properties: nil,
219
+ commit_properties: nil
220
+ )
221
+ metrics =
222
+ @table.delete(
223
+ predicate,
224
+ writer_properties,
225
+ post_commithook_properties,
226
+ commit_properties
227
+ )
95
228
  JSON.parse(metrics).transform_keys(&:to_sym)
96
229
  end
97
230
 
231
+ def repair(
232
+ dry_run: false,
233
+ post_commithook_properties: nil,
234
+ commit_properties: nil
235
+ )
236
+ metrics =
237
+ @table.repair(
238
+ dry_run,
239
+ commit_properties,
240
+ post_commithook_properties
241
+ )
242
+ JSON.parse(metrics).transform_keys(&:to_sym)
243
+ end
244
+
245
+ def transaction_versions
246
+ @table.transaction_versions
247
+ end
248
+
98
249
  # private
99
250
  def _table
100
251
  @table
101
252
  end
253
+
254
+ # private
255
+ def _stringify_partition_values(partition_filters)
256
+ if partition_filters.nil?
257
+ return partition_filters
258
+ end
259
+
260
+ raise Todo
261
+ end
102
262
  end
103
263
  end
@@ -0,0 +1,58 @@
1
+ module DeltaLake
2
+ class TableAlterer
3
+ def initialize(table)
4
+ @table = table
5
+ end
6
+
7
+ def add_feature(
8
+ feature,
9
+ allow_protocol_versions_increase: false
10
+ )
11
+ if !feature.is_a?(Array)
12
+ feature = [feature]
13
+ end
14
+ @table._table.add_feature(
15
+ feature,
16
+ allow_protocol_versions_increase
17
+ )
18
+ end
19
+
20
+ def add_columns(fields)
21
+ if fields.is_a?(DeltaLake::Field)
22
+ fields = [fields]
23
+ end
24
+
25
+ @table._table.add_columns(
26
+ fields
27
+ )
28
+ end
29
+
30
+ def add_constraint(constraints)
31
+ if constraints.length > 1
32
+ raise ArgumentError,
33
+ "add_constraints is limited to a single constraint addition at once for now."
34
+ end
35
+
36
+ @table._table.add_constraints(
37
+ constraints
38
+ )
39
+ end
40
+
41
+ def drop_constraint(name, raise_if_not_exists: true)
42
+ @table._table.drop_constraints(
43
+ name,
44
+ raise_if_not_exists
45
+ )
46
+ end
47
+
48
+ def set_table_properties(
49
+ properties,
50
+ raise_if_not_exists: true
51
+ )
52
+ @table._table.set_table_properties(
53
+ properties,
54
+ raise_if_not_exists
55
+ )
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,38 @@
1
+ module DeltaLake
2
+ class TableMerger
3
+ def initialize(builder, table)
4
+ @builder = builder
5
+ @table = table
6
+ end
7
+
8
+ def when_matched_update(updates, predicate: nil)
9
+ @builder.when_matched_update(updates, predicate)
10
+ self
11
+ end
12
+
13
+ def when_not_matched_insert(updates, predicate: nil)
14
+ @builder.when_not_matched_insert(updates, predicate)
15
+ self
16
+ end
17
+
18
+ def when_matched_delete(predicate: nil)
19
+ @builder.when_matched_delete(predicate)
20
+ self
21
+ end
22
+
23
+ def when_not_matched_by_source_update(updates, predicate: nil)
24
+ @builder.when_not_matched_by_source_update(updates, predicate)
25
+ self
26
+ end
27
+
28
+ def when_not_matched_by_source_delete(predicate: nil)
29
+ @builder.when_not_matched_by_source_delete(predicate)
30
+ self
31
+ end
32
+
33
+ def execute
34
+ metrics = @table.merge_execute(@builder)
35
+ JSON.parse(metrics).transform_keys(&:to_sym)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,67 @@
1
+ module DeltaLake
2
+ class TableOptimizer
3
+ def initialize(table)
4
+ @table = table
5
+ end
6
+
7
+ def compact(
8
+ partition_filters: nil,
9
+ target_size: nil,
10
+ max_concurrent_tasks: nil,
11
+ min_commit_interval: nil,
12
+ writer_properties: nil,
13
+ post_commithook_properties: nil,
14
+ commit_properties: nil
15
+ )
16
+ metrics =
17
+ @table._table.compact_optimize(
18
+ @table._stringify_partition_values(partition_filters),
19
+ target_size,
20
+ max_concurrent_tasks,
21
+ min_commit_interval,
22
+ writer_properties,
23
+ post_commithook_properties,
24
+ commit_properties
25
+ )
26
+ @table.update_incremental
27
+ result = JSON.parse(metrics)
28
+ ["filesAdded", "filesRemoved"].each do |key|
29
+ result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
30
+ end
31
+ # TODO return underscore symbols like delete
32
+ result
33
+ end
34
+
35
+ def z_order(
36
+ columns,
37
+ partition_filters: nil,
38
+ target_size: nil,
39
+ max_concurrent_tasks: nil,
40
+ max_spill_size: 20 * 1024 * 1024 * 1024,
41
+ min_commit_interval: nil,
42
+ writer_properties: nil,
43
+ post_commithook_properties: nil,
44
+ commit_properties: nil
45
+ )
46
+ metrics =
47
+ @table._table.z_order_optimize(
48
+ Array(columns),
49
+ @table._stringify_partition_values(partition_filters),
50
+ target_size,
51
+ max_concurrent_tasks,
52
+ max_spill_size,
53
+ min_commit_interval,
54
+ writer_properties,
55
+ post_commithook_properties,
56
+ commit_properties
57
+ )
58
+ @table.update_incremental
59
+ result = JSON.parse(metrics)
60
+ ["filesAdded", "filesRemoved"].each do |key|
61
+ result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
62
+ end
63
+ # TODO return underscore symbols like delete
64
+ result
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,59 @@
1
+ module DeltaLake
2
+ module Utils
3
+ def self.convert_data(data)
4
+ if data.respond_to?(:arrow_c_stream)
5
+ # TODO convert other object types
6
+ # should probably move logic to Rust
7
+ if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
8
+ data = convert_polars_data(data)
9
+ end
10
+
11
+ data.arrow_c_stream
12
+ else
13
+ raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
14
+ end
15
+ end
16
+
17
+ # unsigned integers are not part of the protocol
18
+ # https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
19
+ def self.convert_polars_data(data)
20
+ new_schema = {}
21
+ data.schema.each do |k, v|
22
+ new_type = convert_polars_type(v)
23
+ new_schema[k] = new_type if new_type
24
+ end
25
+
26
+ if new_schema.any?
27
+ data.cast(new_schema)
28
+ else
29
+ data
30
+ end
31
+ end
32
+
33
+ def self.convert_polars_type(t)
34
+ case t
35
+ when Polars::UInt8
36
+ Polars::Int8
37
+ when Polars::UInt16
38
+ Polars::Int16
39
+ when Polars::UInt32
40
+ Polars::Int32
41
+ when Polars::UInt64
42
+ Polars::Int64
43
+ when Polars::Datetime
44
+ Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
45
+ when Polars::List
46
+ inner = convert_polars_type(t.inner)
47
+ Polars::List.new(inner) if inner
48
+ when Polars::Array
49
+ inner = convert_polars_type(t.inner)
50
+ Polars::Array.new(t.inner, t.width) if inner
51
+ when Polars::Struct
52
+ if t.fields.any? { |f| convert_polars_type(f.dtype) }
53
+ fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
54
+ Polars::Struct.new(fields)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -1,3 +1,3 @@
1
1
  module DeltaLake
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.2"
3
3
  end