deltalake-rb 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Cargo.lock +506 -337
- data/README.md +33 -3
- data/ext/deltalake/Cargo.toml +7 -4
- data/ext/deltalake/src/error.rs +62 -15
- data/ext/deltalake/src/features.rs +67 -0
- data/ext/deltalake/src/lib.rs +1114 -48
- data/ext/deltalake/src/merge.rs +205 -0
- data/lib/deltalake/table.rb +170 -10
- data/lib/deltalake/table_alterer.rb +58 -0
- data/lib/deltalake/table_merger.rb +38 -0
- data/lib/deltalake/table_optimizer.rb +67 -0
- data/lib/deltalake/utils.rb +59 -0
- data/lib/deltalake/version.rb +1 -1
- data/lib/deltalake.rb +50 -12
- metadata +8 -2
@@ -0,0 +1,205 @@
|
|
1
|
+
use deltalake::arrow::array::RecordBatchReader;
|
2
|
+
use deltalake::arrow::datatypes::Schema as ArrowSchema;
|
3
|
+
use deltalake::arrow::ffi_stream::ArrowArrayStreamReader;
|
4
|
+
use deltalake::datafusion::catalog::TableProvider;
|
5
|
+
use deltalake::datafusion::datasource::MemTable;
|
6
|
+
use deltalake::datafusion::prelude::SessionContext;
|
7
|
+
use deltalake::logstore::LogStoreRef;
|
8
|
+
use deltalake::operations::merge::MergeBuilder;
|
9
|
+
use deltalake::table::state::DeltaTableState;
|
10
|
+
use deltalake::{DeltaResult, DeltaTable};
|
11
|
+
use std::cell::RefCell;
|
12
|
+
use std::collections::HashMap;
|
13
|
+
use std::future::IntoFuture;
|
14
|
+
use std::sync::Arc;
|
15
|
+
|
16
|
+
use crate::error::RubyError;
|
17
|
+
use crate::utils::rt;
|
18
|
+
use crate::RbResult;
|
19
|
+
use crate::{
|
20
|
+
maybe_create_commit_properties, set_writer_properties, RbCommitProperties,
|
21
|
+
RbPostCommitHookProperties, RbWriterProperties,
|
22
|
+
};
|
23
|
+
|
24
|
+
#[magnus::wrap(class = "DeltaLake::RbMergeBuilder")]
|
25
|
+
pub(crate) struct RbMergeBuilder {
|
26
|
+
_builder: RefCell<Option<MergeBuilder>>,
|
27
|
+
source_alias: Option<String>,
|
28
|
+
target_alias: Option<String>,
|
29
|
+
#[allow(dead_code)]
|
30
|
+
arrow_schema: Arc<ArrowSchema>,
|
31
|
+
}
|
32
|
+
|
33
|
+
// getters
|
34
|
+
impl RbMergeBuilder {
|
35
|
+
pub fn source_alias(&self) -> Option<String> {
|
36
|
+
self.source_alias.clone()
|
37
|
+
}
|
38
|
+
|
39
|
+
pub fn target_alias(&self) -> Option<String> {
|
40
|
+
self.target_alias.clone()
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
impl RbMergeBuilder {
|
45
|
+
pub fn new(
|
46
|
+
log_store: LogStoreRef,
|
47
|
+
snapshot: DeltaTableState,
|
48
|
+
source: ArrowArrayStreamReader,
|
49
|
+
predicate: String,
|
50
|
+
source_alias: Option<String>,
|
51
|
+
target_alias: Option<String>,
|
52
|
+
safe_cast: bool,
|
53
|
+
writer_properties: Option<RbWriterProperties>,
|
54
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
55
|
+
commit_properties: Option<RbCommitProperties>,
|
56
|
+
) -> DeltaResult<Self> {
|
57
|
+
let ctx = SessionContext::new();
|
58
|
+
let schema = source.schema();
|
59
|
+
let batches = vec![source.map(|batch| batch.unwrap()).collect::<Vec<_>>()];
|
60
|
+
let table_provider: Arc<dyn TableProvider> =
|
61
|
+
Arc::new(MemTable::try_new(schema.clone(), batches).unwrap());
|
62
|
+
let source_df = ctx.read_table(table_provider).unwrap();
|
63
|
+
|
64
|
+
let mut cmd =
|
65
|
+
MergeBuilder::new(log_store, snapshot, predicate, source_df).with_safe_cast(safe_cast);
|
66
|
+
|
67
|
+
if let Some(src_alias) = &source_alias {
|
68
|
+
cmd = cmd.with_source_alias(src_alias);
|
69
|
+
}
|
70
|
+
|
71
|
+
if let Some(trgt_alias) = &target_alias {
|
72
|
+
cmd = cmd.with_target_alias(trgt_alias);
|
73
|
+
}
|
74
|
+
|
75
|
+
if let Some(writer_props) = writer_properties {
|
76
|
+
cmd = cmd.with_writer_properties(set_writer_properties(writer_props)?);
|
77
|
+
}
|
78
|
+
|
79
|
+
if let Some(commit_properties) =
|
80
|
+
maybe_create_commit_properties(commit_properties, post_commithook_properties)
|
81
|
+
{
|
82
|
+
cmd = cmd.with_commit_properties(commit_properties);
|
83
|
+
}
|
84
|
+
|
85
|
+
Ok(Self {
|
86
|
+
_builder: RefCell::new(Some(cmd)),
|
87
|
+
source_alias,
|
88
|
+
target_alias,
|
89
|
+
arrow_schema: schema,
|
90
|
+
})
|
91
|
+
}
|
92
|
+
|
93
|
+
pub fn execute(&self) -> DeltaResult<(DeltaTable, String)> {
|
94
|
+
let (table, metrics) = rt().block_on(self._builder.take().unwrap().into_future())?;
|
95
|
+
Ok((table, serde_json::to_string(&metrics).unwrap()))
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
impl RbMergeBuilder {
|
100
|
+
pub fn when_matched_update(
|
101
|
+
&self,
|
102
|
+
updates: HashMap<String, String>,
|
103
|
+
predicate: Option<String>,
|
104
|
+
) -> RbResult<()> {
|
105
|
+
let mut binding = self._builder.borrow_mut();
|
106
|
+
*binding = match binding.take() {
|
107
|
+
Some(cmd) => Some(
|
108
|
+
cmd.when_matched_update(|mut update| {
|
109
|
+
for (column, expression) in updates {
|
110
|
+
update = update.update(column, expression)
|
111
|
+
}
|
112
|
+
if let Some(predicate) = predicate {
|
113
|
+
update = update.predicate(predicate)
|
114
|
+
};
|
115
|
+
update
|
116
|
+
})
|
117
|
+
.map_err(RubyError::from)?,
|
118
|
+
),
|
119
|
+
None => unreachable!(),
|
120
|
+
};
|
121
|
+
Ok(())
|
122
|
+
}
|
123
|
+
|
124
|
+
pub fn when_matched_delete(&self, predicate: Option<String>) -> RbResult<()> {
|
125
|
+
let mut binding = self._builder.borrow_mut();
|
126
|
+
*binding = match binding.take() {
|
127
|
+
Some(cmd) => Some(
|
128
|
+
cmd.when_matched_delete(|mut delete| {
|
129
|
+
if let Some(predicate) = predicate {
|
130
|
+
delete = delete.predicate(predicate)
|
131
|
+
};
|
132
|
+
delete
|
133
|
+
})
|
134
|
+
.map_err(RubyError::from)?,
|
135
|
+
),
|
136
|
+
None => unreachable!(),
|
137
|
+
};
|
138
|
+
Ok(())
|
139
|
+
}
|
140
|
+
|
141
|
+
pub fn when_not_matched_insert(
|
142
|
+
&self,
|
143
|
+
updates: HashMap<String, String>,
|
144
|
+
predicate: Option<String>,
|
145
|
+
) -> RbResult<()> {
|
146
|
+
let mut binding = self._builder.borrow_mut();
|
147
|
+
*binding = match binding.take() {
|
148
|
+
Some(cmd) => Some(
|
149
|
+
cmd.when_not_matched_insert(|mut insert| {
|
150
|
+
for (column, expression) in updates {
|
151
|
+
insert = insert.set(column, expression)
|
152
|
+
}
|
153
|
+
if let Some(predicate) = predicate {
|
154
|
+
insert = insert.predicate(predicate)
|
155
|
+
};
|
156
|
+
insert
|
157
|
+
})
|
158
|
+
.map_err(RubyError::from)?,
|
159
|
+
),
|
160
|
+
None => unreachable!(),
|
161
|
+
};
|
162
|
+
Ok(())
|
163
|
+
}
|
164
|
+
|
165
|
+
pub fn when_not_matched_by_source_update(
|
166
|
+
&self,
|
167
|
+
updates: HashMap<String, String>,
|
168
|
+
predicate: Option<String>,
|
169
|
+
) -> RbResult<()> {
|
170
|
+
let mut binding = self._builder.borrow_mut();
|
171
|
+
*binding = match binding.take() {
|
172
|
+
Some(cmd) => Some(
|
173
|
+
cmd.when_not_matched_by_source_update(|mut update| {
|
174
|
+
for (column, expression) in updates {
|
175
|
+
update = update.update(column, expression)
|
176
|
+
}
|
177
|
+
if let Some(predicate) = predicate {
|
178
|
+
update = update.predicate(predicate)
|
179
|
+
};
|
180
|
+
update
|
181
|
+
})
|
182
|
+
.map_err(RubyError::from)?,
|
183
|
+
),
|
184
|
+
None => unreachable!(),
|
185
|
+
};
|
186
|
+
Ok(())
|
187
|
+
}
|
188
|
+
|
189
|
+
pub fn when_not_matched_by_source_delete(&self, predicate: Option<String>) -> RbResult<()> {
|
190
|
+
let mut binding = self._builder.borrow_mut();
|
191
|
+
*binding = match binding.take() {
|
192
|
+
Some(cmd) => Some(
|
193
|
+
cmd.when_not_matched_by_source_delete(|mut delete| {
|
194
|
+
if let Some(predicate) = predicate {
|
195
|
+
delete = delete.predicate(predicate)
|
196
|
+
};
|
197
|
+
delete
|
198
|
+
})
|
199
|
+
.map_err(RubyError::from)?,
|
200
|
+
),
|
201
|
+
None => unreachable!(),
|
202
|
+
};
|
203
|
+
Ok(())
|
204
|
+
}
|
205
|
+
}
|
data/lib/deltalake/table.rb
CHANGED
@@ -26,22 +26,51 @@ module DeltaLake
|
|
26
26
|
@table.version
|
27
27
|
end
|
28
28
|
|
29
|
-
def
|
30
|
-
|
29
|
+
def partitions
|
30
|
+
partitions = []
|
31
|
+
@table.get_active_partitions.each do |partition|
|
32
|
+
next unless partition
|
33
|
+
partitions << partition.to_h
|
34
|
+
end
|
35
|
+
partitions
|
36
|
+
end
|
37
|
+
|
38
|
+
def files(partition_filters: nil)
|
39
|
+
@table.files(_stringify_partition_values(partition_filters))
|
31
40
|
end
|
32
41
|
|
33
|
-
def file_uris
|
34
|
-
@table.file_uris
|
42
|
+
def file_uris(partition_filters: nil)
|
43
|
+
@table.file_uris(_stringify_partition_values(partition_filters))
|
35
44
|
end
|
36
45
|
|
37
46
|
def load_as_version(version)
|
38
47
|
if version.is_a?(Integer)
|
39
48
|
@table.load_version(version)
|
49
|
+
elsif version.is_a?(Time)
|
50
|
+
@table.load_with_datetime(version.utc.iso8601(9))
|
51
|
+
elsif version.is_a?(String)
|
52
|
+
@table.load_with_datetime(version)
|
40
53
|
else
|
41
|
-
raise TypeError, "Invalid datatype provided for version, only Integer
|
54
|
+
raise TypeError, "Invalid datatype provided for version, only Integer, String, and Time are accepted."
|
42
55
|
end
|
43
56
|
end
|
44
57
|
|
58
|
+
def load_cdf(
|
59
|
+
starting_version: 0,
|
60
|
+
ending_version: nil,
|
61
|
+
starting_timestamp: nil,
|
62
|
+
ending_timestamp: nil,
|
63
|
+
columns: nil
|
64
|
+
)
|
65
|
+
@table.load_cdf(
|
66
|
+
starting_version,
|
67
|
+
ending_version,
|
68
|
+
starting_timestamp,
|
69
|
+
ending_timestamp,
|
70
|
+
columns
|
71
|
+
)
|
72
|
+
end
|
73
|
+
|
45
74
|
def table_uri
|
46
75
|
@table.table_uri
|
47
76
|
end
|
@@ -54,10 +83,35 @@ module DeltaLake
|
|
54
83
|
Metadata.new(@table)
|
55
84
|
end
|
56
85
|
|
86
|
+
def protocol
|
87
|
+
ProtocolVersions.new(*@table.protocol_versions)
|
88
|
+
end
|
89
|
+
|
90
|
+
def history(limit: nil)
|
91
|
+
backwards_enumerate = lambda do |iterable, start_end, &block|
|
92
|
+
n = start_end
|
93
|
+
iterable.each do |elem|
|
94
|
+
block.call(n, elem)
|
95
|
+
n -= 1
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
commits = @table.history(limit)
|
100
|
+
history = []
|
101
|
+
backwards_enumerate.(commits, @table.get_latest_version) do |version, commit_info_raw|
|
102
|
+
commit = JSON.parse(commit_info_raw)
|
103
|
+
commit["version"] = version
|
104
|
+
history << commit
|
105
|
+
end
|
106
|
+
history
|
107
|
+
end
|
108
|
+
|
57
109
|
def vacuum(
|
58
110
|
retention_hours: nil,
|
59
111
|
dry_run: true,
|
60
|
-
enforce_retention_duration: true
|
112
|
+
enforce_retention_duration: true,
|
113
|
+
post_commithook_properties: nil,
|
114
|
+
commit_properties: nil
|
61
115
|
)
|
62
116
|
if retention_hours
|
63
117
|
if retention_hours < 0
|
@@ -68,10 +122,72 @@ module DeltaLake
|
|
68
122
|
@table.vacuum(
|
69
123
|
dry_run,
|
70
124
|
retention_hours,
|
71
|
-
enforce_retention_duration
|
125
|
+
enforce_retention_duration,
|
126
|
+
commit_properties,
|
127
|
+
post_commithook_properties
|
72
128
|
)
|
73
129
|
end
|
74
130
|
|
131
|
+
def optimize
|
132
|
+
TableOptimizer.new(self)
|
133
|
+
end
|
134
|
+
|
135
|
+
def alter
|
136
|
+
TableAlterer.new(self)
|
137
|
+
end
|
138
|
+
|
139
|
+
def merge(
|
140
|
+
source,
|
141
|
+
predicate,
|
142
|
+
source_alias: nil,
|
143
|
+
target_alias: nil,
|
144
|
+
error_on_type_mismatch: true,
|
145
|
+
writer_properties: nil,
|
146
|
+
post_commithook_properties: nil,
|
147
|
+
commit_properties: nil
|
148
|
+
)
|
149
|
+
source = Utils.convert_data(source)
|
150
|
+
|
151
|
+
rb_merge_builder =
|
152
|
+
@table.create_merge_builder(
|
153
|
+
source,
|
154
|
+
predicate,
|
155
|
+
source_alias,
|
156
|
+
target_alias,
|
157
|
+
!error_on_type_mismatch,
|
158
|
+
writer_properties,
|
159
|
+
post_commithook_properties,
|
160
|
+
commit_properties
|
161
|
+
)
|
162
|
+
TableMerger.new(rb_merge_builder, @table)
|
163
|
+
end
|
164
|
+
|
165
|
+
def restore(
|
166
|
+
target,
|
167
|
+
ignore_missing_files: false,
|
168
|
+
protocol_downgrade_allowed: false,
|
169
|
+
commit_properties: nil
|
170
|
+
)
|
171
|
+
if target.is_a?(Time)
|
172
|
+
metrics =
|
173
|
+
@table.restore(
|
174
|
+
target.utc.iso8601(9),
|
175
|
+
ignore_missing_files,
|
176
|
+
protocol_downgrade_allowed,
|
177
|
+
commit_properties
|
178
|
+
)
|
179
|
+
else
|
180
|
+
metrics =
|
181
|
+
@table.restore(
|
182
|
+
target,
|
183
|
+
ignore_missing_files,
|
184
|
+
protocol_downgrade_allowed,
|
185
|
+
commit_properties
|
186
|
+
)
|
187
|
+
end
|
188
|
+
JSON.parse(metrics)
|
189
|
+
end
|
190
|
+
|
75
191
|
def to_polars(eager: true)
|
76
192
|
require "polars-df"
|
77
193
|
|
@@ -80,7 +196,13 @@ module DeltaLake
|
|
80
196
|
if sources.empty?
|
81
197
|
Polars::LazyFrame.new
|
82
198
|
else
|
83
|
-
|
199
|
+
delta_keys = [
|
200
|
+
"AWS_S3_ALLOW_UNSAFE_RENAME",
|
201
|
+
"AWS_S3_LOCKING_PROVIDER",
|
202
|
+
"CONDITIONAL_PUT",
|
203
|
+
"DELTA_DYNAMO_TABLE_NAME"
|
204
|
+
]
|
205
|
+
storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
|
84
206
|
Polars.scan_parquet(sources, storage_options: storage_options)
|
85
207
|
end
|
86
208
|
eager ? lf.collect : lf
|
@@ -90,14 +212,52 @@ module DeltaLake
|
|
90
212
|
@table.update_incremental
|
91
213
|
end
|
92
214
|
|
93
|
-
def delete(
|
94
|
-
|
215
|
+
def delete(
|
216
|
+
predicate = nil,
|
217
|
+
writer_properties: nil,
|
218
|
+
post_commithook_properties: nil,
|
219
|
+
commit_properties: nil
|
220
|
+
)
|
221
|
+
metrics =
|
222
|
+
@table.delete(
|
223
|
+
predicate,
|
224
|
+
writer_properties,
|
225
|
+
post_commithook_properties,
|
226
|
+
commit_properties
|
227
|
+
)
|
95
228
|
JSON.parse(metrics).transform_keys(&:to_sym)
|
96
229
|
end
|
97
230
|
|
231
|
+
def repair(
|
232
|
+
dry_run: false,
|
233
|
+
post_commithook_properties: nil,
|
234
|
+
commit_properties: nil
|
235
|
+
)
|
236
|
+
metrics =
|
237
|
+
@table.repair(
|
238
|
+
dry_run,
|
239
|
+
commit_properties,
|
240
|
+
post_commithook_properties
|
241
|
+
)
|
242
|
+
JSON.parse(metrics).transform_keys(&:to_sym)
|
243
|
+
end
|
244
|
+
|
245
|
+
def transaction_versions
|
246
|
+
@table.transaction_versions
|
247
|
+
end
|
248
|
+
|
98
249
|
# private
|
99
250
|
def _table
|
100
251
|
@table
|
101
252
|
end
|
253
|
+
|
254
|
+
# private
|
255
|
+
def _stringify_partition_values(partition_filters)
|
256
|
+
if partition_filters.nil?
|
257
|
+
return partition_filters
|
258
|
+
end
|
259
|
+
|
260
|
+
raise Todo
|
261
|
+
end
|
102
262
|
end
|
103
263
|
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class TableAlterer
|
3
|
+
def initialize(table)
|
4
|
+
@table = table
|
5
|
+
end
|
6
|
+
|
7
|
+
def add_feature(
|
8
|
+
feature,
|
9
|
+
allow_protocol_versions_increase: false
|
10
|
+
)
|
11
|
+
if !feature.is_a?(Array)
|
12
|
+
feature = [feature]
|
13
|
+
end
|
14
|
+
@table._table.add_feature(
|
15
|
+
feature,
|
16
|
+
allow_protocol_versions_increase
|
17
|
+
)
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_columns(fields)
|
21
|
+
if fields.is_a?(DeltaLake::Field)
|
22
|
+
fields = [fields]
|
23
|
+
end
|
24
|
+
|
25
|
+
@table._table.add_columns(
|
26
|
+
fields
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
def add_constraint(constraints)
|
31
|
+
if constraints.length > 1
|
32
|
+
raise ArgumentError,
|
33
|
+
"add_constraints is limited to a single constraint addition at once for now."
|
34
|
+
end
|
35
|
+
|
36
|
+
@table._table.add_constraints(
|
37
|
+
constraints
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
def drop_constraint(name, raise_if_not_exists: true)
|
42
|
+
@table._table.drop_constraints(
|
43
|
+
name,
|
44
|
+
raise_if_not_exists
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
def set_table_properties(
|
49
|
+
properties,
|
50
|
+
raise_if_not_exists: true
|
51
|
+
)
|
52
|
+
@table._table.set_table_properties(
|
53
|
+
properties,
|
54
|
+
raise_if_not_exists
|
55
|
+
)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class TableMerger
|
3
|
+
def initialize(builder, table)
|
4
|
+
@builder = builder
|
5
|
+
@table = table
|
6
|
+
end
|
7
|
+
|
8
|
+
def when_matched_update(updates, predicate: nil)
|
9
|
+
@builder.when_matched_update(updates, predicate)
|
10
|
+
self
|
11
|
+
end
|
12
|
+
|
13
|
+
def when_not_matched_insert(updates, predicate: nil)
|
14
|
+
@builder.when_not_matched_insert(updates, predicate)
|
15
|
+
self
|
16
|
+
end
|
17
|
+
|
18
|
+
def when_matched_delete(predicate: nil)
|
19
|
+
@builder.when_matched_delete(predicate)
|
20
|
+
self
|
21
|
+
end
|
22
|
+
|
23
|
+
def when_not_matched_by_source_update(updates, predicate: nil)
|
24
|
+
@builder.when_not_matched_by_source_update(updates, predicate)
|
25
|
+
self
|
26
|
+
end
|
27
|
+
|
28
|
+
def when_not_matched_by_source_delete(predicate: nil)
|
29
|
+
@builder.when_not_matched_by_source_delete(predicate)
|
30
|
+
self
|
31
|
+
end
|
32
|
+
|
33
|
+
def execute
|
34
|
+
metrics = @table.merge_execute(@builder)
|
35
|
+
JSON.parse(metrics).transform_keys(&:to_sym)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class TableOptimizer
|
3
|
+
def initialize(table)
|
4
|
+
@table = table
|
5
|
+
end
|
6
|
+
|
7
|
+
def compact(
|
8
|
+
partition_filters: nil,
|
9
|
+
target_size: nil,
|
10
|
+
max_concurrent_tasks: nil,
|
11
|
+
min_commit_interval: nil,
|
12
|
+
writer_properties: nil,
|
13
|
+
post_commithook_properties: nil,
|
14
|
+
commit_properties: nil
|
15
|
+
)
|
16
|
+
metrics =
|
17
|
+
@table._table.compact_optimize(
|
18
|
+
@table._stringify_partition_values(partition_filters),
|
19
|
+
target_size,
|
20
|
+
max_concurrent_tasks,
|
21
|
+
min_commit_interval,
|
22
|
+
writer_properties,
|
23
|
+
post_commithook_properties,
|
24
|
+
commit_properties
|
25
|
+
)
|
26
|
+
@table.update_incremental
|
27
|
+
result = JSON.parse(metrics)
|
28
|
+
["filesAdded", "filesRemoved"].each do |key|
|
29
|
+
result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
|
30
|
+
end
|
31
|
+
# TODO return underscore symbols like delete
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
def z_order(
|
36
|
+
columns,
|
37
|
+
partition_filters: nil,
|
38
|
+
target_size: nil,
|
39
|
+
max_concurrent_tasks: nil,
|
40
|
+
max_spill_size: 20 * 1024 * 1024 * 1024,
|
41
|
+
min_commit_interval: nil,
|
42
|
+
writer_properties: nil,
|
43
|
+
post_commithook_properties: nil,
|
44
|
+
commit_properties: nil
|
45
|
+
)
|
46
|
+
metrics =
|
47
|
+
@table._table.z_order_optimize(
|
48
|
+
Array(columns),
|
49
|
+
@table._stringify_partition_values(partition_filters),
|
50
|
+
target_size,
|
51
|
+
max_concurrent_tasks,
|
52
|
+
max_spill_size,
|
53
|
+
min_commit_interval,
|
54
|
+
writer_properties,
|
55
|
+
post_commithook_properties,
|
56
|
+
commit_properties
|
57
|
+
)
|
58
|
+
@table.update_incremental
|
59
|
+
result = JSON.parse(metrics)
|
60
|
+
["filesAdded", "filesRemoved"].each do |key|
|
61
|
+
result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
|
62
|
+
end
|
63
|
+
# TODO return underscore symbols like delete
|
64
|
+
result
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
module Utils
|
3
|
+
def self.convert_data(data)
|
4
|
+
if data.respond_to?(:arrow_c_stream)
|
5
|
+
# TODO convert other object types
|
6
|
+
# should probably move logic to Rust
|
7
|
+
if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
|
8
|
+
data = convert_polars_data(data)
|
9
|
+
end
|
10
|
+
|
11
|
+
data.arrow_c_stream
|
12
|
+
else
|
13
|
+
raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# unsigned integers are not part of the protocol
|
18
|
+
# https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
|
19
|
+
def self.convert_polars_data(data)
|
20
|
+
new_schema = {}
|
21
|
+
data.schema.each do |k, v|
|
22
|
+
new_type = convert_polars_type(v)
|
23
|
+
new_schema[k] = new_type if new_type
|
24
|
+
end
|
25
|
+
|
26
|
+
if new_schema.any?
|
27
|
+
data.cast(new_schema)
|
28
|
+
else
|
29
|
+
data
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.convert_polars_type(t)
|
34
|
+
case t
|
35
|
+
when Polars::UInt8
|
36
|
+
Polars::Int8
|
37
|
+
when Polars::UInt16
|
38
|
+
Polars::Int16
|
39
|
+
when Polars::UInt32
|
40
|
+
Polars::Int32
|
41
|
+
when Polars::UInt64
|
42
|
+
Polars::Int64
|
43
|
+
when Polars::Datetime
|
44
|
+
Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
|
45
|
+
when Polars::List
|
46
|
+
inner = convert_polars_type(t.inner)
|
47
|
+
Polars::List.new(inner) if inner
|
48
|
+
when Polars::Array
|
49
|
+
inner = convert_polars_type(t.inner)
|
50
|
+
Polars::Array.new(t.inner, t.width) if inner
|
51
|
+
when Polars::Struct
|
52
|
+
if t.fields.any? { |f| convert_polars_type(f.dtype) }
|
53
|
+
fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
|
54
|
+
Polars::Struct.new(fields)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
data/lib/deltalake/version.rb
CHANGED