deltalake-rb 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Cargo.lock +504 -337
- data/ext/deltalake/Cargo.toml +5 -4
- data/ext/deltalake/src/error.rs +62 -15
- data/ext/deltalake/src/features.rs +67 -0
- data/ext/deltalake/src/lib.rs +632 -61
- data/ext/deltalake/src/merge.rs +205 -0
- data/lib/deltalake/table.rb +59 -15
- data/lib/deltalake/table_alterer.rb +33 -0
- data/lib/deltalake/table_merger.rb +38 -0
- data/lib/deltalake/table_optimizer.rb +20 -4
- data/lib/deltalake/utils.rb +59 -0
- data/lib/deltalake/version.rb +1 -1
- data/lib/deltalake.rb +34 -59
- metadata +6 -2
@@ -0,0 +1,205 @@
|
|
1
|
+
use deltalake::arrow::array::RecordBatchReader;
|
2
|
+
use deltalake::arrow::datatypes::Schema as ArrowSchema;
|
3
|
+
use deltalake::arrow::ffi_stream::ArrowArrayStreamReader;
|
4
|
+
use deltalake::datafusion::catalog::TableProvider;
|
5
|
+
use deltalake::datafusion::datasource::MemTable;
|
6
|
+
use deltalake::datafusion::prelude::SessionContext;
|
7
|
+
use deltalake::logstore::LogStoreRef;
|
8
|
+
use deltalake::operations::merge::MergeBuilder;
|
9
|
+
use deltalake::table::state::DeltaTableState;
|
10
|
+
use deltalake::{DeltaResult, DeltaTable};
|
11
|
+
use std::cell::RefCell;
|
12
|
+
use std::collections::HashMap;
|
13
|
+
use std::future::IntoFuture;
|
14
|
+
use std::sync::Arc;
|
15
|
+
|
16
|
+
use crate::error::RubyError;
|
17
|
+
use crate::utils::rt;
|
18
|
+
use crate::RbResult;
|
19
|
+
use crate::{
|
20
|
+
maybe_create_commit_properties, set_writer_properties, RbCommitProperties,
|
21
|
+
RbPostCommitHookProperties, RbWriterProperties,
|
22
|
+
};
|
23
|
+
|
24
|
+
#[magnus::wrap(class = "DeltaLake::RbMergeBuilder")]
|
25
|
+
pub(crate) struct RbMergeBuilder {
|
26
|
+
_builder: RefCell<Option<MergeBuilder>>,
|
27
|
+
source_alias: Option<String>,
|
28
|
+
target_alias: Option<String>,
|
29
|
+
#[allow(dead_code)]
|
30
|
+
arrow_schema: Arc<ArrowSchema>,
|
31
|
+
}
|
32
|
+
|
33
|
+
// getters
|
34
|
+
impl RbMergeBuilder {
|
35
|
+
pub fn source_alias(&self) -> Option<String> {
|
36
|
+
self.source_alias.clone()
|
37
|
+
}
|
38
|
+
|
39
|
+
pub fn target_alias(&self) -> Option<String> {
|
40
|
+
self.target_alias.clone()
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
impl RbMergeBuilder {
|
45
|
+
pub fn new(
|
46
|
+
log_store: LogStoreRef,
|
47
|
+
snapshot: DeltaTableState,
|
48
|
+
source: ArrowArrayStreamReader,
|
49
|
+
predicate: String,
|
50
|
+
source_alias: Option<String>,
|
51
|
+
target_alias: Option<String>,
|
52
|
+
safe_cast: bool,
|
53
|
+
writer_properties: Option<RbWriterProperties>,
|
54
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
55
|
+
commit_properties: Option<RbCommitProperties>,
|
56
|
+
) -> DeltaResult<Self> {
|
57
|
+
let ctx = SessionContext::new();
|
58
|
+
let schema = source.schema();
|
59
|
+
let batches = vec![source.map(|batch| batch.unwrap()).collect::<Vec<_>>()];
|
60
|
+
let table_provider: Arc<dyn TableProvider> =
|
61
|
+
Arc::new(MemTable::try_new(schema.clone(), batches).unwrap());
|
62
|
+
let source_df = ctx.read_table(table_provider).unwrap();
|
63
|
+
|
64
|
+
let mut cmd =
|
65
|
+
MergeBuilder::new(log_store, snapshot, predicate, source_df).with_safe_cast(safe_cast);
|
66
|
+
|
67
|
+
if let Some(src_alias) = &source_alias {
|
68
|
+
cmd = cmd.with_source_alias(src_alias);
|
69
|
+
}
|
70
|
+
|
71
|
+
if let Some(trgt_alias) = &target_alias {
|
72
|
+
cmd = cmd.with_target_alias(trgt_alias);
|
73
|
+
}
|
74
|
+
|
75
|
+
if let Some(writer_props) = writer_properties {
|
76
|
+
cmd = cmd.with_writer_properties(set_writer_properties(writer_props)?);
|
77
|
+
}
|
78
|
+
|
79
|
+
if let Some(commit_properties) =
|
80
|
+
maybe_create_commit_properties(commit_properties, post_commithook_properties)
|
81
|
+
{
|
82
|
+
cmd = cmd.with_commit_properties(commit_properties);
|
83
|
+
}
|
84
|
+
|
85
|
+
Ok(Self {
|
86
|
+
_builder: RefCell::new(Some(cmd)),
|
87
|
+
source_alias,
|
88
|
+
target_alias,
|
89
|
+
arrow_schema: schema,
|
90
|
+
})
|
91
|
+
}
|
92
|
+
|
93
|
+
pub fn execute(&self) -> DeltaResult<(DeltaTable, String)> {
|
94
|
+
let (table, metrics) = rt().block_on(self._builder.take().unwrap().into_future())?;
|
95
|
+
Ok((table, serde_json::to_string(&metrics).unwrap()))
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
impl RbMergeBuilder {
|
100
|
+
pub fn when_matched_update(
|
101
|
+
&self,
|
102
|
+
updates: HashMap<String, String>,
|
103
|
+
predicate: Option<String>,
|
104
|
+
) -> RbResult<()> {
|
105
|
+
let mut binding = self._builder.borrow_mut();
|
106
|
+
*binding = match binding.take() {
|
107
|
+
Some(cmd) => Some(
|
108
|
+
cmd.when_matched_update(|mut update| {
|
109
|
+
for (column, expression) in updates {
|
110
|
+
update = update.update(column, expression)
|
111
|
+
}
|
112
|
+
if let Some(predicate) = predicate {
|
113
|
+
update = update.predicate(predicate)
|
114
|
+
};
|
115
|
+
update
|
116
|
+
})
|
117
|
+
.map_err(RubyError::from)?,
|
118
|
+
),
|
119
|
+
None => unreachable!(),
|
120
|
+
};
|
121
|
+
Ok(())
|
122
|
+
}
|
123
|
+
|
124
|
+
pub fn when_matched_delete(&self, predicate: Option<String>) -> RbResult<()> {
|
125
|
+
let mut binding = self._builder.borrow_mut();
|
126
|
+
*binding = match binding.take() {
|
127
|
+
Some(cmd) => Some(
|
128
|
+
cmd.when_matched_delete(|mut delete| {
|
129
|
+
if let Some(predicate) = predicate {
|
130
|
+
delete = delete.predicate(predicate)
|
131
|
+
};
|
132
|
+
delete
|
133
|
+
})
|
134
|
+
.map_err(RubyError::from)?,
|
135
|
+
),
|
136
|
+
None => unreachable!(),
|
137
|
+
};
|
138
|
+
Ok(())
|
139
|
+
}
|
140
|
+
|
141
|
+
pub fn when_not_matched_insert(
|
142
|
+
&self,
|
143
|
+
updates: HashMap<String, String>,
|
144
|
+
predicate: Option<String>,
|
145
|
+
) -> RbResult<()> {
|
146
|
+
let mut binding = self._builder.borrow_mut();
|
147
|
+
*binding = match binding.take() {
|
148
|
+
Some(cmd) => Some(
|
149
|
+
cmd.when_not_matched_insert(|mut insert| {
|
150
|
+
for (column, expression) in updates {
|
151
|
+
insert = insert.set(column, expression)
|
152
|
+
}
|
153
|
+
if let Some(predicate) = predicate {
|
154
|
+
insert = insert.predicate(predicate)
|
155
|
+
};
|
156
|
+
insert
|
157
|
+
})
|
158
|
+
.map_err(RubyError::from)?,
|
159
|
+
),
|
160
|
+
None => unreachable!(),
|
161
|
+
};
|
162
|
+
Ok(())
|
163
|
+
}
|
164
|
+
|
165
|
+
pub fn when_not_matched_by_source_update(
|
166
|
+
&self,
|
167
|
+
updates: HashMap<String, String>,
|
168
|
+
predicate: Option<String>,
|
169
|
+
) -> RbResult<()> {
|
170
|
+
let mut binding = self._builder.borrow_mut();
|
171
|
+
*binding = match binding.take() {
|
172
|
+
Some(cmd) => Some(
|
173
|
+
cmd.when_not_matched_by_source_update(|mut update| {
|
174
|
+
for (column, expression) in updates {
|
175
|
+
update = update.update(column, expression)
|
176
|
+
}
|
177
|
+
if let Some(predicate) = predicate {
|
178
|
+
update = update.predicate(predicate)
|
179
|
+
};
|
180
|
+
update
|
181
|
+
})
|
182
|
+
.map_err(RubyError::from)?,
|
183
|
+
),
|
184
|
+
None => unreachable!(),
|
185
|
+
};
|
186
|
+
Ok(())
|
187
|
+
}
|
188
|
+
|
189
|
+
pub fn when_not_matched_by_source_delete(&self, predicate: Option<String>) -> RbResult<()> {
|
190
|
+
let mut binding = self._builder.borrow_mut();
|
191
|
+
*binding = match binding.take() {
|
192
|
+
Some(cmd) => Some(
|
193
|
+
cmd.when_not_matched_by_source_delete(|mut delete| {
|
194
|
+
if let Some(predicate) = predicate {
|
195
|
+
delete = delete.predicate(predicate)
|
196
|
+
};
|
197
|
+
delete
|
198
|
+
})
|
199
|
+
.map_err(RubyError::from)?,
|
200
|
+
),
|
201
|
+
None => unreachable!(),
|
202
|
+
};
|
203
|
+
Ok(())
|
204
|
+
}
|
205
|
+
}
|
data/lib/deltalake/table.rb
CHANGED
@@ -47,9 +47,6 @@ module DeltaLake
|
|
47
47
|
if version.is_a?(Integer)
|
48
48
|
@table.load_version(version)
|
49
49
|
elsif version.is_a?(Time)
|
50
|
-
# needed for iso8601
|
51
|
-
require "time"
|
52
|
-
|
53
50
|
@table.load_with_datetime(version.utc.iso8601(9))
|
54
51
|
elsif version.is_a?(String)
|
55
52
|
@table.load_with_datetime(version)
|
@@ -112,7 +109,9 @@ module DeltaLake
|
|
112
109
|
def vacuum(
|
113
110
|
retention_hours: nil,
|
114
111
|
dry_run: true,
|
115
|
-
enforce_retention_duration: true
|
112
|
+
enforce_retention_duration: true,
|
113
|
+
post_commithook_properties: nil,
|
114
|
+
commit_properties: nil
|
116
115
|
)
|
117
116
|
if retention_hours
|
118
117
|
if retention_hours < 0
|
@@ -123,7 +122,9 @@ module DeltaLake
|
|
123
122
|
@table.vacuum(
|
124
123
|
dry_run,
|
125
124
|
retention_hours,
|
126
|
-
enforce_retention_duration
|
125
|
+
enforce_retention_duration,
|
126
|
+
commit_properties,
|
127
|
+
post_commithook_properties
|
127
128
|
)
|
128
129
|
end
|
129
130
|
|
@@ -135,27 +136,53 @@ module DeltaLake
|
|
135
136
|
TableAlterer.new(self)
|
136
137
|
end
|
137
138
|
|
139
|
+
def merge(
|
140
|
+
source,
|
141
|
+
predicate,
|
142
|
+
source_alias: nil,
|
143
|
+
target_alias: nil,
|
144
|
+
error_on_type_mismatch: true,
|
145
|
+
writer_properties: nil,
|
146
|
+
post_commithook_properties: nil,
|
147
|
+
commit_properties: nil
|
148
|
+
)
|
149
|
+
source = Utils.convert_data(source)
|
150
|
+
|
151
|
+
rb_merge_builder =
|
152
|
+
@table.create_merge_builder(
|
153
|
+
source,
|
154
|
+
predicate,
|
155
|
+
source_alias,
|
156
|
+
target_alias,
|
157
|
+
!error_on_type_mismatch,
|
158
|
+
writer_properties,
|
159
|
+
post_commithook_properties,
|
160
|
+
commit_properties
|
161
|
+
)
|
162
|
+
TableMerger.new(rb_merge_builder, @table)
|
163
|
+
end
|
164
|
+
|
138
165
|
def restore(
|
139
166
|
target,
|
140
167
|
ignore_missing_files: false,
|
141
|
-
protocol_downgrade_allowed: false
|
168
|
+
protocol_downgrade_allowed: false,
|
169
|
+
commit_properties: nil
|
142
170
|
)
|
143
171
|
if target.is_a?(Time)
|
144
|
-
# needed for iso8601
|
145
|
-
require "time"
|
146
|
-
|
147
172
|
metrics =
|
148
173
|
@table.restore(
|
149
174
|
target.utc.iso8601(9),
|
150
175
|
ignore_missing_files,
|
151
|
-
protocol_downgrade_allowed
|
176
|
+
protocol_downgrade_allowed,
|
177
|
+
commit_properties
|
152
178
|
)
|
153
179
|
else
|
154
180
|
metrics =
|
155
181
|
@table.restore(
|
156
182
|
target,
|
157
183
|
ignore_missing_files,
|
158
|
-
protocol_downgrade_allowed
|
184
|
+
protocol_downgrade_allowed,
|
185
|
+
commit_properties
|
159
186
|
)
|
160
187
|
end
|
161
188
|
JSON.parse(metrics)
|
@@ -185,15 +212,32 @@ module DeltaLake
|
|
185
212
|
@table.update_incremental
|
186
213
|
end
|
187
214
|
|
188
|
-
def delete(
|
189
|
-
|
215
|
+
def delete(
|
216
|
+
predicate = nil,
|
217
|
+
writer_properties: nil,
|
218
|
+
post_commithook_properties: nil,
|
219
|
+
commit_properties: nil
|
220
|
+
)
|
221
|
+
metrics =
|
222
|
+
@table.delete(
|
223
|
+
predicate,
|
224
|
+
writer_properties,
|
225
|
+
post_commithook_properties,
|
226
|
+
commit_properties
|
227
|
+
)
|
190
228
|
JSON.parse(metrics).transform_keys(&:to_sym)
|
191
229
|
end
|
192
230
|
|
193
|
-
def repair(
|
231
|
+
def repair(
|
232
|
+
dry_run: false,
|
233
|
+
post_commithook_properties: nil,
|
234
|
+
commit_properties: nil
|
235
|
+
)
|
194
236
|
metrics =
|
195
237
|
@table.repair(
|
196
|
-
dry_run
|
238
|
+
dry_run,
|
239
|
+
commit_properties,
|
240
|
+
post_commithook_properties
|
197
241
|
)
|
198
242
|
JSON.parse(metrics).transform_keys(&:to_sym)
|
199
243
|
end
|
@@ -4,6 +4,29 @@ module DeltaLake
|
|
4
4
|
@table = table
|
5
5
|
end
|
6
6
|
|
7
|
+
def add_feature(
|
8
|
+
feature,
|
9
|
+
allow_protocol_versions_increase: false
|
10
|
+
)
|
11
|
+
if !feature.is_a?(Array)
|
12
|
+
feature = [feature]
|
13
|
+
end
|
14
|
+
@table._table.add_feature(
|
15
|
+
feature,
|
16
|
+
allow_protocol_versions_increase
|
17
|
+
)
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_columns(fields)
|
21
|
+
if fields.is_a?(DeltaLake::Field)
|
22
|
+
fields = [fields]
|
23
|
+
end
|
24
|
+
|
25
|
+
@table._table.add_columns(
|
26
|
+
fields
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
7
30
|
def add_constraint(constraints)
|
8
31
|
if constraints.length > 1
|
9
32
|
raise ArgumentError,
|
@@ -21,5 +44,15 @@ module DeltaLake
|
|
21
44
|
raise_if_not_exists
|
22
45
|
)
|
23
46
|
end
|
47
|
+
|
48
|
+
def set_table_properties(
|
49
|
+
properties,
|
50
|
+
raise_if_not_exists: true
|
51
|
+
)
|
52
|
+
@table._table.set_table_properties(
|
53
|
+
properties,
|
54
|
+
raise_if_not_exists
|
55
|
+
)
|
56
|
+
end
|
24
57
|
end
|
25
58
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class TableMerger
|
3
|
+
def initialize(builder, table)
|
4
|
+
@builder = builder
|
5
|
+
@table = table
|
6
|
+
end
|
7
|
+
|
8
|
+
def when_matched_update(updates, predicate: nil)
|
9
|
+
@builder.when_matched_update(updates, predicate)
|
10
|
+
self
|
11
|
+
end
|
12
|
+
|
13
|
+
def when_not_matched_insert(updates, predicate: nil)
|
14
|
+
@builder.when_not_matched_insert(updates, predicate)
|
15
|
+
self
|
16
|
+
end
|
17
|
+
|
18
|
+
def when_matched_delete(predicate: nil)
|
19
|
+
@builder.when_matched_delete(predicate)
|
20
|
+
self
|
21
|
+
end
|
22
|
+
|
23
|
+
def when_not_matched_by_source_update(updates, predicate: nil)
|
24
|
+
@builder.when_not_matched_by_source_update(updates, predicate)
|
25
|
+
self
|
26
|
+
end
|
27
|
+
|
28
|
+
def when_not_matched_by_source_delete(predicate: nil)
|
29
|
+
@builder.when_not_matched_by_source_delete(predicate)
|
30
|
+
self
|
31
|
+
end
|
32
|
+
|
33
|
+
def execute
|
34
|
+
metrics = @table.merge_execute(@builder)
|
35
|
+
JSON.parse(metrics).transform_keys(&:to_sym)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -5,15 +5,23 @@ module DeltaLake
|
|
5
5
|
end
|
6
6
|
|
7
7
|
def compact(
|
8
|
+
partition_filters: nil,
|
8
9
|
target_size: nil,
|
9
10
|
max_concurrent_tasks: nil,
|
10
|
-
min_commit_interval: nil
|
11
|
+
min_commit_interval: nil,
|
12
|
+
writer_properties: nil,
|
13
|
+
post_commithook_properties: nil,
|
14
|
+
commit_properties: nil
|
11
15
|
)
|
12
16
|
metrics =
|
13
17
|
@table._table.compact_optimize(
|
18
|
+
@table._stringify_partition_values(partition_filters),
|
14
19
|
target_size,
|
15
20
|
max_concurrent_tasks,
|
16
|
-
min_commit_interval
|
21
|
+
min_commit_interval,
|
22
|
+
writer_properties,
|
23
|
+
post_commithook_properties,
|
24
|
+
commit_properties
|
17
25
|
)
|
18
26
|
@table.update_incremental
|
19
27
|
result = JSON.parse(metrics)
|
@@ -26,18 +34,26 @@ module DeltaLake
|
|
26
34
|
|
27
35
|
def z_order(
|
28
36
|
columns,
|
37
|
+
partition_filters: nil,
|
29
38
|
target_size: nil,
|
30
39
|
max_concurrent_tasks: nil,
|
31
40
|
max_spill_size: 20 * 1024 * 1024 * 1024,
|
32
|
-
min_commit_interval: nil
|
41
|
+
min_commit_interval: nil,
|
42
|
+
writer_properties: nil,
|
43
|
+
post_commithook_properties: nil,
|
44
|
+
commit_properties: nil
|
33
45
|
)
|
34
46
|
metrics =
|
35
47
|
@table._table.z_order_optimize(
|
36
48
|
Array(columns),
|
49
|
+
@table._stringify_partition_values(partition_filters),
|
37
50
|
target_size,
|
38
51
|
max_concurrent_tasks,
|
39
52
|
max_spill_size,
|
40
|
-
min_commit_interval
|
53
|
+
min_commit_interval,
|
54
|
+
writer_properties,
|
55
|
+
post_commithook_properties,
|
56
|
+
commit_properties
|
41
57
|
)
|
42
58
|
@table.update_incremental
|
43
59
|
result = JSON.parse(metrics)
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
module Utils
|
3
|
+
def self.convert_data(data)
|
4
|
+
if data.respond_to?(:arrow_c_stream)
|
5
|
+
# TODO convert other object types
|
6
|
+
# should probably move logic to Rust
|
7
|
+
if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
|
8
|
+
data = convert_polars_data(data)
|
9
|
+
end
|
10
|
+
|
11
|
+
data.arrow_c_stream
|
12
|
+
else
|
13
|
+
raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# unsigned integers are not part of the protocol
|
18
|
+
# https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
|
19
|
+
def self.convert_polars_data(data)
|
20
|
+
new_schema = {}
|
21
|
+
data.schema.each do |k, v|
|
22
|
+
new_type = convert_polars_type(v)
|
23
|
+
new_schema[k] = new_type if new_type
|
24
|
+
end
|
25
|
+
|
26
|
+
if new_schema.any?
|
27
|
+
data.cast(new_schema)
|
28
|
+
else
|
29
|
+
data
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.convert_polars_type(t)
|
34
|
+
case t
|
35
|
+
when Polars::UInt8
|
36
|
+
Polars::Int8
|
37
|
+
when Polars::UInt16
|
38
|
+
Polars::Int16
|
39
|
+
when Polars::UInt32
|
40
|
+
Polars::Int32
|
41
|
+
when Polars::UInt64
|
42
|
+
Polars::Int64
|
43
|
+
when Polars::Datetime
|
44
|
+
Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
|
45
|
+
when Polars::List
|
46
|
+
inner = convert_polars_type(t.inner)
|
47
|
+
Polars::List.new(inner) if inner
|
48
|
+
when Polars::Array
|
49
|
+
inner = convert_polars_type(t.inner)
|
50
|
+
Polars::Array.new(t.inner, t.width) if inner
|
51
|
+
when Polars::Struct
|
52
|
+
if t.fields.any? { |f| convert_polars_type(f.dtype) }
|
53
|
+
fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
|
54
|
+
Polars::Struct.new(fields)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
data/lib/deltalake/version.rb
CHANGED
data/lib/deltalake.rb
CHANGED
@@ -7,6 +7,7 @@ end
|
|
7
7
|
|
8
8
|
# stdlib
|
9
9
|
require "json"
|
10
|
+
require "time"
|
10
11
|
|
11
12
|
# modules
|
12
13
|
require_relative "deltalake/field"
|
@@ -14,7 +15,9 @@ require_relative "deltalake/metadata"
|
|
14
15
|
require_relative "deltalake/schema"
|
15
16
|
require_relative "deltalake/table"
|
16
17
|
require_relative "deltalake/table_alterer"
|
18
|
+
require_relative "deltalake/table_merger"
|
17
19
|
require_relative "deltalake/table_optimizer"
|
20
|
+
require_relative "deltalake/utils"
|
18
21
|
require_relative "deltalake/version"
|
19
22
|
|
20
23
|
module DeltaLake
|
@@ -38,6 +41,28 @@ module DeltaLake
|
|
38
41
|
:reader_features
|
39
42
|
)
|
40
43
|
|
44
|
+
CommitProperties =
|
45
|
+
Struct.new(
|
46
|
+
:custom_metadata,
|
47
|
+
:max_commit_retries,
|
48
|
+
# TODO
|
49
|
+
# :app_transactions,
|
50
|
+
keyword_init: true
|
51
|
+
)
|
52
|
+
|
53
|
+
PostCommitHookProperties =
|
54
|
+
Struct.new(
|
55
|
+
:create_checkpoint,
|
56
|
+
:cleanup_expired_logs,
|
57
|
+
keyword_init: true
|
58
|
+
)
|
59
|
+
|
60
|
+
class ArrowArrayStream
|
61
|
+
def arrow_c_stream
|
62
|
+
self
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
41
66
|
class << self
|
42
67
|
def write(
|
43
68
|
table_or_uri,
|
@@ -50,7 +75,10 @@ module DeltaLake
|
|
50
75
|
schema_mode: nil,
|
51
76
|
storage_options: nil,
|
52
77
|
predicate: nil,
|
53
|
-
target_file_size: nil
|
78
|
+
target_file_size: nil,
|
79
|
+
writer_properties: nil,
|
80
|
+
commit_properties: nil,
|
81
|
+
post_commithook_properties: nil
|
54
82
|
)
|
55
83
|
table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options)
|
56
84
|
|
@@ -62,7 +90,7 @@ module DeltaLake
|
|
62
90
|
return
|
63
91
|
end
|
64
92
|
|
65
|
-
data = convert_data(data)
|
93
|
+
data = Utils.convert_data(data)
|
66
94
|
|
67
95
|
write_deltalake_rust(
|
68
96
|
table_uri,
|
@@ -76,7 +104,10 @@ module DeltaLake
|
|
76
104
|
name,
|
77
105
|
description,
|
78
106
|
configuration,
|
79
|
-
storage_options
|
107
|
+
storage_options,
|
108
|
+
writer_properties,
|
109
|
+
commit_properties,
|
110
|
+
post_commithook_properties
|
80
111
|
)
|
81
112
|
|
82
113
|
if table
|
@@ -107,61 +138,5 @@ module DeltaLake
|
|
107
138
|
rescue TableNotFoundError
|
108
139
|
nil
|
109
140
|
end
|
110
|
-
|
111
|
-
def convert_data(data)
|
112
|
-
if data.respond_to?(:arrow_c_stream)
|
113
|
-
# TODO convert other object types
|
114
|
-
# should probably move logic to Rust
|
115
|
-
if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
|
116
|
-
data = convert_polars_data(data)
|
117
|
-
end
|
118
|
-
|
119
|
-
data.arrow_c_stream
|
120
|
-
else
|
121
|
-
raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
|
122
|
-
end
|
123
|
-
end
|
124
|
-
|
125
|
-
# unsigned integers are not part of the protocol
|
126
|
-
# https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
|
127
|
-
def convert_polars_data(data)
|
128
|
-
new_schema = {}
|
129
|
-
data.schema.each do |k, v|
|
130
|
-
new_type = convert_polars_type(v)
|
131
|
-
new_schema[k] = new_type if new_type
|
132
|
-
end
|
133
|
-
|
134
|
-
if new_schema.any?
|
135
|
-
data.cast(new_schema)
|
136
|
-
else
|
137
|
-
data
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def convert_polars_type(t)
|
142
|
-
case t
|
143
|
-
when Polars::UInt8
|
144
|
-
Polars::Int8
|
145
|
-
when Polars::UInt16
|
146
|
-
Polars::Int16
|
147
|
-
when Polars::UInt32
|
148
|
-
Polars::Int32
|
149
|
-
when Polars::UInt64
|
150
|
-
Polars::Int64
|
151
|
-
when Polars::Datetime
|
152
|
-
Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
|
153
|
-
when Polars::List
|
154
|
-
inner = convert_polars_type(t.inner)
|
155
|
-
Polars::List.new(inner) if inner
|
156
|
-
when Polars::Array
|
157
|
-
inner = convert_polars_type(t.inner)
|
158
|
-
Polars::Array.new(t.inner, t.width) if inner
|
159
|
-
when Polars::Struct
|
160
|
-
if t.fields.any? { |f| convert_polars_type(f.dtype) }
|
161
|
-
fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
|
162
|
-
Polars::Struct.new(fields)
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
166
141
|
end
|
167
142
|
end
|