deltalake-rb 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Cargo.lock +506 -337
- data/README.md +33 -3
- data/ext/deltalake/Cargo.toml +7 -4
- data/ext/deltalake/src/error.rs +62 -15
- data/ext/deltalake/src/features.rs +67 -0
- data/ext/deltalake/src/lib.rs +1114 -48
- data/ext/deltalake/src/merge.rs +205 -0
- data/lib/deltalake/table.rb +170 -10
- data/lib/deltalake/table_alterer.rb +58 -0
- data/lib/deltalake/table_merger.rb +38 -0
- data/lib/deltalake/table_optimizer.rb +67 -0
- data/lib/deltalake/utils.rb +59 -0
- data/lib/deltalake/version.rb +1 -1
- data/lib/deltalake.rb +50 -12
- metadata +8 -2
data/ext/deltalake/src/lib.rs
CHANGED
@@ -1,28 +1,77 @@
|
|
1
1
|
mod error;
|
2
|
+
mod features;
|
3
|
+
mod merge;
|
2
4
|
mod schema;
|
3
5
|
mod utils;
|
4
6
|
|
5
7
|
use std::cell::RefCell;
|
6
|
-
use std::collections::HashMap;
|
8
|
+
use std::collections::{HashMap, HashSet};
|
7
9
|
use std::future::IntoFuture;
|
10
|
+
use std::str::FromStr;
|
11
|
+
use std::time;
|
8
12
|
|
9
|
-
use chrono::Duration;
|
13
|
+
use chrono::{DateTime, Duration, FixedOffset, Utc};
|
14
|
+
use delta_kernel::schema::StructField;
|
10
15
|
use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
|
11
|
-
use deltalake::
|
16
|
+
use deltalake::arrow::record_batch::RecordBatchIterator;
|
17
|
+
use deltalake::checkpoints::{cleanup_metadata, create_checkpoint};
|
18
|
+
use deltalake::datafusion::physical_plan::ExecutionPlan;
|
19
|
+
use deltalake::datafusion::prelude::SessionContext;
|
20
|
+
use deltalake::errors::DeltaTableError;
|
21
|
+
use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
|
22
|
+
use deltalake::operations::add_column::AddColumnBuilder;
|
23
|
+
use deltalake::operations::add_feature::AddTableFeatureBuilder;
|
24
|
+
use deltalake::operations::collect_sendable_stream;
|
25
|
+
use deltalake::operations::constraints::ConstraintBuilder;
|
12
26
|
use deltalake::operations::delete::DeleteBuilder;
|
27
|
+
use deltalake::operations::drop_constraints::DropConstraintBuilder;
|
28
|
+
use deltalake::operations::filesystem_check::FileSystemCheckBuilder;
|
29
|
+
use deltalake::operations::load_cdf::CdfLoadBuilder;
|
30
|
+
use deltalake::operations::optimize::{OptimizeBuilder, OptimizeType};
|
31
|
+
use deltalake::operations::restore::RestoreBuilder;
|
32
|
+
use deltalake::operations::set_tbl_properties::SetTablePropertiesBuilder;
|
33
|
+
use deltalake::operations::transaction::{CommitProperties, TableReference};
|
13
34
|
use deltalake::operations::vacuum::VacuumBuilder;
|
35
|
+
use deltalake::parquet::basic::Compression;
|
36
|
+
use deltalake::parquet::errors::ParquetError;
|
37
|
+
use deltalake::parquet::file::properties::WriterProperties;
|
38
|
+
use deltalake::partitions::PartitionFilter;
|
14
39
|
use deltalake::storage::IORuntime;
|
15
|
-
use deltalake::DeltaOps;
|
40
|
+
use deltalake::{DeltaOps, DeltaResult};
|
16
41
|
use error::DeltaError;
|
42
|
+
use futures::future::join_all;
|
17
43
|
|
18
|
-
use magnus::{
|
44
|
+
use magnus::{
|
45
|
+
function, method, prelude::*, typed_data::Obj, Error, Integer, Module, RArray, RHash, Ruby,
|
46
|
+
TryConvert, Value,
|
47
|
+
};
|
48
|
+
use serde_json::Map;
|
19
49
|
|
50
|
+
use crate::error::DeltaProtocolError;
|
51
|
+
use crate::error::RbValueError;
|
20
52
|
use crate::error::RubyError;
|
53
|
+
use crate::features::TableFeatures;
|
54
|
+
use crate::merge::RbMergeBuilder;
|
21
55
|
use crate::schema::{schema_to_rbobject, Field};
|
22
56
|
use crate::utils::rt;
|
23
57
|
|
24
58
|
type RbResult<T> = Result<T, Error>;
|
25
59
|
|
60
|
+
enum PartitionFilterValue {
|
61
|
+
Single(String),
|
62
|
+
Multiple(Vec<String>),
|
63
|
+
}
|
64
|
+
|
65
|
+
impl TryConvert for PartitionFilterValue {
|
66
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
67
|
+
if let Ok(v) = Vec::<String>::try_convert(val) {
|
68
|
+
Ok(PartitionFilterValue::Multiple(v))
|
69
|
+
} else {
|
70
|
+
Ok(PartitionFilterValue::Single(String::try_convert(val)?))
|
71
|
+
}
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
26
75
|
#[magnus::wrap(class = "DeltaLake::RawDeltaTable")]
|
27
76
|
struct RawDeltaTable {
|
28
77
|
_table: RefCell<deltalake::DeltaTable>,
|
@@ -38,6 +87,34 @@ struct RawDeltaTableMetaData {
|
|
38
87
|
configuration: HashMap<String, Option<String>>,
|
39
88
|
}
|
40
89
|
|
90
|
+
impl RawDeltaTableMetaData {
|
91
|
+
fn id(&self) -> String {
|
92
|
+
self.id.clone()
|
93
|
+
}
|
94
|
+
|
95
|
+
fn name(&self) -> Option<String> {
|
96
|
+
self.name.clone()
|
97
|
+
}
|
98
|
+
|
99
|
+
fn description(&self) -> Option<String> {
|
100
|
+
self.description.clone()
|
101
|
+
}
|
102
|
+
|
103
|
+
fn partition_columns(&self) -> Vec<String> {
|
104
|
+
self.partition_columns.clone()
|
105
|
+
}
|
106
|
+
|
107
|
+
fn created_time(&self) -> Option<i64> {
|
108
|
+
self.created_time
|
109
|
+
}
|
110
|
+
|
111
|
+
fn configuration(&self) -> HashMap<String, Option<String>> {
|
112
|
+
self.configuration.clone()
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
type StringVec = Vec<String>;
|
117
|
+
|
41
118
|
impl RawDeltaTable {
|
42
119
|
pub fn new(
|
43
120
|
table_uri: String,
|
@@ -113,37 +190,140 @@ impl RawDeltaTable {
|
|
113
190
|
})
|
114
191
|
}
|
115
192
|
|
193
|
+
pub fn protocol_versions(&self) -> RbResult<(i32, i32, Option<StringVec>, Option<StringVec>)> {
|
194
|
+
let binding = self._table.borrow();
|
195
|
+
let table_protocol = binding.protocol().map_err(RubyError::from)?;
|
196
|
+
Ok((
|
197
|
+
table_protocol.min_reader_version,
|
198
|
+
table_protocol.min_writer_version,
|
199
|
+
table_protocol
|
200
|
+
.writer_features
|
201
|
+
.as_ref()
|
202
|
+
.and_then(|features| {
|
203
|
+
let empty_set = !features.is_empty();
|
204
|
+
empty_set.then(|| {
|
205
|
+
features
|
206
|
+
.iter()
|
207
|
+
.map(|v| v.to_string())
|
208
|
+
.collect::<Vec<String>>()
|
209
|
+
})
|
210
|
+
}),
|
211
|
+
table_protocol
|
212
|
+
.reader_features
|
213
|
+
.as_ref()
|
214
|
+
.and_then(|features| {
|
215
|
+
let empty_set = !features.is_empty();
|
216
|
+
empty_set.then(|| {
|
217
|
+
features
|
218
|
+
.iter()
|
219
|
+
.map(|v| v.to_string())
|
220
|
+
.collect::<Vec<String>>()
|
221
|
+
})
|
222
|
+
}),
|
223
|
+
))
|
224
|
+
}
|
225
|
+
|
116
226
|
pub fn load_version(&self, version: i64) -> RbResult<()> {
|
117
227
|
Ok(rt()
|
118
228
|
.block_on(self._table.borrow_mut().load_version(version))
|
119
229
|
.map_err(RubyError::from)?)
|
120
230
|
}
|
121
231
|
|
122
|
-
pub fn
|
123
|
-
|
124
|
-
|
125
|
-
|
232
|
+
pub fn get_latest_version(&self) -> RbResult<i64> {
|
233
|
+
Ok(rt()
|
234
|
+
.block_on(self._table.borrow().get_latest_version())
|
235
|
+
.map_err(RubyError::from)?)
|
236
|
+
}
|
126
237
|
|
238
|
+
pub fn get_earliest_version(&self) -> RbResult<i64> {
|
239
|
+
Ok(rt()
|
240
|
+
.block_on(self._table.borrow().get_earliest_version())
|
241
|
+
.map_err(RubyError::from)?)
|
242
|
+
}
|
243
|
+
|
244
|
+
pub fn get_num_index_cols(&self) -> RbResult<i32> {
|
127
245
|
Ok(self
|
128
246
|
._table
|
129
247
|
.borrow()
|
130
|
-
.
|
248
|
+
.snapshot()
|
131
249
|
.map_err(RubyError::from)?
|
132
|
-
.
|
133
|
-
.
|
250
|
+
.config()
|
251
|
+
.num_indexed_cols())
|
134
252
|
}
|
135
253
|
|
136
|
-
pub fn
|
137
|
-
if !self._table.borrow().config.require_files {
|
138
|
-
return Err(DeltaError::new_err("Table is initiated without files."));
|
139
|
-
}
|
140
|
-
|
254
|
+
pub fn get_stats_columns(&self) -> RbResult<Option<Vec<String>>> {
|
141
255
|
Ok(self
|
142
256
|
._table
|
143
257
|
.borrow()
|
144
|
-
.
|
258
|
+
.snapshot()
|
145
259
|
.map_err(RubyError::from)?
|
146
|
-
.
|
260
|
+
.config()
|
261
|
+
.stats_columns()
|
262
|
+
.map(|v| v.iter().map(|v| v.to_string()).collect::<Vec<String>>()))
|
263
|
+
}
|
264
|
+
|
265
|
+
pub fn load_with_datetime(&self, ds: String) -> RbResult<()> {
|
266
|
+
let datetime =
|
267
|
+
DateTime::<Utc>::from(DateTime::<FixedOffset>::parse_from_rfc3339(&ds).map_err(
|
268
|
+
|err| RbValueError::new_err(format!("Failed to parse datetime string: {err}")),
|
269
|
+
)?);
|
270
|
+
Ok(rt()
|
271
|
+
.block_on(self._table.borrow_mut().load_with_datetime(datetime))
|
272
|
+
.map_err(RubyError::from)?)
|
273
|
+
}
|
274
|
+
|
275
|
+
pub fn files(
|
276
|
+
&self,
|
277
|
+
partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
|
278
|
+
) -> RbResult<Vec<String>> {
|
279
|
+
if !self.has_files()? {
|
280
|
+
return Err(DeltaError::new_err("Table is instantiated without files."));
|
281
|
+
}
|
282
|
+
|
283
|
+
if let Some(filters) = partition_filters {
|
284
|
+
let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
|
285
|
+
Ok(self
|
286
|
+
._table
|
287
|
+
.borrow()
|
288
|
+
.get_files_by_partitions(&filters)
|
289
|
+
.map_err(RubyError::from)?
|
290
|
+
.into_iter()
|
291
|
+
.map(|p| p.to_string())
|
292
|
+
.collect())
|
293
|
+
} else {
|
294
|
+
Ok(self
|
295
|
+
._table
|
296
|
+
.borrow()
|
297
|
+
.get_files_iter()
|
298
|
+
.map_err(RubyError::from)?
|
299
|
+
.map(|f| f.to_string())
|
300
|
+
.collect())
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
pub fn file_uris(
|
305
|
+
&self,
|
306
|
+
partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
|
307
|
+
) -> RbResult<Vec<String>> {
|
308
|
+
if !self._table.borrow().config.require_files {
|
309
|
+
return Err(DeltaError::new_err("Table is initiated without files."));
|
310
|
+
}
|
311
|
+
|
312
|
+
if let Some(filters) = partition_filters {
|
313
|
+
let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
|
314
|
+
Ok(self
|
315
|
+
._table
|
316
|
+
.borrow()
|
317
|
+
.get_file_uris_by_partitions(&filters)
|
318
|
+
.map_err(RubyError::from)?)
|
319
|
+
} else {
|
320
|
+
Ok(self
|
321
|
+
._table
|
322
|
+
.borrow()
|
323
|
+
.get_file_uris()
|
324
|
+
.map_err(RubyError::from)?
|
325
|
+
.collect())
|
326
|
+
}
|
147
327
|
}
|
148
328
|
|
149
329
|
pub fn schema(&self) -> RbResult<Value> {
|
@@ -157,6 +337,8 @@ impl RawDeltaTable {
|
|
157
337
|
dry_run: bool,
|
158
338
|
retention_hours: Option<u64>,
|
159
339
|
enforce_retention_duration: bool,
|
340
|
+
commit_properties: Option<RbCommitProperties>,
|
341
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
160
342
|
) -> RbResult<Vec<String>> {
|
161
343
|
let mut cmd = VacuumBuilder::new(
|
162
344
|
self._table.borrow().log_store(),
|
@@ -172,11 +354,350 @@ impl RawDeltaTable {
|
|
172
354
|
cmd = cmd.with_retention_period(Duration::hours(retention_period as i64));
|
173
355
|
}
|
174
356
|
|
357
|
+
if let Some(commit_properties) =
|
358
|
+
maybe_create_commit_properties(commit_properties, post_commithook_properties)
|
359
|
+
{
|
360
|
+
cmd = cmd.with_commit_properties(commit_properties);
|
361
|
+
}
|
175
362
|
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
176
363
|
self._table.borrow_mut().state = table.state;
|
177
364
|
Ok(metrics.files_deleted)
|
178
365
|
}
|
179
366
|
|
367
|
+
pub fn compact_optimize(
|
368
|
+
&self,
|
369
|
+
partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
|
370
|
+
target_size: Option<i64>,
|
371
|
+
max_concurrent_tasks: Option<usize>,
|
372
|
+
min_commit_interval: Option<u64>,
|
373
|
+
writer_properties: Option<RbWriterProperties>,
|
374
|
+
commit_properties: Option<RbCommitProperties>,
|
375
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
376
|
+
) -> RbResult<String> {
|
377
|
+
let mut cmd = OptimizeBuilder::new(
|
378
|
+
self._table.borrow().log_store(),
|
379
|
+
self._table
|
380
|
+
.borrow()
|
381
|
+
.snapshot()
|
382
|
+
.map_err(RubyError::from)?
|
383
|
+
.clone(),
|
384
|
+
)
|
385
|
+
.with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get));
|
386
|
+
if let Some(size) = target_size {
|
387
|
+
cmd = cmd.with_target_size(size);
|
388
|
+
}
|
389
|
+
if let Some(commit_interval) = min_commit_interval {
|
390
|
+
cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
|
391
|
+
}
|
392
|
+
|
393
|
+
if let Some(writer_props) = writer_properties {
|
394
|
+
cmd = cmd.with_writer_properties(
|
395
|
+
set_writer_properties(writer_props).map_err(RubyError::from)?,
|
396
|
+
);
|
397
|
+
}
|
398
|
+
|
399
|
+
if let Some(commit_properties) =
|
400
|
+
maybe_create_commit_properties(commit_properties, post_commithook_properties)
|
401
|
+
{
|
402
|
+
cmd = cmd.with_commit_properties(commit_properties);
|
403
|
+
}
|
404
|
+
|
405
|
+
let converted_filters = convert_partition_filters(partition_filters.unwrap_or_default())
|
406
|
+
.map_err(RubyError::from)?;
|
407
|
+
cmd = cmd.with_filters(&converted_filters);
|
408
|
+
|
409
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
410
|
+
self._table.borrow_mut().state = table.state;
|
411
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
412
|
+
}
|
413
|
+
|
414
|
+
pub fn z_order_optimize(
|
415
|
+
&self,
|
416
|
+
z_order_columns: Vec<String>,
|
417
|
+
partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
|
418
|
+
target_size: Option<i64>,
|
419
|
+
max_concurrent_tasks: Option<usize>,
|
420
|
+
max_spill_size: usize,
|
421
|
+
min_commit_interval: Option<u64>,
|
422
|
+
writer_properties: Option<RbWriterProperties>,
|
423
|
+
commit_properties: Option<RbCommitProperties>,
|
424
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
425
|
+
) -> RbResult<String> {
|
426
|
+
let mut cmd = OptimizeBuilder::new(
|
427
|
+
self._table.borrow().log_store(),
|
428
|
+
self._table
|
429
|
+
.borrow()
|
430
|
+
.snapshot()
|
431
|
+
.map_err(RubyError::from)?
|
432
|
+
.clone(),
|
433
|
+
)
|
434
|
+
.with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get))
|
435
|
+
.with_max_spill_size(max_spill_size)
|
436
|
+
.with_type(OptimizeType::ZOrder(z_order_columns));
|
437
|
+
if let Some(size) = target_size {
|
438
|
+
cmd = cmd.with_target_size(size);
|
439
|
+
}
|
440
|
+
if let Some(commit_interval) = min_commit_interval {
|
441
|
+
cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
|
442
|
+
}
|
443
|
+
|
444
|
+
if let Some(writer_props) = writer_properties {
|
445
|
+
cmd = cmd.with_writer_properties(
|
446
|
+
set_writer_properties(writer_props).map_err(RubyError::from)?,
|
447
|
+
);
|
448
|
+
}
|
449
|
+
|
450
|
+
if let Some(commit_properties) =
|
451
|
+
maybe_create_commit_properties(commit_properties, post_commithook_properties)
|
452
|
+
{
|
453
|
+
cmd = cmd.with_commit_properties(commit_properties);
|
454
|
+
}
|
455
|
+
|
456
|
+
let converted_filters = convert_partition_filters(partition_filters.unwrap_or_default())
|
457
|
+
.map_err(RubyError::from)?;
|
458
|
+
cmd = cmd.with_filters(&converted_filters);
|
459
|
+
|
460
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
461
|
+
self._table.borrow_mut().state = table.state;
|
462
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
463
|
+
}
|
464
|
+
|
465
|
+
pub fn add_columns(&self, fields: RArray) -> RbResult<()> {
|
466
|
+
let fields = fields.typecheck::<Obj<Field>>()?;
|
467
|
+
let mut cmd = AddColumnBuilder::new(
|
468
|
+
self._table.borrow().log_store(),
|
469
|
+
self._table
|
470
|
+
.borrow()
|
471
|
+
.snapshot()
|
472
|
+
.map_err(RubyError::from)?
|
473
|
+
.clone(),
|
474
|
+
);
|
475
|
+
|
476
|
+
let new_fields = fields
|
477
|
+
.iter()
|
478
|
+
.map(|v| v.inner.clone())
|
479
|
+
.collect::<Vec<StructField>>();
|
480
|
+
|
481
|
+
cmd = cmd.with_fields(new_fields);
|
482
|
+
|
483
|
+
let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
484
|
+
self._table.borrow_mut().state = table.state;
|
485
|
+
Ok(())
|
486
|
+
}
|
487
|
+
|
488
|
+
pub fn add_feature(
|
489
|
+
&self,
|
490
|
+
feature: RArray,
|
491
|
+
allow_protocol_versions_increase: bool,
|
492
|
+
) -> RbResult<()> {
|
493
|
+
let feature = feature
|
494
|
+
.into_iter()
|
495
|
+
.map(|v| TableFeatures::try_convert(v))
|
496
|
+
.collect::<RbResult<Vec<_>>>()?;
|
497
|
+
let cmd = AddTableFeatureBuilder::new(
|
498
|
+
self._table.borrow().log_store(),
|
499
|
+
self._table
|
500
|
+
.borrow()
|
501
|
+
.snapshot()
|
502
|
+
.map_err(RubyError::from)?
|
503
|
+
.clone(),
|
504
|
+
)
|
505
|
+
.with_features(feature)
|
506
|
+
.with_allow_protocol_versions_increase(allow_protocol_versions_increase);
|
507
|
+
|
508
|
+
let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
509
|
+
self._table.borrow_mut().state = table.state;
|
510
|
+
Ok(())
|
511
|
+
}
|
512
|
+
|
513
|
+
pub fn add_constraints(&self, constraints: HashMap<String, String>) -> RbResult<()> {
|
514
|
+
let mut cmd = ConstraintBuilder::new(
|
515
|
+
self._table.borrow().log_store(),
|
516
|
+
self._table
|
517
|
+
.borrow()
|
518
|
+
.snapshot()
|
519
|
+
.map_err(RubyError::from)?
|
520
|
+
.clone(),
|
521
|
+
);
|
522
|
+
|
523
|
+
for (col_name, expression) in constraints {
|
524
|
+
cmd = cmd.with_constraint(col_name.clone(), expression.clone());
|
525
|
+
}
|
526
|
+
|
527
|
+
let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
528
|
+
self._table.borrow_mut().state = table.state;
|
529
|
+
Ok(())
|
530
|
+
}
|
531
|
+
|
532
|
+
pub fn drop_constraints(&self, name: String, raise_if_not_exists: bool) -> RbResult<()> {
|
533
|
+
let cmd = DropConstraintBuilder::new(
|
534
|
+
self._table.borrow().log_store(),
|
535
|
+
self._table
|
536
|
+
.borrow()
|
537
|
+
.snapshot()
|
538
|
+
.map_err(RubyError::from)?
|
539
|
+
.clone(),
|
540
|
+
)
|
541
|
+
.with_constraint(name)
|
542
|
+
.with_raise_if_not_exists(raise_if_not_exists);
|
543
|
+
|
544
|
+
let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
545
|
+
self._table.borrow_mut().state = table.state;
|
546
|
+
Ok(())
|
547
|
+
}
|
548
|
+
|
549
|
+
pub fn load_cdf(
|
550
|
+
&self,
|
551
|
+
starting_version: i64,
|
552
|
+
ending_version: Option<i64>,
|
553
|
+
starting_timestamp: Option<String>,
|
554
|
+
ending_timestamp: Option<String>,
|
555
|
+
columns: Option<Vec<String>>,
|
556
|
+
) -> RbResult<ArrowArrayStream> {
|
557
|
+
let ctx = SessionContext::new();
|
558
|
+
let mut cdf_read = CdfLoadBuilder::new(
|
559
|
+
self._table.borrow().log_store(),
|
560
|
+
self._table
|
561
|
+
.borrow()
|
562
|
+
.snapshot()
|
563
|
+
.map_err(RubyError::from)?
|
564
|
+
.clone(),
|
565
|
+
)
|
566
|
+
.with_starting_version(starting_version);
|
567
|
+
|
568
|
+
if let Some(ev) = ending_version {
|
569
|
+
cdf_read = cdf_read.with_ending_version(ev);
|
570
|
+
}
|
571
|
+
if let Some(st) = starting_timestamp {
|
572
|
+
let starting_ts: DateTime<Utc> = DateTime::<Utc>::from_str(&st)
|
573
|
+
.map_err(|pe| RbValueError::new_err(pe.to_string()))?
|
574
|
+
.to_utc();
|
575
|
+
cdf_read = cdf_read.with_starting_timestamp(starting_ts);
|
576
|
+
}
|
577
|
+
if let Some(et) = ending_timestamp {
|
578
|
+
let ending_ts = DateTime::<Utc>::from_str(&et)
|
579
|
+
.map_err(|pe| RbValueError::new_err(pe.to_string()))?
|
580
|
+
.to_utc();
|
581
|
+
cdf_read = cdf_read.with_starting_timestamp(ending_ts);
|
582
|
+
}
|
583
|
+
|
584
|
+
if let Some(columns) = columns {
|
585
|
+
cdf_read = cdf_read.with_columns(columns);
|
586
|
+
}
|
587
|
+
|
588
|
+
cdf_read = cdf_read.with_session_ctx(ctx.clone());
|
589
|
+
|
590
|
+
let plan = rt().block_on(cdf_read.build()).map_err(RubyError::from)?;
|
591
|
+
|
592
|
+
let mut tasks = vec![];
|
593
|
+
for p in 0..plan.properties().output_partitioning().partition_count() {
|
594
|
+
let inner_plan = plan.clone();
|
595
|
+
let partition_batch = inner_plan.execute(p, ctx.task_ctx()).unwrap();
|
596
|
+
let handle = rt().spawn(collect_sendable_stream(partition_batch));
|
597
|
+
tasks.push(handle);
|
598
|
+
}
|
599
|
+
|
600
|
+
// This is unfortunate.
|
601
|
+
let batches = rt()
|
602
|
+
.block_on(join_all(tasks))
|
603
|
+
.into_iter()
|
604
|
+
.flatten()
|
605
|
+
.collect::<Result<Vec<Vec<_>>, _>>()
|
606
|
+
.unwrap()
|
607
|
+
.into_iter()
|
608
|
+
.flatten()
|
609
|
+
.map(Ok);
|
610
|
+
let batch_iter = RecordBatchIterator::new(batches, plan.schema());
|
611
|
+
let ffi_stream = FFI_ArrowArrayStream::new(Box::new(batch_iter));
|
612
|
+
Ok(ArrowArrayStream { stream: ffi_stream })
|
613
|
+
}
|
614
|
+
|
615
|
+
pub fn create_merge_builder(
|
616
|
+
&self,
|
617
|
+
source: RbArrowType<ArrowArrayStreamReader>,
|
618
|
+
predicate: String,
|
619
|
+
source_alias: Option<String>,
|
620
|
+
target_alias: Option<String>,
|
621
|
+
safe_cast: bool,
|
622
|
+
writer_properties: Option<RbWriterProperties>,
|
623
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
624
|
+
commit_properties: Option<RbCommitProperties>,
|
625
|
+
) -> RbResult<RbMergeBuilder> {
|
626
|
+
Ok(RbMergeBuilder::new(
|
627
|
+
self._table.borrow().log_store(),
|
628
|
+
self._table
|
629
|
+
.borrow()
|
630
|
+
.snapshot()
|
631
|
+
.map_err(RubyError::from)?
|
632
|
+
.clone(),
|
633
|
+
source.0,
|
634
|
+
predicate,
|
635
|
+
source_alias,
|
636
|
+
target_alias,
|
637
|
+
safe_cast,
|
638
|
+
writer_properties,
|
639
|
+
post_commithook_properties,
|
640
|
+
commit_properties,
|
641
|
+
)
|
642
|
+
.map_err(RubyError::from)?)
|
643
|
+
}
|
644
|
+
|
645
|
+
pub fn merge_execute(&self, merge_builder: &RbMergeBuilder) -> RbResult<String> {
|
646
|
+
let (table, metrics) = merge_builder.execute().map_err(RubyError::from)?;
|
647
|
+
self._table.borrow_mut().state = table.state;
|
648
|
+
Ok(metrics)
|
649
|
+
}
|
650
|
+
|
651
|
+
pub fn restore(
|
652
|
+
&self,
|
653
|
+
target: Option<Value>,
|
654
|
+
ignore_missing_files: bool,
|
655
|
+
protocol_downgrade_allowed: bool,
|
656
|
+
commit_properties: Option<RbCommitProperties>,
|
657
|
+
) -> RbResult<String> {
|
658
|
+
let mut cmd = RestoreBuilder::new(
|
659
|
+
self._table.borrow().log_store(),
|
660
|
+
self._table
|
661
|
+
.borrow()
|
662
|
+
.snapshot()
|
663
|
+
.map_err(RubyError::from)?
|
664
|
+
.clone(),
|
665
|
+
);
|
666
|
+
if let Some(val) = target {
|
667
|
+
if let Some(version) = Integer::from_value(val) {
|
668
|
+
cmd = cmd.with_version_to_restore(version.to_i64()?)
|
669
|
+
}
|
670
|
+
if let Ok(ds) = String::try_convert(val) {
|
671
|
+
let datetime = DateTime::<Utc>::from(
|
672
|
+
DateTime::<FixedOffset>::parse_from_rfc3339(ds.as_ref()).map_err(|err| {
|
673
|
+
RbValueError::new_err(format!("Failed to parse datetime string: {err}"))
|
674
|
+
})?,
|
675
|
+
);
|
676
|
+
cmd = cmd.with_datetime_to_restore(datetime)
|
677
|
+
}
|
678
|
+
}
|
679
|
+
cmd = cmd.with_ignore_missing_files(ignore_missing_files);
|
680
|
+
cmd = cmd.with_protocol_downgrade_allowed(protocol_downgrade_allowed);
|
681
|
+
|
682
|
+
if let Some(commit_properties) = maybe_create_commit_properties(commit_properties, None) {
|
683
|
+
cmd = cmd.with_commit_properties(commit_properties);
|
684
|
+
}
|
685
|
+
|
686
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
687
|
+
self._table.borrow_mut().state = table.state;
|
688
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
689
|
+
}
|
690
|
+
|
691
|
+
pub fn history(&self, limit: Option<usize>) -> RbResult<Vec<String>> {
|
692
|
+
let history = rt()
|
693
|
+
.block_on(self._table.borrow().history(limit))
|
694
|
+
.map_err(RubyError::from)?;
|
695
|
+
Ok(history
|
696
|
+
.iter()
|
697
|
+
.map(|c| serde_json::to_string(c).unwrap())
|
698
|
+
.collect())
|
699
|
+
}
|
700
|
+
|
180
701
|
pub fn update_incremental(&self) -> RbResult<()> {
|
181
702
|
#[allow(deprecated)]
|
182
703
|
Ok(rt()
|
@@ -184,7 +705,89 @@ impl RawDeltaTable {
|
|
184
705
|
.map_err(RubyError::from)?)
|
185
706
|
}
|
186
707
|
|
187
|
-
|
708
|
+
fn get_active_partitions(&self) -> RbResult<RArray> {
|
709
|
+
let binding = self._table.borrow();
|
710
|
+
let _column_names: HashSet<&str> = binding
|
711
|
+
.get_schema()
|
712
|
+
.map_err(|_| DeltaProtocolError::new_err("table does not yet have a schema"))?
|
713
|
+
.fields()
|
714
|
+
.map(|field| field.name().as_str())
|
715
|
+
.collect();
|
716
|
+
let partition_columns: HashSet<&str> = binding
|
717
|
+
.metadata()
|
718
|
+
.map_err(RubyError::from)?
|
719
|
+
.partition_columns
|
720
|
+
.iter()
|
721
|
+
.map(|col| col.as_str())
|
722
|
+
.collect();
|
723
|
+
|
724
|
+
let converted_filters = Vec::new();
|
725
|
+
|
726
|
+
let partition_columns: Vec<&str> = partition_columns.into_iter().collect();
|
727
|
+
|
728
|
+
let adds = binding
|
729
|
+
.snapshot()
|
730
|
+
.map_err(RubyError::from)?
|
731
|
+
.get_active_add_actions_by_partitions(&converted_filters)
|
732
|
+
.map_err(RubyError::from)?
|
733
|
+
.collect::<Result<Vec<_>, _>>()
|
734
|
+
.map_err(RubyError::from)?;
|
735
|
+
let active_partitions: HashSet<Vec<(&str, Option<String>)>> = adds
|
736
|
+
.iter()
|
737
|
+
.flat_map(|add| {
|
738
|
+
Ok::<_, RubyError>(
|
739
|
+
partition_columns
|
740
|
+
.iter()
|
741
|
+
.flat_map(|col| {
|
742
|
+
Ok::<_, RubyError>((
|
743
|
+
*col,
|
744
|
+
add.partition_values()
|
745
|
+
.map_err(RubyError::from)?
|
746
|
+
.get(*col)
|
747
|
+
.map(|v| v.serialize()),
|
748
|
+
))
|
749
|
+
})
|
750
|
+
.collect(),
|
751
|
+
)
|
752
|
+
})
|
753
|
+
.collect();
|
754
|
+
|
755
|
+
Ok(RArray::from_iter(active_partitions))
|
756
|
+
}
|
757
|
+
|
758
|
+
pub fn create_checkpoint(&self) -> RbResult<()> {
|
759
|
+
rt().block_on(create_checkpoint(&self._table.borrow()))
|
760
|
+
.map_err(RubyError::from)?;
|
761
|
+
|
762
|
+
Ok(())
|
763
|
+
}
|
764
|
+
|
765
|
+
pub fn cleanup_metadata(&self) -> RbResult<()> {
|
766
|
+
rt().block_on(cleanup_metadata(&self._table.borrow()))
|
767
|
+
.map_err(RubyError::from)?;
|
768
|
+
|
769
|
+
Ok(())
|
770
|
+
}
|
771
|
+
|
772
|
+
pub fn get_add_file_sizes(&self) -> RbResult<HashMap<String, i64>> {
|
773
|
+
Ok(self
|
774
|
+
._table
|
775
|
+
.borrow()
|
776
|
+
.snapshot()
|
777
|
+
.map_err(RubyError::from)?
|
778
|
+
.eager_snapshot()
|
779
|
+
.files()
|
780
|
+
.map(|f| (f.path().to_string(), f.size()))
|
781
|
+
.collect::<HashMap<String, i64>>())
|
782
|
+
}
|
783
|
+
|
784
|
+
pub fn delete(
|
785
|
+
&self,
|
786
|
+
predicate: Option<String>,
|
787
|
+
writer_properties: Option<RbWriterProperties>,
|
788
|
+
commit_properties: Option<RbCommitProperties>,
|
789
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
790
|
+
) -> RbResult<String> {
|
188
791
|
let mut cmd = DeleteBuilder::new(
|
189
792
|
self._table.borrow().log_store(),
|
190
793
|
self._table
|
@@ -196,43 +799,366 @@ impl RawDeltaTable {
|
|
196
799
|
if let Some(predicate) = predicate {
|
197
800
|
cmd = cmd.with_predicate(predicate);
|
198
801
|
}
|
802
|
+
if let Some(writer_props) = writer_properties {
|
803
|
+
cmd = cmd.with_writer_properties(
|
804
|
+
set_writer_properties(writer_props).map_err(RubyError::from)?,
|
805
|
+
);
|
806
|
+
}
|
807
|
+
if let Some(commit_properties) =
|
808
|
+
maybe_create_commit_properties(commit_properties, post_commithook_properties)
|
809
|
+
{
|
810
|
+
cmd = cmd.with_commit_properties(commit_properties);
|
811
|
+
}
|
812
|
+
|
813
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
814
|
+
self._table.borrow_mut().state = table.state;
|
815
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
816
|
+
}
|
817
|
+
|
818
|
+
pub fn set_table_properties(
|
819
|
+
&self,
|
820
|
+
properties: HashMap<String, String>,
|
821
|
+
raise_if_not_exists: bool,
|
822
|
+
) -> RbResult<()> {
|
823
|
+
let cmd = SetTablePropertiesBuilder::new(
|
824
|
+
self._table.borrow().log_store(),
|
825
|
+
self._table
|
826
|
+
.borrow()
|
827
|
+
.snapshot()
|
828
|
+
.map_err(RubyError::from)?
|
829
|
+
.clone(),
|
830
|
+
)
|
831
|
+
.with_properties(properties)
|
832
|
+
.with_raise_if_not_exists(raise_if_not_exists);
|
833
|
+
|
834
|
+
let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
835
|
+
self._table.borrow_mut().state = table.state;
|
836
|
+
Ok(())
|
837
|
+
}
|
838
|
+
|
839
|
+
pub fn repair(
|
840
|
+
&self,
|
841
|
+
dry_run: bool,
|
842
|
+
commit_properties: Option<RbCommitProperties>,
|
843
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
844
|
+
) -> RbResult<String> {
|
845
|
+
let mut cmd = FileSystemCheckBuilder::new(
|
846
|
+
self._table.borrow().log_store(),
|
847
|
+
self._table
|
848
|
+
.borrow()
|
849
|
+
.snapshot()
|
850
|
+
.map_err(RubyError::from)?
|
851
|
+
.clone(),
|
852
|
+
)
|
853
|
+
.with_dry_run(dry_run);
|
854
|
+
|
855
|
+
if let Some(commit_properties) =
|
856
|
+
maybe_create_commit_properties(commit_properties, post_commithook_properties)
|
857
|
+
{
|
858
|
+
cmd = cmd.with_commit_properties(commit_properties);
|
859
|
+
}
|
199
860
|
|
200
861
|
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
201
862
|
self._table.borrow_mut().state = table.state;
|
202
863
|
Ok(serde_json::to_string(&metrics).unwrap())
|
203
864
|
}
|
865
|
+
|
866
|
+
pub fn transaction_versions(&self) -> RHash {
|
867
|
+
RHash::from_iter(
|
868
|
+
self._table
|
869
|
+
.borrow()
|
870
|
+
.get_app_transaction_version()
|
871
|
+
.into_iter()
|
872
|
+
.map(|(app_id, transaction)| (app_id, RbTransaction::from(transaction))),
|
873
|
+
)
|
874
|
+
}
|
204
875
|
}
|
205
876
|
|
206
|
-
|
207
|
-
|
208
|
-
|
877
|
+
fn set_post_commithook_properties(
|
878
|
+
mut commit_properties: CommitProperties,
|
879
|
+
post_commithook_properties: RbPostCommitHookProperties,
|
880
|
+
) -> CommitProperties {
|
881
|
+
commit_properties =
|
882
|
+
commit_properties.with_create_checkpoint(post_commithook_properties.create_checkpoint);
|
883
|
+
commit_properties = commit_properties
|
884
|
+
.with_cleanup_expired_logs(post_commithook_properties.cleanup_expired_logs);
|
885
|
+
commit_properties
|
886
|
+
}
|
887
|
+
|
888
|
+
fn set_writer_properties(writer_properties: RbWriterProperties) -> DeltaResult<WriterProperties> {
|
889
|
+
let mut properties = WriterProperties::builder();
|
890
|
+
let data_page_size_limit = writer_properties.data_page_size_limit;
|
891
|
+
let dictionary_page_size_limit = writer_properties.dictionary_page_size_limit;
|
892
|
+
let data_page_row_count_limit = writer_properties.data_page_row_count_limit;
|
893
|
+
let write_batch_size = writer_properties.write_batch_size;
|
894
|
+
let max_row_group_size = writer_properties.max_row_group_size;
|
895
|
+
let compression = writer_properties.compression;
|
896
|
+
let statistics_truncate_length = writer_properties.statistics_truncate_length;
|
897
|
+
let default_column_properties = writer_properties.default_column_properties;
|
898
|
+
let column_properties = writer_properties.column_properties;
|
899
|
+
|
900
|
+
if let Some(data_page_size) = data_page_size_limit {
|
901
|
+
properties = properties.set_data_page_size_limit(data_page_size);
|
902
|
+
}
|
903
|
+
if let Some(dictionary_page_size) = dictionary_page_size_limit {
|
904
|
+
properties = properties.set_dictionary_page_size_limit(dictionary_page_size);
|
905
|
+
}
|
906
|
+
if let Some(data_page_row_count) = data_page_row_count_limit {
|
907
|
+
properties = properties.set_data_page_row_count_limit(data_page_row_count);
|
209
908
|
}
|
909
|
+
if let Some(batch_size) = write_batch_size {
|
910
|
+
properties = properties.set_write_batch_size(batch_size);
|
911
|
+
}
|
912
|
+
if let Some(row_group_size) = max_row_group_size {
|
913
|
+
properties = properties.set_max_row_group_size(row_group_size);
|
914
|
+
}
|
915
|
+
properties = properties.set_statistics_truncate_length(statistics_truncate_length);
|
210
916
|
|
211
|
-
|
212
|
-
|
917
|
+
if let Some(compression) = compression {
|
918
|
+
let compress: Compression = compression
|
919
|
+
.parse()
|
920
|
+
.map_err(|err: ParquetError| DeltaTableError::Generic(err.to_string()))?;
|
921
|
+
|
922
|
+
properties = properties.set_compression(compress);
|
213
923
|
}
|
214
924
|
|
215
|
-
|
216
|
-
|
925
|
+
if let Some(default_column_properties) = default_column_properties {
|
926
|
+
if let Some(dictionary_enabled) = default_column_properties.dictionary_enabled {
|
927
|
+
properties = properties.set_dictionary_enabled(dictionary_enabled);
|
928
|
+
}
|
929
|
+
if let Some(max_statistics_size) = default_column_properties.max_statistics_size {
|
930
|
+
properties = properties.set_max_statistics_size(max_statistics_size);
|
931
|
+
}
|
932
|
+
if let Some(bloom_filter_properties) = default_column_properties.bloom_filter_properties {
|
933
|
+
if let Some(set_bloom_filter_enabled) = bloom_filter_properties.set_bloom_filter_enabled
|
934
|
+
{
|
935
|
+
properties = properties.set_bloom_filter_enabled(set_bloom_filter_enabled);
|
936
|
+
}
|
937
|
+
if let Some(bloom_filter_fpp) = bloom_filter_properties.fpp {
|
938
|
+
properties = properties.set_bloom_filter_fpp(bloom_filter_fpp);
|
939
|
+
}
|
940
|
+
if let Some(bloom_filter_ndv) = bloom_filter_properties.ndv {
|
941
|
+
properties = properties.set_bloom_filter_ndv(bloom_filter_ndv);
|
942
|
+
}
|
943
|
+
}
|
944
|
+
}
|
945
|
+
if let Some(column_properties) = column_properties {
|
946
|
+
for (column_name, column_prop) in column_properties {
|
947
|
+
if let Some(column_prop) = column_prop {
|
948
|
+
if let Some(dictionary_enabled) = column_prop.dictionary_enabled {
|
949
|
+
properties = properties.set_column_dictionary_enabled(
|
950
|
+
column_name.clone().into(),
|
951
|
+
dictionary_enabled,
|
952
|
+
);
|
953
|
+
}
|
954
|
+
if let Some(bloom_filter_properties) = column_prop.bloom_filter_properties {
|
955
|
+
if let Some(set_bloom_filter_enabled) =
|
956
|
+
bloom_filter_properties.set_bloom_filter_enabled
|
957
|
+
{
|
958
|
+
properties = properties.set_column_bloom_filter_enabled(
|
959
|
+
column_name.clone().into(),
|
960
|
+
set_bloom_filter_enabled,
|
961
|
+
);
|
962
|
+
}
|
963
|
+
if let Some(bloom_filter_fpp) = bloom_filter_properties.fpp {
|
964
|
+
properties = properties.set_column_bloom_filter_fpp(
|
965
|
+
column_name.clone().into(),
|
966
|
+
bloom_filter_fpp,
|
967
|
+
);
|
968
|
+
}
|
969
|
+
if let Some(bloom_filter_ndv) = bloom_filter_properties.ndv {
|
970
|
+
properties = properties
|
971
|
+
.set_column_bloom_filter_ndv(column_name.into(), bloom_filter_ndv);
|
972
|
+
}
|
973
|
+
}
|
974
|
+
}
|
975
|
+
}
|
217
976
|
}
|
977
|
+
Ok(properties.build())
|
978
|
+
}
|
218
979
|
|
219
|
-
|
220
|
-
|
980
|
+
fn convert_partition_filters(
|
981
|
+
partitions_filters: Vec<(String, String, PartitionFilterValue)>,
|
982
|
+
) -> Result<Vec<PartitionFilter>, DeltaTableError> {
|
983
|
+
partitions_filters
|
984
|
+
.into_iter()
|
985
|
+
.map(|filter| match filter {
|
986
|
+
(key, op, PartitionFilterValue::Single(v)) => {
|
987
|
+
let key: &'_ str = key.as_ref();
|
988
|
+
let op: &'_ str = op.as_ref();
|
989
|
+
let v: &'_ str = v.as_ref();
|
990
|
+
PartitionFilter::try_from((key, op, v))
|
991
|
+
}
|
992
|
+
(key, op, PartitionFilterValue::Multiple(v)) => {
|
993
|
+
let key: &'_ str = key.as_ref();
|
994
|
+
let op: &'_ str = op.as_ref();
|
995
|
+
let v: Vec<&'_ str> = v.iter().map(|v| v.as_ref()).collect();
|
996
|
+
PartitionFilter::try_from((key, op, v.as_slice()))
|
997
|
+
}
|
998
|
+
})
|
999
|
+
.collect()
|
1000
|
+
}
|
1001
|
+
|
1002
|
+
fn maybe_create_commit_properties(
|
1003
|
+
maybe_commit_properties: Option<RbCommitProperties>,
|
1004
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
1005
|
+
) -> Option<CommitProperties> {
|
1006
|
+
if maybe_commit_properties.is_none() && post_commithook_properties.is_none() {
|
1007
|
+
return None;
|
221
1008
|
}
|
1009
|
+
let mut commit_properties = CommitProperties::default();
|
222
1010
|
|
223
|
-
|
224
|
-
|
1011
|
+
if let Some(commit_props) = maybe_commit_properties {
|
1012
|
+
if let Some(metadata) = commit_props.custom_metadata {
|
1013
|
+
let json_metadata: Map<String, serde_json::Value> =
|
1014
|
+
metadata.into_iter().map(|(k, v)| (k, v.into())).collect();
|
1015
|
+
commit_properties = commit_properties.with_metadata(json_metadata);
|
1016
|
+
};
|
1017
|
+
|
1018
|
+
if let Some(max_retries) = commit_props.max_commit_retries {
|
1019
|
+
commit_properties = commit_properties.with_max_retries(max_retries);
|
1020
|
+
};
|
1021
|
+
|
1022
|
+
if let Some(app_transactions) = commit_props.app_transactions {
|
1023
|
+
let app_transactions = app_transactions.iter().map(Transaction::from).collect();
|
1024
|
+
commit_properties = commit_properties.with_application_transactions(app_transactions);
|
1025
|
+
}
|
225
1026
|
}
|
226
1027
|
|
227
|
-
|
228
|
-
|
1028
|
+
if let Some(post_commit_hook_props) = post_commithook_properties {
|
1029
|
+
commit_properties =
|
1030
|
+
set_post_commithook_properties(commit_properties, post_commit_hook_props)
|
1031
|
+
}
|
1032
|
+
Some(commit_properties)
|
1033
|
+
}
|
1034
|
+
|
1035
|
+
fn rust_core_version() -> String {
|
1036
|
+
deltalake::crate_version().to_string()
|
1037
|
+
}
|
1038
|
+
|
1039
|
+
pub struct BloomFilterProperties {
|
1040
|
+
pub set_bloom_filter_enabled: Option<bool>,
|
1041
|
+
pub fpp: Option<f64>,
|
1042
|
+
pub ndv: Option<u64>,
|
1043
|
+
}
|
1044
|
+
|
1045
|
+
impl TryConvert for BloomFilterProperties {
|
1046
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
1047
|
+
Ok(BloomFilterProperties {
|
1048
|
+
set_bloom_filter_enabled: val.funcall("set_bloom_filter_enabled", ())?,
|
1049
|
+
fpp: val.funcall("fpp", ())?,
|
1050
|
+
ndv: val.funcall("ndv", ())?,
|
1051
|
+
})
|
1052
|
+
}
|
1053
|
+
}
|
1054
|
+
|
1055
|
+
pub struct ColumnProperties {
|
1056
|
+
pub dictionary_enabled: Option<bool>,
|
1057
|
+
pub max_statistics_size: Option<usize>,
|
1058
|
+
pub bloom_filter_properties: Option<BloomFilterProperties>,
|
1059
|
+
}
|
1060
|
+
|
1061
|
+
impl TryConvert for ColumnProperties {
|
1062
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
1063
|
+
Ok(ColumnProperties {
|
1064
|
+
dictionary_enabled: val.funcall("dictionary_enabled", ())?,
|
1065
|
+
max_statistics_size: val.funcall("max_statistics_size", ())?,
|
1066
|
+
bloom_filter_properties: val.funcall("bloom_filter_properties", ())?,
|
1067
|
+
})
|
1068
|
+
}
|
1069
|
+
}
|
1070
|
+
|
1071
|
+
pub struct RbWriterProperties {
|
1072
|
+
data_page_size_limit: Option<usize>,
|
1073
|
+
dictionary_page_size_limit: Option<usize>,
|
1074
|
+
data_page_row_count_limit: Option<usize>,
|
1075
|
+
write_batch_size: Option<usize>,
|
1076
|
+
max_row_group_size: Option<usize>,
|
1077
|
+
statistics_truncate_length: Option<usize>,
|
1078
|
+
compression: Option<String>,
|
1079
|
+
default_column_properties: Option<ColumnProperties>,
|
1080
|
+
column_properties: Option<HashMap<String, Option<ColumnProperties>>>,
|
1081
|
+
}
|
1082
|
+
|
1083
|
+
impl TryConvert for RbWriterProperties {
|
1084
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
1085
|
+
Ok(RbWriterProperties {
|
1086
|
+
data_page_size_limit: val.funcall("data_page_size_limit", ())?,
|
1087
|
+
dictionary_page_size_limit: val.funcall("dictionary_page_size_limit", ())?,
|
1088
|
+
data_page_row_count_limit: val.funcall("data_page_row_count_limit", ())?,
|
1089
|
+
write_batch_size: val.funcall("write_batch_size", ())?,
|
1090
|
+
max_row_group_size: val.funcall("max_row_group_size", ())?,
|
1091
|
+
statistics_truncate_length: val.funcall("statistics_truncate_length", ())?,
|
1092
|
+
compression: val.funcall("compression", ())?,
|
1093
|
+
default_column_properties: val.funcall("default_column_properties", ())?,
|
1094
|
+
// TODO fix
|
1095
|
+
column_properties: None,
|
1096
|
+
})
|
1097
|
+
}
|
1098
|
+
}
|
1099
|
+
|
1100
|
+
pub struct RbPostCommitHookProperties {
|
1101
|
+
create_checkpoint: bool,
|
1102
|
+
cleanup_expired_logs: Option<bool>,
|
1103
|
+
}
|
1104
|
+
|
1105
|
+
impl TryConvert for RbPostCommitHookProperties {
|
1106
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
1107
|
+
Ok(RbPostCommitHookProperties {
|
1108
|
+
create_checkpoint: val.funcall("create_checkpoint", ())?,
|
1109
|
+
cleanup_expired_logs: val.funcall("cleanup_expired_logs", ())?,
|
1110
|
+
})
|
1111
|
+
}
|
1112
|
+
}
|
1113
|
+
|
1114
|
+
#[magnus::wrap(class = "DeltaLake::Transaction")]
|
1115
|
+
pub struct RbTransaction {
|
1116
|
+
pub app_id: String,
|
1117
|
+
pub version: i64,
|
1118
|
+
pub last_updated: Option<i64>,
|
1119
|
+
}
|
1120
|
+
|
1121
|
+
impl From<Transaction> for RbTransaction {
|
1122
|
+
fn from(value: Transaction) -> Self {
|
1123
|
+
RbTransaction {
|
1124
|
+
app_id: value.app_id,
|
1125
|
+
version: value.version,
|
1126
|
+
last_updated: value.last_updated,
|
1127
|
+
}
|
1128
|
+
}
|
1129
|
+
}
|
1130
|
+
|
1131
|
+
impl From<&RbTransaction> for Transaction {
|
1132
|
+
fn from(value: &RbTransaction) -> Self {
|
1133
|
+
Transaction {
|
1134
|
+
app_id: value.app_id.clone(),
|
1135
|
+
version: value.version,
|
1136
|
+
last_updated: value.last_updated,
|
1137
|
+
}
|
1138
|
+
}
|
1139
|
+
}
|
1140
|
+
|
1141
|
+
pub struct RbCommitProperties {
|
1142
|
+
custom_metadata: Option<HashMap<String, String>>,
|
1143
|
+
max_commit_retries: Option<usize>,
|
1144
|
+
app_transactions: Option<Vec<RbTransaction>>,
|
1145
|
+
}
|
1146
|
+
|
1147
|
+
impl TryConvert for RbCommitProperties {
|
1148
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
1149
|
+
Ok(RbCommitProperties {
|
1150
|
+
custom_metadata: val.funcall("custom_metadata", ())?,
|
1151
|
+
max_commit_retries: val.funcall("max_commit_retries", ())?,
|
1152
|
+
// TODO fix
|
1153
|
+
app_transactions: None,
|
1154
|
+
})
|
229
1155
|
}
|
230
1156
|
}
|
231
1157
|
|
232
1158
|
#[allow(clippy::too_many_arguments)]
|
233
1159
|
fn write_to_deltalake(
|
234
1160
|
table_uri: String,
|
235
|
-
data:
|
1161
|
+
data: RbArrowType<ArrowArrayStreamReader>,
|
236
1162
|
mode: String,
|
237
1163
|
table: Option<&RawDeltaTable>,
|
238
1164
|
schema_mode: Option<String>,
|
@@ -243,16 +1169,11 @@ fn write_to_deltalake(
|
|
243
1169
|
description: Option<String>,
|
244
1170
|
configuration: Option<HashMap<String, Option<String>>>,
|
245
1171
|
storage_options: Option<HashMap<String, String>>,
|
1172
|
+
writer_properties: Option<RbWriterProperties>,
|
1173
|
+
commit_properties: Option<RbCommitProperties>,
|
1174
|
+
post_commithook_properties: Option<RbPostCommitHookProperties>,
|
246
1175
|
) -> RbResult<()> {
|
247
|
-
let
|
248
|
-
|
249
|
-
// use similar approach as Polars to avoid copy
|
250
|
-
let stream_ptr =
|
251
|
-
Box::new(unsafe { std::ptr::replace(capsule_pointer as _, FFI_ArrowArrayStream::empty()) });
|
252
|
-
let stream = ArrowArrayStreamReader::try_new(*stream_ptr)
|
253
|
-
.map_err(|err| DeltaError::new_err(err.to_string()))?;
|
254
|
-
|
255
|
-
let batches = stream.map(|batch| batch.unwrap()).collect::<Vec<_>>();
|
1176
|
+
let batches = data.0.map(|batch| batch.unwrap()).collect::<Vec<_>>();
|
256
1177
|
let save_mode = mode.parse().map_err(RubyError::from)?;
|
257
1178
|
|
258
1179
|
let options = storage_options.clone().unwrap_or_default();
|
@@ -273,6 +1194,12 @@ fn write_to_deltalake(
|
|
273
1194
|
builder = builder.with_partition_columns(partition_columns);
|
274
1195
|
}
|
275
1196
|
|
1197
|
+
if let Some(writer_props) = writer_properties {
|
1198
|
+
builder = builder.with_writer_properties(
|
1199
|
+
set_writer_properties(writer_props).map_err(RubyError::from)?,
|
1200
|
+
);
|
1201
|
+
}
|
1202
|
+
|
276
1203
|
if let Some(name) = &name {
|
277
1204
|
builder = builder.with_table_name(name);
|
278
1205
|
};
|
@@ -293,18 +1220,55 @@ fn write_to_deltalake(
|
|
293
1220
|
builder = builder.with_configuration(config);
|
294
1221
|
};
|
295
1222
|
|
1223
|
+
if let Some(commit_properties) =
|
1224
|
+
maybe_create_commit_properties(commit_properties, post_commithook_properties)
|
1225
|
+
{
|
1226
|
+
builder = builder.with_commit_properties(commit_properties);
|
1227
|
+
};
|
1228
|
+
|
296
1229
|
rt().block_on(builder.into_future())
|
297
1230
|
.map_err(RubyError::from)?;
|
298
1231
|
|
299
1232
|
Ok(())
|
300
1233
|
}
|
301
1234
|
|
1235
|
+
pub struct RbArrowType<T>(pub T);
|
1236
|
+
|
1237
|
+
impl TryConvert for RbArrowType<ArrowArrayStreamReader> {
|
1238
|
+
fn try_convert(val: Value) -> RbResult<Self> {
|
1239
|
+
let addr: usize = val.funcall("to_i", ())?;
|
1240
|
+
|
1241
|
+
// use similar approach as Polars to consume pointer and avoid copy
|
1242
|
+
let stream_ptr =
|
1243
|
+
Box::new(unsafe { std::ptr::replace(addr as _, FFI_ArrowArrayStream::empty()) });
|
1244
|
+
|
1245
|
+
Ok(RbArrowType(
|
1246
|
+
ArrowArrayStreamReader::try_new(*stream_ptr)
|
1247
|
+
.map_err(|err| DeltaError::new_err(err.to_string()))?,
|
1248
|
+
))
|
1249
|
+
}
|
1250
|
+
}
|
1251
|
+
|
1252
|
+
#[magnus::wrap(class = "DeltaLake::ArrowArrayStream")]
|
1253
|
+
pub struct ArrowArrayStream {
|
1254
|
+
stream: FFI_ArrowArrayStream,
|
1255
|
+
}
|
1256
|
+
|
1257
|
+
impl ArrowArrayStream {
|
1258
|
+
pub fn to_i(&self) -> usize {
|
1259
|
+
(&self.stream as *const _) as usize
|
1260
|
+
}
|
1261
|
+
}
|
1262
|
+
|
302
1263
|
#[magnus::init]
|
303
1264
|
fn init(ruby: &Ruby) -> RbResult<()> {
|
304
1265
|
deltalake::aws::register_handlers(None);
|
1266
|
+
deltalake::azure::register_handlers(None);
|
1267
|
+
deltalake::gcp::register_handlers(None);
|
305
1268
|
|
306
1269
|
let module = ruby.define_module("DeltaLake")?;
|
307
|
-
module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake,
|
1270
|
+
module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 15))?;
|
1271
|
+
module.define_singleton_method("rust_core_version", function!(rust_core_version, 0))?;
|
308
1272
|
|
309
1273
|
let class = module.define_class("RawDeltaTable", ruby.class_object())?;
|
310
1274
|
class.define_singleton_method("new", function!(RawDeltaTable::new, 5))?;
|
@@ -313,16 +1277,91 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
313
1277
|
class.define_method("version", method!(RawDeltaTable::version, 0))?;
|
314
1278
|
class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?;
|
315
1279
|
class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?;
|
1280
|
+
class.define_method(
|
1281
|
+
"protocol_versions",
|
1282
|
+
method!(RawDeltaTable::protocol_versions, 0),
|
1283
|
+
)?;
|
316
1284
|
class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?;
|
317
|
-
class.define_method(
|
318
|
-
|
1285
|
+
class.define_method(
|
1286
|
+
"get_latest_version",
|
1287
|
+
method!(RawDeltaTable::get_latest_version, 0),
|
1288
|
+
)?;
|
1289
|
+
class.define_method(
|
1290
|
+
"get_earliest_version",
|
1291
|
+
method!(RawDeltaTable::get_earliest_version, 0),
|
1292
|
+
)?;
|
1293
|
+
class.define_method(
|
1294
|
+
"get_num_index_cols",
|
1295
|
+
method!(RawDeltaTable::get_num_index_cols, 0),
|
1296
|
+
)?;
|
1297
|
+
class.define_method(
|
1298
|
+
"get_stats_columns",
|
1299
|
+
method!(RawDeltaTable::get_stats_columns, 0),
|
1300
|
+
)?;
|
1301
|
+
class.define_method(
|
1302
|
+
"load_with_datetime",
|
1303
|
+
method!(RawDeltaTable::load_with_datetime, 1),
|
1304
|
+
)?;
|
1305
|
+
class.define_method("files", method!(RawDeltaTable::files, 1))?;
|
1306
|
+
class.define_method("file_uris", method!(RawDeltaTable::file_uris, 1))?;
|
319
1307
|
class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
|
320
|
-
class.define_method("vacuum", method!(RawDeltaTable::vacuum,
|
1308
|
+
class.define_method("vacuum", method!(RawDeltaTable::vacuum, 5))?;
|
1309
|
+
class.define_method(
|
1310
|
+
"compact_optimize",
|
1311
|
+
method!(RawDeltaTable::compact_optimize, 7),
|
1312
|
+
)?;
|
1313
|
+
class.define_method(
|
1314
|
+
"z_order_optimize",
|
1315
|
+
method!(RawDeltaTable::z_order_optimize, 9),
|
1316
|
+
)?;
|
1317
|
+
class.define_method("add_columns", method!(RawDeltaTable::add_columns, 1))?;
|
1318
|
+
class.define_method("add_feature", method!(RawDeltaTable::add_feature, 2))?;
|
1319
|
+
class.define_method(
|
1320
|
+
"add_constraints",
|
1321
|
+
method!(RawDeltaTable::add_constraints, 1),
|
1322
|
+
)?;
|
1323
|
+
class.define_method(
|
1324
|
+
"drop_constraints",
|
1325
|
+
method!(RawDeltaTable::drop_constraints, 2),
|
1326
|
+
)?;
|
1327
|
+
class.define_method("load_cdf", method!(RawDeltaTable::load_cdf, 5))?;
|
1328
|
+
class.define_method(
|
1329
|
+
"create_merge_builder",
|
1330
|
+
method!(RawDeltaTable::create_merge_builder, 8),
|
1331
|
+
)?;
|
1332
|
+
class.define_method("merge_execute", method!(RawDeltaTable::merge_execute, 1))?;
|
1333
|
+
class.define_method("restore", method!(RawDeltaTable::restore, 4))?;
|
1334
|
+
class.define_method("history", method!(RawDeltaTable::history, 1))?;
|
321
1335
|
class.define_method(
|
322
1336
|
"update_incremental",
|
323
1337
|
method!(RawDeltaTable::update_incremental, 0),
|
324
1338
|
)?;
|
325
|
-
class.define_method(
|
1339
|
+
class.define_method(
|
1340
|
+
"get_active_partitions",
|
1341
|
+
method!(RawDeltaTable::get_active_partitions, 0),
|
1342
|
+
)?;
|
1343
|
+
class.define_method(
|
1344
|
+
"create_checkpoint",
|
1345
|
+
method!(RawDeltaTable::create_checkpoint, 0),
|
1346
|
+
)?;
|
1347
|
+
class.define_method(
|
1348
|
+
"cleanup_metadata",
|
1349
|
+
method!(RawDeltaTable::cleanup_metadata, 0),
|
1350
|
+
)?;
|
1351
|
+
class.define_method(
|
1352
|
+
"get_add_file_sizes",
|
1353
|
+
method!(RawDeltaTable::get_add_file_sizes, 0),
|
1354
|
+
)?;
|
1355
|
+
class.define_method("delete", method!(RawDeltaTable::delete, 4))?;
|
1356
|
+
class.define_method(
|
1357
|
+
"set_table_properties",
|
1358
|
+
method!(RawDeltaTable::set_table_properties, 2),
|
1359
|
+
)?;
|
1360
|
+
class.define_method("repair", method!(RawDeltaTable::repair, 3))?;
|
1361
|
+
class.define_method(
|
1362
|
+
"transaction_versions",
|
1363
|
+
method!(RawDeltaTable::transaction_versions, 0),
|
1364
|
+
)?;
|
326
1365
|
|
327
1366
|
let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
|
328
1367
|
class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?;
|
@@ -344,10 +1383,37 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
344
1383
|
method!(RawDeltaTableMetaData::configuration, 0),
|
345
1384
|
)?;
|
346
1385
|
|
1386
|
+
let class = module.define_class("ArrowArrayStream", ruby.class_object())?;
|
1387
|
+
class.define_method("to_i", method!(ArrowArrayStream::to_i, 0))?;
|
1388
|
+
|
347
1389
|
let class = module.define_class("Field", ruby.class_object())?;
|
348
1390
|
class.define_method("name", method!(Field::name, 0))?;
|
349
1391
|
class.define_method("type", method!(Field::get_type, 0))?;
|
350
1392
|
class.define_method("nullable", method!(Field::nullable, 0))?;
|
351
1393
|
|
1394
|
+
let class = module.define_class("RbMergeBuilder", ruby.class_object())?;
|
1395
|
+
class.define_method("source_alias", method!(RbMergeBuilder::source_alias, 0))?;
|
1396
|
+
class.define_method("target_alias", method!(RbMergeBuilder::target_alias, 0))?;
|
1397
|
+
class.define_method(
|
1398
|
+
"when_matched_update",
|
1399
|
+
method!(RbMergeBuilder::when_matched_update, 2),
|
1400
|
+
)?;
|
1401
|
+
class.define_method(
|
1402
|
+
"when_matched_delete",
|
1403
|
+
method!(RbMergeBuilder::when_matched_delete, 1),
|
1404
|
+
)?;
|
1405
|
+
class.define_method(
|
1406
|
+
"when_not_matched_insert",
|
1407
|
+
method!(RbMergeBuilder::when_not_matched_insert, 2),
|
1408
|
+
)?;
|
1409
|
+
class.define_method(
|
1410
|
+
"when_not_matched_by_source_update",
|
1411
|
+
method!(RbMergeBuilder::when_not_matched_by_source_update, 2),
|
1412
|
+
)?;
|
1413
|
+
class.define_method(
|
1414
|
+
"when_not_matched_by_source_delete",
|
1415
|
+
method!(RbMergeBuilder::when_not_matched_by_source_delete, 1),
|
1416
|
+
)?;
|
1417
|
+
|
352
1418
|
Ok(())
|
353
1419
|
}
|