deltalake-rb 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,6 @@
1
1
  mod error;
2
+ mod features;
3
+ mod merge;
2
4
  mod schema;
3
5
  mod utils;
4
6
 
@@ -9,12 +11,16 @@ use std::str::FromStr;
9
11
  use std::time;
10
12
 
11
13
  use chrono::{DateTime, Duration, FixedOffset, Utc};
14
+ use delta_kernel::schema::StructField;
12
15
  use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
13
16
  use deltalake::arrow::record_batch::RecordBatchIterator;
17
+ use deltalake::checkpoints::{cleanup_metadata, create_checkpoint};
14
18
  use deltalake::datafusion::physical_plan::ExecutionPlan;
15
19
  use deltalake::datafusion::prelude::SessionContext;
16
20
  use deltalake::errors::DeltaTableError;
17
21
  use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
22
+ use deltalake::operations::add_column::AddColumnBuilder;
23
+ use deltalake::operations::add_feature::AddTableFeatureBuilder;
18
24
  use deltalake::operations::collect_sendable_stream;
19
25
  use deltalake::operations::constraints::ConstraintBuilder;
20
26
  use deltalake::operations::delete::DeleteBuilder;
@@ -23,25 +29,49 @@ use deltalake::operations::filesystem_check::FileSystemCheckBuilder;
23
29
  use deltalake::operations::load_cdf::CdfLoadBuilder;
24
30
  use deltalake::operations::optimize::{OptimizeBuilder, OptimizeType};
25
31
  use deltalake::operations::restore::RestoreBuilder;
26
- use deltalake::operations::transaction::TableReference;
32
+ use deltalake::operations::set_tbl_properties::SetTablePropertiesBuilder;
33
+ use deltalake::operations::transaction::{CommitProperties, TableReference};
27
34
  use deltalake::operations::vacuum::VacuumBuilder;
35
+ use deltalake::parquet::basic::Compression;
36
+ use deltalake::parquet::errors::ParquetError;
37
+ use deltalake::parquet::file::properties::WriterProperties;
28
38
  use deltalake::partitions::PartitionFilter;
29
39
  use deltalake::storage::IORuntime;
30
- use deltalake::DeltaOps;
40
+ use deltalake::{DeltaOps, DeltaResult};
31
41
  use error::DeltaError;
32
42
  use futures::future::join_all;
33
43
 
34
44
  use magnus::{
35
- exception, function, method, prelude::*, Error, Integer, Module, RArray, RHash, Ruby, Value,
45
+ function, method, prelude::*, typed_data::Obj, Error, Integer, Module, RArray, RHash, Ruby,
46
+ TryConvert, Value,
36
47
  };
48
+ use serde_json::Map;
37
49
 
38
50
  use crate::error::DeltaProtocolError;
51
+ use crate::error::RbValueError;
39
52
  use crate::error::RubyError;
53
+ use crate::features::TableFeatures;
54
+ use crate::merge::RbMergeBuilder;
40
55
  use crate::schema::{schema_to_rbobject, Field};
41
56
  use crate::utils::rt;
42
57
 
43
58
  type RbResult<T> = Result<T, Error>;
44
59
 
60
+ enum PartitionFilterValue {
61
+ Single(String),
62
+ Multiple(Vec<String>),
63
+ }
64
+
65
+ impl TryConvert for PartitionFilterValue {
66
+ fn try_convert(val: Value) -> RbResult<Self> {
67
+ if let Ok(v) = Vec::<String>::try_convert(val) {
68
+ Ok(PartitionFilterValue::Multiple(v))
69
+ } else {
70
+ Ok(PartitionFilterValue::Single(String::try_convert(val)?))
71
+ }
72
+ }
73
+ }
74
+
45
75
  #[magnus::wrap(class = "DeltaLake::RawDeltaTable")]
46
76
  struct RawDeltaTable {
47
77
  _table: RefCell<deltalake::DeltaTable>,
@@ -57,14 +87,29 @@ struct RawDeltaTableMetaData {
57
87
  configuration: HashMap<String, Option<String>>,
58
88
  }
59
89
 
60
- #[magnus::wrap(class = "DeltaLake::ArrowArrayStream")]
61
- pub struct ArrowArrayStream {
62
- stream: FFI_ArrowArrayStream,
63
- }
90
+ impl RawDeltaTableMetaData {
91
+ fn id(&self) -> String {
92
+ self.id.clone()
93
+ }
64
94
 
65
- impl ArrowArrayStream {
66
- pub fn to_i(&self) -> usize {
67
- (&self.stream as *const _) as usize
95
+ fn name(&self) -> Option<String> {
96
+ self.name.clone()
97
+ }
98
+
99
+ fn description(&self) -> Option<String> {
100
+ self.description.clone()
101
+ }
102
+
103
+ fn partition_columns(&self) -> Vec<String> {
104
+ self.partition_columns.clone()
105
+ }
106
+
107
+ fn created_time(&self) -> Option<i64> {
108
+ self.created_time
109
+ }
110
+
111
+ fn configuration(&self) -> HashMap<String, Option<String>> {
112
+ self.configuration.clone()
68
113
  }
69
114
  }
70
115
 
@@ -218,20 +263,19 @@ impl RawDeltaTable {
218
263
  }
219
264
 
220
265
  pub fn load_with_datetime(&self, ds: String) -> RbResult<()> {
221
- let datetime = DateTime::<Utc>::from(
222
- DateTime::<FixedOffset>::parse_from_rfc3339(&ds).map_err(|err| {
223
- Error::new(
224
- exception::arg_error(),
225
- format!("Failed to parse datetime string: {err}"),
226
- )
227
- })?,
228
- );
266
+ let datetime =
267
+ DateTime::<Utc>::from(DateTime::<FixedOffset>::parse_from_rfc3339(&ds).map_err(
268
+ |err| RbValueError::new_err(format!("Failed to parse datetime string: {err}")),
269
+ )?);
229
270
  Ok(rt()
230
271
  .block_on(self._table.borrow_mut().load_with_datetime(datetime))
231
272
  .map_err(RubyError::from)?)
232
273
  }
233
274
 
234
- pub fn files(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
275
+ pub fn files(
276
+ &self,
277
+ partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
278
+ ) -> RbResult<Vec<String>> {
235
279
  if !self.has_files()? {
236
280
  return Err(DeltaError::new_err("Table is instantiated without files."));
237
281
  }
@@ -257,7 +301,10 @@ impl RawDeltaTable {
257
301
  }
258
302
  }
259
303
 
260
- pub fn file_uris(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
304
+ pub fn file_uris(
305
+ &self,
306
+ partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
307
+ ) -> RbResult<Vec<String>> {
261
308
  if !self._table.borrow().config.require_files {
262
309
  return Err(DeltaError::new_err("Table is initiated without files."));
263
310
  }
@@ -290,6 +337,8 @@ impl RawDeltaTable {
290
337
  dry_run: bool,
291
338
  retention_hours: Option<u64>,
292
339
  enforce_retention_duration: bool,
340
+ commit_properties: Option<RbCommitProperties>,
341
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
293
342
  ) -> RbResult<Vec<String>> {
294
343
  let mut cmd = VacuumBuilder::new(
295
344
  self._table.borrow().log_store(),
@@ -305,6 +354,11 @@ impl RawDeltaTable {
305
354
  cmd = cmd.with_retention_period(Duration::hours(retention_period as i64));
306
355
  }
307
356
 
357
+ if let Some(commit_properties) =
358
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
359
+ {
360
+ cmd = cmd.with_commit_properties(commit_properties);
361
+ }
308
362
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
309
363
  self._table.borrow_mut().state = table.state;
310
364
  Ok(metrics.files_deleted)
@@ -312,9 +366,13 @@ impl RawDeltaTable {
312
366
 
313
367
  pub fn compact_optimize(
314
368
  &self,
369
+ partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
315
370
  target_size: Option<i64>,
316
371
  max_concurrent_tasks: Option<usize>,
317
372
  min_commit_interval: Option<u64>,
373
+ writer_properties: Option<RbWriterProperties>,
374
+ commit_properties: Option<RbCommitProperties>,
375
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
318
376
  ) -> RbResult<String> {
319
377
  let mut cmd = OptimizeBuilder::new(
320
378
  self._table.borrow().log_store(),
@@ -332,6 +390,22 @@ impl RawDeltaTable {
332
390
  cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
333
391
  }
334
392
 
393
+ if let Some(writer_props) = writer_properties {
394
+ cmd = cmd.with_writer_properties(
395
+ set_writer_properties(writer_props).map_err(RubyError::from)?,
396
+ );
397
+ }
398
+
399
+ if let Some(commit_properties) =
400
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
401
+ {
402
+ cmd = cmd.with_commit_properties(commit_properties);
403
+ }
404
+
405
+ let converted_filters = convert_partition_filters(partition_filters.unwrap_or_default())
406
+ .map_err(RubyError::from)?;
407
+ cmd = cmd.with_filters(&converted_filters);
408
+
335
409
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
336
410
  self._table.borrow_mut().state = table.state;
337
411
  Ok(serde_json::to_string(&metrics).unwrap())
@@ -340,10 +414,14 @@ impl RawDeltaTable {
340
414
  pub fn z_order_optimize(
341
415
  &self,
342
416
  z_order_columns: Vec<String>,
417
+ partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
343
418
  target_size: Option<i64>,
344
419
  max_concurrent_tasks: Option<usize>,
345
420
  max_spill_size: usize,
346
421
  min_commit_interval: Option<u64>,
422
+ writer_properties: Option<RbWriterProperties>,
423
+ commit_properties: Option<RbCommitProperties>,
424
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
347
425
  ) -> RbResult<String> {
348
426
  let mut cmd = OptimizeBuilder::new(
349
427
  self._table.borrow().log_store(),
@@ -363,11 +441,75 @@ impl RawDeltaTable {
363
441
  cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
364
442
  }
365
443
 
444
+ if let Some(writer_props) = writer_properties {
445
+ cmd = cmd.with_writer_properties(
446
+ set_writer_properties(writer_props).map_err(RubyError::from)?,
447
+ );
448
+ }
449
+
450
+ if let Some(commit_properties) =
451
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
452
+ {
453
+ cmd = cmd.with_commit_properties(commit_properties);
454
+ }
455
+
456
+ let converted_filters = convert_partition_filters(partition_filters.unwrap_or_default())
457
+ .map_err(RubyError::from)?;
458
+ cmd = cmd.with_filters(&converted_filters);
459
+
366
460
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
367
461
  self._table.borrow_mut().state = table.state;
368
462
  Ok(serde_json::to_string(&metrics).unwrap())
369
463
  }
370
464
 
465
+ pub fn add_columns(&self, fields: RArray) -> RbResult<()> {
466
+ let fields = fields.typecheck::<Obj<Field>>()?;
467
+ let mut cmd = AddColumnBuilder::new(
468
+ self._table.borrow().log_store(),
469
+ self._table
470
+ .borrow()
471
+ .snapshot()
472
+ .map_err(RubyError::from)?
473
+ .clone(),
474
+ );
475
+
476
+ let new_fields = fields
477
+ .iter()
478
+ .map(|v| v.inner.clone())
479
+ .collect::<Vec<StructField>>();
480
+
481
+ cmd = cmd.with_fields(new_fields);
482
+
483
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
484
+ self._table.borrow_mut().state = table.state;
485
+ Ok(())
486
+ }
487
+
488
+ pub fn add_feature(
489
+ &self,
490
+ feature: RArray,
491
+ allow_protocol_versions_increase: bool,
492
+ ) -> RbResult<()> {
493
+ let feature = feature
494
+ .into_iter()
495
+ .map(|v| TableFeatures::try_convert(v))
496
+ .collect::<RbResult<Vec<_>>>()?;
497
+ let cmd = AddTableFeatureBuilder::new(
498
+ self._table.borrow().log_store(),
499
+ self._table
500
+ .borrow()
501
+ .snapshot()
502
+ .map_err(RubyError::from)?
503
+ .clone(),
504
+ )
505
+ .with_features(feature)
506
+ .with_allow_protocol_versions_increase(allow_protocol_versions_increase);
507
+
508
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
509
+ self._table.borrow_mut().state = table.state;
510
+ Ok(())
511
+ }
512
+
371
513
  pub fn add_constraints(&self, constraints: HashMap<String, String>) -> RbResult<()> {
372
514
  let mut cmd = ConstraintBuilder::new(
373
515
  self._table.borrow().log_store(),
@@ -428,13 +570,13 @@ impl RawDeltaTable {
428
570
  }
429
571
  if let Some(st) = starting_timestamp {
430
572
  let starting_ts: DateTime<Utc> = DateTime::<Utc>::from_str(&st)
431
- .map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
573
+ .map_err(|pe| RbValueError::new_err(pe.to_string()))?
432
574
  .to_utc();
433
575
  cdf_read = cdf_read.with_starting_timestamp(starting_ts);
434
576
  }
435
577
  if let Some(et) = ending_timestamp {
436
578
  let ending_ts = DateTime::<Utc>::from_str(&et)
437
- .map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
579
+ .map_err(|pe| RbValueError::new_err(pe.to_string()))?
438
580
  .to_utc();
439
581
  cdf_read = cdf_read.with_starting_timestamp(ending_ts);
440
582
  }
@@ -470,11 +612,48 @@ impl RawDeltaTable {
470
612
  Ok(ArrowArrayStream { stream: ffi_stream })
471
613
  }
472
614
 
615
+ pub fn create_merge_builder(
616
+ &self,
617
+ source: RbArrowType<ArrowArrayStreamReader>,
618
+ predicate: String,
619
+ source_alias: Option<String>,
620
+ target_alias: Option<String>,
621
+ safe_cast: bool,
622
+ writer_properties: Option<RbWriterProperties>,
623
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
624
+ commit_properties: Option<RbCommitProperties>,
625
+ ) -> RbResult<RbMergeBuilder> {
626
+ Ok(RbMergeBuilder::new(
627
+ self._table.borrow().log_store(),
628
+ self._table
629
+ .borrow()
630
+ .snapshot()
631
+ .map_err(RubyError::from)?
632
+ .clone(),
633
+ source.0,
634
+ predicate,
635
+ source_alias,
636
+ target_alias,
637
+ safe_cast,
638
+ writer_properties,
639
+ post_commithook_properties,
640
+ commit_properties,
641
+ )
642
+ .map_err(RubyError::from)?)
643
+ }
644
+
645
+ pub fn merge_execute(&self, merge_builder: &RbMergeBuilder) -> RbResult<String> {
646
+ let (table, metrics) = merge_builder.execute().map_err(RubyError::from)?;
647
+ self._table.borrow_mut().state = table.state;
648
+ Ok(metrics)
649
+ }
650
+
473
651
  pub fn restore(
474
652
  &self,
475
653
  target: Option<Value>,
476
654
  ignore_missing_files: bool,
477
655
  protocol_downgrade_allowed: bool,
656
+ commit_properties: Option<RbCommitProperties>,
478
657
  ) -> RbResult<String> {
479
658
  let mut cmd = RestoreBuilder::new(
480
659
  self._table.borrow().log_store(),
@@ -491,10 +670,7 @@ impl RawDeltaTable {
491
670
  if let Ok(ds) = String::try_convert(val) {
492
671
  let datetime = DateTime::<Utc>::from(
493
672
  DateTime::<FixedOffset>::parse_from_rfc3339(ds.as_ref()).map_err(|err| {
494
- Error::new(
495
- exception::arg_error(),
496
- format!("Failed to parse datetime string: {err}"),
497
- )
673
+ RbValueError::new_err(format!("Failed to parse datetime string: {err}"))
498
674
  })?,
499
675
  );
500
676
  cmd = cmd.with_datetime_to_restore(datetime)
@@ -503,6 +679,10 @@ impl RawDeltaTable {
503
679
  cmd = cmd.with_ignore_missing_files(ignore_missing_files);
504
680
  cmd = cmd.with_protocol_downgrade_allowed(protocol_downgrade_allowed);
505
681
 
682
+ if let Some(commit_properties) = maybe_create_commit_properties(commit_properties, None) {
683
+ cmd = cmd.with_commit_properties(commit_properties);
684
+ }
685
+
506
686
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
507
687
  self._table.borrow_mut().state = table.state;
508
688
  Ok(serde_json::to_string(&metrics).unwrap())
@@ -575,7 +755,39 @@ impl RawDeltaTable {
575
755
  Ok(RArray::from_iter(active_partitions))
576
756
  }
577
757
 
578
- pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
758
+ pub fn create_checkpoint(&self) -> RbResult<()> {
759
+ rt().block_on(create_checkpoint(&self._table.borrow()))
760
+ .map_err(RubyError::from)?;
761
+
762
+ Ok(())
763
+ }
764
+
765
+ pub fn cleanup_metadata(&self) -> RbResult<()> {
766
+ rt().block_on(cleanup_metadata(&self._table.borrow()))
767
+ .map_err(RubyError::from)?;
768
+
769
+ Ok(())
770
+ }
771
+
772
+ pub fn get_add_file_sizes(&self) -> RbResult<HashMap<String, i64>> {
773
+ Ok(self
774
+ ._table
775
+ .borrow()
776
+ .snapshot()
777
+ .map_err(RubyError::from)?
778
+ .eager_snapshot()
779
+ .files()
780
+ .map(|f| (f.path().to_string(), f.size()))
781
+ .collect::<HashMap<String, i64>>())
782
+ }
783
+
784
+ pub fn delete(
785
+ &self,
786
+ predicate: Option<String>,
787
+ writer_properties: Option<RbWriterProperties>,
788
+ commit_properties: Option<RbCommitProperties>,
789
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
790
+ ) -> RbResult<String> {
579
791
  let mut cmd = DeleteBuilder::new(
580
792
  self._table.borrow().log_store(),
581
793
  self._table
@@ -587,14 +799,50 @@ impl RawDeltaTable {
587
799
  if let Some(predicate) = predicate {
588
800
  cmd = cmd.with_predicate(predicate);
589
801
  }
802
+ if let Some(writer_props) = writer_properties {
803
+ cmd = cmd.with_writer_properties(
804
+ set_writer_properties(writer_props).map_err(RubyError::from)?,
805
+ );
806
+ }
807
+ if let Some(commit_properties) =
808
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
809
+ {
810
+ cmd = cmd.with_commit_properties(commit_properties);
811
+ }
590
812
 
591
813
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
592
814
  self._table.borrow_mut().state = table.state;
593
815
  Ok(serde_json::to_string(&metrics).unwrap())
594
816
  }
595
817
 
596
- pub fn repair(&self, dry_run: bool) -> RbResult<String> {
597
- let cmd = FileSystemCheckBuilder::new(
818
+ pub fn set_table_properties(
819
+ &self,
820
+ properties: HashMap<String, String>,
821
+ raise_if_not_exists: bool,
822
+ ) -> RbResult<()> {
823
+ let cmd = SetTablePropertiesBuilder::new(
824
+ self._table.borrow().log_store(),
825
+ self._table
826
+ .borrow()
827
+ .snapshot()
828
+ .map_err(RubyError::from)?
829
+ .clone(),
830
+ )
831
+ .with_properties(properties)
832
+ .with_raise_if_not_exists(raise_if_not_exists);
833
+
834
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
835
+ self._table.borrow_mut().state = table.state;
836
+ Ok(())
837
+ }
838
+
839
+ pub fn repair(
840
+ &self,
841
+ dry_run: bool,
842
+ commit_properties: Option<RbCommitProperties>,
843
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
844
+ ) -> RbResult<String> {
845
+ let mut cmd = FileSystemCheckBuilder::new(
598
846
  self._table.borrow().log_store(),
599
847
  self._table
600
848
  .borrow()
@@ -604,6 +852,12 @@ impl RawDeltaTable {
604
852
  )
605
853
  .with_dry_run(dry_run);
606
854
 
855
+ if let Some(commit_properties) =
856
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
857
+ {
858
+ cmd = cmd.with_commit_properties(commit_properties);
859
+ }
860
+
607
861
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
608
862
  self._table.borrow_mut().state = table.state;
609
863
  Ok(serde_json::to_string(&metrics).unwrap())
@@ -620,35 +874,240 @@ impl RawDeltaTable {
620
874
  }
621
875
  }
622
876
 
877
+ fn set_post_commithook_properties(
878
+ mut commit_properties: CommitProperties,
879
+ post_commithook_properties: RbPostCommitHookProperties,
880
+ ) -> CommitProperties {
881
+ commit_properties =
882
+ commit_properties.with_create_checkpoint(post_commithook_properties.create_checkpoint);
883
+ commit_properties = commit_properties
884
+ .with_cleanup_expired_logs(post_commithook_properties.cleanup_expired_logs);
885
+ commit_properties
886
+ }
887
+
888
+ fn set_writer_properties(writer_properties: RbWriterProperties) -> DeltaResult<WriterProperties> {
889
+ let mut properties = WriterProperties::builder();
890
+ let data_page_size_limit = writer_properties.data_page_size_limit;
891
+ let dictionary_page_size_limit = writer_properties.dictionary_page_size_limit;
892
+ let data_page_row_count_limit = writer_properties.data_page_row_count_limit;
893
+ let write_batch_size = writer_properties.write_batch_size;
894
+ let max_row_group_size = writer_properties.max_row_group_size;
895
+ let compression = writer_properties.compression;
896
+ let statistics_truncate_length = writer_properties.statistics_truncate_length;
897
+ let default_column_properties = writer_properties.default_column_properties;
898
+ let column_properties = writer_properties.column_properties;
899
+
900
+ if let Some(data_page_size) = data_page_size_limit {
901
+ properties = properties.set_data_page_size_limit(data_page_size);
902
+ }
903
+ if let Some(dictionary_page_size) = dictionary_page_size_limit {
904
+ properties = properties.set_dictionary_page_size_limit(dictionary_page_size);
905
+ }
906
+ if let Some(data_page_row_count) = data_page_row_count_limit {
907
+ properties = properties.set_data_page_row_count_limit(data_page_row_count);
908
+ }
909
+ if let Some(batch_size) = write_batch_size {
910
+ properties = properties.set_write_batch_size(batch_size);
911
+ }
912
+ if let Some(row_group_size) = max_row_group_size {
913
+ properties = properties.set_max_row_group_size(row_group_size);
914
+ }
915
+ properties = properties.set_statistics_truncate_length(statistics_truncate_length);
916
+
917
+ if let Some(compression) = compression {
918
+ let compress: Compression = compression
919
+ .parse()
920
+ .map_err(|err: ParquetError| DeltaTableError::Generic(err.to_string()))?;
921
+
922
+ properties = properties.set_compression(compress);
923
+ }
924
+
925
+ if let Some(default_column_properties) = default_column_properties {
926
+ if let Some(dictionary_enabled) = default_column_properties.dictionary_enabled {
927
+ properties = properties.set_dictionary_enabled(dictionary_enabled);
928
+ }
929
+ if let Some(max_statistics_size) = default_column_properties.max_statistics_size {
930
+ properties = properties.set_max_statistics_size(max_statistics_size);
931
+ }
932
+ if let Some(bloom_filter_properties) = default_column_properties.bloom_filter_properties {
933
+ if let Some(set_bloom_filter_enabled) = bloom_filter_properties.set_bloom_filter_enabled
934
+ {
935
+ properties = properties.set_bloom_filter_enabled(set_bloom_filter_enabled);
936
+ }
937
+ if let Some(bloom_filter_fpp) = bloom_filter_properties.fpp {
938
+ properties = properties.set_bloom_filter_fpp(bloom_filter_fpp);
939
+ }
940
+ if let Some(bloom_filter_ndv) = bloom_filter_properties.ndv {
941
+ properties = properties.set_bloom_filter_ndv(bloom_filter_ndv);
942
+ }
943
+ }
944
+ }
945
+ if let Some(column_properties) = column_properties {
946
+ for (column_name, column_prop) in column_properties {
947
+ if let Some(column_prop) = column_prop {
948
+ if let Some(dictionary_enabled) = column_prop.dictionary_enabled {
949
+ properties = properties.set_column_dictionary_enabled(
950
+ column_name.clone().into(),
951
+ dictionary_enabled,
952
+ );
953
+ }
954
+ if let Some(bloom_filter_properties) = column_prop.bloom_filter_properties {
955
+ if let Some(set_bloom_filter_enabled) =
956
+ bloom_filter_properties.set_bloom_filter_enabled
957
+ {
958
+ properties = properties.set_column_bloom_filter_enabled(
959
+ column_name.clone().into(),
960
+ set_bloom_filter_enabled,
961
+ );
962
+ }
963
+ if let Some(bloom_filter_fpp) = bloom_filter_properties.fpp {
964
+ properties = properties.set_column_bloom_filter_fpp(
965
+ column_name.clone().into(),
966
+ bloom_filter_fpp,
967
+ );
968
+ }
969
+ if let Some(bloom_filter_ndv) = bloom_filter_properties.ndv {
970
+ properties = properties
971
+ .set_column_bloom_filter_ndv(column_name.into(), bloom_filter_ndv);
972
+ }
973
+ }
974
+ }
975
+ }
976
+ }
977
+ Ok(properties.build())
978
+ }
979
+
623
980
  fn convert_partition_filters(
624
- _partitions_filters: Value,
981
+ partitions_filters: Vec<(String, String, PartitionFilterValue)>,
625
982
  ) -> Result<Vec<PartitionFilter>, DeltaTableError> {
626
- todo!()
983
+ partitions_filters
984
+ .into_iter()
985
+ .map(|filter| match filter {
986
+ (key, op, PartitionFilterValue::Single(v)) => {
987
+ let key: &'_ str = key.as_ref();
988
+ let op: &'_ str = op.as_ref();
989
+ let v: &'_ str = v.as_ref();
990
+ PartitionFilter::try_from((key, op, v))
991
+ }
992
+ (key, op, PartitionFilterValue::Multiple(v)) => {
993
+ let key: &'_ str = key.as_ref();
994
+ let op: &'_ str = op.as_ref();
995
+ let v: Vec<&'_ str> = v.iter().map(|v| v.as_ref()).collect();
996
+ PartitionFilter::try_from((key, op, v.as_slice()))
997
+ }
998
+ })
999
+ .collect()
627
1000
  }
628
1001
 
629
- impl RawDeltaTableMetaData {
630
- fn id(&self) -> String {
631
- self.id.clone()
1002
+ fn maybe_create_commit_properties(
1003
+ maybe_commit_properties: Option<RbCommitProperties>,
1004
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
1005
+ ) -> Option<CommitProperties> {
1006
+ if maybe_commit_properties.is_none() && post_commithook_properties.is_none() {
1007
+ return None;
1008
+ }
1009
+ let mut commit_properties = CommitProperties::default();
1010
+
1011
+ if let Some(commit_props) = maybe_commit_properties {
1012
+ if let Some(metadata) = commit_props.custom_metadata {
1013
+ let json_metadata: Map<String, serde_json::Value> =
1014
+ metadata.into_iter().map(|(k, v)| (k, v.into())).collect();
1015
+ commit_properties = commit_properties.with_metadata(json_metadata);
1016
+ };
1017
+
1018
+ if let Some(max_retries) = commit_props.max_commit_retries {
1019
+ commit_properties = commit_properties.with_max_retries(max_retries);
1020
+ };
1021
+
1022
+ if let Some(app_transactions) = commit_props.app_transactions {
1023
+ let app_transactions = app_transactions.iter().map(Transaction::from).collect();
1024
+ commit_properties = commit_properties.with_application_transactions(app_transactions);
1025
+ }
632
1026
  }
633
1027
 
634
- fn name(&self) -> Option<String> {
635
- self.name.clone()
1028
+ if let Some(post_commit_hook_props) = post_commithook_properties {
1029
+ commit_properties =
1030
+ set_post_commithook_properties(commit_properties, post_commit_hook_props)
636
1031
  }
1032
+ Some(commit_properties)
1033
+ }
637
1034
 
638
- fn description(&self) -> Option<String> {
639
- self.description.clone()
1035
+ fn rust_core_version() -> String {
1036
+ deltalake::crate_version().to_string()
1037
+ }
1038
+
1039
+ pub struct BloomFilterProperties {
1040
+ pub set_bloom_filter_enabled: Option<bool>,
1041
+ pub fpp: Option<f64>,
1042
+ pub ndv: Option<u64>,
1043
+ }
1044
+
1045
+ impl TryConvert for BloomFilterProperties {
1046
+ fn try_convert(val: Value) -> RbResult<Self> {
1047
+ Ok(BloomFilterProperties {
1048
+ set_bloom_filter_enabled: val.funcall("set_bloom_filter_enabled", ())?,
1049
+ fpp: val.funcall("fpp", ())?,
1050
+ ndv: val.funcall("ndv", ())?,
1051
+ })
640
1052
  }
1053
+ }
641
1054
 
642
- fn partition_columns(&self) -> Vec<String> {
643
- self.partition_columns.clone()
1055
+ pub struct ColumnProperties {
1056
+ pub dictionary_enabled: Option<bool>,
1057
+ pub max_statistics_size: Option<usize>,
1058
+ pub bloom_filter_properties: Option<BloomFilterProperties>,
1059
+ }
1060
+
1061
+ impl TryConvert for ColumnProperties {
1062
+ fn try_convert(val: Value) -> RbResult<Self> {
1063
+ Ok(ColumnProperties {
1064
+ dictionary_enabled: val.funcall("dictionary_enabled", ())?,
1065
+ max_statistics_size: val.funcall("max_statistics_size", ())?,
1066
+ bloom_filter_properties: val.funcall("bloom_filter_properties", ())?,
1067
+ })
644
1068
  }
1069
+ }
645
1070
 
646
- fn created_time(&self) -> Option<i64> {
647
- self.created_time
1071
+ pub struct RbWriterProperties {
1072
+ data_page_size_limit: Option<usize>,
1073
+ dictionary_page_size_limit: Option<usize>,
1074
+ data_page_row_count_limit: Option<usize>,
1075
+ write_batch_size: Option<usize>,
1076
+ max_row_group_size: Option<usize>,
1077
+ statistics_truncate_length: Option<usize>,
1078
+ compression: Option<String>,
1079
+ default_column_properties: Option<ColumnProperties>,
1080
+ column_properties: Option<HashMap<String, Option<ColumnProperties>>>,
1081
+ }
1082
+
1083
+ impl TryConvert for RbWriterProperties {
1084
+ fn try_convert(val: Value) -> RbResult<Self> {
1085
+ Ok(RbWriterProperties {
1086
+ data_page_size_limit: val.funcall("data_page_size_limit", ())?,
1087
+ dictionary_page_size_limit: val.funcall("dictionary_page_size_limit", ())?,
1088
+ data_page_row_count_limit: val.funcall("data_page_row_count_limit", ())?,
1089
+ write_batch_size: val.funcall("write_batch_size", ())?,
1090
+ max_row_group_size: val.funcall("max_row_group_size", ())?,
1091
+ statistics_truncate_length: val.funcall("statistics_truncate_length", ())?,
1092
+ compression: val.funcall("compression", ())?,
1093
+ default_column_properties: val.funcall("default_column_properties", ())?,
1094
+ // TODO fix
1095
+ column_properties: None,
1096
+ })
648
1097
  }
1098
+ }
649
1099
 
650
- fn configuration(&self) -> HashMap<String, Option<String>> {
651
- self.configuration.clone()
1100
+ pub struct RbPostCommitHookProperties {
1101
+ create_checkpoint: bool,
1102
+ cleanup_expired_logs: Option<bool>,
1103
+ }
1104
+
1105
+ impl TryConvert for RbPostCommitHookProperties {
1106
+ fn try_convert(val: Value) -> RbResult<Self> {
1107
+ Ok(RbPostCommitHookProperties {
1108
+ create_checkpoint: val.funcall("create_checkpoint", ())?,
1109
+ cleanup_expired_logs: val.funcall("cleanup_expired_logs", ())?,
1110
+ })
652
1111
  }
653
1112
  }
654
1113
 
@@ -669,10 +1128,37 @@ impl From<Transaction> for RbTransaction {
669
1128
  }
670
1129
  }
671
1130
 
1131
+ impl From<&RbTransaction> for Transaction {
1132
+ fn from(value: &RbTransaction) -> Self {
1133
+ Transaction {
1134
+ app_id: value.app_id.clone(),
1135
+ version: value.version,
1136
+ last_updated: value.last_updated,
1137
+ }
1138
+ }
1139
+ }
1140
+
1141
+ pub struct RbCommitProperties {
1142
+ custom_metadata: Option<HashMap<String, String>>,
1143
+ max_commit_retries: Option<usize>,
1144
+ app_transactions: Option<Vec<RbTransaction>>,
1145
+ }
1146
+
1147
+ impl TryConvert for RbCommitProperties {
1148
+ fn try_convert(val: Value) -> RbResult<Self> {
1149
+ Ok(RbCommitProperties {
1150
+ custom_metadata: val.funcall("custom_metadata", ())?,
1151
+ max_commit_retries: val.funcall("max_commit_retries", ())?,
1152
+ // TODO fix
1153
+ app_transactions: None,
1154
+ })
1155
+ }
1156
+ }
1157
+
672
1158
  #[allow(clippy::too_many_arguments)]
673
1159
  fn write_to_deltalake(
674
1160
  table_uri: String,
675
- data: Value,
1161
+ data: RbArrowType<ArrowArrayStreamReader>,
676
1162
  mode: String,
677
1163
  table: Option<&RawDeltaTable>,
678
1164
  schema_mode: Option<String>,
@@ -683,16 +1169,11 @@ fn write_to_deltalake(
683
1169
  description: Option<String>,
684
1170
  configuration: Option<HashMap<String, Option<String>>>,
685
1171
  storage_options: Option<HashMap<String, String>>,
1172
+ writer_properties: Option<RbWriterProperties>,
1173
+ commit_properties: Option<RbCommitProperties>,
1174
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
686
1175
  ) -> RbResult<()> {
687
- let capsule_pointer: usize = data.funcall("to_i", ())?;
688
-
689
- // use similar approach as Polars to avoid copy
690
- let stream_ptr =
691
- Box::new(unsafe { std::ptr::replace(capsule_pointer as _, FFI_ArrowArrayStream::empty()) });
692
- let stream = ArrowArrayStreamReader::try_new(*stream_ptr)
693
- .map_err(|err| DeltaError::new_err(err.to_string()))?;
694
-
695
- let batches = stream.map(|batch| batch.unwrap()).collect::<Vec<_>>();
1176
+ let batches = data.0.map(|batch| batch.unwrap()).collect::<Vec<_>>();
696
1177
  let save_mode = mode.parse().map_err(RubyError::from)?;
697
1178
 
698
1179
  let options = storage_options.clone().unwrap_or_default();
@@ -713,6 +1194,12 @@ fn write_to_deltalake(
713
1194
  builder = builder.with_partition_columns(partition_columns);
714
1195
  }
715
1196
 
1197
+ if let Some(writer_props) = writer_properties {
1198
+ builder = builder.with_writer_properties(
1199
+ set_writer_properties(writer_props).map_err(RubyError::from)?,
1200
+ );
1201
+ }
1202
+
716
1203
  if let Some(name) = &name {
717
1204
  builder = builder.with_table_name(name);
718
1205
  };
@@ -733,18 +1220,55 @@ fn write_to_deltalake(
733
1220
  builder = builder.with_configuration(config);
734
1221
  };
735
1222
 
1223
+ if let Some(commit_properties) =
1224
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
1225
+ {
1226
+ builder = builder.with_commit_properties(commit_properties);
1227
+ };
1228
+
736
1229
  rt().block_on(builder.into_future())
737
1230
  .map_err(RubyError::from)?;
738
1231
 
739
1232
  Ok(())
740
1233
  }
741
1234
 
1235
+ pub struct RbArrowType<T>(pub T);
1236
+
1237
+ impl TryConvert for RbArrowType<ArrowArrayStreamReader> {
1238
+ fn try_convert(val: Value) -> RbResult<Self> {
1239
+ let addr: usize = val.funcall("to_i", ())?;
1240
+
1241
+ // use similar approach as Polars to consume pointer and avoid copy
1242
+ let stream_ptr =
1243
+ Box::new(unsafe { std::ptr::replace(addr as _, FFI_ArrowArrayStream::empty()) });
1244
+
1245
+ Ok(RbArrowType(
1246
+ ArrowArrayStreamReader::try_new(*stream_ptr)
1247
+ .map_err(|err| DeltaError::new_err(err.to_string()))?,
1248
+ ))
1249
+ }
1250
+ }
1251
+
1252
+ #[magnus::wrap(class = "DeltaLake::ArrowArrayStream")]
1253
+ pub struct ArrowArrayStream {
1254
+ stream: FFI_ArrowArrayStream,
1255
+ }
1256
+
1257
+ impl ArrowArrayStream {
1258
+ pub fn to_i(&self) -> usize {
1259
+ (&self.stream as *const _) as usize
1260
+ }
1261
+ }
1262
+
742
1263
  #[magnus::init]
743
1264
  fn init(ruby: &Ruby) -> RbResult<()> {
744
1265
  deltalake::aws::register_handlers(None);
1266
+ deltalake::azure::register_handlers(None);
1267
+ deltalake::gcp::register_handlers(None);
745
1268
 
746
1269
  let module = ruby.define_module("DeltaLake")?;
747
- module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 12))?;
1270
+ module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 15))?;
1271
+ module.define_singleton_method("rust_core_version", function!(rust_core_version, 0))?;
748
1272
 
749
1273
  let class = module.define_class("RawDeltaTable", ruby.class_object())?;
750
1274
  class.define_singleton_method("new", function!(RawDeltaTable::new, 5))?;
@@ -781,15 +1305,17 @@ fn init(ruby: &Ruby) -> RbResult<()> {
781
1305
  class.define_method("files", method!(RawDeltaTable::files, 1))?;
782
1306
  class.define_method("file_uris", method!(RawDeltaTable::file_uris, 1))?;
783
1307
  class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
784
- class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
1308
+ class.define_method("vacuum", method!(RawDeltaTable::vacuum, 5))?;
785
1309
  class.define_method(
786
1310
  "compact_optimize",
787
- method!(RawDeltaTable::compact_optimize, 3),
1311
+ method!(RawDeltaTable::compact_optimize, 7),
788
1312
  )?;
789
1313
  class.define_method(
790
1314
  "z_order_optimize",
791
- method!(RawDeltaTable::z_order_optimize, 5),
1315
+ method!(RawDeltaTable::z_order_optimize, 9),
792
1316
  )?;
1317
+ class.define_method("add_columns", method!(RawDeltaTable::add_columns, 1))?;
1318
+ class.define_method("add_feature", method!(RawDeltaTable::add_feature, 2))?;
793
1319
  class.define_method(
794
1320
  "add_constraints",
795
1321
  method!(RawDeltaTable::add_constraints, 1),
@@ -799,7 +1325,12 @@ fn init(ruby: &Ruby) -> RbResult<()> {
799
1325
  method!(RawDeltaTable::drop_constraints, 2),
800
1326
  )?;
801
1327
  class.define_method("load_cdf", method!(RawDeltaTable::load_cdf, 5))?;
802
- class.define_method("restore", method!(RawDeltaTable::restore, 3))?;
1328
+ class.define_method(
1329
+ "create_merge_builder",
1330
+ method!(RawDeltaTable::create_merge_builder, 8),
1331
+ )?;
1332
+ class.define_method("merge_execute", method!(RawDeltaTable::merge_execute, 1))?;
1333
+ class.define_method("restore", method!(RawDeltaTable::restore, 4))?;
803
1334
  class.define_method("history", method!(RawDeltaTable::history, 1))?;
804
1335
  class.define_method(
805
1336
  "update_incremental",
@@ -809,8 +1340,24 @@ fn init(ruby: &Ruby) -> RbResult<()> {
809
1340
  "get_active_partitions",
810
1341
  method!(RawDeltaTable::get_active_partitions, 0),
811
1342
  )?;
812
- class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
813
- class.define_method("repair", method!(RawDeltaTable::repair, 1))?;
1343
+ class.define_method(
1344
+ "create_checkpoint",
1345
+ method!(RawDeltaTable::create_checkpoint, 0),
1346
+ )?;
1347
+ class.define_method(
1348
+ "cleanup_metadata",
1349
+ method!(RawDeltaTable::cleanup_metadata, 0),
1350
+ )?;
1351
+ class.define_method(
1352
+ "get_add_file_sizes",
1353
+ method!(RawDeltaTable::get_add_file_sizes, 0),
1354
+ )?;
1355
+ class.define_method("delete", method!(RawDeltaTable::delete, 4))?;
1356
+ class.define_method(
1357
+ "set_table_properties",
1358
+ method!(RawDeltaTable::set_table_properties, 2),
1359
+ )?;
1360
+ class.define_method("repair", method!(RawDeltaTable::repair, 3))?;
814
1361
  class.define_method(
815
1362
  "transaction_versions",
816
1363
  method!(RawDeltaTable::transaction_versions, 0),
@@ -844,5 +1391,29 @@ fn init(ruby: &Ruby) -> RbResult<()> {
844
1391
  class.define_method("type", method!(Field::get_type, 0))?;
845
1392
  class.define_method("nullable", method!(Field::nullable, 0))?;
846
1393
 
1394
+ let class = module.define_class("RbMergeBuilder", ruby.class_object())?;
1395
+ class.define_method("source_alias", method!(RbMergeBuilder::source_alias, 0))?;
1396
+ class.define_method("target_alias", method!(RbMergeBuilder::target_alias, 0))?;
1397
+ class.define_method(
1398
+ "when_matched_update",
1399
+ method!(RbMergeBuilder::when_matched_update, 2),
1400
+ )?;
1401
+ class.define_method(
1402
+ "when_matched_delete",
1403
+ method!(RbMergeBuilder::when_matched_delete, 1),
1404
+ )?;
1405
+ class.define_method(
1406
+ "when_not_matched_insert",
1407
+ method!(RbMergeBuilder::when_not_matched_insert, 2),
1408
+ )?;
1409
+ class.define_method(
1410
+ "when_not_matched_by_source_update",
1411
+ method!(RbMergeBuilder::when_not_matched_by_source_update, 2),
1412
+ )?;
1413
+ class.define_method(
1414
+ "when_not_matched_by_source_delete",
1415
+ method!(RbMergeBuilder::when_not_matched_by_source_delete, 1),
1416
+ )?;
1417
+
847
1418
  Ok(())
848
1419
  }