deltalake-rb 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,6 @@
1
1
  mod error;
2
+ mod features;
3
+ mod merge;
2
4
  mod schema;
3
5
  mod utils;
4
6
 
@@ -9,12 +11,16 @@ use std::str::FromStr;
9
11
  use std::time;
10
12
 
11
13
  use chrono::{DateTime, Duration, FixedOffset, Utc};
14
+ use delta_kernel::schema::StructField;
12
15
  use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
13
16
  use deltalake::arrow::record_batch::RecordBatchIterator;
17
+ use deltalake::checkpoints::{cleanup_metadata, create_checkpoint};
14
18
  use deltalake::datafusion::physical_plan::ExecutionPlan;
15
19
  use deltalake::datafusion::prelude::SessionContext;
16
20
  use deltalake::errors::DeltaTableError;
17
21
  use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
22
+ use deltalake::operations::add_column::AddColumnBuilder;
23
+ use deltalake::operations::add_feature::AddTableFeatureBuilder;
18
24
  use deltalake::operations::collect_sendable_stream;
19
25
  use deltalake::operations::constraints::ConstraintBuilder;
20
26
  use deltalake::operations::delete::DeleteBuilder;
@@ -23,25 +29,49 @@ use deltalake::operations::filesystem_check::FileSystemCheckBuilder;
23
29
  use deltalake::operations::load_cdf::CdfLoadBuilder;
24
30
  use deltalake::operations::optimize::{OptimizeBuilder, OptimizeType};
25
31
  use deltalake::operations::restore::RestoreBuilder;
26
- use deltalake::operations::transaction::TableReference;
32
+ use deltalake::operations::set_tbl_properties::SetTablePropertiesBuilder;
33
+ use deltalake::operations::transaction::{CommitProperties, TableReference};
27
34
  use deltalake::operations::vacuum::VacuumBuilder;
35
+ use deltalake::parquet::basic::Compression;
36
+ use deltalake::parquet::errors::ParquetError;
37
+ use deltalake::parquet::file::properties::WriterProperties;
28
38
  use deltalake::partitions::PartitionFilter;
29
39
  use deltalake::storage::IORuntime;
30
- use deltalake::DeltaOps;
40
+ use deltalake::{DeltaOps, DeltaResult};
31
41
  use error::DeltaError;
32
42
  use futures::future::join_all;
33
43
 
34
44
  use magnus::{
35
- exception, function, method, prelude::*, Error, Integer, Module, RArray, RHash, Ruby, Value,
45
+ function, method, prelude::*, typed_data::Obj, Error, Integer, Module, RArray, RHash, Ruby,
46
+ TryConvert, Value,
36
47
  };
48
+ use serde_json::Map;
37
49
 
38
50
  use crate::error::DeltaProtocolError;
51
+ use crate::error::RbValueError;
39
52
  use crate::error::RubyError;
53
+ use crate::features::TableFeatures;
54
+ use crate::merge::RbMergeBuilder;
40
55
  use crate::schema::{schema_to_rbobject, Field};
41
56
  use crate::utils::rt;
42
57
 
43
58
  type RbResult<T> = Result<T, Error>;
44
59
 
60
+ enum PartitionFilterValue {
61
+ Single(String),
62
+ Multiple(Vec<String>),
63
+ }
64
+
65
+ impl TryConvert for PartitionFilterValue {
66
+ fn try_convert(val: Value) -> RbResult<Self> {
67
+ if let Ok(v) = Vec::<String>::try_convert(val) {
68
+ Ok(PartitionFilterValue::Multiple(v))
69
+ } else {
70
+ Ok(PartitionFilterValue::Single(String::try_convert(val)?))
71
+ }
72
+ }
73
+ }
74
+
45
75
  #[magnus::wrap(class = "DeltaLake::RawDeltaTable")]
46
76
  struct RawDeltaTable {
47
77
  _table: RefCell<deltalake::DeltaTable>,
@@ -57,14 +87,29 @@ struct RawDeltaTableMetaData {
57
87
  configuration: HashMap<String, Option<String>>,
58
88
  }
59
89
 
60
- #[magnus::wrap(class = "DeltaLake::ArrowArrayStream")]
61
- pub struct ArrowArrayStream {
62
- stream: FFI_ArrowArrayStream,
63
- }
90
+ impl RawDeltaTableMetaData {
91
+ fn id(&self) -> String {
92
+ self.id.clone()
93
+ }
64
94
 
65
- impl ArrowArrayStream {
66
- pub fn to_i(&self) -> usize {
67
- (&self.stream as *const _) as usize
95
+ fn name(&self) -> Option<String> {
96
+ self.name.clone()
97
+ }
98
+
99
+ fn description(&self) -> Option<String> {
100
+ self.description.clone()
101
+ }
102
+
103
+ fn partition_columns(&self) -> Vec<String> {
104
+ self.partition_columns.clone()
105
+ }
106
+
107
+ fn created_time(&self) -> Option<i64> {
108
+ self.created_time
109
+ }
110
+
111
+ fn configuration(&self) -> HashMap<String, Option<String>> {
112
+ self.configuration.clone()
68
113
  }
69
114
  }
70
115
 
@@ -218,20 +263,19 @@ impl RawDeltaTable {
218
263
  }
219
264
 
220
265
  pub fn load_with_datetime(&self, ds: String) -> RbResult<()> {
221
- let datetime = DateTime::<Utc>::from(
222
- DateTime::<FixedOffset>::parse_from_rfc3339(&ds).map_err(|err| {
223
- Error::new(
224
- exception::arg_error(),
225
- format!("Failed to parse datetime string: {err}"),
226
- )
227
- })?,
228
- );
266
+ let datetime =
267
+ DateTime::<Utc>::from(DateTime::<FixedOffset>::parse_from_rfc3339(&ds).map_err(
268
+ |err| RbValueError::new_err(format!("Failed to parse datetime string: {err}")),
269
+ )?);
229
270
  Ok(rt()
230
271
  .block_on(self._table.borrow_mut().load_with_datetime(datetime))
231
272
  .map_err(RubyError::from)?)
232
273
  }
233
274
 
234
- pub fn files(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
275
+ pub fn files(
276
+ &self,
277
+ partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
278
+ ) -> RbResult<Vec<String>> {
235
279
  if !self.has_files()? {
236
280
  return Err(DeltaError::new_err("Table is instantiated without files."));
237
281
  }
@@ -257,7 +301,10 @@ impl RawDeltaTable {
257
301
  }
258
302
  }
259
303
 
260
- pub fn file_uris(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
304
+ pub fn file_uris(
305
+ &self,
306
+ partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
307
+ ) -> RbResult<Vec<String>> {
261
308
  if !self._table.borrow().config.require_files {
262
309
  return Err(DeltaError::new_err("Table is initiated without files."));
263
310
  }
@@ -290,6 +337,8 @@ impl RawDeltaTable {
290
337
  dry_run: bool,
291
338
  retention_hours: Option<u64>,
292
339
  enforce_retention_duration: bool,
340
+ commit_properties: Option<RbCommitProperties>,
341
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
293
342
  ) -> RbResult<Vec<String>> {
294
343
  let mut cmd = VacuumBuilder::new(
295
344
  self._table.borrow().log_store(),
@@ -305,6 +354,11 @@ impl RawDeltaTable {
305
354
  cmd = cmd.with_retention_period(Duration::hours(retention_period as i64));
306
355
  }
307
356
 
357
+ if let Some(commit_properties) =
358
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
359
+ {
360
+ cmd = cmd.with_commit_properties(commit_properties);
361
+ }
308
362
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
309
363
  self._table.borrow_mut().state = table.state;
310
364
  Ok(metrics.files_deleted)
@@ -312,9 +366,13 @@ impl RawDeltaTable {
312
366
 
313
367
  pub fn compact_optimize(
314
368
  &self,
369
+ partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
315
370
  target_size: Option<i64>,
316
371
  max_concurrent_tasks: Option<usize>,
317
372
  min_commit_interval: Option<u64>,
373
+ writer_properties: Option<RbWriterProperties>,
374
+ commit_properties: Option<RbCommitProperties>,
375
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
318
376
  ) -> RbResult<String> {
319
377
  let mut cmd = OptimizeBuilder::new(
320
378
  self._table.borrow().log_store(),
@@ -332,6 +390,22 @@ impl RawDeltaTable {
332
390
  cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
333
391
  }
334
392
 
393
+ if let Some(writer_props) = writer_properties {
394
+ cmd = cmd.with_writer_properties(
395
+ set_writer_properties(writer_props).map_err(RubyError::from)?,
396
+ );
397
+ }
398
+
399
+ if let Some(commit_properties) =
400
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
401
+ {
402
+ cmd = cmd.with_commit_properties(commit_properties);
403
+ }
404
+
405
+ let converted_filters = convert_partition_filters(partition_filters.unwrap_or_default())
406
+ .map_err(RubyError::from)?;
407
+ cmd = cmd.with_filters(&converted_filters);
408
+
335
409
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
336
410
  self._table.borrow_mut().state = table.state;
337
411
  Ok(serde_json::to_string(&metrics).unwrap())
@@ -340,10 +414,14 @@ impl RawDeltaTable {
340
414
  pub fn z_order_optimize(
341
415
  &self,
342
416
  z_order_columns: Vec<String>,
417
+ partition_filters: Option<Vec<(String, String, PartitionFilterValue)>>,
343
418
  target_size: Option<i64>,
344
419
  max_concurrent_tasks: Option<usize>,
345
420
  max_spill_size: usize,
346
421
  min_commit_interval: Option<u64>,
422
+ writer_properties: Option<RbWriterProperties>,
423
+ commit_properties: Option<RbCommitProperties>,
424
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
347
425
  ) -> RbResult<String> {
348
426
  let mut cmd = OptimizeBuilder::new(
349
427
  self._table.borrow().log_store(),
@@ -363,11 +441,75 @@ impl RawDeltaTable {
363
441
  cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
364
442
  }
365
443
 
444
+ if let Some(writer_props) = writer_properties {
445
+ cmd = cmd.with_writer_properties(
446
+ set_writer_properties(writer_props).map_err(RubyError::from)?,
447
+ );
448
+ }
449
+
450
+ if let Some(commit_properties) =
451
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
452
+ {
453
+ cmd = cmd.with_commit_properties(commit_properties);
454
+ }
455
+
456
+ let converted_filters = convert_partition_filters(partition_filters.unwrap_or_default())
457
+ .map_err(RubyError::from)?;
458
+ cmd = cmd.with_filters(&converted_filters);
459
+
366
460
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
367
461
  self._table.borrow_mut().state = table.state;
368
462
  Ok(serde_json::to_string(&metrics).unwrap())
369
463
  }
370
464
 
465
+ pub fn add_columns(&self, fields: RArray) -> RbResult<()> {
466
+ let fields = fields.typecheck::<Obj<Field>>()?;
467
+ let mut cmd = AddColumnBuilder::new(
468
+ self._table.borrow().log_store(),
469
+ self._table
470
+ .borrow()
471
+ .snapshot()
472
+ .map_err(RubyError::from)?
473
+ .clone(),
474
+ );
475
+
476
+ let new_fields = fields
477
+ .iter()
478
+ .map(|v| v.inner.clone())
479
+ .collect::<Vec<StructField>>();
480
+
481
+ cmd = cmd.with_fields(new_fields);
482
+
483
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
484
+ self._table.borrow_mut().state = table.state;
485
+ Ok(())
486
+ }
487
+
488
+ pub fn add_feature(
489
+ &self,
490
+ feature: RArray,
491
+ allow_protocol_versions_increase: bool,
492
+ ) -> RbResult<()> {
493
+ let feature = feature
494
+ .into_iter()
495
+ .map(|v| TableFeatures::try_convert(v))
496
+ .collect::<RbResult<Vec<_>>>()?;
497
+ let cmd = AddTableFeatureBuilder::new(
498
+ self._table.borrow().log_store(),
499
+ self._table
500
+ .borrow()
501
+ .snapshot()
502
+ .map_err(RubyError::from)?
503
+ .clone(),
504
+ )
505
+ .with_features(feature)
506
+ .with_allow_protocol_versions_increase(allow_protocol_versions_increase);
507
+
508
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
509
+ self._table.borrow_mut().state = table.state;
510
+ Ok(())
511
+ }
512
+
371
513
  pub fn add_constraints(&self, constraints: HashMap<String, String>) -> RbResult<()> {
372
514
  let mut cmd = ConstraintBuilder::new(
373
515
  self._table.borrow().log_store(),
@@ -428,13 +570,13 @@ impl RawDeltaTable {
428
570
  }
429
571
  if let Some(st) = starting_timestamp {
430
572
  let starting_ts: DateTime<Utc> = DateTime::<Utc>::from_str(&st)
431
- .map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
573
+ .map_err(|pe| RbValueError::new_err(pe.to_string()))?
432
574
  .to_utc();
433
575
  cdf_read = cdf_read.with_starting_timestamp(starting_ts);
434
576
  }
435
577
  if let Some(et) = ending_timestamp {
436
578
  let ending_ts = DateTime::<Utc>::from_str(&et)
437
- .map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
579
+ .map_err(|pe| RbValueError::new_err(pe.to_string()))?
438
580
  .to_utc();
439
581
  cdf_read = cdf_read.with_starting_timestamp(ending_ts);
440
582
  }
@@ -470,11 +612,48 @@ impl RawDeltaTable {
470
612
  Ok(ArrowArrayStream { stream: ffi_stream })
471
613
  }
472
614
 
615
+ pub fn create_merge_builder(
616
+ &self,
617
+ source: RbArrowType<ArrowArrayStreamReader>,
618
+ predicate: String,
619
+ source_alias: Option<String>,
620
+ target_alias: Option<String>,
621
+ safe_cast: bool,
622
+ writer_properties: Option<RbWriterProperties>,
623
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
624
+ commit_properties: Option<RbCommitProperties>,
625
+ ) -> RbResult<RbMergeBuilder> {
626
+ Ok(RbMergeBuilder::new(
627
+ self._table.borrow().log_store(),
628
+ self._table
629
+ .borrow()
630
+ .snapshot()
631
+ .map_err(RubyError::from)?
632
+ .clone(),
633
+ source.0,
634
+ predicate,
635
+ source_alias,
636
+ target_alias,
637
+ safe_cast,
638
+ writer_properties,
639
+ post_commithook_properties,
640
+ commit_properties,
641
+ )
642
+ .map_err(RubyError::from)?)
643
+ }
644
+
645
+ pub fn merge_execute(&self, merge_builder: &RbMergeBuilder) -> RbResult<String> {
646
+ let (table, metrics) = merge_builder.execute().map_err(RubyError::from)?;
647
+ self._table.borrow_mut().state = table.state;
648
+ Ok(metrics)
649
+ }
650
+
473
651
  pub fn restore(
474
652
  &self,
475
653
  target: Option<Value>,
476
654
  ignore_missing_files: bool,
477
655
  protocol_downgrade_allowed: bool,
656
+ commit_properties: Option<RbCommitProperties>,
478
657
  ) -> RbResult<String> {
479
658
  let mut cmd = RestoreBuilder::new(
480
659
  self._table.borrow().log_store(),
@@ -491,10 +670,7 @@ impl RawDeltaTable {
491
670
  if let Ok(ds) = String::try_convert(val) {
492
671
  let datetime = DateTime::<Utc>::from(
493
672
  DateTime::<FixedOffset>::parse_from_rfc3339(ds.as_ref()).map_err(|err| {
494
- Error::new(
495
- exception::arg_error(),
496
- format!("Failed to parse datetime string: {err}"),
497
- )
673
+ RbValueError::new_err(format!("Failed to parse datetime string: {err}"))
498
674
  })?,
499
675
  );
500
676
  cmd = cmd.with_datetime_to_restore(datetime)
@@ -503,6 +679,10 @@ impl RawDeltaTable {
503
679
  cmd = cmd.with_ignore_missing_files(ignore_missing_files);
504
680
  cmd = cmd.with_protocol_downgrade_allowed(protocol_downgrade_allowed);
505
681
 
682
+ if let Some(commit_properties) = maybe_create_commit_properties(commit_properties, None) {
683
+ cmd = cmd.with_commit_properties(commit_properties);
684
+ }
685
+
506
686
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
507
687
  self._table.borrow_mut().state = table.state;
508
688
  Ok(serde_json::to_string(&metrics).unwrap())
@@ -575,7 +755,39 @@ impl RawDeltaTable {
575
755
  Ok(RArray::from_iter(active_partitions))
576
756
  }
577
757
 
578
- pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
758
+ pub fn create_checkpoint(&self) -> RbResult<()> {
759
+ rt().block_on(create_checkpoint(&self._table.borrow()))
760
+ .map_err(RubyError::from)?;
761
+
762
+ Ok(())
763
+ }
764
+
765
+ pub fn cleanup_metadata(&self) -> RbResult<()> {
766
+ rt().block_on(cleanup_metadata(&self._table.borrow()))
767
+ .map_err(RubyError::from)?;
768
+
769
+ Ok(())
770
+ }
771
+
772
+ pub fn get_add_file_sizes(&self) -> RbResult<HashMap<String, i64>> {
773
+ Ok(self
774
+ ._table
775
+ .borrow()
776
+ .snapshot()
777
+ .map_err(RubyError::from)?
778
+ .eager_snapshot()
779
+ .files()
780
+ .map(|f| (f.path().to_string(), f.size()))
781
+ .collect::<HashMap<String, i64>>())
782
+ }
783
+
784
+ pub fn delete(
785
+ &self,
786
+ predicate: Option<String>,
787
+ writer_properties: Option<RbWriterProperties>,
788
+ commit_properties: Option<RbCommitProperties>,
789
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
790
+ ) -> RbResult<String> {
579
791
  let mut cmd = DeleteBuilder::new(
580
792
  self._table.borrow().log_store(),
581
793
  self._table
@@ -587,14 +799,50 @@ impl RawDeltaTable {
587
799
  if let Some(predicate) = predicate {
588
800
  cmd = cmd.with_predicate(predicate);
589
801
  }
802
+ if let Some(writer_props) = writer_properties {
803
+ cmd = cmd.with_writer_properties(
804
+ set_writer_properties(writer_props).map_err(RubyError::from)?,
805
+ );
806
+ }
807
+ if let Some(commit_properties) =
808
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
809
+ {
810
+ cmd = cmd.with_commit_properties(commit_properties);
811
+ }
590
812
 
591
813
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
592
814
  self._table.borrow_mut().state = table.state;
593
815
  Ok(serde_json::to_string(&metrics).unwrap())
594
816
  }
595
817
 
596
- pub fn repair(&self, dry_run: bool) -> RbResult<String> {
597
- let cmd = FileSystemCheckBuilder::new(
818
+ pub fn set_table_properties(
819
+ &self,
820
+ properties: HashMap<String, String>,
821
+ raise_if_not_exists: bool,
822
+ ) -> RbResult<()> {
823
+ let cmd = SetTablePropertiesBuilder::new(
824
+ self._table.borrow().log_store(),
825
+ self._table
826
+ .borrow()
827
+ .snapshot()
828
+ .map_err(RubyError::from)?
829
+ .clone(),
830
+ )
831
+ .with_properties(properties)
832
+ .with_raise_if_not_exists(raise_if_not_exists);
833
+
834
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
835
+ self._table.borrow_mut().state = table.state;
836
+ Ok(())
837
+ }
838
+
839
+ pub fn repair(
840
+ &self,
841
+ dry_run: bool,
842
+ commit_properties: Option<RbCommitProperties>,
843
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
844
+ ) -> RbResult<String> {
845
+ let mut cmd = FileSystemCheckBuilder::new(
598
846
  self._table.borrow().log_store(),
599
847
  self._table
600
848
  .borrow()
@@ -604,6 +852,12 @@ impl RawDeltaTable {
604
852
  )
605
853
  .with_dry_run(dry_run);
606
854
 
855
+ if let Some(commit_properties) =
856
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
857
+ {
858
+ cmd = cmd.with_commit_properties(commit_properties);
859
+ }
860
+
607
861
  let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
608
862
  self._table.borrow_mut().state = table.state;
609
863
  Ok(serde_json::to_string(&metrics).unwrap())
@@ -620,35 +874,240 @@ impl RawDeltaTable {
620
874
  }
621
875
  }
622
876
 
877
+ fn set_post_commithook_properties(
878
+ mut commit_properties: CommitProperties,
879
+ post_commithook_properties: RbPostCommitHookProperties,
880
+ ) -> CommitProperties {
881
+ commit_properties =
882
+ commit_properties.with_create_checkpoint(post_commithook_properties.create_checkpoint);
883
+ commit_properties = commit_properties
884
+ .with_cleanup_expired_logs(post_commithook_properties.cleanup_expired_logs);
885
+ commit_properties
886
+ }
887
+
888
+ fn set_writer_properties(writer_properties: RbWriterProperties) -> DeltaResult<WriterProperties> {
889
+ let mut properties = WriterProperties::builder();
890
+ let data_page_size_limit = writer_properties.data_page_size_limit;
891
+ let dictionary_page_size_limit = writer_properties.dictionary_page_size_limit;
892
+ let data_page_row_count_limit = writer_properties.data_page_row_count_limit;
893
+ let write_batch_size = writer_properties.write_batch_size;
894
+ let max_row_group_size = writer_properties.max_row_group_size;
895
+ let compression = writer_properties.compression;
896
+ let statistics_truncate_length = writer_properties.statistics_truncate_length;
897
+ let default_column_properties = writer_properties.default_column_properties;
898
+ let column_properties = writer_properties.column_properties;
899
+
900
+ if let Some(data_page_size) = data_page_size_limit {
901
+ properties = properties.set_data_page_size_limit(data_page_size);
902
+ }
903
+ if let Some(dictionary_page_size) = dictionary_page_size_limit {
904
+ properties = properties.set_dictionary_page_size_limit(dictionary_page_size);
905
+ }
906
+ if let Some(data_page_row_count) = data_page_row_count_limit {
907
+ properties = properties.set_data_page_row_count_limit(data_page_row_count);
908
+ }
909
+ if let Some(batch_size) = write_batch_size {
910
+ properties = properties.set_write_batch_size(batch_size);
911
+ }
912
+ if let Some(row_group_size) = max_row_group_size {
913
+ properties = properties.set_max_row_group_size(row_group_size);
914
+ }
915
+ properties = properties.set_statistics_truncate_length(statistics_truncate_length);
916
+
917
+ if let Some(compression) = compression {
918
+ let compress: Compression = compression
919
+ .parse()
920
+ .map_err(|err: ParquetError| DeltaTableError::Generic(err.to_string()))?;
921
+
922
+ properties = properties.set_compression(compress);
923
+ }
924
+
925
+ if let Some(default_column_properties) = default_column_properties {
926
+ if let Some(dictionary_enabled) = default_column_properties.dictionary_enabled {
927
+ properties = properties.set_dictionary_enabled(dictionary_enabled);
928
+ }
929
+ if let Some(max_statistics_size) = default_column_properties.max_statistics_size {
930
+ properties = properties.set_max_statistics_size(max_statistics_size);
931
+ }
932
+ if let Some(bloom_filter_properties) = default_column_properties.bloom_filter_properties {
933
+ if let Some(set_bloom_filter_enabled) = bloom_filter_properties.set_bloom_filter_enabled
934
+ {
935
+ properties = properties.set_bloom_filter_enabled(set_bloom_filter_enabled);
936
+ }
937
+ if let Some(bloom_filter_fpp) = bloom_filter_properties.fpp {
938
+ properties = properties.set_bloom_filter_fpp(bloom_filter_fpp);
939
+ }
940
+ if let Some(bloom_filter_ndv) = bloom_filter_properties.ndv {
941
+ properties = properties.set_bloom_filter_ndv(bloom_filter_ndv);
942
+ }
943
+ }
944
+ }
945
+ if let Some(column_properties) = column_properties {
946
+ for (column_name, column_prop) in column_properties {
947
+ if let Some(column_prop) = column_prop {
948
+ if let Some(dictionary_enabled) = column_prop.dictionary_enabled {
949
+ properties = properties.set_column_dictionary_enabled(
950
+ column_name.clone().into(),
951
+ dictionary_enabled,
952
+ );
953
+ }
954
+ if let Some(bloom_filter_properties) = column_prop.bloom_filter_properties {
955
+ if let Some(set_bloom_filter_enabled) =
956
+ bloom_filter_properties.set_bloom_filter_enabled
957
+ {
958
+ properties = properties.set_column_bloom_filter_enabled(
959
+ column_name.clone().into(),
960
+ set_bloom_filter_enabled,
961
+ );
962
+ }
963
+ if let Some(bloom_filter_fpp) = bloom_filter_properties.fpp {
964
+ properties = properties.set_column_bloom_filter_fpp(
965
+ column_name.clone().into(),
966
+ bloom_filter_fpp,
967
+ );
968
+ }
969
+ if let Some(bloom_filter_ndv) = bloom_filter_properties.ndv {
970
+ properties = properties
971
+ .set_column_bloom_filter_ndv(column_name.into(), bloom_filter_ndv);
972
+ }
973
+ }
974
+ }
975
+ }
976
+ }
977
+ Ok(properties.build())
978
+ }
979
+
623
980
  fn convert_partition_filters(
624
- _partitions_filters: Value,
981
+ partitions_filters: Vec<(String, String, PartitionFilterValue)>,
625
982
  ) -> Result<Vec<PartitionFilter>, DeltaTableError> {
626
- todo!()
983
+ partitions_filters
984
+ .into_iter()
985
+ .map(|filter| match filter {
986
+ (key, op, PartitionFilterValue::Single(v)) => {
987
+ let key: &'_ str = key.as_ref();
988
+ let op: &'_ str = op.as_ref();
989
+ let v: &'_ str = v.as_ref();
990
+ PartitionFilter::try_from((key, op, v))
991
+ }
992
+ (key, op, PartitionFilterValue::Multiple(v)) => {
993
+ let key: &'_ str = key.as_ref();
994
+ let op: &'_ str = op.as_ref();
995
+ let v: Vec<&'_ str> = v.iter().map(|v| v.as_ref()).collect();
996
+ PartitionFilter::try_from((key, op, v.as_slice()))
997
+ }
998
+ })
999
+ .collect()
627
1000
  }
628
1001
 
629
- impl RawDeltaTableMetaData {
630
- fn id(&self) -> String {
631
- self.id.clone()
1002
+ fn maybe_create_commit_properties(
1003
+ maybe_commit_properties: Option<RbCommitProperties>,
1004
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
1005
+ ) -> Option<CommitProperties> {
1006
+ if maybe_commit_properties.is_none() && post_commithook_properties.is_none() {
1007
+ return None;
1008
+ }
1009
+ let mut commit_properties = CommitProperties::default();
1010
+
1011
+ if let Some(commit_props) = maybe_commit_properties {
1012
+ if let Some(metadata) = commit_props.custom_metadata {
1013
+ let json_metadata: Map<String, serde_json::Value> =
1014
+ metadata.into_iter().map(|(k, v)| (k, v.into())).collect();
1015
+ commit_properties = commit_properties.with_metadata(json_metadata);
1016
+ };
1017
+
1018
+ if let Some(max_retries) = commit_props.max_commit_retries {
1019
+ commit_properties = commit_properties.with_max_retries(max_retries);
1020
+ };
1021
+
1022
+ if let Some(app_transactions) = commit_props.app_transactions {
1023
+ let app_transactions = app_transactions.iter().map(Transaction::from).collect();
1024
+ commit_properties = commit_properties.with_application_transactions(app_transactions);
1025
+ }
632
1026
  }
633
1027
 
634
- fn name(&self) -> Option<String> {
635
- self.name.clone()
1028
+ if let Some(post_commit_hook_props) = post_commithook_properties {
1029
+ commit_properties =
1030
+ set_post_commithook_properties(commit_properties, post_commit_hook_props)
636
1031
  }
1032
+ Some(commit_properties)
1033
+ }
637
1034
 
638
- fn description(&self) -> Option<String> {
639
- self.description.clone()
1035
+ fn rust_core_version() -> String {
1036
+ deltalake::crate_version().to_string()
1037
+ }
1038
+
1039
+ pub struct BloomFilterProperties {
1040
+ pub set_bloom_filter_enabled: Option<bool>,
1041
+ pub fpp: Option<f64>,
1042
+ pub ndv: Option<u64>,
1043
+ }
1044
+
1045
+ impl TryConvert for BloomFilterProperties {
1046
+ fn try_convert(val: Value) -> RbResult<Self> {
1047
+ Ok(BloomFilterProperties {
1048
+ set_bloom_filter_enabled: val.funcall("set_bloom_filter_enabled", ())?,
1049
+ fpp: val.funcall("fpp", ())?,
1050
+ ndv: val.funcall("ndv", ())?,
1051
+ })
640
1052
  }
1053
+ }
641
1054
 
642
- fn partition_columns(&self) -> Vec<String> {
643
- self.partition_columns.clone()
1055
+ pub struct ColumnProperties {
1056
+ pub dictionary_enabled: Option<bool>,
1057
+ pub max_statistics_size: Option<usize>,
1058
+ pub bloom_filter_properties: Option<BloomFilterProperties>,
1059
+ }
1060
+
1061
+ impl TryConvert for ColumnProperties {
1062
+ fn try_convert(val: Value) -> RbResult<Self> {
1063
+ Ok(ColumnProperties {
1064
+ dictionary_enabled: val.funcall("dictionary_enabled", ())?,
1065
+ max_statistics_size: val.funcall("max_statistics_size", ())?,
1066
+ bloom_filter_properties: val.funcall("bloom_filter_properties", ())?,
1067
+ })
644
1068
  }
1069
+ }
645
1070
 
646
- fn created_time(&self) -> Option<i64> {
647
- self.created_time
1071
+ pub struct RbWriterProperties {
1072
+ data_page_size_limit: Option<usize>,
1073
+ dictionary_page_size_limit: Option<usize>,
1074
+ data_page_row_count_limit: Option<usize>,
1075
+ write_batch_size: Option<usize>,
1076
+ max_row_group_size: Option<usize>,
1077
+ statistics_truncate_length: Option<usize>,
1078
+ compression: Option<String>,
1079
+ default_column_properties: Option<ColumnProperties>,
1080
+ column_properties: Option<HashMap<String, Option<ColumnProperties>>>,
1081
+ }
1082
+
1083
+ impl TryConvert for RbWriterProperties {
1084
+ fn try_convert(val: Value) -> RbResult<Self> {
1085
+ Ok(RbWriterProperties {
1086
+ data_page_size_limit: val.funcall("data_page_size_limit", ())?,
1087
+ dictionary_page_size_limit: val.funcall("dictionary_page_size_limit", ())?,
1088
+ data_page_row_count_limit: val.funcall("data_page_row_count_limit", ())?,
1089
+ write_batch_size: val.funcall("write_batch_size", ())?,
1090
+ max_row_group_size: val.funcall("max_row_group_size", ())?,
1091
+ statistics_truncate_length: val.funcall("statistics_truncate_length", ())?,
1092
+ compression: val.funcall("compression", ())?,
1093
+ default_column_properties: val.funcall("default_column_properties", ())?,
1094
+ // TODO fix
1095
+ column_properties: None,
1096
+ })
648
1097
  }
1098
+ }
649
1099
 
650
- fn configuration(&self) -> HashMap<String, Option<String>> {
651
- self.configuration.clone()
1100
+ pub struct RbPostCommitHookProperties {
1101
+ create_checkpoint: bool,
1102
+ cleanup_expired_logs: Option<bool>,
1103
+ }
1104
+
1105
+ impl TryConvert for RbPostCommitHookProperties {
1106
+ fn try_convert(val: Value) -> RbResult<Self> {
1107
+ Ok(RbPostCommitHookProperties {
1108
+ create_checkpoint: val.funcall("create_checkpoint", ())?,
1109
+ cleanup_expired_logs: val.funcall("cleanup_expired_logs", ())?,
1110
+ })
652
1111
  }
653
1112
  }
654
1113
 
@@ -669,10 +1128,37 @@ impl From<Transaction> for RbTransaction {
669
1128
  }
670
1129
  }
671
1130
 
1131
+ impl From<&RbTransaction> for Transaction {
1132
+ fn from(value: &RbTransaction) -> Self {
1133
+ Transaction {
1134
+ app_id: value.app_id.clone(),
1135
+ version: value.version,
1136
+ last_updated: value.last_updated,
1137
+ }
1138
+ }
1139
+ }
1140
+
1141
+ pub struct RbCommitProperties {
1142
+ custom_metadata: Option<HashMap<String, String>>,
1143
+ max_commit_retries: Option<usize>,
1144
+ app_transactions: Option<Vec<RbTransaction>>,
1145
+ }
1146
+
1147
+ impl TryConvert for RbCommitProperties {
1148
+ fn try_convert(val: Value) -> RbResult<Self> {
1149
+ Ok(RbCommitProperties {
1150
+ custom_metadata: val.funcall("custom_metadata", ())?,
1151
+ max_commit_retries: val.funcall("max_commit_retries", ())?,
1152
+ // TODO fix
1153
+ app_transactions: None,
1154
+ })
1155
+ }
1156
+ }
1157
+
672
1158
  #[allow(clippy::too_many_arguments)]
673
1159
  fn write_to_deltalake(
674
1160
  table_uri: String,
675
- data: Value,
1161
+ data: RbArrowType<ArrowArrayStreamReader>,
676
1162
  mode: String,
677
1163
  table: Option<&RawDeltaTable>,
678
1164
  schema_mode: Option<String>,
@@ -683,16 +1169,11 @@ fn write_to_deltalake(
683
1169
  description: Option<String>,
684
1170
  configuration: Option<HashMap<String, Option<String>>>,
685
1171
  storage_options: Option<HashMap<String, String>>,
1172
+ writer_properties: Option<RbWriterProperties>,
1173
+ commit_properties: Option<RbCommitProperties>,
1174
+ post_commithook_properties: Option<RbPostCommitHookProperties>,
686
1175
  ) -> RbResult<()> {
687
- let capsule_pointer: usize = data.funcall("to_i", ())?;
688
-
689
- // use similar approach as Polars to avoid copy
690
- let stream_ptr =
691
- Box::new(unsafe { std::ptr::replace(capsule_pointer as _, FFI_ArrowArrayStream::empty()) });
692
- let stream = ArrowArrayStreamReader::try_new(*stream_ptr)
693
- .map_err(|err| DeltaError::new_err(err.to_string()))?;
694
-
695
- let batches = stream.map(|batch| batch.unwrap()).collect::<Vec<_>>();
1176
+ let batches = data.0.map(|batch| batch.unwrap()).collect::<Vec<_>>();
696
1177
  let save_mode = mode.parse().map_err(RubyError::from)?;
697
1178
 
698
1179
  let options = storage_options.clone().unwrap_or_default();
@@ -713,6 +1194,12 @@ fn write_to_deltalake(
713
1194
  builder = builder.with_partition_columns(partition_columns);
714
1195
  }
715
1196
 
1197
+ if let Some(writer_props) = writer_properties {
1198
+ builder = builder.with_writer_properties(
1199
+ set_writer_properties(writer_props).map_err(RubyError::from)?,
1200
+ );
1201
+ }
1202
+
716
1203
  if let Some(name) = &name {
717
1204
  builder = builder.with_table_name(name);
718
1205
  };
@@ -733,18 +1220,55 @@ fn write_to_deltalake(
733
1220
  builder = builder.with_configuration(config);
734
1221
  };
735
1222
 
1223
+ if let Some(commit_properties) =
1224
+ maybe_create_commit_properties(commit_properties, post_commithook_properties)
1225
+ {
1226
+ builder = builder.with_commit_properties(commit_properties);
1227
+ };
1228
+
736
1229
  rt().block_on(builder.into_future())
737
1230
  .map_err(RubyError::from)?;
738
1231
 
739
1232
  Ok(())
740
1233
  }
741
1234
 
1235
+ pub struct RbArrowType<T>(pub T);
1236
+
1237
+ impl TryConvert for RbArrowType<ArrowArrayStreamReader> {
1238
+ fn try_convert(val: Value) -> RbResult<Self> {
1239
+ let addr: usize = val.funcall("to_i", ())?;
1240
+
1241
+ // use similar approach as Polars to consume pointer and avoid copy
1242
+ let stream_ptr =
1243
+ Box::new(unsafe { std::ptr::replace(addr as _, FFI_ArrowArrayStream::empty()) });
1244
+
1245
+ Ok(RbArrowType(
1246
+ ArrowArrayStreamReader::try_new(*stream_ptr)
1247
+ .map_err(|err| DeltaError::new_err(err.to_string()))?,
1248
+ ))
1249
+ }
1250
+ }
1251
+
1252
+ #[magnus::wrap(class = "DeltaLake::ArrowArrayStream")]
1253
+ pub struct ArrowArrayStream {
1254
+ stream: FFI_ArrowArrayStream,
1255
+ }
1256
+
1257
+ impl ArrowArrayStream {
1258
+ pub fn to_i(&self) -> usize {
1259
+ (&self.stream as *const _) as usize
1260
+ }
1261
+ }
1262
+
742
1263
  #[magnus::init]
743
1264
  fn init(ruby: &Ruby) -> RbResult<()> {
744
1265
  deltalake::aws::register_handlers(None);
1266
+ deltalake::azure::register_handlers(None);
1267
+ deltalake::gcp::register_handlers(None);
745
1268
 
746
1269
  let module = ruby.define_module("DeltaLake")?;
747
- module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 12))?;
1270
+ module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 15))?;
1271
+ module.define_singleton_method("rust_core_version", function!(rust_core_version, 0))?;
748
1272
 
749
1273
  let class = module.define_class("RawDeltaTable", ruby.class_object())?;
750
1274
  class.define_singleton_method("new", function!(RawDeltaTable::new, 5))?;
@@ -781,15 +1305,17 @@ fn init(ruby: &Ruby) -> RbResult<()> {
781
1305
  class.define_method("files", method!(RawDeltaTable::files, 1))?;
782
1306
  class.define_method("file_uris", method!(RawDeltaTable::file_uris, 1))?;
783
1307
  class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
784
- class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
1308
+ class.define_method("vacuum", method!(RawDeltaTable::vacuum, 5))?;
785
1309
  class.define_method(
786
1310
  "compact_optimize",
787
- method!(RawDeltaTable::compact_optimize, 3),
1311
+ method!(RawDeltaTable::compact_optimize, 7),
788
1312
  )?;
789
1313
  class.define_method(
790
1314
  "z_order_optimize",
791
- method!(RawDeltaTable::z_order_optimize, 5),
1315
+ method!(RawDeltaTable::z_order_optimize, 9),
792
1316
  )?;
1317
+ class.define_method("add_columns", method!(RawDeltaTable::add_columns, 1))?;
1318
+ class.define_method("add_feature", method!(RawDeltaTable::add_feature, 2))?;
793
1319
  class.define_method(
794
1320
  "add_constraints",
795
1321
  method!(RawDeltaTable::add_constraints, 1),
@@ -799,7 +1325,12 @@ fn init(ruby: &Ruby) -> RbResult<()> {
799
1325
  method!(RawDeltaTable::drop_constraints, 2),
800
1326
  )?;
801
1327
  class.define_method("load_cdf", method!(RawDeltaTable::load_cdf, 5))?;
802
- class.define_method("restore", method!(RawDeltaTable::restore, 3))?;
1328
+ class.define_method(
1329
+ "create_merge_builder",
1330
+ method!(RawDeltaTable::create_merge_builder, 8),
1331
+ )?;
1332
+ class.define_method("merge_execute", method!(RawDeltaTable::merge_execute, 1))?;
1333
+ class.define_method("restore", method!(RawDeltaTable::restore, 4))?;
803
1334
  class.define_method("history", method!(RawDeltaTable::history, 1))?;
804
1335
  class.define_method(
805
1336
  "update_incremental",
@@ -809,8 +1340,24 @@ fn init(ruby: &Ruby) -> RbResult<()> {
809
1340
  "get_active_partitions",
810
1341
  method!(RawDeltaTable::get_active_partitions, 0),
811
1342
  )?;
812
- class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
813
- class.define_method("repair", method!(RawDeltaTable::repair, 1))?;
1343
+ class.define_method(
1344
+ "create_checkpoint",
1345
+ method!(RawDeltaTable::create_checkpoint, 0),
1346
+ )?;
1347
+ class.define_method(
1348
+ "cleanup_metadata",
1349
+ method!(RawDeltaTable::cleanup_metadata, 0),
1350
+ )?;
1351
+ class.define_method(
1352
+ "get_add_file_sizes",
1353
+ method!(RawDeltaTable::get_add_file_sizes, 0),
1354
+ )?;
1355
+ class.define_method("delete", method!(RawDeltaTable::delete, 4))?;
1356
+ class.define_method(
1357
+ "set_table_properties",
1358
+ method!(RawDeltaTable::set_table_properties, 2),
1359
+ )?;
1360
+ class.define_method("repair", method!(RawDeltaTable::repair, 3))?;
814
1361
  class.define_method(
815
1362
  "transaction_versions",
816
1363
  method!(RawDeltaTable::transaction_versions, 0),
@@ -844,5 +1391,29 @@ fn init(ruby: &Ruby) -> RbResult<()> {
844
1391
  class.define_method("type", method!(Field::get_type, 0))?;
845
1392
  class.define_method("nullable", method!(Field::nullable, 0))?;
846
1393
 
1394
+ let class = module.define_class("RbMergeBuilder", ruby.class_object())?;
1395
+ class.define_method("source_alias", method!(RbMergeBuilder::source_alias, 0))?;
1396
+ class.define_method("target_alias", method!(RbMergeBuilder::target_alias, 0))?;
1397
+ class.define_method(
1398
+ "when_matched_update",
1399
+ method!(RbMergeBuilder::when_matched_update, 2),
1400
+ )?;
1401
+ class.define_method(
1402
+ "when_matched_delete",
1403
+ method!(RbMergeBuilder::when_matched_delete, 1),
1404
+ )?;
1405
+ class.define_method(
1406
+ "when_not_matched_insert",
1407
+ method!(RbMergeBuilder::when_not_matched_insert, 2),
1408
+ )?;
1409
+ class.define_method(
1410
+ "when_not_matched_by_source_update",
1411
+ method!(RbMergeBuilder::when_not_matched_by_source_update, 2),
1412
+ )?;
1413
+ class.define_method(
1414
+ "when_not_matched_by_source_delete",
1415
+ method!(RbMergeBuilder::when_not_matched_by_source_delete, 1),
1416
+ )?;
1417
+
847
1418
  Ok(())
848
1419
  }