deltalake-rb 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 55b36ac54ad1d38070efaa4d555cbe368dcb47c98c88a56fd27d90cc3633e563
4
- data.tar.gz: 6a4464866554b770725d2f7e9e3ad39983933349d3ac57df97aa6204f90111d4
3
+ metadata.gz: 1ad1a2f352a83da63ccbde0126430b052d44801be630f3ad5e8326832205dc52
4
+ data.tar.gz: 1ce59b16589b891390d4ab1e81284d57389b6e8ff950faf11d1b3cf4736ce235
5
5
  SHA512:
6
- metadata.gz: 9a545da9d049c2519dfe075f9ccd0b186e0c5369401bbb258548205a72e0dfe3b57a1167675641a7df235d717159ec1fa9547858ba7db2f78ba5503fa31a3bd2
7
- data.tar.gz: 976cf69691f1e13e02ea374232be12a0afbef4cb35e1e7889d618c04dca10b1bd05e6b359765fe752a36b4109920bb88c1a95cffbeedcb36fca7040db5afe3aa
6
+ metadata.gz: 28c7d8f93e8dc78d9e81490e62d27be6d431efea6e1242c14ecb931b82b766926cb65a35b9440d372d27513e95c1f5c90e042fa42fb97a6407f1c3d30abb15ca
7
+ data.tar.gz: 547aa9019ac83f8ae9955f6f2cafe062a14223bdf18a29471b98e68d86d9655c69d47e84f520f9e1b5331e10f00e807141ed18d3786ae9995b120230c7217855
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## 0.1.1 (2024-11-22)
2
+
3
+ - Added support for constraints
4
+ - Added support for small file compaction
5
+ - Added support for Z Ordering
6
+ - Added `history`, `partitions`, `protocol`, `repair`, and `restore` methods to `Table`
7
+ - Added experimental `load_cdf` method to `Table`
8
+ - Fixed handling of unsigned integers
9
+ - Fixed error with timestamps
10
+
1
11
  ## 0.1.0 (2024-11-20)
2
12
 
3
13
  - First release
data/Cargo.lock CHANGED
@@ -1488,13 +1488,15 @@ dependencies = [
1488
1488
 
1489
1489
  [[package]]
1490
1490
  name = "deltalake"
1491
- version = "0.1.0"
1491
+ version = "0.1.1"
1492
1492
  dependencies = [
1493
1493
  "arrow",
1494
1494
  "arrow-schema",
1495
1495
  "chrono",
1496
1496
  "deltalake 0.21.0",
1497
+ "futures",
1497
1498
  "magnus",
1499
+ "num_cpus",
1498
1500
  "serde",
1499
1501
  "serde_json",
1500
1502
  "tokio",
data/README.md CHANGED
@@ -14,7 +14,7 @@ Add this line to your application’s Gemfile:
14
14
  gem "deltalake-rb"
15
15
  ```
16
16
 
17
- It can take a few minutes to compile the gem.
17
+ It can take 5-10 minutes to compile the gem.
18
18
 
19
19
  ## Getting Started
20
20
 
@@ -50,6 +50,18 @@ Overwrite a table
50
50
  DeltaLake.write("./data/delta", df, mode: "overwrite")
51
51
  ```
52
52
 
53
+ Add a constraint
54
+
55
+ ```ruby
56
+ dt.alter.add_constraint({"a_gt_0" => "a > 0"})
57
+ ```
58
+
59
+ Drop a constraint
60
+
61
+ ```ruby
62
+ dt.alter.drop_constraint("a_gt_0")
63
+ ```
64
+
53
65
  Delete rows
54
66
 
55
67
  ```ruby
@@ -62,6 +74,18 @@ Vacuum
62
74
  dt.vacuum(dry_run: false)
63
75
  ```
64
76
 
77
+ Perform small file compaction
78
+
79
+ ```ruby
80
+ dt.optimize.compact
81
+ ```
82
+
83
+ Colocate similar data in the same files
84
+
85
+ ```ruby
86
+ dt.optimize.z_order(["a"])
87
+ ```
88
+
65
89
  Load a previous version of a table
66
90
 
67
91
  ```ruby
@@ -70,16 +94,22 @@ dt = DeltaLake::Table.new("./data/delta", version: 1)
70
94
  dt.load_as_version(1)
71
95
  ```
72
96
 
97
+ Get the schema
98
+
99
+ ```ruby
100
+ dt.schema
101
+ ```
102
+
73
103
  Get metadata
74
104
 
75
105
  ```ruby
76
106
  dt.metadata
77
107
  ```
78
108
 
79
- Get the schema
109
+ Get history
80
110
 
81
111
  ```ruby
82
- dt.schema
112
+ dt.history
83
113
  ```
84
114
 
85
115
  ## API
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "deltalake"
3
- version = "0.1.0"
3
+ version = "0.1.1"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -15,7 +15,9 @@ arrow = { version = "52", features = ["ffi"] }
15
15
  arrow-schema = { version = "52", features = ["serde"] }
16
16
  chrono = "0.4"
17
17
  deltalake = { version = "=0.21.0", features = ["datafusion", "s3"] }
18
+ futures = "0.3"
18
19
  magnus = "0.7"
20
+ num_cpus = "1"
19
21
  serde = "1"
20
22
  serde_json = "1"
21
23
  tokio = { version = "1", features = ["rt-multi-thread"] }
@@ -3,20 +3,39 @@ mod schema;
3
3
  mod utils;
4
4
 
5
5
  use std::cell::RefCell;
6
- use std::collections::HashMap;
6
+ use std::collections::{HashMap, HashSet};
7
7
  use std::future::IntoFuture;
8
+ use std::str::FromStr;
9
+ use std::time;
8
10
 
9
- use chrono::Duration;
11
+ use chrono::{DateTime, Duration, FixedOffset, Utc};
10
12
  use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
11
- use deltalake::kernel::StructType;
13
+ use deltalake::arrow::record_batch::RecordBatchIterator;
14
+ use deltalake::datafusion::physical_plan::ExecutionPlan;
15
+ use deltalake::datafusion::prelude::SessionContext;
16
+ use deltalake::errors::DeltaTableError;
17
+ use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
18
+ use deltalake::operations::collect_sendable_stream;
19
+ use deltalake::operations::constraints::ConstraintBuilder;
12
20
  use deltalake::operations::delete::DeleteBuilder;
21
+ use deltalake::operations::drop_constraints::DropConstraintBuilder;
22
+ use deltalake::operations::filesystem_check::FileSystemCheckBuilder;
23
+ use deltalake::operations::load_cdf::CdfLoadBuilder;
24
+ use deltalake::operations::optimize::{OptimizeBuilder, OptimizeType};
25
+ use deltalake::operations::restore::RestoreBuilder;
26
+ use deltalake::operations::transaction::TableReference;
13
27
  use deltalake::operations::vacuum::VacuumBuilder;
28
+ use deltalake::partitions::PartitionFilter;
14
29
  use deltalake::storage::IORuntime;
15
30
  use deltalake::DeltaOps;
16
31
  use error::DeltaError;
32
+ use futures::future::join_all;
17
33
 
18
- use magnus::{function, method, prelude::*, Error, Module, Ruby, Value};
34
+ use magnus::{
35
+ exception, function, method, prelude::*, Error, Integer, Module, RArray, RHash, Ruby, Value,
36
+ };
19
37
 
38
+ use crate::error::DeltaProtocolError;
20
39
  use crate::error::RubyError;
21
40
  use crate::schema::{schema_to_rbobject, Field};
22
41
  use crate::utils::rt;
@@ -38,6 +57,19 @@ struct RawDeltaTableMetaData {
38
57
  configuration: HashMap<String, Option<String>>,
39
58
  }
40
59
 
60
+ #[magnus::wrap(class = "DeltaLake::ArrowArrayStream")]
61
+ pub struct ArrowArrayStream {
62
+ stream: FFI_ArrowArrayStream,
63
+ }
64
+
65
+ impl ArrowArrayStream {
66
+ pub fn to_i(&self) -> usize {
67
+ (&self.stream as *const _) as usize
68
+ }
69
+ }
70
+
71
+ type StringVec = Vec<String>;
72
+
41
73
  impl RawDeltaTable {
42
74
  pub fn new(
43
75
  table_uri: String,
@@ -113,37 +145,138 @@ impl RawDeltaTable {
113
145
  })
114
146
  }
115
147
 
148
+ pub fn protocol_versions(&self) -> RbResult<(i32, i32, Option<StringVec>, Option<StringVec>)> {
149
+ let binding = self._table.borrow();
150
+ let table_protocol = binding.protocol().map_err(RubyError::from)?;
151
+ Ok((
152
+ table_protocol.min_reader_version,
153
+ table_protocol.min_writer_version,
154
+ table_protocol
155
+ .writer_features
156
+ .as_ref()
157
+ .and_then(|features| {
158
+ let empty_set = !features.is_empty();
159
+ empty_set.then(|| {
160
+ features
161
+ .iter()
162
+ .map(|v| v.to_string())
163
+ .collect::<Vec<String>>()
164
+ })
165
+ }),
166
+ table_protocol
167
+ .reader_features
168
+ .as_ref()
169
+ .and_then(|features| {
170
+ let empty_set = !features.is_empty();
171
+ empty_set.then(|| {
172
+ features
173
+ .iter()
174
+ .map(|v| v.to_string())
175
+ .collect::<Vec<String>>()
176
+ })
177
+ }),
178
+ ))
179
+ }
180
+
116
181
  pub fn load_version(&self, version: i64) -> RbResult<()> {
117
182
  Ok(rt()
118
183
  .block_on(self._table.borrow_mut().load_version(version))
119
184
  .map_err(RubyError::from)?)
120
185
  }
121
186
 
122
- pub fn files(&self) -> RbResult<Vec<String>> {
123
- if !self.has_files()? {
124
- return Err(DeltaError::new_err("Table is instantiated without files."));
125
- }
187
+ pub fn get_latest_version(&self) -> RbResult<i64> {
188
+ Ok(rt()
189
+ .block_on(self._table.borrow().get_latest_version())
190
+ .map_err(RubyError::from)?)
191
+ }
126
192
 
193
+ pub fn get_earliest_version(&self) -> RbResult<i64> {
194
+ Ok(rt()
195
+ .block_on(self._table.borrow().get_earliest_version())
196
+ .map_err(RubyError::from)?)
197
+ }
198
+
199
+ pub fn get_num_index_cols(&self) -> RbResult<i32> {
127
200
  Ok(self
128
201
  ._table
129
202
  .borrow()
130
- .get_files_iter()
203
+ .snapshot()
131
204
  .map_err(RubyError::from)?
132
- .map(|f| f.to_string())
133
- .collect())
205
+ .config()
206
+ .num_indexed_cols())
134
207
  }
135
208
 
136
- pub fn file_uris(&self) -> RbResult<Vec<String>> {
137
- if !self._table.borrow().config.require_files {
138
- return Err(DeltaError::new_err("Table is initiated without files."));
139
- }
140
-
209
+ pub fn get_stats_columns(&self) -> RbResult<Option<Vec<String>>> {
141
210
  Ok(self
142
211
  ._table
143
212
  .borrow()
144
- .get_file_uris()
213
+ .snapshot()
145
214
  .map_err(RubyError::from)?
146
- .collect())
215
+ .config()
216
+ .stats_columns()
217
+ .map(|v| v.iter().map(|v| v.to_string()).collect::<Vec<String>>()))
218
+ }
219
+
220
+ pub fn load_with_datetime(&self, ds: String) -> RbResult<()> {
221
+ let datetime = DateTime::<Utc>::from(
222
+ DateTime::<FixedOffset>::parse_from_rfc3339(&ds).map_err(|err| {
223
+ Error::new(
224
+ exception::arg_error(),
225
+ format!("Failed to parse datetime string: {err}"),
226
+ )
227
+ })?,
228
+ );
229
+ Ok(rt()
230
+ .block_on(self._table.borrow_mut().load_with_datetime(datetime))
231
+ .map_err(RubyError::from)?)
232
+ }
233
+
234
+ pub fn files(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
235
+ if !self.has_files()? {
236
+ return Err(DeltaError::new_err("Table is instantiated without files."));
237
+ }
238
+
239
+ if let Some(filters) = partition_filters {
240
+ let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
241
+ Ok(self
242
+ ._table
243
+ .borrow()
244
+ .get_files_by_partitions(&filters)
245
+ .map_err(RubyError::from)?
246
+ .into_iter()
247
+ .map(|p| p.to_string())
248
+ .collect())
249
+ } else {
250
+ Ok(self
251
+ ._table
252
+ .borrow()
253
+ .get_files_iter()
254
+ .map_err(RubyError::from)?
255
+ .map(|f| f.to_string())
256
+ .collect())
257
+ }
258
+ }
259
+
260
+ pub fn file_uris(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
261
+ if !self._table.borrow().config.require_files {
262
+ return Err(DeltaError::new_err("Table is initiated without files."));
263
+ }
264
+
265
+ if let Some(filters) = partition_filters {
266
+ let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
267
+ Ok(self
268
+ ._table
269
+ .borrow()
270
+ .get_file_uris_by_partitions(&filters)
271
+ .map_err(RubyError::from)?)
272
+ } else {
273
+ Ok(self
274
+ ._table
275
+ .borrow()
276
+ .get_file_uris()
277
+ .map_err(RubyError::from)?
278
+ .collect())
279
+ }
147
280
  }
148
281
 
149
282
  pub fn schema(&self) -> RbResult<Value> {
@@ -177,6 +310,214 @@ impl RawDeltaTable {
177
310
  Ok(metrics.files_deleted)
178
311
  }
179
312
 
313
+ pub fn compact_optimize(
314
+ &self,
315
+ target_size: Option<i64>,
316
+ max_concurrent_tasks: Option<usize>,
317
+ min_commit_interval: Option<u64>,
318
+ ) -> RbResult<String> {
319
+ let mut cmd = OptimizeBuilder::new(
320
+ self._table.borrow().log_store(),
321
+ self._table
322
+ .borrow()
323
+ .snapshot()
324
+ .map_err(RubyError::from)?
325
+ .clone(),
326
+ )
327
+ .with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get));
328
+ if let Some(size) = target_size {
329
+ cmd = cmd.with_target_size(size);
330
+ }
331
+ if let Some(commit_interval) = min_commit_interval {
332
+ cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
333
+ }
334
+
335
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
336
+ self._table.borrow_mut().state = table.state;
337
+ Ok(serde_json::to_string(&metrics).unwrap())
338
+ }
339
+
340
+ pub fn z_order_optimize(
341
+ &self,
342
+ z_order_columns: Vec<String>,
343
+ target_size: Option<i64>,
344
+ max_concurrent_tasks: Option<usize>,
345
+ max_spill_size: usize,
346
+ min_commit_interval: Option<u64>,
347
+ ) -> RbResult<String> {
348
+ let mut cmd = OptimizeBuilder::new(
349
+ self._table.borrow().log_store(),
350
+ self._table
351
+ .borrow()
352
+ .snapshot()
353
+ .map_err(RubyError::from)?
354
+ .clone(),
355
+ )
356
+ .with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get))
357
+ .with_max_spill_size(max_spill_size)
358
+ .with_type(OptimizeType::ZOrder(z_order_columns));
359
+ if let Some(size) = target_size {
360
+ cmd = cmd.with_target_size(size);
361
+ }
362
+ if let Some(commit_interval) = min_commit_interval {
363
+ cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
364
+ }
365
+
366
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
367
+ self._table.borrow_mut().state = table.state;
368
+ Ok(serde_json::to_string(&metrics).unwrap())
369
+ }
370
+
371
+ pub fn add_constraints(&self, constraints: HashMap<String, String>) -> RbResult<()> {
372
+ let mut cmd = ConstraintBuilder::new(
373
+ self._table.borrow().log_store(),
374
+ self._table
375
+ .borrow()
376
+ .snapshot()
377
+ .map_err(RubyError::from)?
378
+ .clone(),
379
+ );
380
+
381
+ for (col_name, expression) in constraints {
382
+ cmd = cmd.with_constraint(col_name.clone(), expression.clone());
383
+ }
384
+
385
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
386
+ self._table.borrow_mut().state = table.state;
387
+ Ok(())
388
+ }
389
+
390
+ pub fn drop_constraints(&self, name: String, raise_if_not_exists: bool) -> RbResult<()> {
391
+ let cmd = DropConstraintBuilder::new(
392
+ self._table.borrow().log_store(),
393
+ self._table
394
+ .borrow()
395
+ .snapshot()
396
+ .map_err(RubyError::from)?
397
+ .clone(),
398
+ )
399
+ .with_constraint(name)
400
+ .with_raise_if_not_exists(raise_if_not_exists);
401
+
402
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
403
+ self._table.borrow_mut().state = table.state;
404
+ Ok(())
405
+ }
406
+
407
+ pub fn load_cdf(
408
+ &self,
409
+ starting_version: i64,
410
+ ending_version: Option<i64>,
411
+ starting_timestamp: Option<String>,
412
+ ending_timestamp: Option<String>,
413
+ columns: Option<Vec<String>>,
414
+ ) -> RbResult<ArrowArrayStream> {
415
+ let ctx = SessionContext::new();
416
+ let mut cdf_read = CdfLoadBuilder::new(
417
+ self._table.borrow().log_store(),
418
+ self._table
419
+ .borrow()
420
+ .snapshot()
421
+ .map_err(RubyError::from)?
422
+ .clone(),
423
+ )
424
+ .with_starting_version(starting_version);
425
+
426
+ if let Some(ev) = ending_version {
427
+ cdf_read = cdf_read.with_ending_version(ev);
428
+ }
429
+ if let Some(st) = starting_timestamp {
430
+ let starting_ts: DateTime<Utc> = DateTime::<Utc>::from_str(&st)
431
+ .map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
432
+ .to_utc();
433
+ cdf_read = cdf_read.with_starting_timestamp(starting_ts);
434
+ }
435
+ if let Some(et) = ending_timestamp {
436
+ let ending_ts = DateTime::<Utc>::from_str(&et)
437
+ .map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
438
+ .to_utc();
439
+ cdf_read = cdf_read.with_starting_timestamp(ending_ts);
440
+ }
441
+
442
+ if let Some(columns) = columns {
443
+ cdf_read = cdf_read.with_columns(columns);
444
+ }
445
+
446
+ cdf_read = cdf_read.with_session_ctx(ctx.clone());
447
+
448
+ let plan = rt().block_on(cdf_read.build()).map_err(RubyError::from)?;
449
+
450
+ let mut tasks = vec![];
451
+ for p in 0..plan.properties().output_partitioning().partition_count() {
452
+ let inner_plan = plan.clone();
453
+ let partition_batch = inner_plan.execute(p, ctx.task_ctx()).unwrap();
454
+ let handle = rt().spawn(collect_sendable_stream(partition_batch));
455
+ tasks.push(handle);
456
+ }
457
+
458
+ // This is unfortunate.
459
+ let batches = rt()
460
+ .block_on(join_all(tasks))
461
+ .into_iter()
462
+ .flatten()
463
+ .collect::<Result<Vec<Vec<_>>, _>>()
464
+ .unwrap()
465
+ .into_iter()
466
+ .flatten()
467
+ .map(Ok);
468
+ let batch_iter = RecordBatchIterator::new(batches, plan.schema());
469
+ let ffi_stream = FFI_ArrowArrayStream::new(Box::new(batch_iter));
470
+ Ok(ArrowArrayStream { stream: ffi_stream })
471
+ }
472
+
473
+ pub fn restore(
474
+ &self,
475
+ target: Option<Value>,
476
+ ignore_missing_files: bool,
477
+ protocol_downgrade_allowed: bool,
478
+ ) -> RbResult<String> {
479
+ let mut cmd = RestoreBuilder::new(
480
+ self._table.borrow().log_store(),
481
+ self._table
482
+ .borrow()
483
+ .snapshot()
484
+ .map_err(RubyError::from)?
485
+ .clone(),
486
+ );
487
+ if let Some(val) = target {
488
+ if let Some(version) = Integer::from_value(val) {
489
+ cmd = cmd.with_version_to_restore(version.to_i64()?)
490
+ }
491
+ if let Ok(ds) = String::try_convert(val) {
492
+ let datetime = DateTime::<Utc>::from(
493
+ DateTime::<FixedOffset>::parse_from_rfc3339(ds.as_ref()).map_err(|err| {
494
+ Error::new(
495
+ exception::arg_error(),
496
+ format!("Failed to parse datetime string: {err}"),
497
+ )
498
+ })?,
499
+ );
500
+ cmd = cmd.with_datetime_to_restore(datetime)
501
+ }
502
+ }
503
+ cmd = cmd.with_ignore_missing_files(ignore_missing_files);
504
+ cmd = cmd.with_protocol_downgrade_allowed(protocol_downgrade_allowed);
505
+
506
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
507
+ self._table.borrow_mut().state = table.state;
508
+ Ok(serde_json::to_string(&metrics).unwrap())
509
+ }
510
+
511
+ pub fn history(&self, limit: Option<usize>) -> RbResult<Vec<String>> {
512
+ let history = rt()
513
+ .block_on(self._table.borrow().history(limit))
514
+ .map_err(RubyError::from)?;
515
+ Ok(history
516
+ .iter()
517
+ .map(|c| serde_json::to_string(c).unwrap())
518
+ .collect())
519
+ }
520
+
180
521
  pub fn update_incremental(&self) -> RbResult<()> {
181
522
  #[allow(deprecated)]
182
523
  Ok(rt()
@@ -184,6 +525,56 @@ impl RawDeltaTable {
184
525
  .map_err(RubyError::from)?)
185
526
  }
186
527
 
528
+ fn get_active_partitions(&self) -> RbResult<RArray> {
529
+ let binding = self._table.borrow();
530
+ let _column_names: HashSet<&str> = binding
531
+ .get_schema()
532
+ .map_err(|_| DeltaProtocolError::new_err("table does not yet have a schema"))?
533
+ .fields()
534
+ .map(|field| field.name().as_str())
535
+ .collect();
536
+ let partition_columns: HashSet<&str> = binding
537
+ .metadata()
538
+ .map_err(RubyError::from)?
539
+ .partition_columns
540
+ .iter()
541
+ .map(|col| col.as_str())
542
+ .collect();
543
+
544
+ let converted_filters = Vec::new();
545
+
546
+ let partition_columns: Vec<&str> = partition_columns.into_iter().collect();
547
+
548
+ let adds = binding
549
+ .snapshot()
550
+ .map_err(RubyError::from)?
551
+ .get_active_add_actions_by_partitions(&converted_filters)
552
+ .map_err(RubyError::from)?
553
+ .collect::<Result<Vec<_>, _>>()
554
+ .map_err(RubyError::from)?;
555
+ let active_partitions: HashSet<Vec<(&str, Option<String>)>> = adds
556
+ .iter()
557
+ .flat_map(|add| {
558
+ Ok::<_, RubyError>(
559
+ partition_columns
560
+ .iter()
561
+ .flat_map(|col| {
562
+ Ok::<_, RubyError>((
563
+ *col,
564
+ add.partition_values()
565
+ .map_err(RubyError::from)?
566
+ .get(*col)
567
+ .map(|v| v.serialize()),
568
+ ))
569
+ })
570
+ .collect(),
571
+ )
572
+ })
573
+ .collect();
574
+
575
+ Ok(RArray::from_iter(active_partitions))
576
+ }
577
+
187
578
  pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
188
579
  let mut cmd = DeleteBuilder::new(
189
580
  self._table.borrow().log_store(),
@@ -201,6 +592,38 @@ impl RawDeltaTable {
201
592
  self._table.borrow_mut().state = table.state;
202
593
  Ok(serde_json::to_string(&metrics).unwrap())
203
594
  }
595
+
596
+ pub fn repair(&self, dry_run: bool) -> RbResult<String> {
597
+ let cmd = FileSystemCheckBuilder::new(
598
+ self._table.borrow().log_store(),
599
+ self._table
600
+ .borrow()
601
+ .snapshot()
602
+ .map_err(RubyError::from)?
603
+ .clone(),
604
+ )
605
+ .with_dry_run(dry_run);
606
+
607
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
608
+ self._table.borrow_mut().state = table.state;
609
+ Ok(serde_json::to_string(&metrics).unwrap())
610
+ }
611
+
612
+ pub fn transaction_versions(&self) -> RHash {
613
+ RHash::from_iter(
614
+ self._table
615
+ .borrow()
616
+ .get_app_transaction_version()
617
+ .into_iter()
618
+ .map(|(app_id, transaction)| (app_id, RbTransaction::from(transaction))),
619
+ )
620
+ }
621
+ }
622
+
623
+ fn convert_partition_filters(
624
+ _partitions_filters: Value,
625
+ ) -> Result<Vec<PartitionFilter>, DeltaTableError> {
626
+ todo!()
204
627
  }
205
628
 
206
629
  impl RawDeltaTableMetaData {
@@ -229,6 +652,23 @@ impl RawDeltaTableMetaData {
229
652
  }
230
653
  }
231
654
 
655
+ #[magnus::wrap(class = "DeltaLake::Transaction")]
656
+ pub struct RbTransaction {
657
+ pub app_id: String,
658
+ pub version: i64,
659
+ pub last_updated: Option<i64>,
660
+ }
661
+
662
+ impl From<Transaction> for RbTransaction {
663
+ fn from(value: Transaction) -> Self {
664
+ RbTransaction {
665
+ app_id: value.app_id,
666
+ version: value.version,
667
+ last_updated: value.last_updated,
668
+ }
669
+ }
670
+ }
671
+
232
672
  #[allow(clippy::too_many_arguments)]
233
673
  fn write_to_deltalake(
234
674
  table_uri: String,
@@ -313,16 +753,68 @@ fn init(ruby: &Ruby) -> RbResult<()> {
313
753
  class.define_method("version", method!(RawDeltaTable::version, 0))?;
314
754
  class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?;
315
755
  class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?;
756
+ class.define_method(
757
+ "protocol_versions",
758
+ method!(RawDeltaTable::protocol_versions, 0),
759
+ )?;
316
760
  class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?;
317
- class.define_method("files", method!(RawDeltaTable::files, 0))?;
318
- class.define_method("file_uris", method!(RawDeltaTable::file_uris, 0))?;
761
+ class.define_method(
762
+ "get_latest_version",
763
+ method!(RawDeltaTable::get_latest_version, 0),
764
+ )?;
765
+ class.define_method(
766
+ "get_earliest_version",
767
+ method!(RawDeltaTable::get_earliest_version, 0),
768
+ )?;
769
+ class.define_method(
770
+ "get_num_index_cols",
771
+ method!(RawDeltaTable::get_num_index_cols, 0),
772
+ )?;
773
+ class.define_method(
774
+ "get_stats_columns",
775
+ method!(RawDeltaTable::get_stats_columns, 0),
776
+ )?;
777
+ class.define_method(
778
+ "load_with_datetime",
779
+ method!(RawDeltaTable::load_with_datetime, 1),
780
+ )?;
781
+ class.define_method("files", method!(RawDeltaTable::files, 1))?;
782
+ class.define_method("file_uris", method!(RawDeltaTable::file_uris, 1))?;
319
783
  class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
320
784
  class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
785
+ class.define_method(
786
+ "compact_optimize",
787
+ method!(RawDeltaTable::compact_optimize, 3),
788
+ )?;
789
+ class.define_method(
790
+ "z_order_optimize",
791
+ method!(RawDeltaTable::z_order_optimize, 5),
792
+ )?;
793
+ class.define_method(
794
+ "add_constraints",
795
+ method!(RawDeltaTable::add_constraints, 1),
796
+ )?;
797
+ class.define_method(
798
+ "drop_constraints",
799
+ method!(RawDeltaTable::drop_constraints, 2),
800
+ )?;
801
+ class.define_method("load_cdf", method!(RawDeltaTable::load_cdf, 5))?;
802
+ class.define_method("restore", method!(RawDeltaTable::restore, 3))?;
803
+ class.define_method("history", method!(RawDeltaTable::history, 1))?;
321
804
  class.define_method(
322
805
  "update_incremental",
323
806
  method!(RawDeltaTable::update_incremental, 0),
324
807
  )?;
808
+ class.define_method(
809
+ "get_active_partitions",
810
+ method!(RawDeltaTable::get_active_partitions, 0),
811
+ )?;
325
812
  class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
813
+ class.define_method("repair", method!(RawDeltaTable::repair, 1))?;
814
+ class.define_method(
815
+ "transaction_versions",
816
+ method!(RawDeltaTable::transaction_versions, 0),
817
+ )?;
326
818
 
327
819
  let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
328
820
  class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?;
@@ -344,6 +836,9 @@ fn init(ruby: &Ruby) -> RbResult<()> {
344
836
  method!(RawDeltaTableMetaData::configuration, 0),
345
837
  )?;
346
838
 
839
+ let class = module.define_class("ArrowArrayStream", ruby.class_object())?;
840
+ class.define_method("to_i", method!(ArrowArrayStream::to_i, 0))?;
841
+
347
842
  let class = module.define_class("Field", ruby.class_object())?;
348
843
  class.define_method("name", method!(Field::name, 0))?;
349
844
  class.define_method("type", method!(Field::get_type, 0))?;
@@ -26,22 +26,54 @@ module DeltaLake
26
26
  @table.version
27
27
  end
28
28
 
29
- def files
30
- @table.files
29
+ def partitions
30
+ partitions = []
31
+ @table.get_active_partitions.each do |partition|
32
+ next unless partition
33
+ partitions << partition.to_h
34
+ end
35
+ partitions
36
+ end
37
+
38
+ def files(partition_filters: nil)
39
+ @table.files(_stringify_partition_values(partition_filters))
31
40
  end
32
41
 
33
- def file_uris
34
- @table.file_uris
42
+ def file_uris(partition_filters: nil)
43
+ @table.file_uris(_stringify_partition_values(partition_filters))
35
44
  end
36
45
 
37
46
  def load_as_version(version)
38
47
  if version.is_a?(Integer)
39
48
  @table.load_version(version)
49
+ elsif version.is_a?(Time)
50
+ # needed for iso8601
51
+ require "time"
52
+
53
+ @table.load_with_datetime(version.utc.iso8601(9))
54
+ elsif version.is_a?(String)
55
+ @table.load_with_datetime(version)
40
56
  else
41
- raise TypeError, "Invalid datatype provided for version, only Integer is accepted."
57
+ raise TypeError, "Invalid datatype provided for version, only Integer, String, and Time are accepted."
42
58
  end
43
59
  end
44
60
 
61
+ def load_cdf(
62
+ starting_version: 0,
63
+ ending_version: nil,
64
+ starting_timestamp: nil,
65
+ ending_timestamp: nil,
66
+ columns: nil
67
+ )
68
+ @table.load_cdf(
69
+ starting_version,
70
+ ending_version,
71
+ starting_timestamp,
72
+ ending_timestamp,
73
+ columns
74
+ )
75
+ end
76
+
45
77
  def table_uri
46
78
  @table.table_uri
47
79
  end
@@ -54,6 +86,29 @@ module DeltaLake
54
86
  Metadata.new(@table)
55
87
  end
56
88
 
89
+ def protocol
90
+ ProtocolVersions.new(*@table.protocol_versions)
91
+ end
92
+
93
+ def history(limit: nil)
94
+ backwards_enumerate = lambda do |iterable, start_end, &block|
95
+ n = start_end
96
+ iterable.each do |elem|
97
+ block.call(n, elem)
98
+ n -= 1
99
+ end
100
+ end
101
+
102
+ commits = @table.history(limit)
103
+ history = []
104
+ backwards_enumerate.(commits, @table.get_latest_version) do |version, commit_info_raw|
105
+ commit = JSON.parse(commit_info_raw)
106
+ commit["version"] = version
107
+ history << commit
108
+ end
109
+ history
110
+ end
111
+
57
112
  def vacuum(
58
113
  retention_hours: nil,
59
114
  dry_run: true,
@@ -72,6 +127,40 @@ module DeltaLake
72
127
  )
73
128
  end
74
129
 
130
+ def optimize
131
+ TableOptimizer.new(self)
132
+ end
133
+
134
+ def alter
135
+ TableAlterer.new(self)
136
+ end
137
+
138
+ def restore(
139
+ target,
140
+ ignore_missing_files: false,
141
+ protocol_downgrade_allowed: false
142
+ )
143
+ if target.is_a?(Time)
144
+ # needed for iso8601
145
+ require "time"
146
+
147
+ metrics =
148
+ @table.restore(
149
+ target.utc.iso8601(9),
150
+ ignore_missing_files,
151
+ protocol_downgrade_allowed
152
+ )
153
+ else
154
+ metrics =
155
+ @table.restore(
156
+ target,
157
+ ignore_missing_files,
158
+ protocol_downgrade_allowed
159
+ )
160
+ end
161
+ JSON.parse(metrics)
162
+ end
163
+
75
164
  def to_polars(eager: true)
76
165
  require "polars-df"
77
166
 
@@ -80,7 +169,13 @@ module DeltaLake
80
169
  if sources.empty?
81
170
  Polars::LazyFrame.new
82
171
  else
83
- storage_options = @storage_options&.except("AWS_S3_ALLOW_UNSAFE_RENAME")
172
+ delta_keys = [
173
+ "AWS_S3_ALLOW_UNSAFE_RENAME",
174
+ "AWS_S3_LOCKING_PROVIDER",
175
+ "CONDITIONAL_PUT",
176
+ "DELTA_DYNAMO_TABLE_NAME"
177
+ ]
178
+ storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
84
179
  Polars.scan_parquet(sources, storage_options: storage_options)
85
180
  end
86
181
  eager ? lf.collect : lf
@@ -95,9 +190,30 @@ module DeltaLake
95
190
  JSON.parse(metrics).transform_keys(&:to_sym)
96
191
  end
97
192
 
193
+ def repair(dry_run: false)
194
+ metrics =
195
+ @table.repair(
196
+ dry_run
197
+ )
198
+ JSON.parse(metrics).transform_keys(&:to_sym)
199
+ end
200
+
201
+ def transaction_versions
202
+ @table.transaction_versions
203
+ end
204
+
98
205
  # private
99
206
  def _table
100
207
  @table
101
208
  end
209
+
210
+ # private
211
+ def _stringify_partition_values(partition_filters)
212
+ if partition_filters.nil?
213
+ return partition_filters
214
+ end
215
+
216
+ raise Todo
217
+ end
102
218
  end
103
219
  end
@@ -0,0 +1,25 @@
1
+ module DeltaLake
2
+ class TableAlterer
3
+ def initialize(table)
4
+ @table = table
5
+ end
6
+
7
+ def add_constraint(constraints)
8
+ if constraints.length > 1
9
+ raise ArgumentError,
10
+ "add_constraints is limited to a single constraint addition at once for now."
11
+ end
12
+
13
+ @table._table.add_constraints(
14
+ constraints
15
+ )
16
+ end
17
+
18
+ def drop_constraint(name, raise_if_not_exists: true)
19
+ @table._table.drop_constraints(
20
+ name,
21
+ raise_if_not_exists
22
+ )
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,51 @@
1
+ module DeltaLake
2
+ class TableOptimizer
3
+ def initialize(table)
4
+ @table = table
5
+ end
6
+
7
+ def compact(
8
+ target_size: nil,
9
+ max_concurrent_tasks: nil,
10
+ min_commit_interval: nil
11
+ )
12
+ metrics =
13
+ @table._table.compact_optimize(
14
+ target_size,
15
+ max_concurrent_tasks,
16
+ min_commit_interval
17
+ )
18
+ @table.update_incremental
19
+ result = JSON.parse(metrics)
20
+ ["filesAdded", "filesRemoved"].each do |key|
21
+ result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
22
+ end
23
+ # TODO return underscore symbols like delete
24
+ result
25
+ end
26
+
27
+ def z_order(
28
+ columns,
29
+ target_size: nil,
30
+ max_concurrent_tasks: nil,
31
+ max_spill_size: 20 * 1024 * 1024 * 1024,
32
+ min_commit_interval: nil
33
+ )
34
+ metrics =
35
+ @table._table.z_order_optimize(
36
+ Array(columns),
37
+ target_size,
38
+ max_concurrent_tasks,
39
+ max_spill_size,
40
+ min_commit_interval
41
+ )
42
+ @table.update_incremental
43
+ result = JSON.parse(metrics)
44
+ ["filesAdded", "filesRemoved"].each do |key|
45
+ result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
46
+ end
47
+ # TODO return underscore symbols like delete
48
+ result
49
+ end
50
+ end
51
+ end
@@ -1,3 +1,3 @@
1
1
  module DeltaLake
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/deltalake.rb CHANGED
@@ -13,6 +13,8 @@ require_relative "deltalake/field"
13
13
  require_relative "deltalake/metadata"
14
14
  require_relative "deltalake/schema"
15
15
  require_relative "deltalake/table"
16
+ require_relative "deltalake/table_alterer"
17
+ require_relative "deltalake/table_optimizer"
16
18
  require_relative "deltalake/version"
17
19
 
18
20
  module DeltaLake
@@ -22,8 +24,21 @@ module DeltaLake
22
24
  class CommitFailedError < Error; end
23
25
  class SchemaMismatchError < Error; end
24
26
 
25
- class << self
27
+ class Todo < Error
28
+ def message
29
+ "not implemented yet"
30
+ end
31
+ end
32
+
33
+ ProtocolVersions =
34
+ Struct.new(
35
+ :min_reader_version,
36
+ :min_writer_version,
37
+ :writer_features,
38
+ :reader_features
39
+ )
26
40
 
41
+ class << self
27
42
  def write(
28
43
  table_or_uri,
29
44
  data,
@@ -95,10 +110,58 @@ module DeltaLake
95
110
 
96
111
  def convert_data(data)
97
112
  if data.respond_to?(:arrow_c_stream)
113
+ # TODO convert other object types
114
+ # should probably move logic to Rust
115
+ if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
116
+ data = convert_polars_data(data)
117
+ end
118
+
98
119
  data.arrow_c_stream
99
120
  else
100
121
  raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
101
122
  end
102
123
  end
124
+
125
+ # unsigned integers are not part of the protocol
126
+ # https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
127
+ def convert_polars_data(data)
128
+ new_schema = {}
129
+ data.schema.each do |k, v|
130
+ new_type = convert_polars_type(v)
131
+ new_schema[k] = new_type if new_type
132
+ end
133
+
134
+ if new_schema.any?
135
+ data.cast(new_schema)
136
+ else
137
+ data
138
+ end
139
+ end
140
+
141
+ def convert_polars_type(t)
142
+ case t
143
+ when Polars::UInt8
144
+ Polars::Int8
145
+ when Polars::UInt16
146
+ Polars::Int16
147
+ when Polars::UInt32
148
+ Polars::Int32
149
+ when Polars::UInt64
150
+ Polars::Int64
151
+ when Polars::Datetime
152
+ Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
153
+ when Polars::List
154
+ inner = convert_polars_type(t.inner)
155
+ Polars::List.new(inner) if inner
156
+ when Polars::Array
157
+ inner = convert_polars_type(t.inner)
158
+ Polars::Array.new(t.inner, t.width) if inner
159
+ when Polars::Struct
160
+ if t.fields.any? { |f| convert_polars_type(f.dtype) }
161
+ fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
162
+ Polars::Struct.new(fields)
163
+ end
164
+ end
165
+ end
103
166
  end
104
167
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: deltalake-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-11-21 00:00:00.000000000 Z
11
+ date: 2024-11-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -48,6 +48,8 @@ files:
48
48
  - lib/deltalake/metadata.rb
49
49
  - lib/deltalake/schema.rb
50
50
  - lib/deltalake/table.rb
51
+ - lib/deltalake/table_alterer.rb
52
+ - lib/deltalake/table_optimizer.rb
51
53
  - lib/deltalake/version.rb
52
54
  homepage: https://github.com/ankane/delta-ruby
53
55
  licenses: