deltalake-rb 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 55b36ac54ad1d38070efaa4d555cbe368dcb47c98c88a56fd27d90cc3633e563
4
- data.tar.gz: 6a4464866554b770725d2f7e9e3ad39983933349d3ac57df97aa6204f90111d4
3
+ metadata.gz: 1ad1a2f352a83da63ccbde0126430b052d44801be630f3ad5e8326832205dc52
4
+ data.tar.gz: 1ce59b16589b891390d4ab1e81284d57389b6e8ff950faf11d1b3cf4736ce235
5
5
  SHA512:
6
- metadata.gz: 9a545da9d049c2519dfe075f9ccd0b186e0c5369401bbb258548205a72e0dfe3b57a1167675641a7df235d717159ec1fa9547858ba7db2f78ba5503fa31a3bd2
7
- data.tar.gz: 976cf69691f1e13e02ea374232be12a0afbef4cb35e1e7889d618c04dca10b1bd05e6b359765fe752a36b4109920bb88c1a95cffbeedcb36fca7040db5afe3aa
6
+ metadata.gz: 28c7d8f93e8dc78d9e81490e62d27be6d431efea6e1242c14ecb931b82b766926cb65a35b9440d372d27513e95c1f5c90e042fa42fb97a6407f1c3d30abb15ca
7
+ data.tar.gz: 547aa9019ac83f8ae9955f6f2cafe062a14223bdf18a29471b98e68d86d9655c69d47e84f520f9e1b5331e10f00e807141ed18d3786ae9995b120230c7217855
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## 0.1.1 (2024-11-22)
2
+
3
+ - Added support for constraints
4
+ - Added support for small file compaction
5
+ - Added support for Z Ordering
6
+ - Added `history`, `partitions`, `protocol`, `repair`, and `restore` methods to `Table`
7
+ - Added experimental `load_cdf` method to `Table`
8
+ - Fixed handling of unsigned integers
9
+ - Fixed error with timestamps
10
+
1
11
  ## 0.1.0 (2024-11-20)
2
12
 
3
13
  - First release
data/Cargo.lock CHANGED
@@ -1488,13 +1488,15 @@ dependencies = [
1488
1488
 
1489
1489
  [[package]]
1490
1490
  name = "deltalake"
1491
- version = "0.1.0"
1491
+ version = "0.1.1"
1492
1492
  dependencies = [
1493
1493
  "arrow",
1494
1494
  "arrow-schema",
1495
1495
  "chrono",
1496
1496
  "deltalake 0.21.0",
1497
+ "futures",
1497
1498
  "magnus",
1499
+ "num_cpus",
1498
1500
  "serde",
1499
1501
  "serde_json",
1500
1502
  "tokio",
data/README.md CHANGED
@@ -14,7 +14,7 @@ Add this line to your application’s Gemfile:
14
14
  gem "deltalake-rb"
15
15
  ```
16
16
 
17
- It can take a few minutes to compile the gem.
17
+ It can take 5-10 minutes to compile the gem.
18
18
 
19
19
  ## Getting Started
20
20
 
@@ -50,6 +50,18 @@ Overwrite a table
50
50
  DeltaLake.write("./data/delta", df, mode: "overwrite")
51
51
  ```
52
52
 
53
+ Add a constraint
54
+
55
+ ```ruby
56
+ dt.alter.add_constraint({"a_gt_0" => "a > 0"})
57
+ ```
58
+
59
+ Drop a constraint
60
+
61
+ ```ruby
62
+ dt.alter.drop_constraint("a_gt_0")
63
+ ```
64
+
53
65
  Delete rows
54
66
 
55
67
  ```ruby
@@ -62,6 +74,18 @@ Vacuum
62
74
  dt.vacuum(dry_run: false)
63
75
  ```
64
76
 
77
+ Perform small file compaction
78
+
79
+ ```ruby
80
+ dt.optimize.compact
81
+ ```
82
+
83
+ Colocate similar data in the same files
84
+
85
+ ```ruby
86
+ dt.optimize.z_order(["a"])
87
+ ```
88
+
65
89
  Load a previous version of a table
66
90
 
67
91
  ```ruby
@@ -70,16 +94,22 @@ dt = DeltaLake::Table.new("./data/delta", version: 1)
70
94
  dt.load_as_version(1)
71
95
  ```
72
96
 
97
+ Get the schema
98
+
99
+ ```ruby
100
+ dt.schema
101
+ ```
102
+
73
103
  Get metadata
74
104
 
75
105
  ```ruby
76
106
  dt.metadata
77
107
  ```
78
108
 
79
- Get the schema
109
+ Get history
80
110
 
81
111
  ```ruby
82
- dt.schema
112
+ dt.history
83
113
  ```
84
114
 
85
115
  ## API
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "deltalake"
3
- version = "0.1.0"
3
+ version = "0.1.1"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -15,7 +15,9 @@ arrow = { version = "52", features = ["ffi"] }
15
15
  arrow-schema = { version = "52", features = ["serde"] }
16
16
  chrono = "0.4"
17
17
  deltalake = { version = "=0.21.0", features = ["datafusion", "s3"] }
18
+ futures = "0.3"
18
19
  magnus = "0.7"
20
+ num_cpus = "1"
19
21
  serde = "1"
20
22
  serde_json = "1"
21
23
  tokio = { version = "1", features = ["rt-multi-thread"] }
@@ -3,20 +3,39 @@ mod schema;
3
3
  mod utils;
4
4
 
5
5
  use std::cell::RefCell;
6
- use std::collections::HashMap;
6
+ use std::collections::{HashMap, HashSet};
7
7
  use std::future::IntoFuture;
8
+ use std::str::FromStr;
9
+ use std::time;
8
10
 
9
- use chrono::Duration;
11
+ use chrono::{DateTime, Duration, FixedOffset, Utc};
10
12
  use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
11
- use deltalake::kernel::StructType;
13
+ use deltalake::arrow::record_batch::RecordBatchIterator;
14
+ use deltalake::datafusion::physical_plan::ExecutionPlan;
15
+ use deltalake::datafusion::prelude::SessionContext;
16
+ use deltalake::errors::DeltaTableError;
17
+ use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
18
+ use deltalake::operations::collect_sendable_stream;
19
+ use deltalake::operations::constraints::ConstraintBuilder;
12
20
  use deltalake::operations::delete::DeleteBuilder;
21
+ use deltalake::operations::drop_constraints::DropConstraintBuilder;
22
+ use deltalake::operations::filesystem_check::FileSystemCheckBuilder;
23
+ use deltalake::operations::load_cdf::CdfLoadBuilder;
24
+ use deltalake::operations::optimize::{OptimizeBuilder, OptimizeType};
25
+ use deltalake::operations::restore::RestoreBuilder;
26
+ use deltalake::operations::transaction::TableReference;
13
27
  use deltalake::operations::vacuum::VacuumBuilder;
28
+ use deltalake::partitions::PartitionFilter;
14
29
  use deltalake::storage::IORuntime;
15
30
  use deltalake::DeltaOps;
16
31
  use error::DeltaError;
32
+ use futures::future::join_all;
17
33
 
18
- use magnus::{function, method, prelude::*, Error, Module, Ruby, Value};
34
+ use magnus::{
35
+ exception, function, method, prelude::*, Error, Integer, Module, RArray, RHash, Ruby, Value,
36
+ };
19
37
 
38
+ use crate::error::DeltaProtocolError;
20
39
  use crate::error::RubyError;
21
40
  use crate::schema::{schema_to_rbobject, Field};
22
41
  use crate::utils::rt;
@@ -38,6 +57,19 @@ struct RawDeltaTableMetaData {
38
57
  configuration: HashMap<String, Option<String>>,
39
58
  }
40
59
 
60
+ #[magnus::wrap(class = "DeltaLake::ArrowArrayStream")]
61
+ pub struct ArrowArrayStream {
62
+ stream: FFI_ArrowArrayStream,
63
+ }
64
+
65
+ impl ArrowArrayStream {
66
+ pub fn to_i(&self) -> usize {
67
+ (&self.stream as *const _) as usize
68
+ }
69
+ }
70
+
71
+ type StringVec = Vec<String>;
72
+
41
73
  impl RawDeltaTable {
42
74
  pub fn new(
43
75
  table_uri: String,
@@ -113,37 +145,138 @@ impl RawDeltaTable {
113
145
  })
114
146
  }
115
147
 
148
+ pub fn protocol_versions(&self) -> RbResult<(i32, i32, Option<StringVec>, Option<StringVec>)> {
149
+ let binding = self._table.borrow();
150
+ let table_protocol = binding.protocol().map_err(RubyError::from)?;
151
+ Ok((
152
+ table_protocol.min_reader_version,
153
+ table_protocol.min_writer_version,
154
+ table_protocol
155
+ .writer_features
156
+ .as_ref()
157
+ .and_then(|features| {
158
+ let empty_set = !features.is_empty();
159
+ empty_set.then(|| {
160
+ features
161
+ .iter()
162
+ .map(|v| v.to_string())
163
+ .collect::<Vec<String>>()
164
+ })
165
+ }),
166
+ table_protocol
167
+ .reader_features
168
+ .as_ref()
169
+ .and_then(|features| {
170
+ let empty_set = !features.is_empty();
171
+ empty_set.then(|| {
172
+ features
173
+ .iter()
174
+ .map(|v| v.to_string())
175
+ .collect::<Vec<String>>()
176
+ })
177
+ }),
178
+ ))
179
+ }
180
+
116
181
  pub fn load_version(&self, version: i64) -> RbResult<()> {
117
182
  Ok(rt()
118
183
  .block_on(self._table.borrow_mut().load_version(version))
119
184
  .map_err(RubyError::from)?)
120
185
  }
121
186
 
122
- pub fn files(&self) -> RbResult<Vec<String>> {
123
- if !self.has_files()? {
124
- return Err(DeltaError::new_err("Table is instantiated without files."));
125
- }
187
+ pub fn get_latest_version(&self) -> RbResult<i64> {
188
+ Ok(rt()
189
+ .block_on(self._table.borrow().get_latest_version())
190
+ .map_err(RubyError::from)?)
191
+ }
126
192
 
193
+ pub fn get_earliest_version(&self) -> RbResult<i64> {
194
+ Ok(rt()
195
+ .block_on(self._table.borrow().get_earliest_version())
196
+ .map_err(RubyError::from)?)
197
+ }
198
+
199
+ pub fn get_num_index_cols(&self) -> RbResult<i32> {
127
200
  Ok(self
128
201
  ._table
129
202
  .borrow()
130
- .get_files_iter()
203
+ .snapshot()
131
204
  .map_err(RubyError::from)?
132
- .map(|f| f.to_string())
133
- .collect())
205
+ .config()
206
+ .num_indexed_cols())
134
207
  }
135
208
 
136
- pub fn file_uris(&self) -> RbResult<Vec<String>> {
137
- if !self._table.borrow().config.require_files {
138
- return Err(DeltaError::new_err("Table is initiated without files."));
139
- }
140
-
209
+ pub fn get_stats_columns(&self) -> RbResult<Option<Vec<String>>> {
141
210
  Ok(self
142
211
  ._table
143
212
  .borrow()
144
- .get_file_uris()
213
+ .snapshot()
145
214
  .map_err(RubyError::from)?
146
- .collect())
215
+ .config()
216
+ .stats_columns()
217
+ .map(|v| v.iter().map(|v| v.to_string()).collect::<Vec<String>>()))
218
+ }
219
+
220
+ pub fn load_with_datetime(&self, ds: String) -> RbResult<()> {
221
+ let datetime = DateTime::<Utc>::from(
222
+ DateTime::<FixedOffset>::parse_from_rfc3339(&ds).map_err(|err| {
223
+ Error::new(
224
+ exception::arg_error(),
225
+ format!("Failed to parse datetime string: {err}"),
226
+ )
227
+ })?,
228
+ );
229
+ Ok(rt()
230
+ .block_on(self._table.borrow_mut().load_with_datetime(datetime))
231
+ .map_err(RubyError::from)?)
232
+ }
233
+
234
+ pub fn files(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
235
+ if !self.has_files()? {
236
+ return Err(DeltaError::new_err("Table is instantiated without files."));
237
+ }
238
+
239
+ if let Some(filters) = partition_filters {
240
+ let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
241
+ Ok(self
242
+ ._table
243
+ .borrow()
244
+ .get_files_by_partitions(&filters)
245
+ .map_err(RubyError::from)?
246
+ .into_iter()
247
+ .map(|p| p.to_string())
248
+ .collect())
249
+ } else {
250
+ Ok(self
251
+ ._table
252
+ .borrow()
253
+ .get_files_iter()
254
+ .map_err(RubyError::from)?
255
+ .map(|f| f.to_string())
256
+ .collect())
257
+ }
258
+ }
259
+
260
+ pub fn file_uris(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
261
+ if !self._table.borrow().config.require_files {
262
+ return Err(DeltaError::new_err("Table is initiated without files."));
263
+ }
264
+
265
+ if let Some(filters) = partition_filters {
266
+ let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
267
+ Ok(self
268
+ ._table
269
+ .borrow()
270
+ .get_file_uris_by_partitions(&filters)
271
+ .map_err(RubyError::from)?)
272
+ } else {
273
+ Ok(self
274
+ ._table
275
+ .borrow()
276
+ .get_file_uris()
277
+ .map_err(RubyError::from)?
278
+ .collect())
279
+ }
147
280
  }
148
281
 
149
282
  pub fn schema(&self) -> RbResult<Value> {
@@ -177,6 +310,214 @@ impl RawDeltaTable {
177
310
  Ok(metrics.files_deleted)
178
311
  }
179
312
 
313
+ pub fn compact_optimize(
314
+ &self,
315
+ target_size: Option<i64>,
316
+ max_concurrent_tasks: Option<usize>,
317
+ min_commit_interval: Option<u64>,
318
+ ) -> RbResult<String> {
319
+ let mut cmd = OptimizeBuilder::new(
320
+ self._table.borrow().log_store(),
321
+ self._table
322
+ .borrow()
323
+ .snapshot()
324
+ .map_err(RubyError::from)?
325
+ .clone(),
326
+ )
327
+ .with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get));
328
+ if let Some(size) = target_size {
329
+ cmd = cmd.with_target_size(size);
330
+ }
331
+ if let Some(commit_interval) = min_commit_interval {
332
+ cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
333
+ }
334
+
335
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
336
+ self._table.borrow_mut().state = table.state;
337
+ Ok(serde_json::to_string(&metrics).unwrap())
338
+ }
339
+
340
+ pub fn z_order_optimize(
341
+ &self,
342
+ z_order_columns: Vec<String>,
343
+ target_size: Option<i64>,
344
+ max_concurrent_tasks: Option<usize>,
345
+ max_spill_size: usize,
346
+ min_commit_interval: Option<u64>,
347
+ ) -> RbResult<String> {
348
+ let mut cmd = OptimizeBuilder::new(
349
+ self._table.borrow().log_store(),
350
+ self._table
351
+ .borrow()
352
+ .snapshot()
353
+ .map_err(RubyError::from)?
354
+ .clone(),
355
+ )
356
+ .with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get))
357
+ .with_max_spill_size(max_spill_size)
358
+ .with_type(OptimizeType::ZOrder(z_order_columns));
359
+ if let Some(size) = target_size {
360
+ cmd = cmd.with_target_size(size);
361
+ }
362
+ if let Some(commit_interval) = min_commit_interval {
363
+ cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
364
+ }
365
+
366
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
367
+ self._table.borrow_mut().state = table.state;
368
+ Ok(serde_json::to_string(&metrics).unwrap())
369
+ }
370
+
371
+ pub fn add_constraints(&self, constraints: HashMap<String, String>) -> RbResult<()> {
372
+ let mut cmd = ConstraintBuilder::new(
373
+ self._table.borrow().log_store(),
374
+ self._table
375
+ .borrow()
376
+ .snapshot()
377
+ .map_err(RubyError::from)?
378
+ .clone(),
379
+ );
380
+
381
+ for (col_name, expression) in constraints {
382
+ cmd = cmd.with_constraint(col_name.clone(), expression.clone());
383
+ }
384
+
385
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
386
+ self._table.borrow_mut().state = table.state;
387
+ Ok(())
388
+ }
389
+
390
+ pub fn drop_constraints(&self, name: String, raise_if_not_exists: bool) -> RbResult<()> {
391
+ let cmd = DropConstraintBuilder::new(
392
+ self._table.borrow().log_store(),
393
+ self._table
394
+ .borrow()
395
+ .snapshot()
396
+ .map_err(RubyError::from)?
397
+ .clone(),
398
+ )
399
+ .with_constraint(name)
400
+ .with_raise_if_not_exists(raise_if_not_exists);
401
+
402
+ let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
403
+ self._table.borrow_mut().state = table.state;
404
+ Ok(())
405
+ }
406
+
407
+ pub fn load_cdf(
408
+ &self,
409
+ starting_version: i64,
410
+ ending_version: Option<i64>,
411
+ starting_timestamp: Option<String>,
412
+ ending_timestamp: Option<String>,
413
+ columns: Option<Vec<String>>,
414
+ ) -> RbResult<ArrowArrayStream> {
415
+ let ctx = SessionContext::new();
416
+ let mut cdf_read = CdfLoadBuilder::new(
417
+ self._table.borrow().log_store(),
418
+ self._table
419
+ .borrow()
420
+ .snapshot()
421
+ .map_err(RubyError::from)?
422
+ .clone(),
423
+ )
424
+ .with_starting_version(starting_version);
425
+
426
+ if let Some(ev) = ending_version {
427
+ cdf_read = cdf_read.with_ending_version(ev);
428
+ }
429
+ if let Some(st) = starting_timestamp {
430
+ let starting_ts: DateTime<Utc> = DateTime::<Utc>::from_str(&st)
431
+ .map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
432
+ .to_utc();
433
+ cdf_read = cdf_read.with_starting_timestamp(starting_ts);
434
+ }
435
+ if let Some(et) = ending_timestamp {
436
+ let ending_ts = DateTime::<Utc>::from_str(&et)
437
+ .map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
438
+ .to_utc();
439
+ cdf_read = cdf_read.with_starting_timestamp(ending_ts);
440
+ }
441
+
442
+ if let Some(columns) = columns {
443
+ cdf_read = cdf_read.with_columns(columns);
444
+ }
445
+
446
+ cdf_read = cdf_read.with_session_ctx(ctx.clone());
447
+
448
+ let plan = rt().block_on(cdf_read.build()).map_err(RubyError::from)?;
449
+
450
+ let mut tasks = vec![];
451
+ for p in 0..plan.properties().output_partitioning().partition_count() {
452
+ let inner_plan = plan.clone();
453
+ let partition_batch = inner_plan.execute(p, ctx.task_ctx()).unwrap();
454
+ let handle = rt().spawn(collect_sendable_stream(partition_batch));
455
+ tasks.push(handle);
456
+ }
457
+
458
+ // This is unfortunate.
459
+ let batches = rt()
460
+ .block_on(join_all(tasks))
461
+ .into_iter()
462
+ .flatten()
463
+ .collect::<Result<Vec<Vec<_>>, _>>()
464
+ .unwrap()
465
+ .into_iter()
466
+ .flatten()
467
+ .map(Ok);
468
+ let batch_iter = RecordBatchIterator::new(batches, plan.schema());
469
+ let ffi_stream = FFI_ArrowArrayStream::new(Box::new(batch_iter));
470
+ Ok(ArrowArrayStream { stream: ffi_stream })
471
+ }
472
+
473
+ pub fn restore(
474
+ &self,
475
+ target: Option<Value>,
476
+ ignore_missing_files: bool,
477
+ protocol_downgrade_allowed: bool,
478
+ ) -> RbResult<String> {
479
+ let mut cmd = RestoreBuilder::new(
480
+ self._table.borrow().log_store(),
481
+ self._table
482
+ .borrow()
483
+ .snapshot()
484
+ .map_err(RubyError::from)?
485
+ .clone(),
486
+ );
487
+ if let Some(val) = target {
488
+ if let Some(version) = Integer::from_value(val) {
489
+ cmd = cmd.with_version_to_restore(version.to_i64()?)
490
+ }
491
+ if let Ok(ds) = String::try_convert(val) {
492
+ let datetime = DateTime::<Utc>::from(
493
+ DateTime::<FixedOffset>::parse_from_rfc3339(ds.as_ref()).map_err(|err| {
494
+ Error::new(
495
+ exception::arg_error(),
496
+ format!("Failed to parse datetime string: {err}"),
497
+ )
498
+ })?,
499
+ );
500
+ cmd = cmd.with_datetime_to_restore(datetime)
501
+ }
502
+ }
503
+ cmd = cmd.with_ignore_missing_files(ignore_missing_files);
504
+ cmd = cmd.with_protocol_downgrade_allowed(protocol_downgrade_allowed);
505
+
506
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
507
+ self._table.borrow_mut().state = table.state;
508
+ Ok(serde_json::to_string(&metrics).unwrap())
509
+ }
510
+
511
+ pub fn history(&self, limit: Option<usize>) -> RbResult<Vec<String>> {
512
+ let history = rt()
513
+ .block_on(self._table.borrow().history(limit))
514
+ .map_err(RubyError::from)?;
515
+ Ok(history
516
+ .iter()
517
+ .map(|c| serde_json::to_string(c).unwrap())
518
+ .collect())
519
+ }
520
+
180
521
  pub fn update_incremental(&self) -> RbResult<()> {
181
522
  #[allow(deprecated)]
182
523
  Ok(rt()
@@ -184,6 +525,56 @@ impl RawDeltaTable {
184
525
  .map_err(RubyError::from)?)
185
526
  }
186
527
 
528
+ fn get_active_partitions(&self) -> RbResult<RArray> {
529
+ let binding = self._table.borrow();
530
+ let _column_names: HashSet<&str> = binding
531
+ .get_schema()
532
+ .map_err(|_| DeltaProtocolError::new_err("table does not yet have a schema"))?
533
+ .fields()
534
+ .map(|field| field.name().as_str())
535
+ .collect();
536
+ let partition_columns: HashSet<&str> = binding
537
+ .metadata()
538
+ .map_err(RubyError::from)?
539
+ .partition_columns
540
+ .iter()
541
+ .map(|col| col.as_str())
542
+ .collect();
543
+
544
+ let converted_filters = Vec::new();
545
+
546
+ let partition_columns: Vec<&str> = partition_columns.into_iter().collect();
547
+
548
+ let adds = binding
549
+ .snapshot()
550
+ .map_err(RubyError::from)?
551
+ .get_active_add_actions_by_partitions(&converted_filters)
552
+ .map_err(RubyError::from)?
553
+ .collect::<Result<Vec<_>, _>>()
554
+ .map_err(RubyError::from)?;
555
+ let active_partitions: HashSet<Vec<(&str, Option<String>)>> = adds
556
+ .iter()
557
+ .flat_map(|add| {
558
+ Ok::<_, RubyError>(
559
+ partition_columns
560
+ .iter()
561
+ .flat_map(|col| {
562
+ Ok::<_, RubyError>((
563
+ *col,
564
+ add.partition_values()
565
+ .map_err(RubyError::from)?
566
+ .get(*col)
567
+ .map(|v| v.serialize()),
568
+ ))
569
+ })
570
+ .collect(),
571
+ )
572
+ })
573
+ .collect();
574
+
575
+ Ok(RArray::from_iter(active_partitions))
576
+ }
577
+
187
578
  pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
188
579
  let mut cmd = DeleteBuilder::new(
189
580
  self._table.borrow().log_store(),
@@ -201,6 +592,38 @@ impl RawDeltaTable {
201
592
  self._table.borrow_mut().state = table.state;
202
593
  Ok(serde_json::to_string(&metrics).unwrap())
203
594
  }
595
+
596
+ pub fn repair(&self, dry_run: bool) -> RbResult<String> {
597
+ let cmd = FileSystemCheckBuilder::new(
598
+ self._table.borrow().log_store(),
599
+ self._table
600
+ .borrow()
601
+ .snapshot()
602
+ .map_err(RubyError::from)?
603
+ .clone(),
604
+ )
605
+ .with_dry_run(dry_run);
606
+
607
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
608
+ self._table.borrow_mut().state = table.state;
609
+ Ok(serde_json::to_string(&metrics).unwrap())
610
+ }
611
+
612
+ pub fn transaction_versions(&self) -> RHash {
613
+ RHash::from_iter(
614
+ self._table
615
+ .borrow()
616
+ .get_app_transaction_version()
617
+ .into_iter()
618
+ .map(|(app_id, transaction)| (app_id, RbTransaction::from(transaction))),
619
+ )
620
+ }
621
+ }
622
+
623
+ fn convert_partition_filters(
624
+ _partitions_filters: Value,
625
+ ) -> Result<Vec<PartitionFilter>, DeltaTableError> {
626
+ todo!()
204
627
  }
205
628
 
206
629
  impl RawDeltaTableMetaData {
@@ -229,6 +652,23 @@ impl RawDeltaTableMetaData {
229
652
  }
230
653
  }
231
654
 
655
+ #[magnus::wrap(class = "DeltaLake::Transaction")]
656
+ pub struct RbTransaction {
657
+ pub app_id: String,
658
+ pub version: i64,
659
+ pub last_updated: Option<i64>,
660
+ }
661
+
662
+ impl From<Transaction> for RbTransaction {
663
+ fn from(value: Transaction) -> Self {
664
+ RbTransaction {
665
+ app_id: value.app_id,
666
+ version: value.version,
667
+ last_updated: value.last_updated,
668
+ }
669
+ }
670
+ }
671
+
232
672
  #[allow(clippy::too_many_arguments)]
233
673
  fn write_to_deltalake(
234
674
  table_uri: String,
@@ -313,16 +753,68 @@ fn init(ruby: &Ruby) -> RbResult<()> {
313
753
  class.define_method("version", method!(RawDeltaTable::version, 0))?;
314
754
  class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?;
315
755
  class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?;
756
+ class.define_method(
757
+ "protocol_versions",
758
+ method!(RawDeltaTable::protocol_versions, 0),
759
+ )?;
316
760
  class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?;
317
- class.define_method("files", method!(RawDeltaTable::files, 0))?;
318
- class.define_method("file_uris", method!(RawDeltaTable::file_uris, 0))?;
761
+ class.define_method(
762
+ "get_latest_version",
763
+ method!(RawDeltaTable::get_latest_version, 0),
764
+ )?;
765
+ class.define_method(
766
+ "get_earliest_version",
767
+ method!(RawDeltaTable::get_earliest_version, 0),
768
+ )?;
769
+ class.define_method(
770
+ "get_num_index_cols",
771
+ method!(RawDeltaTable::get_num_index_cols, 0),
772
+ )?;
773
+ class.define_method(
774
+ "get_stats_columns",
775
+ method!(RawDeltaTable::get_stats_columns, 0),
776
+ )?;
777
+ class.define_method(
778
+ "load_with_datetime",
779
+ method!(RawDeltaTable::load_with_datetime, 1),
780
+ )?;
781
+ class.define_method("files", method!(RawDeltaTable::files, 1))?;
782
+ class.define_method("file_uris", method!(RawDeltaTable::file_uris, 1))?;
319
783
  class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
320
784
  class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
785
+ class.define_method(
786
+ "compact_optimize",
787
+ method!(RawDeltaTable::compact_optimize, 3),
788
+ )?;
789
+ class.define_method(
790
+ "z_order_optimize",
791
+ method!(RawDeltaTable::z_order_optimize, 5),
792
+ )?;
793
+ class.define_method(
794
+ "add_constraints",
795
+ method!(RawDeltaTable::add_constraints, 1),
796
+ )?;
797
+ class.define_method(
798
+ "drop_constraints",
799
+ method!(RawDeltaTable::drop_constraints, 2),
800
+ )?;
801
+ class.define_method("load_cdf", method!(RawDeltaTable::load_cdf, 5))?;
802
+ class.define_method("restore", method!(RawDeltaTable::restore, 3))?;
803
+ class.define_method("history", method!(RawDeltaTable::history, 1))?;
321
804
  class.define_method(
322
805
  "update_incremental",
323
806
  method!(RawDeltaTable::update_incremental, 0),
324
807
  )?;
808
+ class.define_method(
809
+ "get_active_partitions",
810
+ method!(RawDeltaTable::get_active_partitions, 0),
811
+ )?;
325
812
  class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
813
+ class.define_method("repair", method!(RawDeltaTable::repair, 1))?;
814
+ class.define_method(
815
+ "transaction_versions",
816
+ method!(RawDeltaTable::transaction_versions, 0),
817
+ )?;
326
818
 
327
819
  let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
328
820
  class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?;
@@ -344,6 +836,9 @@ fn init(ruby: &Ruby) -> RbResult<()> {
344
836
  method!(RawDeltaTableMetaData::configuration, 0),
345
837
  )?;
346
838
 
839
+ let class = module.define_class("ArrowArrayStream", ruby.class_object())?;
840
+ class.define_method("to_i", method!(ArrowArrayStream::to_i, 0))?;
841
+
347
842
  let class = module.define_class("Field", ruby.class_object())?;
348
843
  class.define_method("name", method!(Field::name, 0))?;
349
844
  class.define_method("type", method!(Field::get_type, 0))?;
@@ -26,22 +26,54 @@ module DeltaLake
26
26
  @table.version
27
27
  end
28
28
 
29
- def files
30
- @table.files
29
+ def partitions
30
+ partitions = []
31
+ @table.get_active_partitions.each do |partition|
32
+ next unless partition
33
+ partitions << partition.to_h
34
+ end
35
+ partitions
36
+ end
37
+
38
+ def files(partition_filters: nil)
39
+ @table.files(_stringify_partition_values(partition_filters))
31
40
  end
32
41
 
33
- def file_uris
34
- @table.file_uris
42
+ def file_uris(partition_filters: nil)
43
+ @table.file_uris(_stringify_partition_values(partition_filters))
35
44
  end
36
45
 
37
46
  def load_as_version(version)
38
47
  if version.is_a?(Integer)
39
48
  @table.load_version(version)
49
+ elsif version.is_a?(Time)
50
+ # needed for iso8601
51
+ require "time"
52
+
53
+ @table.load_with_datetime(version.utc.iso8601(9))
54
+ elsif version.is_a?(String)
55
+ @table.load_with_datetime(version)
40
56
  else
41
- raise TypeError, "Invalid datatype provided for version, only Integer is accepted."
57
+ raise TypeError, "Invalid datatype provided for version, only Integer, String, and Time are accepted."
42
58
  end
43
59
  end
44
60
 
61
+ def load_cdf(
62
+ starting_version: 0,
63
+ ending_version: nil,
64
+ starting_timestamp: nil,
65
+ ending_timestamp: nil,
66
+ columns: nil
67
+ )
68
+ @table.load_cdf(
69
+ starting_version,
70
+ ending_version,
71
+ starting_timestamp,
72
+ ending_timestamp,
73
+ columns
74
+ )
75
+ end
76
+
45
77
  def table_uri
46
78
  @table.table_uri
47
79
  end
@@ -54,6 +86,29 @@ module DeltaLake
54
86
  Metadata.new(@table)
55
87
  end
56
88
 
89
+ def protocol
90
+ ProtocolVersions.new(*@table.protocol_versions)
91
+ end
92
+
93
+ def history(limit: nil)
94
+ backwards_enumerate = lambda do |iterable, start_end, &block|
95
+ n = start_end
96
+ iterable.each do |elem|
97
+ block.call(n, elem)
98
+ n -= 1
99
+ end
100
+ end
101
+
102
+ commits = @table.history(limit)
103
+ history = []
104
+ backwards_enumerate.(commits, @table.get_latest_version) do |version, commit_info_raw|
105
+ commit = JSON.parse(commit_info_raw)
106
+ commit["version"] = version
107
+ history << commit
108
+ end
109
+ history
110
+ end
111
+
57
112
  def vacuum(
58
113
  retention_hours: nil,
59
114
  dry_run: true,
@@ -72,6 +127,40 @@ module DeltaLake
72
127
  )
73
128
  end
74
129
 
130
+ def optimize
131
+ TableOptimizer.new(self)
132
+ end
133
+
134
+ def alter
135
+ TableAlterer.new(self)
136
+ end
137
+
138
+ def restore(
139
+ target,
140
+ ignore_missing_files: false,
141
+ protocol_downgrade_allowed: false
142
+ )
143
+ if target.is_a?(Time)
144
+ # needed for iso8601
145
+ require "time"
146
+
147
+ metrics =
148
+ @table.restore(
149
+ target.utc.iso8601(9),
150
+ ignore_missing_files,
151
+ protocol_downgrade_allowed
152
+ )
153
+ else
154
+ metrics =
155
+ @table.restore(
156
+ target,
157
+ ignore_missing_files,
158
+ protocol_downgrade_allowed
159
+ )
160
+ end
161
+ JSON.parse(metrics)
162
+ end
163
+
75
164
  def to_polars(eager: true)
76
165
  require "polars-df"
77
166
 
@@ -80,7 +169,13 @@ module DeltaLake
80
169
  if sources.empty?
81
170
  Polars::LazyFrame.new
82
171
  else
83
- storage_options = @storage_options&.except("AWS_S3_ALLOW_UNSAFE_RENAME")
172
+ delta_keys = [
173
+ "AWS_S3_ALLOW_UNSAFE_RENAME",
174
+ "AWS_S3_LOCKING_PROVIDER",
175
+ "CONDITIONAL_PUT",
176
+ "DELTA_DYNAMO_TABLE_NAME"
177
+ ]
178
+ storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
84
179
  Polars.scan_parquet(sources, storage_options: storage_options)
85
180
  end
86
181
  eager ? lf.collect : lf
@@ -95,9 +190,30 @@ module DeltaLake
95
190
  JSON.parse(metrics).transform_keys(&:to_sym)
96
191
  end
97
192
 
193
+ def repair(dry_run: false)
194
+ metrics =
195
+ @table.repair(
196
+ dry_run
197
+ )
198
+ JSON.parse(metrics).transform_keys(&:to_sym)
199
+ end
200
+
201
+ def transaction_versions
202
+ @table.transaction_versions
203
+ end
204
+
98
205
  # private
99
206
  def _table
100
207
  @table
101
208
  end
209
+
210
+ # private
211
+ def _stringify_partition_values(partition_filters)
212
+ if partition_filters.nil?
213
+ return partition_filters
214
+ end
215
+
216
+ raise Todo
217
+ end
102
218
  end
103
219
  end
@@ -0,0 +1,25 @@
1
+ module DeltaLake
2
+ class TableAlterer
3
+ def initialize(table)
4
+ @table = table
5
+ end
6
+
7
+ def add_constraint(constraints)
8
+ if constraints.length > 1
9
+ raise ArgumentError,
10
+ "add_constraints is limited to a single constraint addition at once for now."
11
+ end
12
+
13
+ @table._table.add_constraints(
14
+ constraints
15
+ )
16
+ end
17
+
18
+ def drop_constraint(name, raise_if_not_exists: true)
19
+ @table._table.drop_constraints(
20
+ name,
21
+ raise_if_not_exists
22
+ )
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,51 @@
1
+ module DeltaLake
2
+ class TableOptimizer
3
+ def initialize(table)
4
+ @table = table
5
+ end
6
+
7
+ def compact(
8
+ target_size: nil,
9
+ max_concurrent_tasks: nil,
10
+ min_commit_interval: nil
11
+ )
12
+ metrics =
13
+ @table._table.compact_optimize(
14
+ target_size,
15
+ max_concurrent_tasks,
16
+ min_commit_interval
17
+ )
18
+ @table.update_incremental
19
+ result = JSON.parse(metrics)
20
+ ["filesAdded", "filesRemoved"].each do |key|
21
+ result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
22
+ end
23
+ # TODO return underscore symbols like delete
24
+ result
25
+ end
26
+
27
+ def z_order(
28
+ columns,
29
+ target_size: nil,
30
+ max_concurrent_tasks: nil,
31
+ max_spill_size: 20 * 1024 * 1024 * 1024,
32
+ min_commit_interval: nil
33
+ )
34
+ metrics =
35
+ @table._table.z_order_optimize(
36
+ Array(columns),
37
+ target_size,
38
+ max_concurrent_tasks,
39
+ max_spill_size,
40
+ min_commit_interval
41
+ )
42
+ @table.update_incremental
43
+ result = JSON.parse(metrics)
44
+ ["filesAdded", "filesRemoved"].each do |key|
45
+ result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
46
+ end
47
+ # TODO return underscore symbols like delete
48
+ result
49
+ end
50
+ end
51
+ end
@@ -1,3 +1,3 @@
1
1
  module DeltaLake
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/deltalake.rb CHANGED
@@ -13,6 +13,8 @@ require_relative "deltalake/field"
13
13
  require_relative "deltalake/metadata"
14
14
  require_relative "deltalake/schema"
15
15
  require_relative "deltalake/table"
16
+ require_relative "deltalake/table_alterer"
17
+ require_relative "deltalake/table_optimizer"
16
18
  require_relative "deltalake/version"
17
19
 
18
20
  module DeltaLake
@@ -22,8 +24,21 @@ module DeltaLake
22
24
  class CommitFailedError < Error; end
23
25
  class SchemaMismatchError < Error; end
24
26
 
25
- class << self
27
+ class Todo < Error
28
+ def message
29
+ "not implemented yet"
30
+ end
31
+ end
32
+
33
+ ProtocolVersions =
34
+ Struct.new(
35
+ :min_reader_version,
36
+ :min_writer_version,
37
+ :writer_features,
38
+ :reader_features
39
+ )
26
40
 
41
+ class << self
27
42
  def write(
28
43
  table_or_uri,
29
44
  data,
@@ -95,10 +110,58 @@ module DeltaLake
95
110
 
96
111
  def convert_data(data)
97
112
  if data.respond_to?(:arrow_c_stream)
113
+ # TODO convert other object types
114
+ # should probably move logic to Rust
115
+ if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
116
+ data = convert_polars_data(data)
117
+ end
118
+
98
119
  data.arrow_c_stream
99
120
  else
100
121
  raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
101
122
  end
102
123
  end
124
+
125
+ # unsigned integers are not part of the protocol
126
+ # https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
127
+ def convert_polars_data(data)
128
+ new_schema = {}
129
+ data.schema.each do |k, v|
130
+ new_type = convert_polars_type(v)
131
+ new_schema[k] = new_type if new_type
132
+ end
133
+
134
+ if new_schema.any?
135
+ data.cast(new_schema)
136
+ else
137
+ data
138
+ end
139
+ end
140
+
141
+ def convert_polars_type(t)
142
+ case t
143
+ when Polars::UInt8
144
+ Polars::Int8
145
+ when Polars::UInt16
146
+ Polars::Int16
147
+ when Polars::UInt32
148
+ Polars::Int32
149
+ when Polars::UInt64
150
+ Polars::Int64
151
+ when Polars::Datetime
152
+ Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
153
+ when Polars::List
154
+ inner = convert_polars_type(t.inner)
155
+ Polars::List.new(inner) if inner
156
+ when Polars::Array
157
+ inner = convert_polars_type(t.inner)
158
+ Polars::Array.new(t.inner, t.width) if inner
159
+ when Polars::Struct
160
+ if t.fields.any? { |f| convert_polars_type(f.dtype) }
161
+ fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
162
+ Polars::Struct.new(fields)
163
+ end
164
+ end
165
+ end
103
166
  end
104
167
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: deltalake-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-11-21 00:00:00.000000000 Z
11
+ date: 2024-11-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -48,6 +48,8 @@ files:
48
48
  - lib/deltalake/metadata.rb
49
49
  - lib/deltalake/schema.rb
50
50
  - lib/deltalake/table.rb
51
+ - lib/deltalake/table_alterer.rb
52
+ - lib/deltalake/table_optimizer.rb
51
53
  - lib/deltalake/version.rb
52
54
  homepage: https://github.com/ankane/delta-ruby
53
55
  licenses: