deltalake-rb 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Cargo.lock +3 -1
- data/README.md +33 -3
- data/ext/deltalake/Cargo.toml +3 -1
- data/ext/deltalake/src/lib.rs +515 -20
- data/lib/deltalake/table.rb +122 -6
- data/lib/deltalake/table_alterer.rb +25 -0
- data/lib/deltalake/table_optimizer.rb +51 -0
- data/lib/deltalake/version.rb +1 -1
- data/lib/deltalake.rb +64 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1ad1a2f352a83da63ccbde0126430b052d44801be630f3ad5e8326832205dc52
|
4
|
+
data.tar.gz: 1ce59b16589b891390d4ab1e81284d57389b6e8ff950faf11d1b3cf4736ce235
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28c7d8f93e8dc78d9e81490e62d27be6d431efea6e1242c14ecb931b82b766926cb65a35b9440d372d27513e95c1f5c90e042fa42fb97a6407f1c3d30abb15ca
|
7
|
+
data.tar.gz: 547aa9019ac83f8ae9955f6f2cafe062a14223bdf18a29471b98e68d86d9655c69d47e84f520f9e1b5331e10f00e807141ed18d3786ae9995b120230c7217855
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## 0.1.1 (2024-11-22)
|
2
|
+
|
3
|
+
- Added support for constraints
|
4
|
+
- Added support for small file compaction
|
5
|
+
- Added support for Z Ordering
|
6
|
+
- Added `history`, `partitions`, `protocol`, `repair`, and `restore` methods to `Table`
|
7
|
+
- Added experimental `load_cdf` method to `Table`
|
8
|
+
- Fixed handling of unsigned integers
|
9
|
+
- Fixed error with timestamps
|
10
|
+
|
1
11
|
## 0.1.0 (2024-11-20)
|
2
12
|
|
3
13
|
- First release
|
data/Cargo.lock
CHANGED
@@ -1488,13 +1488,15 @@ dependencies = [
|
|
1488
1488
|
|
1489
1489
|
[[package]]
|
1490
1490
|
name = "deltalake"
|
1491
|
-
version = "0.1.
|
1491
|
+
version = "0.1.1"
|
1492
1492
|
dependencies = [
|
1493
1493
|
"arrow",
|
1494
1494
|
"arrow-schema",
|
1495
1495
|
"chrono",
|
1496
1496
|
"deltalake 0.21.0",
|
1497
|
+
"futures",
|
1497
1498
|
"magnus",
|
1499
|
+
"num_cpus",
|
1498
1500
|
"serde",
|
1499
1501
|
"serde_json",
|
1500
1502
|
"tokio",
|
data/README.md
CHANGED
@@ -14,7 +14,7 @@ Add this line to your application’s Gemfile:
|
|
14
14
|
gem "deltalake-rb"
|
15
15
|
```
|
16
16
|
|
17
|
-
It can take
|
17
|
+
It can take 5-10 minutes to compile the gem.
|
18
18
|
|
19
19
|
## Getting Started
|
20
20
|
|
@@ -50,6 +50,18 @@ Overwrite a table
|
|
50
50
|
DeltaLake.write("./data/delta", df, mode: "overwrite")
|
51
51
|
```
|
52
52
|
|
53
|
+
Add a constraint
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
dt.alter.add_constraint({"a_gt_0" => "a > 0"})
|
57
|
+
```
|
58
|
+
|
59
|
+
Drop a constraint
|
60
|
+
|
61
|
+
```ruby
|
62
|
+
dt.alter.drop_constraint("a_gt_0")
|
63
|
+
```
|
64
|
+
|
53
65
|
Delete rows
|
54
66
|
|
55
67
|
```ruby
|
@@ -62,6 +74,18 @@ Vacuum
|
|
62
74
|
dt.vacuum(dry_run: false)
|
63
75
|
```
|
64
76
|
|
77
|
+
Perform small file compaction
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
dt.optimize.compact
|
81
|
+
```
|
82
|
+
|
83
|
+
Colocate similar data in the same files
|
84
|
+
|
85
|
+
```ruby
|
86
|
+
dt.optimize.z_order(["a"])
|
87
|
+
```
|
88
|
+
|
65
89
|
Load a previous version of a table
|
66
90
|
|
67
91
|
```ruby
|
@@ -70,16 +94,22 @@ dt = DeltaLake::Table.new("./data/delta", version: 1)
|
|
70
94
|
dt.load_as_version(1)
|
71
95
|
```
|
72
96
|
|
97
|
+
Get the schema
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
dt.schema
|
101
|
+
```
|
102
|
+
|
73
103
|
Get metadata
|
74
104
|
|
75
105
|
```ruby
|
76
106
|
dt.metadata
|
77
107
|
```
|
78
108
|
|
79
|
-
Get
|
109
|
+
Get history
|
80
110
|
|
81
111
|
```ruby
|
82
|
-
dt.
|
112
|
+
dt.history
|
83
113
|
```
|
84
114
|
|
85
115
|
## API
|
data/ext/deltalake/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "deltalake"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.1"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -15,7 +15,9 @@ arrow = { version = "52", features = ["ffi"] }
|
|
15
15
|
arrow-schema = { version = "52", features = ["serde"] }
|
16
16
|
chrono = "0.4"
|
17
17
|
deltalake = { version = "=0.21.0", features = ["datafusion", "s3"] }
|
18
|
+
futures = "0.3"
|
18
19
|
magnus = "0.7"
|
20
|
+
num_cpus = "1"
|
19
21
|
serde = "1"
|
20
22
|
serde_json = "1"
|
21
23
|
tokio = { version = "1", features = ["rt-multi-thread"] }
|
data/ext/deltalake/src/lib.rs
CHANGED
@@ -3,20 +3,39 @@ mod schema;
|
|
3
3
|
mod utils;
|
4
4
|
|
5
5
|
use std::cell::RefCell;
|
6
|
-
use std::collections::HashMap;
|
6
|
+
use std::collections::{HashMap, HashSet};
|
7
7
|
use std::future::IntoFuture;
|
8
|
+
use std::str::FromStr;
|
9
|
+
use std::time;
|
8
10
|
|
9
|
-
use chrono::Duration;
|
11
|
+
use chrono::{DateTime, Duration, FixedOffset, Utc};
|
10
12
|
use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
|
11
|
-
use deltalake::
|
13
|
+
use deltalake::arrow::record_batch::RecordBatchIterator;
|
14
|
+
use deltalake::datafusion::physical_plan::ExecutionPlan;
|
15
|
+
use deltalake::datafusion::prelude::SessionContext;
|
16
|
+
use deltalake::errors::DeltaTableError;
|
17
|
+
use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
|
18
|
+
use deltalake::operations::collect_sendable_stream;
|
19
|
+
use deltalake::operations::constraints::ConstraintBuilder;
|
12
20
|
use deltalake::operations::delete::DeleteBuilder;
|
21
|
+
use deltalake::operations::drop_constraints::DropConstraintBuilder;
|
22
|
+
use deltalake::operations::filesystem_check::FileSystemCheckBuilder;
|
23
|
+
use deltalake::operations::load_cdf::CdfLoadBuilder;
|
24
|
+
use deltalake::operations::optimize::{OptimizeBuilder, OptimizeType};
|
25
|
+
use deltalake::operations::restore::RestoreBuilder;
|
26
|
+
use deltalake::operations::transaction::TableReference;
|
13
27
|
use deltalake::operations::vacuum::VacuumBuilder;
|
28
|
+
use deltalake::partitions::PartitionFilter;
|
14
29
|
use deltalake::storage::IORuntime;
|
15
30
|
use deltalake::DeltaOps;
|
16
31
|
use error::DeltaError;
|
32
|
+
use futures::future::join_all;
|
17
33
|
|
18
|
-
use magnus::{
|
34
|
+
use magnus::{
|
35
|
+
exception, function, method, prelude::*, Error, Integer, Module, RArray, RHash, Ruby, Value,
|
36
|
+
};
|
19
37
|
|
38
|
+
use crate::error::DeltaProtocolError;
|
20
39
|
use crate::error::RubyError;
|
21
40
|
use crate::schema::{schema_to_rbobject, Field};
|
22
41
|
use crate::utils::rt;
|
@@ -38,6 +57,19 @@ struct RawDeltaTableMetaData {
|
|
38
57
|
configuration: HashMap<String, Option<String>>,
|
39
58
|
}
|
40
59
|
|
60
|
+
#[magnus::wrap(class = "DeltaLake::ArrowArrayStream")]
|
61
|
+
pub struct ArrowArrayStream {
|
62
|
+
stream: FFI_ArrowArrayStream,
|
63
|
+
}
|
64
|
+
|
65
|
+
impl ArrowArrayStream {
|
66
|
+
pub fn to_i(&self) -> usize {
|
67
|
+
(&self.stream as *const _) as usize
|
68
|
+
}
|
69
|
+
}
|
70
|
+
|
71
|
+
type StringVec = Vec<String>;
|
72
|
+
|
41
73
|
impl RawDeltaTable {
|
42
74
|
pub fn new(
|
43
75
|
table_uri: String,
|
@@ -113,37 +145,138 @@ impl RawDeltaTable {
|
|
113
145
|
})
|
114
146
|
}
|
115
147
|
|
148
|
+
pub fn protocol_versions(&self) -> RbResult<(i32, i32, Option<StringVec>, Option<StringVec>)> {
|
149
|
+
let binding = self._table.borrow();
|
150
|
+
let table_protocol = binding.protocol().map_err(RubyError::from)?;
|
151
|
+
Ok((
|
152
|
+
table_protocol.min_reader_version,
|
153
|
+
table_protocol.min_writer_version,
|
154
|
+
table_protocol
|
155
|
+
.writer_features
|
156
|
+
.as_ref()
|
157
|
+
.and_then(|features| {
|
158
|
+
let empty_set = !features.is_empty();
|
159
|
+
empty_set.then(|| {
|
160
|
+
features
|
161
|
+
.iter()
|
162
|
+
.map(|v| v.to_string())
|
163
|
+
.collect::<Vec<String>>()
|
164
|
+
})
|
165
|
+
}),
|
166
|
+
table_protocol
|
167
|
+
.reader_features
|
168
|
+
.as_ref()
|
169
|
+
.and_then(|features| {
|
170
|
+
let empty_set = !features.is_empty();
|
171
|
+
empty_set.then(|| {
|
172
|
+
features
|
173
|
+
.iter()
|
174
|
+
.map(|v| v.to_string())
|
175
|
+
.collect::<Vec<String>>()
|
176
|
+
})
|
177
|
+
}),
|
178
|
+
))
|
179
|
+
}
|
180
|
+
|
116
181
|
pub fn load_version(&self, version: i64) -> RbResult<()> {
|
117
182
|
Ok(rt()
|
118
183
|
.block_on(self._table.borrow_mut().load_version(version))
|
119
184
|
.map_err(RubyError::from)?)
|
120
185
|
}
|
121
186
|
|
122
|
-
pub fn
|
123
|
-
|
124
|
-
|
125
|
-
|
187
|
+
pub fn get_latest_version(&self) -> RbResult<i64> {
|
188
|
+
Ok(rt()
|
189
|
+
.block_on(self._table.borrow().get_latest_version())
|
190
|
+
.map_err(RubyError::from)?)
|
191
|
+
}
|
126
192
|
|
193
|
+
pub fn get_earliest_version(&self) -> RbResult<i64> {
|
194
|
+
Ok(rt()
|
195
|
+
.block_on(self._table.borrow().get_earliest_version())
|
196
|
+
.map_err(RubyError::from)?)
|
197
|
+
}
|
198
|
+
|
199
|
+
pub fn get_num_index_cols(&self) -> RbResult<i32> {
|
127
200
|
Ok(self
|
128
201
|
._table
|
129
202
|
.borrow()
|
130
|
-
.
|
203
|
+
.snapshot()
|
131
204
|
.map_err(RubyError::from)?
|
132
|
-
.
|
133
|
-
.
|
205
|
+
.config()
|
206
|
+
.num_indexed_cols())
|
134
207
|
}
|
135
208
|
|
136
|
-
pub fn
|
137
|
-
if !self._table.borrow().config.require_files {
|
138
|
-
return Err(DeltaError::new_err("Table is initiated without files."));
|
139
|
-
}
|
140
|
-
|
209
|
+
pub fn get_stats_columns(&self) -> RbResult<Option<Vec<String>>> {
|
141
210
|
Ok(self
|
142
211
|
._table
|
143
212
|
.borrow()
|
144
|
-
.
|
213
|
+
.snapshot()
|
145
214
|
.map_err(RubyError::from)?
|
146
|
-
.
|
215
|
+
.config()
|
216
|
+
.stats_columns()
|
217
|
+
.map(|v| v.iter().map(|v| v.to_string()).collect::<Vec<String>>()))
|
218
|
+
}
|
219
|
+
|
220
|
+
pub fn load_with_datetime(&self, ds: String) -> RbResult<()> {
|
221
|
+
let datetime = DateTime::<Utc>::from(
|
222
|
+
DateTime::<FixedOffset>::parse_from_rfc3339(&ds).map_err(|err| {
|
223
|
+
Error::new(
|
224
|
+
exception::arg_error(),
|
225
|
+
format!("Failed to parse datetime string: {err}"),
|
226
|
+
)
|
227
|
+
})?,
|
228
|
+
);
|
229
|
+
Ok(rt()
|
230
|
+
.block_on(self._table.borrow_mut().load_with_datetime(datetime))
|
231
|
+
.map_err(RubyError::from)?)
|
232
|
+
}
|
233
|
+
|
234
|
+
pub fn files(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
|
235
|
+
if !self.has_files()? {
|
236
|
+
return Err(DeltaError::new_err("Table is instantiated without files."));
|
237
|
+
}
|
238
|
+
|
239
|
+
if let Some(filters) = partition_filters {
|
240
|
+
let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
|
241
|
+
Ok(self
|
242
|
+
._table
|
243
|
+
.borrow()
|
244
|
+
.get_files_by_partitions(&filters)
|
245
|
+
.map_err(RubyError::from)?
|
246
|
+
.into_iter()
|
247
|
+
.map(|p| p.to_string())
|
248
|
+
.collect())
|
249
|
+
} else {
|
250
|
+
Ok(self
|
251
|
+
._table
|
252
|
+
.borrow()
|
253
|
+
.get_files_iter()
|
254
|
+
.map_err(RubyError::from)?
|
255
|
+
.map(|f| f.to_string())
|
256
|
+
.collect())
|
257
|
+
}
|
258
|
+
}
|
259
|
+
|
260
|
+
pub fn file_uris(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
|
261
|
+
if !self._table.borrow().config.require_files {
|
262
|
+
return Err(DeltaError::new_err("Table is initiated without files."));
|
263
|
+
}
|
264
|
+
|
265
|
+
if let Some(filters) = partition_filters {
|
266
|
+
let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
|
267
|
+
Ok(self
|
268
|
+
._table
|
269
|
+
.borrow()
|
270
|
+
.get_file_uris_by_partitions(&filters)
|
271
|
+
.map_err(RubyError::from)?)
|
272
|
+
} else {
|
273
|
+
Ok(self
|
274
|
+
._table
|
275
|
+
.borrow()
|
276
|
+
.get_file_uris()
|
277
|
+
.map_err(RubyError::from)?
|
278
|
+
.collect())
|
279
|
+
}
|
147
280
|
}
|
148
281
|
|
149
282
|
pub fn schema(&self) -> RbResult<Value> {
|
@@ -177,6 +310,214 @@ impl RawDeltaTable {
|
|
177
310
|
Ok(metrics.files_deleted)
|
178
311
|
}
|
179
312
|
|
313
|
+
pub fn compact_optimize(
|
314
|
+
&self,
|
315
|
+
target_size: Option<i64>,
|
316
|
+
max_concurrent_tasks: Option<usize>,
|
317
|
+
min_commit_interval: Option<u64>,
|
318
|
+
) -> RbResult<String> {
|
319
|
+
let mut cmd = OptimizeBuilder::new(
|
320
|
+
self._table.borrow().log_store(),
|
321
|
+
self._table
|
322
|
+
.borrow()
|
323
|
+
.snapshot()
|
324
|
+
.map_err(RubyError::from)?
|
325
|
+
.clone(),
|
326
|
+
)
|
327
|
+
.with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get));
|
328
|
+
if let Some(size) = target_size {
|
329
|
+
cmd = cmd.with_target_size(size);
|
330
|
+
}
|
331
|
+
if let Some(commit_interval) = min_commit_interval {
|
332
|
+
cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
|
333
|
+
}
|
334
|
+
|
335
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
336
|
+
self._table.borrow_mut().state = table.state;
|
337
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
338
|
+
}
|
339
|
+
|
340
|
+
pub fn z_order_optimize(
|
341
|
+
&self,
|
342
|
+
z_order_columns: Vec<String>,
|
343
|
+
target_size: Option<i64>,
|
344
|
+
max_concurrent_tasks: Option<usize>,
|
345
|
+
max_spill_size: usize,
|
346
|
+
min_commit_interval: Option<u64>,
|
347
|
+
) -> RbResult<String> {
|
348
|
+
let mut cmd = OptimizeBuilder::new(
|
349
|
+
self._table.borrow().log_store(),
|
350
|
+
self._table
|
351
|
+
.borrow()
|
352
|
+
.snapshot()
|
353
|
+
.map_err(RubyError::from)?
|
354
|
+
.clone(),
|
355
|
+
)
|
356
|
+
.with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get))
|
357
|
+
.with_max_spill_size(max_spill_size)
|
358
|
+
.with_type(OptimizeType::ZOrder(z_order_columns));
|
359
|
+
if let Some(size) = target_size {
|
360
|
+
cmd = cmd.with_target_size(size);
|
361
|
+
}
|
362
|
+
if let Some(commit_interval) = min_commit_interval {
|
363
|
+
cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
|
364
|
+
}
|
365
|
+
|
366
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
367
|
+
self._table.borrow_mut().state = table.state;
|
368
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
369
|
+
}
|
370
|
+
|
371
|
+
pub fn add_constraints(&self, constraints: HashMap<String, String>) -> RbResult<()> {
|
372
|
+
let mut cmd = ConstraintBuilder::new(
|
373
|
+
self._table.borrow().log_store(),
|
374
|
+
self._table
|
375
|
+
.borrow()
|
376
|
+
.snapshot()
|
377
|
+
.map_err(RubyError::from)?
|
378
|
+
.clone(),
|
379
|
+
);
|
380
|
+
|
381
|
+
for (col_name, expression) in constraints {
|
382
|
+
cmd = cmd.with_constraint(col_name.clone(), expression.clone());
|
383
|
+
}
|
384
|
+
|
385
|
+
let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
386
|
+
self._table.borrow_mut().state = table.state;
|
387
|
+
Ok(())
|
388
|
+
}
|
389
|
+
|
390
|
+
pub fn drop_constraints(&self, name: String, raise_if_not_exists: bool) -> RbResult<()> {
|
391
|
+
let cmd = DropConstraintBuilder::new(
|
392
|
+
self._table.borrow().log_store(),
|
393
|
+
self._table
|
394
|
+
.borrow()
|
395
|
+
.snapshot()
|
396
|
+
.map_err(RubyError::from)?
|
397
|
+
.clone(),
|
398
|
+
)
|
399
|
+
.with_constraint(name)
|
400
|
+
.with_raise_if_not_exists(raise_if_not_exists);
|
401
|
+
|
402
|
+
let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
403
|
+
self._table.borrow_mut().state = table.state;
|
404
|
+
Ok(())
|
405
|
+
}
|
406
|
+
|
407
|
+
pub fn load_cdf(
|
408
|
+
&self,
|
409
|
+
starting_version: i64,
|
410
|
+
ending_version: Option<i64>,
|
411
|
+
starting_timestamp: Option<String>,
|
412
|
+
ending_timestamp: Option<String>,
|
413
|
+
columns: Option<Vec<String>>,
|
414
|
+
) -> RbResult<ArrowArrayStream> {
|
415
|
+
let ctx = SessionContext::new();
|
416
|
+
let mut cdf_read = CdfLoadBuilder::new(
|
417
|
+
self._table.borrow().log_store(),
|
418
|
+
self._table
|
419
|
+
.borrow()
|
420
|
+
.snapshot()
|
421
|
+
.map_err(RubyError::from)?
|
422
|
+
.clone(),
|
423
|
+
)
|
424
|
+
.with_starting_version(starting_version);
|
425
|
+
|
426
|
+
if let Some(ev) = ending_version {
|
427
|
+
cdf_read = cdf_read.with_ending_version(ev);
|
428
|
+
}
|
429
|
+
if let Some(st) = starting_timestamp {
|
430
|
+
let starting_ts: DateTime<Utc> = DateTime::<Utc>::from_str(&st)
|
431
|
+
.map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
|
432
|
+
.to_utc();
|
433
|
+
cdf_read = cdf_read.with_starting_timestamp(starting_ts);
|
434
|
+
}
|
435
|
+
if let Some(et) = ending_timestamp {
|
436
|
+
let ending_ts = DateTime::<Utc>::from_str(&et)
|
437
|
+
.map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
|
438
|
+
.to_utc();
|
439
|
+
cdf_read = cdf_read.with_starting_timestamp(ending_ts);
|
440
|
+
}
|
441
|
+
|
442
|
+
if let Some(columns) = columns {
|
443
|
+
cdf_read = cdf_read.with_columns(columns);
|
444
|
+
}
|
445
|
+
|
446
|
+
cdf_read = cdf_read.with_session_ctx(ctx.clone());
|
447
|
+
|
448
|
+
let plan = rt().block_on(cdf_read.build()).map_err(RubyError::from)?;
|
449
|
+
|
450
|
+
let mut tasks = vec![];
|
451
|
+
for p in 0..plan.properties().output_partitioning().partition_count() {
|
452
|
+
let inner_plan = plan.clone();
|
453
|
+
let partition_batch = inner_plan.execute(p, ctx.task_ctx()).unwrap();
|
454
|
+
let handle = rt().spawn(collect_sendable_stream(partition_batch));
|
455
|
+
tasks.push(handle);
|
456
|
+
}
|
457
|
+
|
458
|
+
// This is unfortunate.
|
459
|
+
let batches = rt()
|
460
|
+
.block_on(join_all(tasks))
|
461
|
+
.into_iter()
|
462
|
+
.flatten()
|
463
|
+
.collect::<Result<Vec<Vec<_>>, _>>()
|
464
|
+
.unwrap()
|
465
|
+
.into_iter()
|
466
|
+
.flatten()
|
467
|
+
.map(Ok);
|
468
|
+
let batch_iter = RecordBatchIterator::new(batches, plan.schema());
|
469
|
+
let ffi_stream = FFI_ArrowArrayStream::new(Box::new(batch_iter));
|
470
|
+
Ok(ArrowArrayStream { stream: ffi_stream })
|
471
|
+
}
|
472
|
+
|
473
|
+
pub fn restore(
|
474
|
+
&self,
|
475
|
+
target: Option<Value>,
|
476
|
+
ignore_missing_files: bool,
|
477
|
+
protocol_downgrade_allowed: bool,
|
478
|
+
) -> RbResult<String> {
|
479
|
+
let mut cmd = RestoreBuilder::new(
|
480
|
+
self._table.borrow().log_store(),
|
481
|
+
self._table
|
482
|
+
.borrow()
|
483
|
+
.snapshot()
|
484
|
+
.map_err(RubyError::from)?
|
485
|
+
.clone(),
|
486
|
+
);
|
487
|
+
if let Some(val) = target {
|
488
|
+
if let Some(version) = Integer::from_value(val) {
|
489
|
+
cmd = cmd.with_version_to_restore(version.to_i64()?)
|
490
|
+
}
|
491
|
+
if let Ok(ds) = String::try_convert(val) {
|
492
|
+
let datetime = DateTime::<Utc>::from(
|
493
|
+
DateTime::<FixedOffset>::parse_from_rfc3339(ds.as_ref()).map_err(|err| {
|
494
|
+
Error::new(
|
495
|
+
exception::arg_error(),
|
496
|
+
format!("Failed to parse datetime string: {err}"),
|
497
|
+
)
|
498
|
+
})?,
|
499
|
+
);
|
500
|
+
cmd = cmd.with_datetime_to_restore(datetime)
|
501
|
+
}
|
502
|
+
}
|
503
|
+
cmd = cmd.with_ignore_missing_files(ignore_missing_files);
|
504
|
+
cmd = cmd.with_protocol_downgrade_allowed(protocol_downgrade_allowed);
|
505
|
+
|
506
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
507
|
+
self._table.borrow_mut().state = table.state;
|
508
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
509
|
+
}
|
510
|
+
|
511
|
+
pub fn history(&self, limit: Option<usize>) -> RbResult<Vec<String>> {
|
512
|
+
let history = rt()
|
513
|
+
.block_on(self._table.borrow().history(limit))
|
514
|
+
.map_err(RubyError::from)?;
|
515
|
+
Ok(history
|
516
|
+
.iter()
|
517
|
+
.map(|c| serde_json::to_string(c).unwrap())
|
518
|
+
.collect())
|
519
|
+
}
|
520
|
+
|
180
521
|
pub fn update_incremental(&self) -> RbResult<()> {
|
181
522
|
#[allow(deprecated)]
|
182
523
|
Ok(rt()
|
@@ -184,6 +525,56 @@ impl RawDeltaTable {
|
|
184
525
|
.map_err(RubyError::from)?)
|
185
526
|
}
|
186
527
|
|
528
|
+
fn get_active_partitions(&self) -> RbResult<RArray> {
|
529
|
+
let binding = self._table.borrow();
|
530
|
+
let _column_names: HashSet<&str> = binding
|
531
|
+
.get_schema()
|
532
|
+
.map_err(|_| DeltaProtocolError::new_err("table does not yet have a schema"))?
|
533
|
+
.fields()
|
534
|
+
.map(|field| field.name().as_str())
|
535
|
+
.collect();
|
536
|
+
let partition_columns: HashSet<&str> = binding
|
537
|
+
.metadata()
|
538
|
+
.map_err(RubyError::from)?
|
539
|
+
.partition_columns
|
540
|
+
.iter()
|
541
|
+
.map(|col| col.as_str())
|
542
|
+
.collect();
|
543
|
+
|
544
|
+
let converted_filters = Vec::new();
|
545
|
+
|
546
|
+
let partition_columns: Vec<&str> = partition_columns.into_iter().collect();
|
547
|
+
|
548
|
+
let adds = binding
|
549
|
+
.snapshot()
|
550
|
+
.map_err(RubyError::from)?
|
551
|
+
.get_active_add_actions_by_partitions(&converted_filters)
|
552
|
+
.map_err(RubyError::from)?
|
553
|
+
.collect::<Result<Vec<_>, _>>()
|
554
|
+
.map_err(RubyError::from)?;
|
555
|
+
let active_partitions: HashSet<Vec<(&str, Option<String>)>> = adds
|
556
|
+
.iter()
|
557
|
+
.flat_map(|add| {
|
558
|
+
Ok::<_, RubyError>(
|
559
|
+
partition_columns
|
560
|
+
.iter()
|
561
|
+
.flat_map(|col| {
|
562
|
+
Ok::<_, RubyError>((
|
563
|
+
*col,
|
564
|
+
add.partition_values()
|
565
|
+
.map_err(RubyError::from)?
|
566
|
+
.get(*col)
|
567
|
+
.map(|v| v.serialize()),
|
568
|
+
))
|
569
|
+
})
|
570
|
+
.collect(),
|
571
|
+
)
|
572
|
+
})
|
573
|
+
.collect();
|
574
|
+
|
575
|
+
Ok(RArray::from_iter(active_partitions))
|
576
|
+
}
|
577
|
+
|
187
578
|
pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
|
188
579
|
let mut cmd = DeleteBuilder::new(
|
189
580
|
self._table.borrow().log_store(),
|
@@ -201,6 +592,38 @@ impl RawDeltaTable {
|
|
201
592
|
self._table.borrow_mut().state = table.state;
|
202
593
|
Ok(serde_json::to_string(&metrics).unwrap())
|
203
594
|
}
|
595
|
+
|
596
|
+
pub fn repair(&self, dry_run: bool) -> RbResult<String> {
|
597
|
+
let cmd = FileSystemCheckBuilder::new(
|
598
|
+
self._table.borrow().log_store(),
|
599
|
+
self._table
|
600
|
+
.borrow()
|
601
|
+
.snapshot()
|
602
|
+
.map_err(RubyError::from)?
|
603
|
+
.clone(),
|
604
|
+
)
|
605
|
+
.with_dry_run(dry_run);
|
606
|
+
|
607
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
608
|
+
self._table.borrow_mut().state = table.state;
|
609
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
610
|
+
}
|
611
|
+
|
612
|
+
pub fn transaction_versions(&self) -> RHash {
|
613
|
+
RHash::from_iter(
|
614
|
+
self._table
|
615
|
+
.borrow()
|
616
|
+
.get_app_transaction_version()
|
617
|
+
.into_iter()
|
618
|
+
.map(|(app_id, transaction)| (app_id, RbTransaction::from(transaction))),
|
619
|
+
)
|
620
|
+
}
|
621
|
+
}
|
622
|
+
|
623
|
+
fn convert_partition_filters(
|
624
|
+
_partitions_filters: Value,
|
625
|
+
) -> Result<Vec<PartitionFilter>, DeltaTableError> {
|
626
|
+
todo!()
|
204
627
|
}
|
205
628
|
|
206
629
|
impl RawDeltaTableMetaData {
|
@@ -229,6 +652,23 @@ impl RawDeltaTableMetaData {
|
|
229
652
|
}
|
230
653
|
}
|
231
654
|
|
655
|
+
#[magnus::wrap(class = "DeltaLake::Transaction")]
|
656
|
+
pub struct RbTransaction {
|
657
|
+
pub app_id: String,
|
658
|
+
pub version: i64,
|
659
|
+
pub last_updated: Option<i64>,
|
660
|
+
}
|
661
|
+
|
662
|
+
impl From<Transaction> for RbTransaction {
|
663
|
+
fn from(value: Transaction) -> Self {
|
664
|
+
RbTransaction {
|
665
|
+
app_id: value.app_id,
|
666
|
+
version: value.version,
|
667
|
+
last_updated: value.last_updated,
|
668
|
+
}
|
669
|
+
}
|
670
|
+
}
|
671
|
+
|
232
672
|
#[allow(clippy::too_many_arguments)]
|
233
673
|
fn write_to_deltalake(
|
234
674
|
table_uri: String,
|
@@ -313,16 +753,68 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
313
753
|
class.define_method("version", method!(RawDeltaTable::version, 0))?;
|
314
754
|
class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?;
|
315
755
|
class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?;
|
756
|
+
class.define_method(
|
757
|
+
"protocol_versions",
|
758
|
+
method!(RawDeltaTable::protocol_versions, 0),
|
759
|
+
)?;
|
316
760
|
class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?;
|
317
|
-
class.define_method(
|
318
|
-
|
761
|
+
class.define_method(
|
762
|
+
"get_latest_version",
|
763
|
+
method!(RawDeltaTable::get_latest_version, 0),
|
764
|
+
)?;
|
765
|
+
class.define_method(
|
766
|
+
"get_earliest_version",
|
767
|
+
method!(RawDeltaTable::get_earliest_version, 0),
|
768
|
+
)?;
|
769
|
+
class.define_method(
|
770
|
+
"get_num_index_cols",
|
771
|
+
method!(RawDeltaTable::get_num_index_cols, 0),
|
772
|
+
)?;
|
773
|
+
class.define_method(
|
774
|
+
"get_stats_columns",
|
775
|
+
method!(RawDeltaTable::get_stats_columns, 0),
|
776
|
+
)?;
|
777
|
+
class.define_method(
|
778
|
+
"load_with_datetime",
|
779
|
+
method!(RawDeltaTable::load_with_datetime, 1),
|
780
|
+
)?;
|
781
|
+
class.define_method("files", method!(RawDeltaTable::files, 1))?;
|
782
|
+
class.define_method("file_uris", method!(RawDeltaTable::file_uris, 1))?;
|
319
783
|
class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
|
320
784
|
class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
|
785
|
+
class.define_method(
|
786
|
+
"compact_optimize",
|
787
|
+
method!(RawDeltaTable::compact_optimize, 3),
|
788
|
+
)?;
|
789
|
+
class.define_method(
|
790
|
+
"z_order_optimize",
|
791
|
+
method!(RawDeltaTable::z_order_optimize, 5),
|
792
|
+
)?;
|
793
|
+
class.define_method(
|
794
|
+
"add_constraints",
|
795
|
+
method!(RawDeltaTable::add_constraints, 1),
|
796
|
+
)?;
|
797
|
+
class.define_method(
|
798
|
+
"drop_constraints",
|
799
|
+
method!(RawDeltaTable::drop_constraints, 2),
|
800
|
+
)?;
|
801
|
+
class.define_method("load_cdf", method!(RawDeltaTable::load_cdf, 5))?;
|
802
|
+
class.define_method("restore", method!(RawDeltaTable::restore, 3))?;
|
803
|
+
class.define_method("history", method!(RawDeltaTable::history, 1))?;
|
321
804
|
class.define_method(
|
322
805
|
"update_incremental",
|
323
806
|
method!(RawDeltaTable::update_incremental, 0),
|
324
807
|
)?;
|
808
|
+
class.define_method(
|
809
|
+
"get_active_partitions",
|
810
|
+
method!(RawDeltaTable::get_active_partitions, 0),
|
811
|
+
)?;
|
325
812
|
class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
|
813
|
+
class.define_method("repair", method!(RawDeltaTable::repair, 1))?;
|
814
|
+
class.define_method(
|
815
|
+
"transaction_versions",
|
816
|
+
method!(RawDeltaTable::transaction_versions, 0),
|
817
|
+
)?;
|
326
818
|
|
327
819
|
let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
|
328
820
|
class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?;
|
@@ -344,6 +836,9 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
344
836
|
method!(RawDeltaTableMetaData::configuration, 0),
|
345
837
|
)?;
|
346
838
|
|
839
|
+
let class = module.define_class("ArrowArrayStream", ruby.class_object())?;
|
840
|
+
class.define_method("to_i", method!(ArrowArrayStream::to_i, 0))?;
|
841
|
+
|
347
842
|
let class = module.define_class("Field", ruby.class_object())?;
|
348
843
|
class.define_method("name", method!(Field::name, 0))?;
|
349
844
|
class.define_method("type", method!(Field::get_type, 0))?;
|
data/lib/deltalake/table.rb
CHANGED
@@ -26,22 +26,54 @@ module DeltaLake
|
|
26
26
|
@table.version
|
27
27
|
end
|
28
28
|
|
29
|
-
def
|
30
|
-
|
29
|
+
def partitions
|
30
|
+
partitions = []
|
31
|
+
@table.get_active_partitions.each do |partition|
|
32
|
+
next unless partition
|
33
|
+
partitions << partition.to_h
|
34
|
+
end
|
35
|
+
partitions
|
36
|
+
end
|
37
|
+
|
38
|
+
def files(partition_filters: nil)
|
39
|
+
@table.files(_stringify_partition_values(partition_filters))
|
31
40
|
end
|
32
41
|
|
33
|
-
def file_uris
|
34
|
-
@table.file_uris
|
42
|
+
def file_uris(partition_filters: nil)
|
43
|
+
@table.file_uris(_stringify_partition_values(partition_filters))
|
35
44
|
end
|
36
45
|
|
37
46
|
def load_as_version(version)
|
38
47
|
if version.is_a?(Integer)
|
39
48
|
@table.load_version(version)
|
49
|
+
elsif version.is_a?(Time)
|
50
|
+
# needed for iso8601
|
51
|
+
require "time"
|
52
|
+
|
53
|
+
@table.load_with_datetime(version.utc.iso8601(9))
|
54
|
+
elsif version.is_a?(String)
|
55
|
+
@table.load_with_datetime(version)
|
40
56
|
else
|
41
|
-
raise TypeError, "Invalid datatype provided for version, only Integer
|
57
|
+
raise TypeError, "Invalid datatype provided for version, only Integer, String, and Time are accepted."
|
42
58
|
end
|
43
59
|
end
|
44
60
|
|
61
|
+
def load_cdf(
|
62
|
+
starting_version: 0,
|
63
|
+
ending_version: nil,
|
64
|
+
starting_timestamp: nil,
|
65
|
+
ending_timestamp: nil,
|
66
|
+
columns: nil
|
67
|
+
)
|
68
|
+
@table.load_cdf(
|
69
|
+
starting_version,
|
70
|
+
ending_version,
|
71
|
+
starting_timestamp,
|
72
|
+
ending_timestamp,
|
73
|
+
columns
|
74
|
+
)
|
75
|
+
end
|
76
|
+
|
45
77
|
def table_uri
|
46
78
|
@table.table_uri
|
47
79
|
end
|
@@ -54,6 +86,29 @@ module DeltaLake
|
|
54
86
|
Metadata.new(@table)
|
55
87
|
end
|
56
88
|
|
89
|
+
def protocol
|
90
|
+
ProtocolVersions.new(*@table.protocol_versions)
|
91
|
+
end
|
92
|
+
|
93
|
+
def history(limit: nil)
|
94
|
+
backwards_enumerate = lambda do |iterable, start_end, &block|
|
95
|
+
n = start_end
|
96
|
+
iterable.each do |elem|
|
97
|
+
block.call(n, elem)
|
98
|
+
n -= 1
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
commits = @table.history(limit)
|
103
|
+
history = []
|
104
|
+
backwards_enumerate.(commits, @table.get_latest_version) do |version, commit_info_raw|
|
105
|
+
commit = JSON.parse(commit_info_raw)
|
106
|
+
commit["version"] = version
|
107
|
+
history << commit
|
108
|
+
end
|
109
|
+
history
|
110
|
+
end
|
111
|
+
|
57
112
|
def vacuum(
|
58
113
|
retention_hours: nil,
|
59
114
|
dry_run: true,
|
@@ -72,6 +127,40 @@ module DeltaLake
|
|
72
127
|
)
|
73
128
|
end
|
74
129
|
|
130
|
+
def optimize
|
131
|
+
TableOptimizer.new(self)
|
132
|
+
end
|
133
|
+
|
134
|
+
def alter
|
135
|
+
TableAlterer.new(self)
|
136
|
+
end
|
137
|
+
|
138
|
+
def restore(
|
139
|
+
target,
|
140
|
+
ignore_missing_files: false,
|
141
|
+
protocol_downgrade_allowed: false
|
142
|
+
)
|
143
|
+
if target.is_a?(Time)
|
144
|
+
# needed for iso8601
|
145
|
+
require "time"
|
146
|
+
|
147
|
+
metrics =
|
148
|
+
@table.restore(
|
149
|
+
target.utc.iso8601(9),
|
150
|
+
ignore_missing_files,
|
151
|
+
protocol_downgrade_allowed
|
152
|
+
)
|
153
|
+
else
|
154
|
+
metrics =
|
155
|
+
@table.restore(
|
156
|
+
target,
|
157
|
+
ignore_missing_files,
|
158
|
+
protocol_downgrade_allowed
|
159
|
+
)
|
160
|
+
end
|
161
|
+
JSON.parse(metrics)
|
162
|
+
end
|
163
|
+
|
75
164
|
def to_polars(eager: true)
|
76
165
|
require "polars-df"
|
77
166
|
|
@@ -80,7 +169,13 @@ module DeltaLake
|
|
80
169
|
if sources.empty?
|
81
170
|
Polars::LazyFrame.new
|
82
171
|
else
|
83
|
-
|
172
|
+
delta_keys = [
|
173
|
+
"AWS_S3_ALLOW_UNSAFE_RENAME",
|
174
|
+
"AWS_S3_LOCKING_PROVIDER",
|
175
|
+
"CONDITIONAL_PUT",
|
176
|
+
"DELTA_DYNAMO_TABLE_NAME"
|
177
|
+
]
|
178
|
+
storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
|
84
179
|
Polars.scan_parquet(sources, storage_options: storage_options)
|
85
180
|
end
|
86
181
|
eager ? lf.collect : lf
|
@@ -95,9 +190,30 @@ module DeltaLake
|
|
95
190
|
JSON.parse(metrics).transform_keys(&:to_sym)
|
96
191
|
end
|
97
192
|
|
193
|
+
def repair(dry_run: false)
|
194
|
+
metrics =
|
195
|
+
@table.repair(
|
196
|
+
dry_run
|
197
|
+
)
|
198
|
+
JSON.parse(metrics).transform_keys(&:to_sym)
|
199
|
+
end
|
200
|
+
|
201
|
+
def transaction_versions
|
202
|
+
@table.transaction_versions
|
203
|
+
end
|
204
|
+
|
98
205
|
# private
|
99
206
|
def _table
|
100
207
|
@table
|
101
208
|
end
|
209
|
+
|
210
|
+
# private
|
211
|
+
def _stringify_partition_values(partition_filters)
|
212
|
+
if partition_filters.nil?
|
213
|
+
return partition_filters
|
214
|
+
end
|
215
|
+
|
216
|
+
raise Todo
|
217
|
+
end
|
102
218
|
end
|
103
219
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class TableAlterer
|
3
|
+
def initialize(table)
|
4
|
+
@table = table
|
5
|
+
end
|
6
|
+
|
7
|
+
def add_constraint(constraints)
|
8
|
+
if constraints.length > 1
|
9
|
+
raise ArgumentError,
|
10
|
+
"add_constraints is limited to a single constraint addition at once for now."
|
11
|
+
end
|
12
|
+
|
13
|
+
@table._table.add_constraints(
|
14
|
+
constraints
|
15
|
+
)
|
16
|
+
end
|
17
|
+
|
18
|
+
def drop_constraint(name, raise_if_not_exists: true)
|
19
|
+
@table._table.drop_constraints(
|
20
|
+
name,
|
21
|
+
raise_if_not_exists
|
22
|
+
)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class TableOptimizer
|
3
|
+
def initialize(table)
|
4
|
+
@table = table
|
5
|
+
end
|
6
|
+
|
7
|
+
def compact(
|
8
|
+
target_size: nil,
|
9
|
+
max_concurrent_tasks: nil,
|
10
|
+
min_commit_interval: nil
|
11
|
+
)
|
12
|
+
metrics =
|
13
|
+
@table._table.compact_optimize(
|
14
|
+
target_size,
|
15
|
+
max_concurrent_tasks,
|
16
|
+
min_commit_interval
|
17
|
+
)
|
18
|
+
@table.update_incremental
|
19
|
+
result = JSON.parse(metrics)
|
20
|
+
["filesAdded", "filesRemoved"].each do |key|
|
21
|
+
result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
|
22
|
+
end
|
23
|
+
# TODO return underscore symbols like delete
|
24
|
+
result
|
25
|
+
end
|
26
|
+
|
27
|
+
def z_order(
|
28
|
+
columns,
|
29
|
+
target_size: nil,
|
30
|
+
max_concurrent_tasks: nil,
|
31
|
+
max_spill_size: 20 * 1024 * 1024 * 1024,
|
32
|
+
min_commit_interval: nil
|
33
|
+
)
|
34
|
+
metrics =
|
35
|
+
@table._table.z_order_optimize(
|
36
|
+
Array(columns),
|
37
|
+
target_size,
|
38
|
+
max_concurrent_tasks,
|
39
|
+
max_spill_size,
|
40
|
+
min_commit_interval
|
41
|
+
)
|
42
|
+
@table.update_incremental
|
43
|
+
result = JSON.parse(metrics)
|
44
|
+
["filesAdded", "filesRemoved"].each do |key|
|
45
|
+
result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
|
46
|
+
end
|
47
|
+
# TODO return underscore symbols like delete
|
48
|
+
result
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
data/lib/deltalake/version.rb
CHANGED
data/lib/deltalake.rb
CHANGED
@@ -13,6 +13,8 @@ require_relative "deltalake/field"
|
|
13
13
|
require_relative "deltalake/metadata"
|
14
14
|
require_relative "deltalake/schema"
|
15
15
|
require_relative "deltalake/table"
|
16
|
+
require_relative "deltalake/table_alterer"
|
17
|
+
require_relative "deltalake/table_optimizer"
|
16
18
|
require_relative "deltalake/version"
|
17
19
|
|
18
20
|
module DeltaLake
|
@@ -22,8 +24,21 @@ module DeltaLake
|
|
22
24
|
class CommitFailedError < Error; end
|
23
25
|
class SchemaMismatchError < Error; end
|
24
26
|
|
25
|
-
class
|
27
|
+
class Todo < Error
|
28
|
+
def message
|
29
|
+
"not implemented yet"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
ProtocolVersions =
|
34
|
+
Struct.new(
|
35
|
+
:min_reader_version,
|
36
|
+
:min_writer_version,
|
37
|
+
:writer_features,
|
38
|
+
:reader_features
|
39
|
+
)
|
26
40
|
|
41
|
+
class << self
|
27
42
|
def write(
|
28
43
|
table_or_uri,
|
29
44
|
data,
|
@@ -95,10 +110,58 @@ module DeltaLake
|
|
95
110
|
|
96
111
|
def convert_data(data)
|
97
112
|
if data.respond_to?(:arrow_c_stream)
|
113
|
+
# TODO convert other object types
|
114
|
+
# should probably move logic to Rust
|
115
|
+
if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
|
116
|
+
data = convert_polars_data(data)
|
117
|
+
end
|
118
|
+
|
98
119
|
data.arrow_c_stream
|
99
120
|
else
|
100
121
|
raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
|
101
122
|
end
|
102
123
|
end
|
124
|
+
|
125
|
+
# unsigned integers are not part of the protocol
|
126
|
+
# https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
|
127
|
+
def convert_polars_data(data)
|
128
|
+
new_schema = {}
|
129
|
+
data.schema.each do |k, v|
|
130
|
+
new_type = convert_polars_type(v)
|
131
|
+
new_schema[k] = new_type if new_type
|
132
|
+
end
|
133
|
+
|
134
|
+
if new_schema.any?
|
135
|
+
data.cast(new_schema)
|
136
|
+
else
|
137
|
+
data
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def convert_polars_type(t)
|
142
|
+
case t
|
143
|
+
when Polars::UInt8
|
144
|
+
Polars::Int8
|
145
|
+
when Polars::UInt16
|
146
|
+
Polars::Int16
|
147
|
+
when Polars::UInt32
|
148
|
+
Polars::Int32
|
149
|
+
when Polars::UInt64
|
150
|
+
Polars::Int64
|
151
|
+
when Polars::Datetime
|
152
|
+
Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
|
153
|
+
when Polars::List
|
154
|
+
inner = convert_polars_type(t.inner)
|
155
|
+
Polars::List.new(inner) if inner
|
156
|
+
when Polars::Array
|
157
|
+
inner = convert_polars_type(t.inner)
|
158
|
+
Polars::Array.new(t.inner, t.width) if inner
|
159
|
+
when Polars::Struct
|
160
|
+
if t.fields.any? { |f| convert_polars_type(f.dtype) }
|
161
|
+
fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
|
162
|
+
Polars::Struct.new(fields)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
103
166
|
end
|
104
167
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: deltalake-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-11-
|
11
|
+
date: 2024-11-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -48,6 +48,8 @@ files:
|
|
48
48
|
- lib/deltalake/metadata.rb
|
49
49
|
- lib/deltalake/schema.rb
|
50
50
|
- lib/deltalake/table.rb
|
51
|
+
- lib/deltalake/table_alterer.rb
|
52
|
+
- lib/deltalake/table_optimizer.rb
|
51
53
|
- lib/deltalake/version.rb
|
52
54
|
homepage: https://github.com/ankane/delta-ruby
|
53
55
|
licenses:
|