deltalake-rb 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Cargo.lock +3 -1
- data/README.md +33 -3
- data/ext/deltalake/Cargo.toml +3 -1
- data/ext/deltalake/src/lib.rs +515 -20
- data/lib/deltalake/table.rb +122 -6
- data/lib/deltalake/table_alterer.rb +25 -0
- data/lib/deltalake/table_optimizer.rb +51 -0
- data/lib/deltalake/version.rb +1 -1
- data/lib/deltalake.rb +64 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1ad1a2f352a83da63ccbde0126430b052d44801be630f3ad5e8326832205dc52
|
4
|
+
data.tar.gz: 1ce59b16589b891390d4ab1e81284d57389b6e8ff950faf11d1b3cf4736ce235
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28c7d8f93e8dc78d9e81490e62d27be6d431efea6e1242c14ecb931b82b766926cb65a35b9440d372d27513e95c1f5c90e042fa42fb97a6407f1c3d30abb15ca
|
7
|
+
data.tar.gz: 547aa9019ac83f8ae9955f6f2cafe062a14223bdf18a29471b98e68d86d9655c69d47e84f520f9e1b5331e10f00e807141ed18d3786ae9995b120230c7217855
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
## 0.1.1 (2024-11-22)
|
2
|
+
|
3
|
+
- Added support for constraints
|
4
|
+
- Added support for small file compaction
|
5
|
+
- Added support for Z Ordering
|
6
|
+
- Added `history`, `partitions`, `protocol`, `repair`, and `restore` methods to `Table`
|
7
|
+
- Added experimental `load_cdf` method to `Table`
|
8
|
+
- Fixed handling of unsigned integers
|
9
|
+
- Fixed error with timestamps
|
10
|
+
|
1
11
|
## 0.1.0 (2024-11-20)
|
2
12
|
|
3
13
|
- First release
|
data/Cargo.lock
CHANGED
@@ -1488,13 +1488,15 @@ dependencies = [
|
|
1488
1488
|
|
1489
1489
|
[[package]]
|
1490
1490
|
name = "deltalake"
|
1491
|
-
version = "0.1.
|
1491
|
+
version = "0.1.1"
|
1492
1492
|
dependencies = [
|
1493
1493
|
"arrow",
|
1494
1494
|
"arrow-schema",
|
1495
1495
|
"chrono",
|
1496
1496
|
"deltalake 0.21.0",
|
1497
|
+
"futures",
|
1497
1498
|
"magnus",
|
1499
|
+
"num_cpus",
|
1498
1500
|
"serde",
|
1499
1501
|
"serde_json",
|
1500
1502
|
"tokio",
|
data/README.md
CHANGED
@@ -14,7 +14,7 @@ Add this line to your application’s Gemfile:
|
|
14
14
|
gem "deltalake-rb"
|
15
15
|
```
|
16
16
|
|
17
|
-
It can take
|
17
|
+
It can take 5-10 minutes to compile the gem.
|
18
18
|
|
19
19
|
## Getting Started
|
20
20
|
|
@@ -50,6 +50,18 @@ Overwrite a table
|
|
50
50
|
DeltaLake.write("./data/delta", df, mode: "overwrite")
|
51
51
|
```
|
52
52
|
|
53
|
+
Add a constraint
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
dt.alter.add_constraint({"a_gt_0" => "a > 0"})
|
57
|
+
```
|
58
|
+
|
59
|
+
Drop a constraint
|
60
|
+
|
61
|
+
```ruby
|
62
|
+
dt.alter.drop_constraint("a_gt_0")
|
63
|
+
```
|
64
|
+
|
53
65
|
Delete rows
|
54
66
|
|
55
67
|
```ruby
|
@@ -62,6 +74,18 @@ Vacuum
|
|
62
74
|
dt.vacuum(dry_run: false)
|
63
75
|
```
|
64
76
|
|
77
|
+
Perform small file compaction
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
dt.optimize.compact
|
81
|
+
```
|
82
|
+
|
83
|
+
Colocate similar data in the same files
|
84
|
+
|
85
|
+
```ruby
|
86
|
+
dt.optimize.z_order(["a"])
|
87
|
+
```
|
88
|
+
|
65
89
|
Load a previous version of a table
|
66
90
|
|
67
91
|
```ruby
|
@@ -70,16 +94,22 @@ dt = DeltaLake::Table.new("./data/delta", version: 1)
|
|
70
94
|
dt.load_as_version(1)
|
71
95
|
```
|
72
96
|
|
97
|
+
Get the schema
|
98
|
+
|
99
|
+
```ruby
|
100
|
+
dt.schema
|
101
|
+
```
|
102
|
+
|
73
103
|
Get metadata
|
74
104
|
|
75
105
|
```ruby
|
76
106
|
dt.metadata
|
77
107
|
```
|
78
108
|
|
79
|
-
Get
|
109
|
+
Get history
|
80
110
|
|
81
111
|
```ruby
|
82
|
-
dt.
|
112
|
+
dt.history
|
83
113
|
```
|
84
114
|
|
85
115
|
## API
|
data/ext/deltalake/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "deltalake"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.1"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -15,7 +15,9 @@ arrow = { version = "52", features = ["ffi"] }
|
|
15
15
|
arrow-schema = { version = "52", features = ["serde"] }
|
16
16
|
chrono = "0.4"
|
17
17
|
deltalake = { version = "=0.21.0", features = ["datafusion", "s3"] }
|
18
|
+
futures = "0.3"
|
18
19
|
magnus = "0.7"
|
20
|
+
num_cpus = "1"
|
19
21
|
serde = "1"
|
20
22
|
serde_json = "1"
|
21
23
|
tokio = { version = "1", features = ["rt-multi-thread"] }
|
data/ext/deltalake/src/lib.rs
CHANGED
@@ -3,20 +3,39 @@ mod schema;
|
|
3
3
|
mod utils;
|
4
4
|
|
5
5
|
use std::cell::RefCell;
|
6
|
-
use std::collections::HashMap;
|
6
|
+
use std::collections::{HashMap, HashSet};
|
7
7
|
use std::future::IntoFuture;
|
8
|
+
use std::str::FromStr;
|
9
|
+
use std::time;
|
8
10
|
|
9
|
-
use chrono::Duration;
|
11
|
+
use chrono::{DateTime, Duration, FixedOffset, Utc};
|
10
12
|
use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
|
11
|
-
use deltalake::
|
13
|
+
use deltalake::arrow::record_batch::RecordBatchIterator;
|
14
|
+
use deltalake::datafusion::physical_plan::ExecutionPlan;
|
15
|
+
use deltalake::datafusion::prelude::SessionContext;
|
16
|
+
use deltalake::errors::DeltaTableError;
|
17
|
+
use deltalake::kernel::{scalars::ScalarExt, StructType, Transaction};
|
18
|
+
use deltalake::operations::collect_sendable_stream;
|
19
|
+
use deltalake::operations::constraints::ConstraintBuilder;
|
12
20
|
use deltalake::operations::delete::DeleteBuilder;
|
21
|
+
use deltalake::operations::drop_constraints::DropConstraintBuilder;
|
22
|
+
use deltalake::operations::filesystem_check::FileSystemCheckBuilder;
|
23
|
+
use deltalake::operations::load_cdf::CdfLoadBuilder;
|
24
|
+
use deltalake::operations::optimize::{OptimizeBuilder, OptimizeType};
|
25
|
+
use deltalake::operations::restore::RestoreBuilder;
|
26
|
+
use deltalake::operations::transaction::TableReference;
|
13
27
|
use deltalake::operations::vacuum::VacuumBuilder;
|
28
|
+
use deltalake::partitions::PartitionFilter;
|
14
29
|
use deltalake::storage::IORuntime;
|
15
30
|
use deltalake::DeltaOps;
|
16
31
|
use error::DeltaError;
|
32
|
+
use futures::future::join_all;
|
17
33
|
|
18
|
-
use magnus::{
|
34
|
+
use magnus::{
|
35
|
+
exception, function, method, prelude::*, Error, Integer, Module, RArray, RHash, Ruby, Value,
|
36
|
+
};
|
19
37
|
|
38
|
+
use crate::error::DeltaProtocolError;
|
20
39
|
use crate::error::RubyError;
|
21
40
|
use crate::schema::{schema_to_rbobject, Field};
|
22
41
|
use crate::utils::rt;
|
@@ -38,6 +57,19 @@ struct RawDeltaTableMetaData {
|
|
38
57
|
configuration: HashMap<String, Option<String>>,
|
39
58
|
}
|
40
59
|
|
60
|
+
#[magnus::wrap(class = "DeltaLake::ArrowArrayStream")]
|
61
|
+
pub struct ArrowArrayStream {
|
62
|
+
stream: FFI_ArrowArrayStream,
|
63
|
+
}
|
64
|
+
|
65
|
+
impl ArrowArrayStream {
|
66
|
+
pub fn to_i(&self) -> usize {
|
67
|
+
(&self.stream as *const _) as usize
|
68
|
+
}
|
69
|
+
}
|
70
|
+
|
71
|
+
type StringVec = Vec<String>;
|
72
|
+
|
41
73
|
impl RawDeltaTable {
|
42
74
|
pub fn new(
|
43
75
|
table_uri: String,
|
@@ -113,37 +145,138 @@ impl RawDeltaTable {
|
|
113
145
|
})
|
114
146
|
}
|
115
147
|
|
148
|
+
pub fn protocol_versions(&self) -> RbResult<(i32, i32, Option<StringVec>, Option<StringVec>)> {
|
149
|
+
let binding = self._table.borrow();
|
150
|
+
let table_protocol = binding.protocol().map_err(RubyError::from)?;
|
151
|
+
Ok((
|
152
|
+
table_protocol.min_reader_version,
|
153
|
+
table_protocol.min_writer_version,
|
154
|
+
table_protocol
|
155
|
+
.writer_features
|
156
|
+
.as_ref()
|
157
|
+
.and_then(|features| {
|
158
|
+
let empty_set = !features.is_empty();
|
159
|
+
empty_set.then(|| {
|
160
|
+
features
|
161
|
+
.iter()
|
162
|
+
.map(|v| v.to_string())
|
163
|
+
.collect::<Vec<String>>()
|
164
|
+
})
|
165
|
+
}),
|
166
|
+
table_protocol
|
167
|
+
.reader_features
|
168
|
+
.as_ref()
|
169
|
+
.and_then(|features| {
|
170
|
+
let empty_set = !features.is_empty();
|
171
|
+
empty_set.then(|| {
|
172
|
+
features
|
173
|
+
.iter()
|
174
|
+
.map(|v| v.to_string())
|
175
|
+
.collect::<Vec<String>>()
|
176
|
+
})
|
177
|
+
}),
|
178
|
+
))
|
179
|
+
}
|
180
|
+
|
116
181
|
pub fn load_version(&self, version: i64) -> RbResult<()> {
|
117
182
|
Ok(rt()
|
118
183
|
.block_on(self._table.borrow_mut().load_version(version))
|
119
184
|
.map_err(RubyError::from)?)
|
120
185
|
}
|
121
186
|
|
122
|
-
pub fn
|
123
|
-
|
124
|
-
|
125
|
-
|
187
|
+
pub fn get_latest_version(&self) -> RbResult<i64> {
|
188
|
+
Ok(rt()
|
189
|
+
.block_on(self._table.borrow().get_latest_version())
|
190
|
+
.map_err(RubyError::from)?)
|
191
|
+
}
|
126
192
|
|
193
|
+
pub fn get_earliest_version(&self) -> RbResult<i64> {
|
194
|
+
Ok(rt()
|
195
|
+
.block_on(self._table.borrow().get_earliest_version())
|
196
|
+
.map_err(RubyError::from)?)
|
197
|
+
}
|
198
|
+
|
199
|
+
pub fn get_num_index_cols(&self) -> RbResult<i32> {
|
127
200
|
Ok(self
|
128
201
|
._table
|
129
202
|
.borrow()
|
130
|
-
.
|
203
|
+
.snapshot()
|
131
204
|
.map_err(RubyError::from)?
|
132
|
-
.
|
133
|
-
.
|
205
|
+
.config()
|
206
|
+
.num_indexed_cols())
|
134
207
|
}
|
135
208
|
|
136
|
-
pub fn
|
137
|
-
if !self._table.borrow().config.require_files {
|
138
|
-
return Err(DeltaError::new_err("Table is initiated without files."));
|
139
|
-
}
|
140
|
-
|
209
|
+
pub fn get_stats_columns(&self) -> RbResult<Option<Vec<String>>> {
|
141
210
|
Ok(self
|
142
211
|
._table
|
143
212
|
.borrow()
|
144
|
-
.
|
213
|
+
.snapshot()
|
145
214
|
.map_err(RubyError::from)?
|
146
|
-
.
|
215
|
+
.config()
|
216
|
+
.stats_columns()
|
217
|
+
.map(|v| v.iter().map(|v| v.to_string()).collect::<Vec<String>>()))
|
218
|
+
}
|
219
|
+
|
220
|
+
pub fn load_with_datetime(&self, ds: String) -> RbResult<()> {
|
221
|
+
let datetime = DateTime::<Utc>::from(
|
222
|
+
DateTime::<FixedOffset>::parse_from_rfc3339(&ds).map_err(|err| {
|
223
|
+
Error::new(
|
224
|
+
exception::arg_error(),
|
225
|
+
format!("Failed to parse datetime string: {err}"),
|
226
|
+
)
|
227
|
+
})?,
|
228
|
+
);
|
229
|
+
Ok(rt()
|
230
|
+
.block_on(self._table.borrow_mut().load_with_datetime(datetime))
|
231
|
+
.map_err(RubyError::from)?)
|
232
|
+
}
|
233
|
+
|
234
|
+
pub fn files(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
|
235
|
+
if !self.has_files()? {
|
236
|
+
return Err(DeltaError::new_err("Table is instantiated without files."));
|
237
|
+
}
|
238
|
+
|
239
|
+
if let Some(filters) = partition_filters {
|
240
|
+
let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
|
241
|
+
Ok(self
|
242
|
+
._table
|
243
|
+
.borrow()
|
244
|
+
.get_files_by_partitions(&filters)
|
245
|
+
.map_err(RubyError::from)?
|
246
|
+
.into_iter()
|
247
|
+
.map(|p| p.to_string())
|
248
|
+
.collect())
|
249
|
+
} else {
|
250
|
+
Ok(self
|
251
|
+
._table
|
252
|
+
.borrow()
|
253
|
+
.get_files_iter()
|
254
|
+
.map_err(RubyError::from)?
|
255
|
+
.map(|f| f.to_string())
|
256
|
+
.collect())
|
257
|
+
}
|
258
|
+
}
|
259
|
+
|
260
|
+
pub fn file_uris(&self, partition_filters: Option<Value>) -> RbResult<Vec<String>> {
|
261
|
+
if !self._table.borrow().config.require_files {
|
262
|
+
return Err(DeltaError::new_err("Table is initiated without files."));
|
263
|
+
}
|
264
|
+
|
265
|
+
if let Some(filters) = partition_filters {
|
266
|
+
let filters = convert_partition_filters(filters).map_err(RubyError::from)?;
|
267
|
+
Ok(self
|
268
|
+
._table
|
269
|
+
.borrow()
|
270
|
+
.get_file_uris_by_partitions(&filters)
|
271
|
+
.map_err(RubyError::from)?)
|
272
|
+
} else {
|
273
|
+
Ok(self
|
274
|
+
._table
|
275
|
+
.borrow()
|
276
|
+
.get_file_uris()
|
277
|
+
.map_err(RubyError::from)?
|
278
|
+
.collect())
|
279
|
+
}
|
147
280
|
}
|
148
281
|
|
149
282
|
pub fn schema(&self) -> RbResult<Value> {
|
@@ -177,6 +310,214 @@ impl RawDeltaTable {
|
|
177
310
|
Ok(metrics.files_deleted)
|
178
311
|
}
|
179
312
|
|
313
|
+
pub fn compact_optimize(
|
314
|
+
&self,
|
315
|
+
target_size: Option<i64>,
|
316
|
+
max_concurrent_tasks: Option<usize>,
|
317
|
+
min_commit_interval: Option<u64>,
|
318
|
+
) -> RbResult<String> {
|
319
|
+
let mut cmd = OptimizeBuilder::new(
|
320
|
+
self._table.borrow().log_store(),
|
321
|
+
self._table
|
322
|
+
.borrow()
|
323
|
+
.snapshot()
|
324
|
+
.map_err(RubyError::from)?
|
325
|
+
.clone(),
|
326
|
+
)
|
327
|
+
.with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get));
|
328
|
+
if let Some(size) = target_size {
|
329
|
+
cmd = cmd.with_target_size(size);
|
330
|
+
}
|
331
|
+
if let Some(commit_interval) = min_commit_interval {
|
332
|
+
cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
|
333
|
+
}
|
334
|
+
|
335
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
336
|
+
self._table.borrow_mut().state = table.state;
|
337
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
338
|
+
}
|
339
|
+
|
340
|
+
pub fn z_order_optimize(
|
341
|
+
&self,
|
342
|
+
z_order_columns: Vec<String>,
|
343
|
+
target_size: Option<i64>,
|
344
|
+
max_concurrent_tasks: Option<usize>,
|
345
|
+
max_spill_size: usize,
|
346
|
+
min_commit_interval: Option<u64>,
|
347
|
+
) -> RbResult<String> {
|
348
|
+
let mut cmd = OptimizeBuilder::new(
|
349
|
+
self._table.borrow().log_store(),
|
350
|
+
self._table
|
351
|
+
.borrow()
|
352
|
+
.snapshot()
|
353
|
+
.map_err(RubyError::from)?
|
354
|
+
.clone(),
|
355
|
+
)
|
356
|
+
.with_max_concurrent_tasks(max_concurrent_tasks.unwrap_or_else(num_cpus::get))
|
357
|
+
.with_max_spill_size(max_spill_size)
|
358
|
+
.with_type(OptimizeType::ZOrder(z_order_columns));
|
359
|
+
if let Some(size) = target_size {
|
360
|
+
cmd = cmd.with_target_size(size);
|
361
|
+
}
|
362
|
+
if let Some(commit_interval) = min_commit_interval {
|
363
|
+
cmd = cmd.with_min_commit_interval(time::Duration::from_secs(commit_interval));
|
364
|
+
}
|
365
|
+
|
366
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
367
|
+
self._table.borrow_mut().state = table.state;
|
368
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
369
|
+
}
|
370
|
+
|
371
|
+
pub fn add_constraints(&self, constraints: HashMap<String, String>) -> RbResult<()> {
|
372
|
+
let mut cmd = ConstraintBuilder::new(
|
373
|
+
self._table.borrow().log_store(),
|
374
|
+
self._table
|
375
|
+
.borrow()
|
376
|
+
.snapshot()
|
377
|
+
.map_err(RubyError::from)?
|
378
|
+
.clone(),
|
379
|
+
);
|
380
|
+
|
381
|
+
for (col_name, expression) in constraints {
|
382
|
+
cmd = cmd.with_constraint(col_name.clone(), expression.clone());
|
383
|
+
}
|
384
|
+
|
385
|
+
let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
386
|
+
self._table.borrow_mut().state = table.state;
|
387
|
+
Ok(())
|
388
|
+
}
|
389
|
+
|
390
|
+
pub fn drop_constraints(&self, name: String, raise_if_not_exists: bool) -> RbResult<()> {
|
391
|
+
let cmd = DropConstraintBuilder::new(
|
392
|
+
self._table.borrow().log_store(),
|
393
|
+
self._table
|
394
|
+
.borrow()
|
395
|
+
.snapshot()
|
396
|
+
.map_err(RubyError::from)?
|
397
|
+
.clone(),
|
398
|
+
)
|
399
|
+
.with_constraint(name)
|
400
|
+
.with_raise_if_not_exists(raise_if_not_exists);
|
401
|
+
|
402
|
+
let table = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
403
|
+
self._table.borrow_mut().state = table.state;
|
404
|
+
Ok(())
|
405
|
+
}
|
406
|
+
|
407
|
+
pub fn load_cdf(
|
408
|
+
&self,
|
409
|
+
starting_version: i64,
|
410
|
+
ending_version: Option<i64>,
|
411
|
+
starting_timestamp: Option<String>,
|
412
|
+
ending_timestamp: Option<String>,
|
413
|
+
columns: Option<Vec<String>>,
|
414
|
+
) -> RbResult<ArrowArrayStream> {
|
415
|
+
let ctx = SessionContext::new();
|
416
|
+
let mut cdf_read = CdfLoadBuilder::new(
|
417
|
+
self._table.borrow().log_store(),
|
418
|
+
self._table
|
419
|
+
.borrow()
|
420
|
+
.snapshot()
|
421
|
+
.map_err(RubyError::from)?
|
422
|
+
.clone(),
|
423
|
+
)
|
424
|
+
.with_starting_version(starting_version);
|
425
|
+
|
426
|
+
if let Some(ev) = ending_version {
|
427
|
+
cdf_read = cdf_read.with_ending_version(ev);
|
428
|
+
}
|
429
|
+
if let Some(st) = starting_timestamp {
|
430
|
+
let starting_ts: DateTime<Utc> = DateTime::<Utc>::from_str(&st)
|
431
|
+
.map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
|
432
|
+
.to_utc();
|
433
|
+
cdf_read = cdf_read.with_starting_timestamp(starting_ts);
|
434
|
+
}
|
435
|
+
if let Some(et) = ending_timestamp {
|
436
|
+
let ending_ts = DateTime::<Utc>::from_str(&et)
|
437
|
+
.map_err(|pe| Error::new(exception::arg_error(), pe.to_string()))?
|
438
|
+
.to_utc();
|
439
|
+
cdf_read = cdf_read.with_starting_timestamp(ending_ts);
|
440
|
+
}
|
441
|
+
|
442
|
+
if let Some(columns) = columns {
|
443
|
+
cdf_read = cdf_read.with_columns(columns);
|
444
|
+
}
|
445
|
+
|
446
|
+
cdf_read = cdf_read.with_session_ctx(ctx.clone());
|
447
|
+
|
448
|
+
let plan = rt().block_on(cdf_read.build()).map_err(RubyError::from)?;
|
449
|
+
|
450
|
+
let mut tasks = vec![];
|
451
|
+
for p in 0..plan.properties().output_partitioning().partition_count() {
|
452
|
+
let inner_plan = plan.clone();
|
453
|
+
let partition_batch = inner_plan.execute(p, ctx.task_ctx()).unwrap();
|
454
|
+
let handle = rt().spawn(collect_sendable_stream(partition_batch));
|
455
|
+
tasks.push(handle);
|
456
|
+
}
|
457
|
+
|
458
|
+
// This is unfortunate.
|
459
|
+
let batches = rt()
|
460
|
+
.block_on(join_all(tasks))
|
461
|
+
.into_iter()
|
462
|
+
.flatten()
|
463
|
+
.collect::<Result<Vec<Vec<_>>, _>>()
|
464
|
+
.unwrap()
|
465
|
+
.into_iter()
|
466
|
+
.flatten()
|
467
|
+
.map(Ok);
|
468
|
+
let batch_iter = RecordBatchIterator::new(batches, plan.schema());
|
469
|
+
let ffi_stream = FFI_ArrowArrayStream::new(Box::new(batch_iter));
|
470
|
+
Ok(ArrowArrayStream { stream: ffi_stream })
|
471
|
+
}
|
472
|
+
|
473
|
+
pub fn restore(
|
474
|
+
&self,
|
475
|
+
target: Option<Value>,
|
476
|
+
ignore_missing_files: bool,
|
477
|
+
protocol_downgrade_allowed: bool,
|
478
|
+
) -> RbResult<String> {
|
479
|
+
let mut cmd = RestoreBuilder::new(
|
480
|
+
self._table.borrow().log_store(),
|
481
|
+
self._table
|
482
|
+
.borrow()
|
483
|
+
.snapshot()
|
484
|
+
.map_err(RubyError::from)?
|
485
|
+
.clone(),
|
486
|
+
);
|
487
|
+
if let Some(val) = target {
|
488
|
+
if let Some(version) = Integer::from_value(val) {
|
489
|
+
cmd = cmd.with_version_to_restore(version.to_i64()?)
|
490
|
+
}
|
491
|
+
if let Ok(ds) = String::try_convert(val) {
|
492
|
+
let datetime = DateTime::<Utc>::from(
|
493
|
+
DateTime::<FixedOffset>::parse_from_rfc3339(ds.as_ref()).map_err(|err| {
|
494
|
+
Error::new(
|
495
|
+
exception::arg_error(),
|
496
|
+
format!("Failed to parse datetime string: {err}"),
|
497
|
+
)
|
498
|
+
})?,
|
499
|
+
);
|
500
|
+
cmd = cmd.with_datetime_to_restore(datetime)
|
501
|
+
}
|
502
|
+
}
|
503
|
+
cmd = cmd.with_ignore_missing_files(ignore_missing_files);
|
504
|
+
cmd = cmd.with_protocol_downgrade_allowed(protocol_downgrade_allowed);
|
505
|
+
|
506
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
507
|
+
self._table.borrow_mut().state = table.state;
|
508
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
509
|
+
}
|
510
|
+
|
511
|
+
pub fn history(&self, limit: Option<usize>) -> RbResult<Vec<String>> {
|
512
|
+
let history = rt()
|
513
|
+
.block_on(self._table.borrow().history(limit))
|
514
|
+
.map_err(RubyError::from)?;
|
515
|
+
Ok(history
|
516
|
+
.iter()
|
517
|
+
.map(|c| serde_json::to_string(c).unwrap())
|
518
|
+
.collect())
|
519
|
+
}
|
520
|
+
|
180
521
|
pub fn update_incremental(&self) -> RbResult<()> {
|
181
522
|
#[allow(deprecated)]
|
182
523
|
Ok(rt()
|
@@ -184,6 +525,56 @@ impl RawDeltaTable {
|
|
184
525
|
.map_err(RubyError::from)?)
|
185
526
|
}
|
186
527
|
|
528
|
+
fn get_active_partitions(&self) -> RbResult<RArray> {
|
529
|
+
let binding = self._table.borrow();
|
530
|
+
let _column_names: HashSet<&str> = binding
|
531
|
+
.get_schema()
|
532
|
+
.map_err(|_| DeltaProtocolError::new_err("table does not yet have a schema"))?
|
533
|
+
.fields()
|
534
|
+
.map(|field| field.name().as_str())
|
535
|
+
.collect();
|
536
|
+
let partition_columns: HashSet<&str> = binding
|
537
|
+
.metadata()
|
538
|
+
.map_err(RubyError::from)?
|
539
|
+
.partition_columns
|
540
|
+
.iter()
|
541
|
+
.map(|col| col.as_str())
|
542
|
+
.collect();
|
543
|
+
|
544
|
+
let converted_filters = Vec::new();
|
545
|
+
|
546
|
+
let partition_columns: Vec<&str> = partition_columns.into_iter().collect();
|
547
|
+
|
548
|
+
let adds = binding
|
549
|
+
.snapshot()
|
550
|
+
.map_err(RubyError::from)?
|
551
|
+
.get_active_add_actions_by_partitions(&converted_filters)
|
552
|
+
.map_err(RubyError::from)?
|
553
|
+
.collect::<Result<Vec<_>, _>>()
|
554
|
+
.map_err(RubyError::from)?;
|
555
|
+
let active_partitions: HashSet<Vec<(&str, Option<String>)>> = adds
|
556
|
+
.iter()
|
557
|
+
.flat_map(|add| {
|
558
|
+
Ok::<_, RubyError>(
|
559
|
+
partition_columns
|
560
|
+
.iter()
|
561
|
+
.flat_map(|col| {
|
562
|
+
Ok::<_, RubyError>((
|
563
|
+
*col,
|
564
|
+
add.partition_values()
|
565
|
+
.map_err(RubyError::from)?
|
566
|
+
.get(*col)
|
567
|
+
.map(|v| v.serialize()),
|
568
|
+
))
|
569
|
+
})
|
570
|
+
.collect(),
|
571
|
+
)
|
572
|
+
})
|
573
|
+
.collect();
|
574
|
+
|
575
|
+
Ok(RArray::from_iter(active_partitions))
|
576
|
+
}
|
577
|
+
|
187
578
|
pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
|
188
579
|
let mut cmd = DeleteBuilder::new(
|
189
580
|
self._table.borrow().log_store(),
|
@@ -201,6 +592,38 @@ impl RawDeltaTable {
|
|
201
592
|
self._table.borrow_mut().state = table.state;
|
202
593
|
Ok(serde_json::to_string(&metrics).unwrap())
|
203
594
|
}
|
595
|
+
|
596
|
+
pub fn repair(&self, dry_run: bool) -> RbResult<String> {
|
597
|
+
let cmd = FileSystemCheckBuilder::new(
|
598
|
+
self._table.borrow().log_store(),
|
599
|
+
self._table
|
600
|
+
.borrow()
|
601
|
+
.snapshot()
|
602
|
+
.map_err(RubyError::from)?
|
603
|
+
.clone(),
|
604
|
+
)
|
605
|
+
.with_dry_run(dry_run);
|
606
|
+
|
607
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
608
|
+
self._table.borrow_mut().state = table.state;
|
609
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
610
|
+
}
|
611
|
+
|
612
|
+
pub fn transaction_versions(&self) -> RHash {
|
613
|
+
RHash::from_iter(
|
614
|
+
self._table
|
615
|
+
.borrow()
|
616
|
+
.get_app_transaction_version()
|
617
|
+
.into_iter()
|
618
|
+
.map(|(app_id, transaction)| (app_id, RbTransaction::from(transaction))),
|
619
|
+
)
|
620
|
+
}
|
621
|
+
}
|
622
|
+
|
623
|
+
fn convert_partition_filters(
|
624
|
+
_partitions_filters: Value,
|
625
|
+
) -> Result<Vec<PartitionFilter>, DeltaTableError> {
|
626
|
+
todo!()
|
204
627
|
}
|
205
628
|
|
206
629
|
impl RawDeltaTableMetaData {
|
@@ -229,6 +652,23 @@ impl RawDeltaTableMetaData {
|
|
229
652
|
}
|
230
653
|
}
|
231
654
|
|
655
|
+
#[magnus::wrap(class = "DeltaLake::Transaction")]
|
656
|
+
pub struct RbTransaction {
|
657
|
+
pub app_id: String,
|
658
|
+
pub version: i64,
|
659
|
+
pub last_updated: Option<i64>,
|
660
|
+
}
|
661
|
+
|
662
|
+
impl From<Transaction> for RbTransaction {
|
663
|
+
fn from(value: Transaction) -> Self {
|
664
|
+
RbTransaction {
|
665
|
+
app_id: value.app_id,
|
666
|
+
version: value.version,
|
667
|
+
last_updated: value.last_updated,
|
668
|
+
}
|
669
|
+
}
|
670
|
+
}
|
671
|
+
|
232
672
|
#[allow(clippy::too_many_arguments)]
|
233
673
|
fn write_to_deltalake(
|
234
674
|
table_uri: String,
|
@@ -313,16 +753,68 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
313
753
|
class.define_method("version", method!(RawDeltaTable::version, 0))?;
|
314
754
|
class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?;
|
315
755
|
class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?;
|
756
|
+
class.define_method(
|
757
|
+
"protocol_versions",
|
758
|
+
method!(RawDeltaTable::protocol_versions, 0),
|
759
|
+
)?;
|
316
760
|
class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?;
|
317
|
-
class.define_method(
|
318
|
-
|
761
|
+
class.define_method(
|
762
|
+
"get_latest_version",
|
763
|
+
method!(RawDeltaTable::get_latest_version, 0),
|
764
|
+
)?;
|
765
|
+
class.define_method(
|
766
|
+
"get_earliest_version",
|
767
|
+
method!(RawDeltaTable::get_earliest_version, 0),
|
768
|
+
)?;
|
769
|
+
class.define_method(
|
770
|
+
"get_num_index_cols",
|
771
|
+
method!(RawDeltaTable::get_num_index_cols, 0),
|
772
|
+
)?;
|
773
|
+
class.define_method(
|
774
|
+
"get_stats_columns",
|
775
|
+
method!(RawDeltaTable::get_stats_columns, 0),
|
776
|
+
)?;
|
777
|
+
class.define_method(
|
778
|
+
"load_with_datetime",
|
779
|
+
method!(RawDeltaTable::load_with_datetime, 1),
|
780
|
+
)?;
|
781
|
+
class.define_method("files", method!(RawDeltaTable::files, 1))?;
|
782
|
+
class.define_method("file_uris", method!(RawDeltaTable::file_uris, 1))?;
|
319
783
|
class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
|
320
784
|
class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
|
785
|
+
class.define_method(
|
786
|
+
"compact_optimize",
|
787
|
+
method!(RawDeltaTable::compact_optimize, 3),
|
788
|
+
)?;
|
789
|
+
class.define_method(
|
790
|
+
"z_order_optimize",
|
791
|
+
method!(RawDeltaTable::z_order_optimize, 5),
|
792
|
+
)?;
|
793
|
+
class.define_method(
|
794
|
+
"add_constraints",
|
795
|
+
method!(RawDeltaTable::add_constraints, 1),
|
796
|
+
)?;
|
797
|
+
class.define_method(
|
798
|
+
"drop_constraints",
|
799
|
+
method!(RawDeltaTable::drop_constraints, 2),
|
800
|
+
)?;
|
801
|
+
class.define_method("load_cdf", method!(RawDeltaTable::load_cdf, 5))?;
|
802
|
+
class.define_method("restore", method!(RawDeltaTable::restore, 3))?;
|
803
|
+
class.define_method("history", method!(RawDeltaTable::history, 1))?;
|
321
804
|
class.define_method(
|
322
805
|
"update_incremental",
|
323
806
|
method!(RawDeltaTable::update_incremental, 0),
|
324
807
|
)?;
|
808
|
+
class.define_method(
|
809
|
+
"get_active_partitions",
|
810
|
+
method!(RawDeltaTable::get_active_partitions, 0),
|
811
|
+
)?;
|
325
812
|
class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
|
813
|
+
class.define_method("repair", method!(RawDeltaTable::repair, 1))?;
|
814
|
+
class.define_method(
|
815
|
+
"transaction_versions",
|
816
|
+
method!(RawDeltaTable::transaction_versions, 0),
|
817
|
+
)?;
|
326
818
|
|
327
819
|
let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
|
328
820
|
class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?;
|
@@ -344,6 +836,9 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
344
836
|
method!(RawDeltaTableMetaData::configuration, 0),
|
345
837
|
)?;
|
346
838
|
|
839
|
+
let class = module.define_class("ArrowArrayStream", ruby.class_object())?;
|
840
|
+
class.define_method("to_i", method!(ArrowArrayStream::to_i, 0))?;
|
841
|
+
|
347
842
|
let class = module.define_class("Field", ruby.class_object())?;
|
348
843
|
class.define_method("name", method!(Field::name, 0))?;
|
349
844
|
class.define_method("type", method!(Field::get_type, 0))?;
|
data/lib/deltalake/table.rb
CHANGED
@@ -26,22 +26,54 @@ module DeltaLake
|
|
26
26
|
@table.version
|
27
27
|
end
|
28
28
|
|
29
|
-
def
|
30
|
-
|
29
|
+
def partitions
|
30
|
+
partitions = []
|
31
|
+
@table.get_active_partitions.each do |partition|
|
32
|
+
next unless partition
|
33
|
+
partitions << partition.to_h
|
34
|
+
end
|
35
|
+
partitions
|
36
|
+
end
|
37
|
+
|
38
|
+
def files(partition_filters: nil)
|
39
|
+
@table.files(_stringify_partition_values(partition_filters))
|
31
40
|
end
|
32
41
|
|
33
|
-
def file_uris
|
34
|
-
@table.file_uris
|
42
|
+
def file_uris(partition_filters: nil)
|
43
|
+
@table.file_uris(_stringify_partition_values(partition_filters))
|
35
44
|
end
|
36
45
|
|
37
46
|
def load_as_version(version)
|
38
47
|
if version.is_a?(Integer)
|
39
48
|
@table.load_version(version)
|
49
|
+
elsif version.is_a?(Time)
|
50
|
+
# needed for iso8601
|
51
|
+
require "time"
|
52
|
+
|
53
|
+
@table.load_with_datetime(version.utc.iso8601(9))
|
54
|
+
elsif version.is_a?(String)
|
55
|
+
@table.load_with_datetime(version)
|
40
56
|
else
|
41
|
-
raise TypeError, "Invalid datatype provided for version, only Integer
|
57
|
+
raise TypeError, "Invalid datatype provided for version, only Integer, String, and Time are accepted."
|
42
58
|
end
|
43
59
|
end
|
44
60
|
|
61
|
+
def load_cdf(
|
62
|
+
starting_version: 0,
|
63
|
+
ending_version: nil,
|
64
|
+
starting_timestamp: nil,
|
65
|
+
ending_timestamp: nil,
|
66
|
+
columns: nil
|
67
|
+
)
|
68
|
+
@table.load_cdf(
|
69
|
+
starting_version,
|
70
|
+
ending_version,
|
71
|
+
starting_timestamp,
|
72
|
+
ending_timestamp,
|
73
|
+
columns
|
74
|
+
)
|
75
|
+
end
|
76
|
+
|
45
77
|
def table_uri
|
46
78
|
@table.table_uri
|
47
79
|
end
|
@@ -54,6 +86,29 @@ module DeltaLake
|
|
54
86
|
Metadata.new(@table)
|
55
87
|
end
|
56
88
|
|
89
|
+
def protocol
|
90
|
+
ProtocolVersions.new(*@table.protocol_versions)
|
91
|
+
end
|
92
|
+
|
93
|
+
def history(limit: nil)
|
94
|
+
backwards_enumerate = lambda do |iterable, start_end, &block|
|
95
|
+
n = start_end
|
96
|
+
iterable.each do |elem|
|
97
|
+
block.call(n, elem)
|
98
|
+
n -= 1
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
commits = @table.history(limit)
|
103
|
+
history = []
|
104
|
+
backwards_enumerate.(commits, @table.get_latest_version) do |version, commit_info_raw|
|
105
|
+
commit = JSON.parse(commit_info_raw)
|
106
|
+
commit["version"] = version
|
107
|
+
history << commit
|
108
|
+
end
|
109
|
+
history
|
110
|
+
end
|
111
|
+
|
57
112
|
def vacuum(
|
58
113
|
retention_hours: nil,
|
59
114
|
dry_run: true,
|
@@ -72,6 +127,40 @@ module DeltaLake
|
|
72
127
|
)
|
73
128
|
end
|
74
129
|
|
130
|
+
def optimize
|
131
|
+
TableOptimizer.new(self)
|
132
|
+
end
|
133
|
+
|
134
|
+
def alter
|
135
|
+
TableAlterer.new(self)
|
136
|
+
end
|
137
|
+
|
138
|
+
def restore(
|
139
|
+
target,
|
140
|
+
ignore_missing_files: false,
|
141
|
+
protocol_downgrade_allowed: false
|
142
|
+
)
|
143
|
+
if target.is_a?(Time)
|
144
|
+
# needed for iso8601
|
145
|
+
require "time"
|
146
|
+
|
147
|
+
metrics =
|
148
|
+
@table.restore(
|
149
|
+
target.utc.iso8601(9),
|
150
|
+
ignore_missing_files,
|
151
|
+
protocol_downgrade_allowed
|
152
|
+
)
|
153
|
+
else
|
154
|
+
metrics =
|
155
|
+
@table.restore(
|
156
|
+
target,
|
157
|
+
ignore_missing_files,
|
158
|
+
protocol_downgrade_allowed
|
159
|
+
)
|
160
|
+
end
|
161
|
+
JSON.parse(metrics)
|
162
|
+
end
|
163
|
+
|
75
164
|
def to_polars(eager: true)
|
76
165
|
require "polars-df"
|
77
166
|
|
@@ -80,7 +169,13 @@ module DeltaLake
|
|
80
169
|
if sources.empty?
|
81
170
|
Polars::LazyFrame.new
|
82
171
|
else
|
83
|
-
|
172
|
+
delta_keys = [
|
173
|
+
"AWS_S3_ALLOW_UNSAFE_RENAME",
|
174
|
+
"AWS_S3_LOCKING_PROVIDER",
|
175
|
+
"CONDITIONAL_PUT",
|
176
|
+
"DELTA_DYNAMO_TABLE_NAME"
|
177
|
+
]
|
178
|
+
storage_options = @storage_options&.reject { |k, _| delta_keys.include?(k.to_s.upcase) }
|
84
179
|
Polars.scan_parquet(sources, storage_options: storage_options)
|
85
180
|
end
|
86
181
|
eager ? lf.collect : lf
|
@@ -95,9 +190,30 @@ module DeltaLake
|
|
95
190
|
JSON.parse(metrics).transform_keys(&:to_sym)
|
96
191
|
end
|
97
192
|
|
193
|
+
def repair(dry_run: false)
|
194
|
+
metrics =
|
195
|
+
@table.repair(
|
196
|
+
dry_run
|
197
|
+
)
|
198
|
+
JSON.parse(metrics).transform_keys(&:to_sym)
|
199
|
+
end
|
200
|
+
|
201
|
+
def transaction_versions
|
202
|
+
@table.transaction_versions
|
203
|
+
end
|
204
|
+
|
98
205
|
# private
|
99
206
|
def _table
|
100
207
|
@table
|
101
208
|
end
|
209
|
+
|
210
|
+
# private
|
211
|
+
def _stringify_partition_values(partition_filters)
|
212
|
+
if partition_filters.nil?
|
213
|
+
return partition_filters
|
214
|
+
end
|
215
|
+
|
216
|
+
raise Todo
|
217
|
+
end
|
102
218
|
end
|
103
219
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class TableAlterer
|
3
|
+
def initialize(table)
|
4
|
+
@table = table
|
5
|
+
end
|
6
|
+
|
7
|
+
def add_constraint(constraints)
|
8
|
+
if constraints.length > 1
|
9
|
+
raise ArgumentError,
|
10
|
+
"add_constraints is limited to a single constraint addition at once for now."
|
11
|
+
end
|
12
|
+
|
13
|
+
@table._table.add_constraints(
|
14
|
+
constraints
|
15
|
+
)
|
16
|
+
end
|
17
|
+
|
18
|
+
def drop_constraint(name, raise_if_not_exists: true)
|
19
|
+
@table._table.drop_constraints(
|
20
|
+
name,
|
21
|
+
raise_if_not_exists
|
22
|
+
)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class TableOptimizer
|
3
|
+
def initialize(table)
|
4
|
+
@table = table
|
5
|
+
end
|
6
|
+
|
7
|
+
def compact(
|
8
|
+
target_size: nil,
|
9
|
+
max_concurrent_tasks: nil,
|
10
|
+
min_commit_interval: nil
|
11
|
+
)
|
12
|
+
metrics =
|
13
|
+
@table._table.compact_optimize(
|
14
|
+
target_size,
|
15
|
+
max_concurrent_tasks,
|
16
|
+
min_commit_interval
|
17
|
+
)
|
18
|
+
@table.update_incremental
|
19
|
+
result = JSON.parse(metrics)
|
20
|
+
["filesAdded", "filesRemoved"].each do |key|
|
21
|
+
result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
|
22
|
+
end
|
23
|
+
# TODO return underscore symbols like delete
|
24
|
+
result
|
25
|
+
end
|
26
|
+
|
27
|
+
def z_order(
|
28
|
+
columns,
|
29
|
+
target_size: nil,
|
30
|
+
max_concurrent_tasks: nil,
|
31
|
+
max_spill_size: 20 * 1024 * 1024 * 1024,
|
32
|
+
min_commit_interval: nil
|
33
|
+
)
|
34
|
+
metrics =
|
35
|
+
@table._table.z_order_optimize(
|
36
|
+
Array(columns),
|
37
|
+
target_size,
|
38
|
+
max_concurrent_tasks,
|
39
|
+
max_spill_size,
|
40
|
+
min_commit_interval
|
41
|
+
)
|
42
|
+
@table.update_incremental
|
43
|
+
result = JSON.parse(metrics)
|
44
|
+
["filesAdded", "filesRemoved"].each do |key|
|
45
|
+
result[key] = JSON.parse(result[key]) if result[key].is_a?(String)
|
46
|
+
end
|
47
|
+
# TODO return underscore symbols like delete
|
48
|
+
result
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
data/lib/deltalake/version.rb
CHANGED
data/lib/deltalake.rb
CHANGED
@@ -13,6 +13,8 @@ require_relative "deltalake/field"
|
|
13
13
|
require_relative "deltalake/metadata"
|
14
14
|
require_relative "deltalake/schema"
|
15
15
|
require_relative "deltalake/table"
|
16
|
+
require_relative "deltalake/table_alterer"
|
17
|
+
require_relative "deltalake/table_optimizer"
|
16
18
|
require_relative "deltalake/version"
|
17
19
|
|
18
20
|
module DeltaLake
|
@@ -22,8 +24,21 @@ module DeltaLake
|
|
22
24
|
class CommitFailedError < Error; end
|
23
25
|
class SchemaMismatchError < Error; end
|
24
26
|
|
25
|
-
class
|
27
|
+
class Todo < Error
|
28
|
+
def message
|
29
|
+
"not implemented yet"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
ProtocolVersions =
|
34
|
+
Struct.new(
|
35
|
+
:min_reader_version,
|
36
|
+
:min_writer_version,
|
37
|
+
:writer_features,
|
38
|
+
:reader_features
|
39
|
+
)
|
26
40
|
|
41
|
+
class << self
|
27
42
|
def write(
|
28
43
|
table_or_uri,
|
29
44
|
data,
|
@@ -95,10 +110,58 @@ module DeltaLake
|
|
95
110
|
|
96
111
|
def convert_data(data)
|
97
112
|
if data.respond_to?(:arrow_c_stream)
|
113
|
+
# TODO convert other object types
|
114
|
+
# should probably move logic to Rust
|
115
|
+
if defined?(Polars::DataFrame) && data.is_a?(Polars::DataFrame)
|
116
|
+
data = convert_polars_data(data)
|
117
|
+
end
|
118
|
+
|
98
119
|
data.arrow_c_stream
|
99
120
|
else
|
100
121
|
raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
|
101
122
|
end
|
102
123
|
end
|
124
|
+
|
125
|
+
# unsigned integers are not part of the protocol
|
126
|
+
# https://github.com/delta-io/delta/blob/master/PROTOCOL.md#primitive-types
|
127
|
+
def convert_polars_data(data)
|
128
|
+
new_schema = {}
|
129
|
+
data.schema.each do |k, v|
|
130
|
+
new_type = convert_polars_type(v)
|
131
|
+
new_schema[k] = new_type if new_type
|
132
|
+
end
|
133
|
+
|
134
|
+
if new_schema.any?
|
135
|
+
data.cast(new_schema)
|
136
|
+
else
|
137
|
+
data
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def convert_polars_type(t)
|
142
|
+
case t
|
143
|
+
when Polars::UInt8
|
144
|
+
Polars::Int8
|
145
|
+
when Polars::UInt16
|
146
|
+
Polars::Int16
|
147
|
+
when Polars::UInt32
|
148
|
+
Polars::Int32
|
149
|
+
when Polars::UInt64
|
150
|
+
Polars::Int64
|
151
|
+
when Polars::Datetime
|
152
|
+
Polars::Datetime.new("us", t.time_zone) if t.time_unit != "us"
|
153
|
+
when Polars::List
|
154
|
+
inner = convert_polars_type(t.inner)
|
155
|
+
Polars::List.new(inner) if inner
|
156
|
+
when Polars::Array
|
157
|
+
inner = convert_polars_type(t.inner)
|
158
|
+
Polars::Array.new(t.inner, t.width) if inner
|
159
|
+
when Polars::Struct
|
160
|
+
if t.fields.any? { |f| convert_polars_type(f.dtype) }
|
161
|
+
fields = t.fields.map { |f| Polars::Field.new(f.name, convert_polars_type(f.dtype) || f.dtype) }
|
162
|
+
Polars::Struct.new(fields)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
103
166
|
end
|
104
167
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: deltalake-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-11-
|
11
|
+
date: 2024-11-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -48,6 +48,8 @@ files:
|
|
48
48
|
- lib/deltalake/metadata.rb
|
49
49
|
- lib/deltalake/schema.rb
|
50
50
|
- lib/deltalake/table.rb
|
51
|
+
- lib/deltalake/table_alterer.rb
|
52
|
+
- lib/deltalake/table_optimizer.rb
|
51
53
|
- lib/deltalake/version.rb
|
52
54
|
homepage: https://github.com/ankane/delta-ruby
|
53
55
|
licenses:
|