deltalake-rb 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Cargo.lock +4371 -0
- data/Cargo.toml +6 -0
- data/LICENSE.txt +179 -0
- data/README.md +110 -0
- data/ext/deltalake/Cargo.toml +21 -0
- data/ext/deltalake/extconf.rb +4 -0
- data/ext/deltalake/src/error.rs +98 -0
- data/ext/deltalake/src/lib.rs +353 -0
- data/ext/deltalake/src/schema.rs +37 -0
- data/ext/deltalake/src/utils.rs +21 -0
- data/lib/deltalake/field.rb +12 -0
- data/lib/deltalake/metadata.rb +43 -0
- data/lib/deltalake/schema.rb +9 -0
- data/lib/deltalake/table.rb +103 -0
- data/lib/deltalake/version.rb +3 -0
- data/lib/deltalake-rb.rb +1 -0
- data/lib/deltalake.rb +104 -0
- metadata +75 -0
@@ -0,0 +1,353 @@
|
|
1
|
+
mod error;
|
2
|
+
mod schema;
|
3
|
+
mod utils;
|
4
|
+
|
5
|
+
use std::cell::RefCell;
|
6
|
+
use std::collections::HashMap;
|
7
|
+
use std::future::IntoFuture;
|
8
|
+
|
9
|
+
use chrono::Duration;
|
10
|
+
use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
|
11
|
+
use deltalake::kernel::StructType;
|
12
|
+
use deltalake::operations::delete::DeleteBuilder;
|
13
|
+
use deltalake::operations::vacuum::VacuumBuilder;
|
14
|
+
use deltalake::storage::IORuntime;
|
15
|
+
use deltalake::DeltaOps;
|
16
|
+
use error::DeltaError;
|
17
|
+
|
18
|
+
use magnus::{function, method, prelude::*, Error, Module, Ruby, Value};
|
19
|
+
|
20
|
+
use crate::error::RubyError;
|
21
|
+
use crate::schema::{schema_to_rbobject, Field};
|
22
|
+
use crate::utils::rt;
|
23
|
+
|
24
|
+
type RbResult<T> = Result<T, Error>;
|
25
|
+
|
26
|
+
#[magnus::wrap(class = "DeltaLake::RawDeltaTable")]
|
27
|
+
struct RawDeltaTable {
|
28
|
+
_table: RefCell<deltalake::DeltaTable>,
|
29
|
+
}
|
30
|
+
|
31
|
+
#[magnus::wrap(class = "DeltaLake::RawDeltaTableMetaData")]
|
32
|
+
struct RawDeltaTableMetaData {
|
33
|
+
id: String,
|
34
|
+
name: Option<String>,
|
35
|
+
description: Option<String>,
|
36
|
+
partition_columns: Vec<String>,
|
37
|
+
created_time: Option<i64>,
|
38
|
+
configuration: HashMap<String, Option<String>>,
|
39
|
+
}
|
40
|
+
|
41
|
+
impl RawDeltaTable {
|
42
|
+
pub fn new(
|
43
|
+
table_uri: String,
|
44
|
+
version: Option<i64>,
|
45
|
+
storage_options: Option<HashMap<String, String>>,
|
46
|
+
without_files: bool,
|
47
|
+
log_buffer_size: Option<usize>,
|
48
|
+
) -> RbResult<Self> {
|
49
|
+
let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri)
|
50
|
+
.with_io_runtime(IORuntime::default());
|
51
|
+
|
52
|
+
if let Some(storage_options) = storage_options {
|
53
|
+
builder = builder.with_storage_options(storage_options)
|
54
|
+
}
|
55
|
+
if let Some(version) = version {
|
56
|
+
builder = builder.with_version(version)
|
57
|
+
}
|
58
|
+
if without_files {
|
59
|
+
builder = builder.without_files()
|
60
|
+
}
|
61
|
+
if let Some(buf_size) = log_buffer_size {
|
62
|
+
builder = builder
|
63
|
+
.with_log_buffer_size(buf_size)
|
64
|
+
.map_err(RubyError::from)?;
|
65
|
+
}
|
66
|
+
|
67
|
+
let table = rt().block_on(builder.load()).map_err(RubyError::from)?;
|
68
|
+
Ok(RawDeltaTable {
|
69
|
+
_table: RefCell::new(table),
|
70
|
+
})
|
71
|
+
}
|
72
|
+
|
73
|
+
pub fn is_deltatable(
|
74
|
+
table_uri: String,
|
75
|
+
storage_options: Option<HashMap<String, String>>,
|
76
|
+
) -> RbResult<bool> {
|
77
|
+
let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri);
|
78
|
+
if let Some(storage_options) = storage_options {
|
79
|
+
builder = builder.with_storage_options(storage_options)
|
80
|
+
}
|
81
|
+
Ok(rt()
|
82
|
+
.block_on(async {
|
83
|
+
match builder.build() {
|
84
|
+
Ok(table) => table.verify_deltatable_existence().await,
|
85
|
+
Err(err) => Err(err),
|
86
|
+
}
|
87
|
+
})
|
88
|
+
.map_err(RubyError::from)?)
|
89
|
+
}
|
90
|
+
|
91
|
+
pub fn table_uri(&self) -> RbResult<String> {
|
92
|
+
Ok(self._table.borrow().table_uri())
|
93
|
+
}
|
94
|
+
|
95
|
+
pub fn version(&self) -> RbResult<i64> {
|
96
|
+
Ok(self._table.borrow().version())
|
97
|
+
}
|
98
|
+
|
99
|
+
pub fn has_files(&self) -> RbResult<bool> {
|
100
|
+
Ok(self._table.borrow().config.require_files)
|
101
|
+
}
|
102
|
+
|
103
|
+
pub fn metadata(&self) -> RbResult<RawDeltaTableMetaData> {
|
104
|
+
let binding = self._table.borrow();
|
105
|
+
let metadata = binding.metadata().map_err(RubyError::from)?;
|
106
|
+
Ok(RawDeltaTableMetaData {
|
107
|
+
id: metadata.id.clone(),
|
108
|
+
name: metadata.name.clone(),
|
109
|
+
description: metadata.description.clone(),
|
110
|
+
partition_columns: metadata.partition_columns.clone(),
|
111
|
+
created_time: metadata.created_time,
|
112
|
+
configuration: metadata.configuration.clone(),
|
113
|
+
})
|
114
|
+
}
|
115
|
+
|
116
|
+
pub fn load_version(&self, version: i64) -> RbResult<()> {
|
117
|
+
Ok(rt()
|
118
|
+
.block_on(self._table.borrow_mut().load_version(version))
|
119
|
+
.map_err(RubyError::from)?)
|
120
|
+
}
|
121
|
+
|
122
|
+
pub fn files(&self) -> RbResult<Vec<String>> {
|
123
|
+
if !self.has_files()? {
|
124
|
+
return Err(DeltaError::new_err("Table is instantiated without files."));
|
125
|
+
}
|
126
|
+
|
127
|
+
Ok(self
|
128
|
+
._table
|
129
|
+
.borrow()
|
130
|
+
.get_files_iter()
|
131
|
+
.map_err(RubyError::from)?
|
132
|
+
.map(|f| f.to_string())
|
133
|
+
.collect())
|
134
|
+
}
|
135
|
+
|
136
|
+
pub fn file_uris(&self) -> RbResult<Vec<String>> {
|
137
|
+
if !self._table.borrow().config.require_files {
|
138
|
+
return Err(DeltaError::new_err("Table is initiated without files."));
|
139
|
+
}
|
140
|
+
|
141
|
+
Ok(self
|
142
|
+
._table
|
143
|
+
.borrow()
|
144
|
+
.get_file_uris()
|
145
|
+
.map_err(RubyError::from)?
|
146
|
+
.collect())
|
147
|
+
}
|
148
|
+
|
149
|
+
pub fn schema(&self) -> RbResult<Value> {
|
150
|
+
let binding = self._table.borrow();
|
151
|
+
let schema: &StructType = binding.get_schema().map_err(RubyError::from)?;
|
152
|
+
schema_to_rbobject(schema.to_owned())
|
153
|
+
}
|
154
|
+
|
155
|
+
pub fn vacuum(
|
156
|
+
&self,
|
157
|
+
dry_run: bool,
|
158
|
+
retention_hours: Option<u64>,
|
159
|
+
enforce_retention_duration: bool,
|
160
|
+
) -> RbResult<Vec<String>> {
|
161
|
+
let mut cmd = VacuumBuilder::new(
|
162
|
+
self._table.borrow().log_store(),
|
163
|
+
self._table
|
164
|
+
.borrow()
|
165
|
+
.snapshot()
|
166
|
+
.map_err(RubyError::from)?
|
167
|
+
.clone(),
|
168
|
+
)
|
169
|
+
.with_enforce_retention_duration(enforce_retention_duration)
|
170
|
+
.with_dry_run(dry_run);
|
171
|
+
if let Some(retention_period) = retention_hours {
|
172
|
+
cmd = cmd.with_retention_period(Duration::hours(retention_period as i64));
|
173
|
+
}
|
174
|
+
|
175
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
176
|
+
self._table.borrow_mut().state = table.state;
|
177
|
+
Ok(metrics.files_deleted)
|
178
|
+
}
|
179
|
+
|
180
|
+
pub fn update_incremental(&self) -> RbResult<()> {
|
181
|
+
#[allow(deprecated)]
|
182
|
+
Ok(rt()
|
183
|
+
.block_on(self._table.borrow_mut().update_incremental(None))
|
184
|
+
.map_err(RubyError::from)?)
|
185
|
+
}
|
186
|
+
|
187
|
+
pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
|
188
|
+
let mut cmd = DeleteBuilder::new(
|
189
|
+
self._table.borrow().log_store(),
|
190
|
+
self._table
|
191
|
+
.borrow()
|
192
|
+
.snapshot()
|
193
|
+
.map_err(RubyError::from)?
|
194
|
+
.clone(),
|
195
|
+
);
|
196
|
+
if let Some(predicate) = predicate {
|
197
|
+
cmd = cmd.with_predicate(predicate);
|
198
|
+
}
|
199
|
+
|
200
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
201
|
+
self._table.borrow_mut().state = table.state;
|
202
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
impl RawDeltaTableMetaData {
|
207
|
+
fn id(&self) -> String {
|
208
|
+
self.id.clone()
|
209
|
+
}
|
210
|
+
|
211
|
+
fn name(&self) -> Option<String> {
|
212
|
+
self.name.clone()
|
213
|
+
}
|
214
|
+
|
215
|
+
fn description(&self) -> Option<String> {
|
216
|
+
self.description.clone()
|
217
|
+
}
|
218
|
+
|
219
|
+
fn partition_columns(&self) -> Vec<String> {
|
220
|
+
self.partition_columns.clone()
|
221
|
+
}
|
222
|
+
|
223
|
+
fn created_time(&self) -> Option<i64> {
|
224
|
+
self.created_time
|
225
|
+
}
|
226
|
+
|
227
|
+
fn configuration(&self) -> HashMap<String, Option<String>> {
|
228
|
+
self.configuration.clone()
|
229
|
+
}
|
230
|
+
}
|
231
|
+
|
232
|
+
#[allow(clippy::too_many_arguments)]
|
233
|
+
fn write_to_deltalake(
|
234
|
+
table_uri: String,
|
235
|
+
data: Value,
|
236
|
+
mode: String,
|
237
|
+
table: Option<&RawDeltaTable>,
|
238
|
+
schema_mode: Option<String>,
|
239
|
+
partition_by: Option<Vec<String>>,
|
240
|
+
predicate: Option<String>,
|
241
|
+
target_file_size: Option<usize>,
|
242
|
+
name: Option<String>,
|
243
|
+
description: Option<String>,
|
244
|
+
configuration: Option<HashMap<String, Option<String>>>,
|
245
|
+
storage_options: Option<HashMap<String, String>>,
|
246
|
+
) -> RbResult<()> {
|
247
|
+
let capsule_pointer: usize = data.funcall("to_i", ())?;
|
248
|
+
|
249
|
+
// use similar approach as Polars to avoid copy
|
250
|
+
let stream_ptr =
|
251
|
+
Box::new(unsafe { std::ptr::replace(capsule_pointer as _, FFI_ArrowArrayStream::empty()) });
|
252
|
+
let stream = ArrowArrayStreamReader::try_new(*stream_ptr)
|
253
|
+
.map_err(|err| DeltaError::new_err(err.to_string()))?;
|
254
|
+
|
255
|
+
let batches = stream.map(|batch| batch.unwrap()).collect::<Vec<_>>();
|
256
|
+
let save_mode = mode.parse().map_err(RubyError::from)?;
|
257
|
+
|
258
|
+
let options = storage_options.clone().unwrap_or_default();
|
259
|
+
let table = if let Some(table) = table {
|
260
|
+
DeltaOps(table._table.borrow().clone())
|
261
|
+
} else {
|
262
|
+
rt().block_on(DeltaOps::try_from_uri_with_storage_options(
|
263
|
+
&table_uri, options,
|
264
|
+
))
|
265
|
+
.map_err(RubyError::from)?
|
266
|
+
};
|
267
|
+
|
268
|
+
let mut builder = table.write(batches).with_save_mode(save_mode);
|
269
|
+
if let Some(schema_mode) = schema_mode {
|
270
|
+
builder = builder.with_schema_mode(schema_mode.parse().map_err(RubyError::from)?);
|
271
|
+
}
|
272
|
+
if let Some(partition_columns) = partition_by {
|
273
|
+
builder = builder.with_partition_columns(partition_columns);
|
274
|
+
}
|
275
|
+
|
276
|
+
if let Some(name) = &name {
|
277
|
+
builder = builder.with_table_name(name);
|
278
|
+
};
|
279
|
+
|
280
|
+
if let Some(description) = &description {
|
281
|
+
builder = builder.with_description(description);
|
282
|
+
};
|
283
|
+
|
284
|
+
if let Some(predicate) = predicate {
|
285
|
+
builder = builder.with_replace_where(predicate);
|
286
|
+
};
|
287
|
+
|
288
|
+
if let Some(target_file_size) = target_file_size {
|
289
|
+
builder = builder.with_target_file_size(target_file_size)
|
290
|
+
};
|
291
|
+
|
292
|
+
if let Some(config) = configuration {
|
293
|
+
builder = builder.with_configuration(config);
|
294
|
+
};
|
295
|
+
|
296
|
+
rt().block_on(builder.into_future())
|
297
|
+
.map_err(RubyError::from)?;
|
298
|
+
|
299
|
+
Ok(())
|
300
|
+
}
|
301
|
+
|
302
|
+
#[magnus::init]
|
303
|
+
fn init(ruby: &Ruby) -> RbResult<()> {
|
304
|
+
deltalake::aws::register_handlers(None);
|
305
|
+
|
306
|
+
let module = ruby.define_module("DeltaLake")?;
|
307
|
+
module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 12))?;
|
308
|
+
|
309
|
+
let class = module.define_class("RawDeltaTable", ruby.class_object())?;
|
310
|
+
class.define_singleton_method("new", function!(RawDeltaTable::new, 5))?;
|
311
|
+
class.define_singleton_method("is_deltatable", function!(RawDeltaTable::is_deltatable, 2))?;
|
312
|
+
class.define_method("table_uri", method!(RawDeltaTable::table_uri, 0))?;
|
313
|
+
class.define_method("version", method!(RawDeltaTable::version, 0))?;
|
314
|
+
class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?;
|
315
|
+
class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?;
|
316
|
+
class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?;
|
317
|
+
class.define_method("files", method!(RawDeltaTable::files, 0))?;
|
318
|
+
class.define_method("file_uris", method!(RawDeltaTable::file_uris, 0))?;
|
319
|
+
class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
|
320
|
+
class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
|
321
|
+
class.define_method(
|
322
|
+
"update_incremental",
|
323
|
+
method!(RawDeltaTable::update_incremental, 0),
|
324
|
+
)?;
|
325
|
+
class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
|
326
|
+
|
327
|
+
let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
|
328
|
+
class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?;
|
329
|
+
class.define_method("name", method!(RawDeltaTableMetaData::name, 0))?;
|
330
|
+
class.define_method(
|
331
|
+
"description",
|
332
|
+
method!(RawDeltaTableMetaData::description, 0),
|
333
|
+
)?;
|
334
|
+
class.define_method(
|
335
|
+
"partition_columns",
|
336
|
+
method!(RawDeltaTableMetaData::partition_columns, 0),
|
337
|
+
)?;
|
338
|
+
class.define_method(
|
339
|
+
"created_time",
|
340
|
+
method!(RawDeltaTableMetaData::created_time, 0),
|
341
|
+
)?;
|
342
|
+
class.define_method(
|
343
|
+
"configuration",
|
344
|
+
method!(RawDeltaTableMetaData::configuration, 0),
|
345
|
+
)?;
|
346
|
+
|
347
|
+
let class = module.define_class("Field", ruby.class_object())?;
|
348
|
+
class.define_method("name", method!(Field::name, 0))?;
|
349
|
+
class.define_method("type", method!(Field::get_type, 0))?;
|
350
|
+
class.define_method("nullable", method!(Field::nullable, 0))?;
|
351
|
+
|
352
|
+
Ok(())
|
353
|
+
}
|
@@ -0,0 +1,37 @@
|
|
1
|
+
use deltalake::kernel::{StructField, StructType as DeltaStructType};
|
2
|
+
use magnus::{value::ReprValue, Module, RArray, RModule, Ruby, Value};
|
3
|
+
|
4
|
+
use crate::RbResult;
|
5
|
+
|
6
|
+
pub fn schema_to_rbobject(schema: DeltaStructType) -> RbResult<Value> {
|
7
|
+
let fields = schema.fields().map(|field| Field {
|
8
|
+
inner: field.clone(),
|
9
|
+
});
|
10
|
+
|
11
|
+
let rb_schema: Value = Ruby::get()
|
12
|
+
.unwrap()
|
13
|
+
.class_object()
|
14
|
+
.const_get::<_, RModule>("DeltaLake")?
|
15
|
+
.const_get("Schema")?;
|
16
|
+
|
17
|
+
rb_schema.funcall("new", (RArray::from_iter(fields),))
|
18
|
+
}
|
19
|
+
|
20
|
+
#[magnus::wrap(class = "DeltaLake::Field")]
|
21
|
+
pub struct Field {
|
22
|
+
pub inner: StructField,
|
23
|
+
}
|
24
|
+
|
25
|
+
impl Field {
|
26
|
+
pub fn name(&self) -> String {
|
27
|
+
self.inner.name().to_string()
|
28
|
+
}
|
29
|
+
|
30
|
+
pub fn get_type(&self) -> String {
|
31
|
+
self.inner.data_type().to_string()
|
32
|
+
}
|
33
|
+
|
34
|
+
pub fn nullable(&self) -> bool {
|
35
|
+
self.inner.is_nullable()
|
36
|
+
}
|
37
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
use std::sync::OnceLock;
|
2
|
+
|
3
|
+
use tokio::runtime::Runtime;
|
4
|
+
|
5
|
+
#[inline]
|
6
|
+
pub fn rt() -> &'static Runtime {
|
7
|
+
static TOKIO_RT: OnceLock<Runtime> = OnceLock::new();
|
8
|
+
static PID: OnceLock<u32> = OnceLock::new();
|
9
|
+
let pid = std::process::id();
|
10
|
+
let runtime_pid = *PID.get_or_init(|| pid);
|
11
|
+
if pid != runtime_pid {
|
12
|
+
panic!(
|
13
|
+
"Forked process detected - current PID is {} but the tokio runtime was created by {}. The tokio \
|
14
|
+
runtime does not support forked processes https://github.com/tokio-rs/tokio/issues/4301. If you are \
|
15
|
+
seeing this message while using Ruby multithreading make sure to use the `spawn` or `forkserver` \
|
16
|
+
mode.",
|
17
|
+
pid, runtime_pid
|
18
|
+
);
|
19
|
+
}
|
20
|
+
TOKIO_RT.get_or_init(|| Runtime::new().expect("Failed to create a tokio runtime."))
|
21
|
+
}
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class Metadata
|
3
|
+
def initialize(table)
|
4
|
+
@metadata = table.metadata
|
5
|
+
end
|
6
|
+
|
7
|
+
def id
|
8
|
+
@metadata.id
|
9
|
+
end
|
10
|
+
|
11
|
+
def name
|
12
|
+
@metadata.name
|
13
|
+
end
|
14
|
+
|
15
|
+
def description
|
16
|
+
@metadata.description
|
17
|
+
end
|
18
|
+
|
19
|
+
def partition_columns
|
20
|
+
@metadata.partition_columns
|
21
|
+
end
|
22
|
+
|
23
|
+
def created_time
|
24
|
+
@metadata.created_time
|
25
|
+
end
|
26
|
+
|
27
|
+
def configuration
|
28
|
+
@metadata.configuration
|
29
|
+
end
|
30
|
+
|
31
|
+
def inspect
|
32
|
+
attributes = {
|
33
|
+
id: id,
|
34
|
+
name: name,
|
35
|
+
description: description,
|
36
|
+
partition_columns: partition_columns,
|
37
|
+
created_time: created_time,
|
38
|
+
configuration: configuration
|
39
|
+
}
|
40
|
+
"<#{self.class.name} #{attributes.map { |k, v| "#{k}=#{v.inspect}" }.join(", ")}>"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class Table
|
3
|
+
def initialize(
|
4
|
+
table_uri,
|
5
|
+
version: nil,
|
6
|
+
storage_options: nil,
|
7
|
+
without_files: false,
|
8
|
+
log_buffer_size: nil
|
9
|
+
)
|
10
|
+
@storage_options = storage_options
|
11
|
+
@table =
|
12
|
+
RawDeltaTable.new(
|
13
|
+
table_uri,
|
14
|
+
version,
|
15
|
+
storage_options,
|
16
|
+
without_files,
|
17
|
+
log_buffer_size
|
18
|
+
)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.exists?(table_uri, storage_options: nil)
|
22
|
+
RawDeltaTable.is_deltatable(table_uri, storage_options)
|
23
|
+
end
|
24
|
+
|
25
|
+
def version
|
26
|
+
@table.version
|
27
|
+
end
|
28
|
+
|
29
|
+
def files
|
30
|
+
@table.files
|
31
|
+
end
|
32
|
+
|
33
|
+
def file_uris
|
34
|
+
@table.file_uris
|
35
|
+
end
|
36
|
+
|
37
|
+
def load_as_version(version)
|
38
|
+
if version.is_a?(Integer)
|
39
|
+
@table.load_version(version)
|
40
|
+
else
|
41
|
+
raise TypeError, "Invalid datatype provided for version, only Integer is accepted."
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def table_uri
|
46
|
+
@table.table_uri
|
47
|
+
end
|
48
|
+
|
49
|
+
def schema
|
50
|
+
@table.schema
|
51
|
+
end
|
52
|
+
|
53
|
+
def metadata
|
54
|
+
Metadata.new(@table)
|
55
|
+
end
|
56
|
+
|
57
|
+
def vacuum(
|
58
|
+
retention_hours: nil,
|
59
|
+
dry_run: true,
|
60
|
+
enforce_retention_duration: true
|
61
|
+
)
|
62
|
+
if retention_hours
|
63
|
+
if retention_hours < 0
|
64
|
+
raise ArgumentError, "The retention periods should be positive."
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
@table.vacuum(
|
69
|
+
dry_run,
|
70
|
+
retention_hours,
|
71
|
+
enforce_retention_duration
|
72
|
+
)
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_polars(eager: true)
|
76
|
+
require "polars-df"
|
77
|
+
|
78
|
+
sources = file_uris
|
79
|
+
lf =
|
80
|
+
if sources.empty?
|
81
|
+
Polars::LazyFrame.new
|
82
|
+
else
|
83
|
+
storage_options = @storage_options&.except("AWS_S3_ALLOW_UNSAFE_RENAME")
|
84
|
+
Polars.scan_parquet(sources, storage_options: storage_options)
|
85
|
+
end
|
86
|
+
eager ? lf.collect : lf
|
87
|
+
end
|
88
|
+
|
89
|
+
def update_incremental
|
90
|
+
@table.update_incremental
|
91
|
+
end
|
92
|
+
|
93
|
+
def delete(predicate = nil)
|
94
|
+
metrics = @table.delete(predicate)
|
95
|
+
JSON.parse(metrics).transform_keys(&:to_sym)
|
96
|
+
end
|
97
|
+
|
98
|
+
# private
|
99
|
+
def _table
|
100
|
+
@table
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
data/lib/deltalake-rb.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require_relative "deltalake"
|
data/lib/deltalake.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
# ext
|
2
|
+
begin
|
3
|
+
require "deltalake/#{RUBY_VERSION.to_f}/deltalake"
|
4
|
+
rescue LoadError
|
5
|
+
require "deltalake/deltalake"
|
6
|
+
end
|
7
|
+
|
8
|
+
# stdlib
|
9
|
+
require "json"
|
10
|
+
|
11
|
+
# modules
|
12
|
+
require_relative "deltalake/field"
|
13
|
+
require_relative "deltalake/metadata"
|
14
|
+
require_relative "deltalake/schema"
|
15
|
+
require_relative "deltalake/table"
|
16
|
+
require_relative "deltalake/version"
|
17
|
+
|
18
|
+
module DeltaLake
|
19
|
+
class Error < StandardError; end
|
20
|
+
class TableNotFoundError < Error; end
|
21
|
+
class DeltaProtocolError < Error; end
|
22
|
+
class CommitFailedError < Error; end
|
23
|
+
class SchemaMismatchError < Error; end
|
24
|
+
|
25
|
+
class << self
|
26
|
+
|
27
|
+
def write(
|
28
|
+
table_or_uri,
|
29
|
+
data,
|
30
|
+
partition_by: nil,
|
31
|
+
mode: "error",
|
32
|
+
name: nil,
|
33
|
+
description: nil,
|
34
|
+
configuration: nil,
|
35
|
+
schema_mode: nil,
|
36
|
+
storage_options: nil,
|
37
|
+
predicate: nil,
|
38
|
+
target_file_size: nil
|
39
|
+
)
|
40
|
+
table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options)
|
41
|
+
|
42
|
+
if partition_by.is_a?(String)
|
43
|
+
partition_by = [partition_by]
|
44
|
+
end
|
45
|
+
|
46
|
+
if !table.nil? && mode == "ignore"
|
47
|
+
return
|
48
|
+
end
|
49
|
+
|
50
|
+
data = convert_data(data)
|
51
|
+
|
52
|
+
write_deltalake_rust(
|
53
|
+
table_uri,
|
54
|
+
data,
|
55
|
+
mode,
|
56
|
+
table&._table,
|
57
|
+
schema_mode,
|
58
|
+
partition_by,
|
59
|
+
predicate,
|
60
|
+
target_file_size,
|
61
|
+
name,
|
62
|
+
description,
|
63
|
+
configuration,
|
64
|
+
storage_options
|
65
|
+
)
|
66
|
+
|
67
|
+
if table
|
68
|
+
table.update_incremental
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def try_get_table_and_table_uri(table_or_uri, storage_options)
|
75
|
+
if !table_or_uri.is_a?(String) && !table_or_uri.is_a?(Table)
|
76
|
+
raise ArgumentError, "table_or_uri must be a String or Table"
|
77
|
+
end
|
78
|
+
|
79
|
+
if table_or_uri.is_a?(String)
|
80
|
+
table = try_get_deltatable(table_or_uri, storage_options)
|
81
|
+
table_uri = table_or_uri.to_s
|
82
|
+
else
|
83
|
+
table = table_or_uri
|
84
|
+
table_uri = table._table.table_uri
|
85
|
+
end
|
86
|
+
|
87
|
+
[table, table_uri]
|
88
|
+
end
|
89
|
+
|
90
|
+
def try_get_deltatable(table_uri, storage_options)
|
91
|
+
Table.new(table_uri, storage_options: storage_options)
|
92
|
+
rescue TableNotFoundError
|
93
|
+
nil
|
94
|
+
end
|
95
|
+
|
96
|
+
def convert_data(data)
|
97
|
+
if data.respond_to?(:arrow_c_stream)
|
98
|
+
data.arrow_c_stream
|
99
|
+
else
|
100
|
+
raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|