deltalake-rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Cargo.lock +4371 -0
- data/Cargo.toml +6 -0
- data/LICENSE.txt +179 -0
- data/README.md +110 -0
- data/ext/deltalake/Cargo.toml +21 -0
- data/ext/deltalake/extconf.rb +4 -0
- data/ext/deltalake/src/error.rs +98 -0
- data/ext/deltalake/src/lib.rs +353 -0
- data/ext/deltalake/src/schema.rs +37 -0
- data/ext/deltalake/src/utils.rs +21 -0
- data/lib/deltalake/field.rb +12 -0
- data/lib/deltalake/metadata.rb +43 -0
- data/lib/deltalake/schema.rb +9 -0
- data/lib/deltalake/table.rb +103 -0
- data/lib/deltalake/version.rb +3 -0
- data/lib/deltalake-rb.rb +1 -0
- data/lib/deltalake.rb +104 -0
- metadata +75 -0
@@ -0,0 +1,353 @@
|
|
1
|
+
mod error;
|
2
|
+
mod schema;
|
3
|
+
mod utils;
|
4
|
+
|
5
|
+
use std::cell::RefCell;
|
6
|
+
use std::collections::HashMap;
|
7
|
+
use std::future::IntoFuture;
|
8
|
+
|
9
|
+
use chrono::Duration;
|
10
|
+
use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
|
11
|
+
use deltalake::kernel::StructType;
|
12
|
+
use deltalake::operations::delete::DeleteBuilder;
|
13
|
+
use deltalake::operations::vacuum::VacuumBuilder;
|
14
|
+
use deltalake::storage::IORuntime;
|
15
|
+
use deltalake::DeltaOps;
|
16
|
+
use error::DeltaError;
|
17
|
+
|
18
|
+
use magnus::{function, method, prelude::*, Error, Module, Ruby, Value};
|
19
|
+
|
20
|
+
use crate::error::RubyError;
|
21
|
+
use crate::schema::{schema_to_rbobject, Field};
|
22
|
+
use crate::utils::rt;
|
23
|
+
|
24
|
+
type RbResult<T> = Result<T, Error>;
|
25
|
+
|
26
|
+
#[magnus::wrap(class = "DeltaLake::RawDeltaTable")]
|
27
|
+
struct RawDeltaTable {
|
28
|
+
_table: RefCell<deltalake::DeltaTable>,
|
29
|
+
}
|
30
|
+
|
31
|
+
#[magnus::wrap(class = "DeltaLake::RawDeltaTableMetaData")]
|
32
|
+
struct RawDeltaTableMetaData {
|
33
|
+
id: String,
|
34
|
+
name: Option<String>,
|
35
|
+
description: Option<String>,
|
36
|
+
partition_columns: Vec<String>,
|
37
|
+
created_time: Option<i64>,
|
38
|
+
configuration: HashMap<String, Option<String>>,
|
39
|
+
}
|
40
|
+
|
41
|
+
impl RawDeltaTable {
|
42
|
+
pub fn new(
|
43
|
+
table_uri: String,
|
44
|
+
version: Option<i64>,
|
45
|
+
storage_options: Option<HashMap<String, String>>,
|
46
|
+
without_files: bool,
|
47
|
+
log_buffer_size: Option<usize>,
|
48
|
+
) -> RbResult<Self> {
|
49
|
+
let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri)
|
50
|
+
.with_io_runtime(IORuntime::default());
|
51
|
+
|
52
|
+
if let Some(storage_options) = storage_options {
|
53
|
+
builder = builder.with_storage_options(storage_options)
|
54
|
+
}
|
55
|
+
if let Some(version) = version {
|
56
|
+
builder = builder.with_version(version)
|
57
|
+
}
|
58
|
+
if without_files {
|
59
|
+
builder = builder.without_files()
|
60
|
+
}
|
61
|
+
if let Some(buf_size) = log_buffer_size {
|
62
|
+
builder = builder
|
63
|
+
.with_log_buffer_size(buf_size)
|
64
|
+
.map_err(RubyError::from)?;
|
65
|
+
}
|
66
|
+
|
67
|
+
let table = rt().block_on(builder.load()).map_err(RubyError::from)?;
|
68
|
+
Ok(RawDeltaTable {
|
69
|
+
_table: RefCell::new(table),
|
70
|
+
})
|
71
|
+
}
|
72
|
+
|
73
|
+
pub fn is_deltatable(
|
74
|
+
table_uri: String,
|
75
|
+
storage_options: Option<HashMap<String, String>>,
|
76
|
+
) -> RbResult<bool> {
|
77
|
+
let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri);
|
78
|
+
if let Some(storage_options) = storage_options {
|
79
|
+
builder = builder.with_storage_options(storage_options)
|
80
|
+
}
|
81
|
+
Ok(rt()
|
82
|
+
.block_on(async {
|
83
|
+
match builder.build() {
|
84
|
+
Ok(table) => table.verify_deltatable_existence().await,
|
85
|
+
Err(err) => Err(err),
|
86
|
+
}
|
87
|
+
})
|
88
|
+
.map_err(RubyError::from)?)
|
89
|
+
}
|
90
|
+
|
91
|
+
pub fn table_uri(&self) -> RbResult<String> {
|
92
|
+
Ok(self._table.borrow().table_uri())
|
93
|
+
}
|
94
|
+
|
95
|
+
pub fn version(&self) -> RbResult<i64> {
|
96
|
+
Ok(self._table.borrow().version())
|
97
|
+
}
|
98
|
+
|
99
|
+
pub fn has_files(&self) -> RbResult<bool> {
|
100
|
+
Ok(self._table.borrow().config.require_files)
|
101
|
+
}
|
102
|
+
|
103
|
+
pub fn metadata(&self) -> RbResult<RawDeltaTableMetaData> {
|
104
|
+
let binding = self._table.borrow();
|
105
|
+
let metadata = binding.metadata().map_err(RubyError::from)?;
|
106
|
+
Ok(RawDeltaTableMetaData {
|
107
|
+
id: metadata.id.clone(),
|
108
|
+
name: metadata.name.clone(),
|
109
|
+
description: metadata.description.clone(),
|
110
|
+
partition_columns: metadata.partition_columns.clone(),
|
111
|
+
created_time: metadata.created_time,
|
112
|
+
configuration: metadata.configuration.clone(),
|
113
|
+
})
|
114
|
+
}
|
115
|
+
|
116
|
+
pub fn load_version(&self, version: i64) -> RbResult<()> {
|
117
|
+
Ok(rt()
|
118
|
+
.block_on(self._table.borrow_mut().load_version(version))
|
119
|
+
.map_err(RubyError::from)?)
|
120
|
+
}
|
121
|
+
|
122
|
+
pub fn files(&self) -> RbResult<Vec<String>> {
|
123
|
+
if !self.has_files()? {
|
124
|
+
return Err(DeltaError::new_err("Table is instantiated without files."));
|
125
|
+
}
|
126
|
+
|
127
|
+
Ok(self
|
128
|
+
._table
|
129
|
+
.borrow()
|
130
|
+
.get_files_iter()
|
131
|
+
.map_err(RubyError::from)?
|
132
|
+
.map(|f| f.to_string())
|
133
|
+
.collect())
|
134
|
+
}
|
135
|
+
|
136
|
+
pub fn file_uris(&self) -> RbResult<Vec<String>> {
|
137
|
+
if !self._table.borrow().config.require_files {
|
138
|
+
return Err(DeltaError::new_err("Table is initiated without files."));
|
139
|
+
}
|
140
|
+
|
141
|
+
Ok(self
|
142
|
+
._table
|
143
|
+
.borrow()
|
144
|
+
.get_file_uris()
|
145
|
+
.map_err(RubyError::from)?
|
146
|
+
.collect())
|
147
|
+
}
|
148
|
+
|
149
|
+
pub fn schema(&self) -> RbResult<Value> {
|
150
|
+
let binding = self._table.borrow();
|
151
|
+
let schema: &StructType = binding.get_schema().map_err(RubyError::from)?;
|
152
|
+
schema_to_rbobject(schema.to_owned())
|
153
|
+
}
|
154
|
+
|
155
|
+
pub fn vacuum(
|
156
|
+
&self,
|
157
|
+
dry_run: bool,
|
158
|
+
retention_hours: Option<u64>,
|
159
|
+
enforce_retention_duration: bool,
|
160
|
+
) -> RbResult<Vec<String>> {
|
161
|
+
let mut cmd = VacuumBuilder::new(
|
162
|
+
self._table.borrow().log_store(),
|
163
|
+
self._table
|
164
|
+
.borrow()
|
165
|
+
.snapshot()
|
166
|
+
.map_err(RubyError::from)?
|
167
|
+
.clone(),
|
168
|
+
)
|
169
|
+
.with_enforce_retention_duration(enforce_retention_duration)
|
170
|
+
.with_dry_run(dry_run);
|
171
|
+
if let Some(retention_period) = retention_hours {
|
172
|
+
cmd = cmd.with_retention_period(Duration::hours(retention_period as i64));
|
173
|
+
}
|
174
|
+
|
175
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
176
|
+
self._table.borrow_mut().state = table.state;
|
177
|
+
Ok(metrics.files_deleted)
|
178
|
+
}
|
179
|
+
|
180
|
+
pub fn update_incremental(&self) -> RbResult<()> {
|
181
|
+
#[allow(deprecated)]
|
182
|
+
Ok(rt()
|
183
|
+
.block_on(self._table.borrow_mut().update_incremental(None))
|
184
|
+
.map_err(RubyError::from)?)
|
185
|
+
}
|
186
|
+
|
187
|
+
pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
|
188
|
+
let mut cmd = DeleteBuilder::new(
|
189
|
+
self._table.borrow().log_store(),
|
190
|
+
self._table
|
191
|
+
.borrow()
|
192
|
+
.snapshot()
|
193
|
+
.map_err(RubyError::from)?
|
194
|
+
.clone(),
|
195
|
+
);
|
196
|
+
if let Some(predicate) = predicate {
|
197
|
+
cmd = cmd.with_predicate(predicate);
|
198
|
+
}
|
199
|
+
|
200
|
+
let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
|
201
|
+
self._table.borrow_mut().state = table.state;
|
202
|
+
Ok(serde_json::to_string(&metrics).unwrap())
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
impl RawDeltaTableMetaData {
|
207
|
+
fn id(&self) -> String {
|
208
|
+
self.id.clone()
|
209
|
+
}
|
210
|
+
|
211
|
+
fn name(&self) -> Option<String> {
|
212
|
+
self.name.clone()
|
213
|
+
}
|
214
|
+
|
215
|
+
fn description(&self) -> Option<String> {
|
216
|
+
self.description.clone()
|
217
|
+
}
|
218
|
+
|
219
|
+
fn partition_columns(&self) -> Vec<String> {
|
220
|
+
self.partition_columns.clone()
|
221
|
+
}
|
222
|
+
|
223
|
+
fn created_time(&self) -> Option<i64> {
|
224
|
+
self.created_time
|
225
|
+
}
|
226
|
+
|
227
|
+
fn configuration(&self) -> HashMap<String, Option<String>> {
|
228
|
+
self.configuration.clone()
|
229
|
+
}
|
230
|
+
}
|
231
|
+
|
232
|
+
#[allow(clippy::too_many_arguments)]
|
233
|
+
fn write_to_deltalake(
|
234
|
+
table_uri: String,
|
235
|
+
data: Value,
|
236
|
+
mode: String,
|
237
|
+
table: Option<&RawDeltaTable>,
|
238
|
+
schema_mode: Option<String>,
|
239
|
+
partition_by: Option<Vec<String>>,
|
240
|
+
predicate: Option<String>,
|
241
|
+
target_file_size: Option<usize>,
|
242
|
+
name: Option<String>,
|
243
|
+
description: Option<String>,
|
244
|
+
configuration: Option<HashMap<String, Option<String>>>,
|
245
|
+
storage_options: Option<HashMap<String, String>>,
|
246
|
+
) -> RbResult<()> {
|
247
|
+
let capsule_pointer: usize = data.funcall("to_i", ())?;
|
248
|
+
|
249
|
+
// use similar approach as Polars to avoid copy
|
250
|
+
let stream_ptr =
|
251
|
+
Box::new(unsafe { std::ptr::replace(capsule_pointer as _, FFI_ArrowArrayStream::empty()) });
|
252
|
+
let stream = ArrowArrayStreamReader::try_new(*stream_ptr)
|
253
|
+
.map_err(|err| DeltaError::new_err(err.to_string()))?;
|
254
|
+
|
255
|
+
let batches = stream.map(|batch| batch.unwrap()).collect::<Vec<_>>();
|
256
|
+
let save_mode = mode.parse().map_err(RubyError::from)?;
|
257
|
+
|
258
|
+
let options = storage_options.clone().unwrap_or_default();
|
259
|
+
let table = if let Some(table) = table {
|
260
|
+
DeltaOps(table._table.borrow().clone())
|
261
|
+
} else {
|
262
|
+
rt().block_on(DeltaOps::try_from_uri_with_storage_options(
|
263
|
+
&table_uri, options,
|
264
|
+
))
|
265
|
+
.map_err(RubyError::from)?
|
266
|
+
};
|
267
|
+
|
268
|
+
let mut builder = table.write(batches).with_save_mode(save_mode);
|
269
|
+
if let Some(schema_mode) = schema_mode {
|
270
|
+
builder = builder.with_schema_mode(schema_mode.parse().map_err(RubyError::from)?);
|
271
|
+
}
|
272
|
+
if let Some(partition_columns) = partition_by {
|
273
|
+
builder = builder.with_partition_columns(partition_columns);
|
274
|
+
}
|
275
|
+
|
276
|
+
if let Some(name) = &name {
|
277
|
+
builder = builder.with_table_name(name);
|
278
|
+
};
|
279
|
+
|
280
|
+
if let Some(description) = &description {
|
281
|
+
builder = builder.with_description(description);
|
282
|
+
};
|
283
|
+
|
284
|
+
if let Some(predicate) = predicate {
|
285
|
+
builder = builder.with_replace_where(predicate);
|
286
|
+
};
|
287
|
+
|
288
|
+
if let Some(target_file_size) = target_file_size {
|
289
|
+
builder = builder.with_target_file_size(target_file_size)
|
290
|
+
};
|
291
|
+
|
292
|
+
if let Some(config) = configuration {
|
293
|
+
builder = builder.with_configuration(config);
|
294
|
+
};
|
295
|
+
|
296
|
+
rt().block_on(builder.into_future())
|
297
|
+
.map_err(RubyError::from)?;
|
298
|
+
|
299
|
+
Ok(())
|
300
|
+
}
|
301
|
+
|
302
|
+
#[magnus::init]
|
303
|
+
fn init(ruby: &Ruby) -> RbResult<()> {
|
304
|
+
deltalake::aws::register_handlers(None);
|
305
|
+
|
306
|
+
let module = ruby.define_module("DeltaLake")?;
|
307
|
+
module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 12))?;
|
308
|
+
|
309
|
+
let class = module.define_class("RawDeltaTable", ruby.class_object())?;
|
310
|
+
class.define_singleton_method("new", function!(RawDeltaTable::new, 5))?;
|
311
|
+
class.define_singleton_method("is_deltatable", function!(RawDeltaTable::is_deltatable, 2))?;
|
312
|
+
class.define_method("table_uri", method!(RawDeltaTable::table_uri, 0))?;
|
313
|
+
class.define_method("version", method!(RawDeltaTable::version, 0))?;
|
314
|
+
class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?;
|
315
|
+
class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?;
|
316
|
+
class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?;
|
317
|
+
class.define_method("files", method!(RawDeltaTable::files, 0))?;
|
318
|
+
class.define_method("file_uris", method!(RawDeltaTable::file_uris, 0))?;
|
319
|
+
class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
|
320
|
+
class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
|
321
|
+
class.define_method(
|
322
|
+
"update_incremental",
|
323
|
+
method!(RawDeltaTable::update_incremental, 0),
|
324
|
+
)?;
|
325
|
+
class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
|
326
|
+
|
327
|
+
let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
|
328
|
+
class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?;
|
329
|
+
class.define_method("name", method!(RawDeltaTableMetaData::name, 0))?;
|
330
|
+
class.define_method(
|
331
|
+
"description",
|
332
|
+
method!(RawDeltaTableMetaData::description, 0),
|
333
|
+
)?;
|
334
|
+
class.define_method(
|
335
|
+
"partition_columns",
|
336
|
+
method!(RawDeltaTableMetaData::partition_columns, 0),
|
337
|
+
)?;
|
338
|
+
class.define_method(
|
339
|
+
"created_time",
|
340
|
+
method!(RawDeltaTableMetaData::created_time, 0),
|
341
|
+
)?;
|
342
|
+
class.define_method(
|
343
|
+
"configuration",
|
344
|
+
method!(RawDeltaTableMetaData::configuration, 0),
|
345
|
+
)?;
|
346
|
+
|
347
|
+
let class = module.define_class("Field", ruby.class_object())?;
|
348
|
+
class.define_method("name", method!(Field::name, 0))?;
|
349
|
+
class.define_method("type", method!(Field::get_type, 0))?;
|
350
|
+
class.define_method("nullable", method!(Field::nullable, 0))?;
|
351
|
+
|
352
|
+
Ok(())
|
353
|
+
}
|
@@ -0,0 +1,37 @@
|
|
1
|
+
use deltalake::kernel::{StructField, StructType as DeltaStructType};
|
2
|
+
use magnus::{value::ReprValue, Module, RArray, RModule, Ruby, Value};
|
3
|
+
|
4
|
+
use crate::RbResult;
|
5
|
+
|
6
|
+
pub fn schema_to_rbobject(schema: DeltaStructType) -> RbResult<Value> {
|
7
|
+
let fields = schema.fields().map(|field| Field {
|
8
|
+
inner: field.clone(),
|
9
|
+
});
|
10
|
+
|
11
|
+
let rb_schema: Value = Ruby::get()
|
12
|
+
.unwrap()
|
13
|
+
.class_object()
|
14
|
+
.const_get::<_, RModule>("DeltaLake")?
|
15
|
+
.const_get("Schema")?;
|
16
|
+
|
17
|
+
rb_schema.funcall("new", (RArray::from_iter(fields),))
|
18
|
+
}
|
19
|
+
|
20
|
+
#[magnus::wrap(class = "DeltaLake::Field")]
|
21
|
+
pub struct Field {
|
22
|
+
pub inner: StructField,
|
23
|
+
}
|
24
|
+
|
25
|
+
impl Field {
|
26
|
+
pub fn name(&self) -> String {
|
27
|
+
self.inner.name().to_string()
|
28
|
+
}
|
29
|
+
|
30
|
+
pub fn get_type(&self) -> String {
|
31
|
+
self.inner.data_type().to_string()
|
32
|
+
}
|
33
|
+
|
34
|
+
pub fn nullable(&self) -> bool {
|
35
|
+
self.inner.is_nullable()
|
36
|
+
}
|
37
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
use std::sync::OnceLock;
|
2
|
+
|
3
|
+
use tokio::runtime::Runtime;
|
4
|
+
|
5
|
+
#[inline]
|
6
|
+
pub fn rt() -> &'static Runtime {
|
7
|
+
static TOKIO_RT: OnceLock<Runtime> = OnceLock::new();
|
8
|
+
static PID: OnceLock<u32> = OnceLock::new();
|
9
|
+
let pid = std::process::id();
|
10
|
+
let runtime_pid = *PID.get_or_init(|| pid);
|
11
|
+
if pid != runtime_pid {
|
12
|
+
panic!(
|
13
|
+
"Forked process detected - current PID is {} but the tokio runtime was created by {}. The tokio \
|
14
|
+
runtime does not support forked processes https://github.com/tokio-rs/tokio/issues/4301. If you are \
|
15
|
+
seeing this message while using Ruby multithreading make sure to use the `spawn` or `forkserver` \
|
16
|
+
mode.",
|
17
|
+
pid, runtime_pid
|
18
|
+
);
|
19
|
+
}
|
20
|
+
TOKIO_RT.get_or_init(|| Runtime::new().expect("Failed to create a tokio runtime."))
|
21
|
+
}
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class Metadata
|
3
|
+
def initialize(table)
|
4
|
+
@metadata = table.metadata
|
5
|
+
end
|
6
|
+
|
7
|
+
def id
|
8
|
+
@metadata.id
|
9
|
+
end
|
10
|
+
|
11
|
+
def name
|
12
|
+
@metadata.name
|
13
|
+
end
|
14
|
+
|
15
|
+
def description
|
16
|
+
@metadata.description
|
17
|
+
end
|
18
|
+
|
19
|
+
def partition_columns
|
20
|
+
@metadata.partition_columns
|
21
|
+
end
|
22
|
+
|
23
|
+
def created_time
|
24
|
+
@metadata.created_time
|
25
|
+
end
|
26
|
+
|
27
|
+
def configuration
|
28
|
+
@metadata.configuration
|
29
|
+
end
|
30
|
+
|
31
|
+
def inspect
|
32
|
+
attributes = {
|
33
|
+
id: id,
|
34
|
+
name: name,
|
35
|
+
description: description,
|
36
|
+
partition_columns: partition_columns,
|
37
|
+
created_time: created_time,
|
38
|
+
configuration: configuration
|
39
|
+
}
|
40
|
+
"<#{self.class.name} #{attributes.map { |k, v| "#{k}=#{v.inspect}" }.join(", ")}>"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module DeltaLake
|
2
|
+
class Table
|
3
|
+
def initialize(
|
4
|
+
table_uri,
|
5
|
+
version: nil,
|
6
|
+
storage_options: nil,
|
7
|
+
without_files: false,
|
8
|
+
log_buffer_size: nil
|
9
|
+
)
|
10
|
+
@storage_options = storage_options
|
11
|
+
@table =
|
12
|
+
RawDeltaTable.new(
|
13
|
+
table_uri,
|
14
|
+
version,
|
15
|
+
storage_options,
|
16
|
+
without_files,
|
17
|
+
log_buffer_size
|
18
|
+
)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.exists?(table_uri, storage_options: nil)
|
22
|
+
RawDeltaTable.is_deltatable(table_uri, storage_options)
|
23
|
+
end
|
24
|
+
|
25
|
+
def version
|
26
|
+
@table.version
|
27
|
+
end
|
28
|
+
|
29
|
+
def files
|
30
|
+
@table.files
|
31
|
+
end
|
32
|
+
|
33
|
+
def file_uris
|
34
|
+
@table.file_uris
|
35
|
+
end
|
36
|
+
|
37
|
+
def load_as_version(version)
|
38
|
+
if version.is_a?(Integer)
|
39
|
+
@table.load_version(version)
|
40
|
+
else
|
41
|
+
raise TypeError, "Invalid datatype provided for version, only Integer is accepted."
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def table_uri
|
46
|
+
@table.table_uri
|
47
|
+
end
|
48
|
+
|
49
|
+
def schema
|
50
|
+
@table.schema
|
51
|
+
end
|
52
|
+
|
53
|
+
def metadata
|
54
|
+
Metadata.new(@table)
|
55
|
+
end
|
56
|
+
|
57
|
+
def vacuum(
|
58
|
+
retention_hours: nil,
|
59
|
+
dry_run: true,
|
60
|
+
enforce_retention_duration: true
|
61
|
+
)
|
62
|
+
if retention_hours
|
63
|
+
if retention_hours < 0
|
64
|
+
raise ArgumentError, "The retention periods should be positive."
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
@table.vacuum(
|
69
|
+
dry_run,
|
70
|
+
retention_hours,
|
71
|
+
enforce_retention_duration
|
72
|
+
)
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_polars(eager: true)
|
76
|
+
require "polars-df"
|
77
|
+
|
78
|
+
sources = file_uris
|
79
|
+
lf =
|
80
|
+
if sources.empty?
|
81
|
+
Polars::LazyFrame.new
|
82
|
+
else
|
83
|
+
storage_options = @storage_options&.except("AWS_S3_ALLOW_UNSAFE_RENAME")
|
84
|
+
Polars.scan_parquet(sources, storage_options: storage_options)
|
85
|
+
end
|
86
|
+
eager ? lf.collect : lf
|
87
|
+
end
|
88
|
+
|
89
|
+
def update_incremental
|
90
|
+
@table.update_incremental
|
91
|
+
end
|
92
|
+
|
93
|
+
def delete(predicate = nil)
|
94
|
+
metrics = @table.delete(predicate)
|
95
|
+
JSON.parse(metrics).transform_keys(&:to_sym)
|
96
|
+
end
|
97
|
+
|
98
|
+
# private
|
99
|
+
def _table
|
100
|
+
@table
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
data/lib/deltalake-rb.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require_relative "deltalake"
|
data/lib/deltalake.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
# ext
|
2
|
+
begin
|
3
|
+
require "deltalake/#{RUBY_VERSION.to_f}/deltalake"
|
4
|
+
rescue LoadError
|
5
|
+
require "deltalake/deltalake"
|
6
|
+
end
|
7
|
+
|
8
|
+
# stdlib
|
9
|
+
require "json"
|
10
|
+
|
11
|
+
# modules
|
12
|
+
require_relative "deltalake/field"
|
13
|
+
require_relative "deltalake/metadata"
|
14
|
+
require_relative "deltalake/schema"
|
15
|
+
require_relative "deltalake/table"
|
16
|
+
require_relative "deltalake/version"
|
17
|
+
|
18
|
+
module DeltaLake
|
19
|
+
class Error < StandardError; end
|
20
|
+
class TableNotFoundError < Error; end
|
21
|
+
class DeltaProtocolError < Error; end
|
22
|
+
class CommitFailedError < Error; end
|
23
|
+
class SchemaMismatchError < Error; end
|
24
|
+
|
25
|
+
class << self
|
26
|
+
|
27
|
+
def write(
|
28
|
+
table_or_uri,
|
29
|
+
data,
|
30
|
+
partition_by: nil,
|
31
|
+
mode: "error",
|
32
|
+
name: nil,
|
33
|
+
description: nil,
|
34
|
+
configuration: nil,
|
35
|
+
schema_mode: nil,
|
36
|
+
storage_options: nil,
|
37
|
+
predicate: nil,
|
38
|
+
target_file_size: nil
|
39
|
+
)
|
40
|
+
table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options)
|
41
|
+
|
42
|
+
if partition_by.is_a?(String)
|
43
|
+
partition_by = [partition_by]
|
44
|
+
end
|
45
|
+
|
46
|
+
if !table.nil? && mode == "ignore"
|
47
|
+
return
|
48
|
+
end
|
49
|
+
|
50
|
+
data = convert_data(data)
|
51
|
+
|
52
|
+
write_deltalake_rust(
|
53
|
+
table_uri,
|
54
|
+
data,
|
55
|
+
mode,
|
56
|
+
table&._table,
|
57
|
+
schema_mode,
|
58
|
+
partition_by,
|
59
|
+
predicate,
|
60
|
+
target_file_size,
|
61
|
+
name,
|
62
|
+
description,
|
63
|
+
configuration,
|
64
|
+
storage_options
|
65
|
+
)
|
66
|
+
|
67
|
+
if table
|
68
|
+
table.update_incremental
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def try_get_table_and_table_uri(table_or_uri, storage_options)
|
75
|
+
if !table_or_uri.is_a?(String) && !table_or_uri.is_a?(Table)
|
76
|
+
raise ArgumentError, "table_or_uri must be a String or Table"
|
77
|
+
end
|
78
|
+
|
79
|
+
if table_or_uri.is_a?(String)
|
80
|
+
table = try_get_deltatable(table_or_uri, storage_options)
|
81
|
+
table_uri = table_or_uri.to_s
|
82
|
+
else
|
83
|
+
table = table_or_uri
|
84
|
+
table_uri = table._table.table_uri
|
85
|
+
end
|
86
|
+
|
87
|
+
[table, table_uri]
|
88
|
+
end
|
89
|
+
|
90
|
+
def try_get_deltatable(table_uri, storage_options)
|
91
|
+
Table.new(table_uri, storage_options: storage_options)
|
92
|
+
rescue TableNotFoundError
|
93
|
+
nil
|
94
|
+
end
|
95
|
+
|
96
|
+
def convert_data(data)
|
97
|
+
if data.respond_to?(:arrow_c_stream)
|
98
|
+
data.arrow_c_stream
|
99
|
+
else
|
100
|
+
raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|