deltalake-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,353 @@
1
+ mod error;
2
+ mod schema;
3
+ mod utils;
4
+
5
+ use std::cell::RefCell;
6
+ use std::collections::HashMap;
7
+ use std::future::IntoFuture;
8
+
9
+ use chrono::Duration;
10
+ use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
11
+ use deltalake::kernel::StructType;
12
+ use deltalake::operations::delete::DeleteBuilder;
13
+ use deltalake::operations::vacuum::VacuumBuilder;
14
+ use deltalake::storage::IORuntime;
15
+ use deltalake::DeltaOps;
16
+ use error::DeltaError;
17
+
18
+ use magnus::{function, method, prelude::*, Error, Module, Ruby, Value};
19
+
20
+ use crate::error::RubyError;
21
+ use crate::schema::{schema_to_rbobject, Field};
22
+ use crate::utils::rt;
23
+
24
+ type RbResult<T> = Result<T, Error>;
25
+
26
+ #[magnus::wrap(class = "DeltaLake::RawDeltaTable")]
27
+ struct RawDeltaTable {
28
+ _table: RefCell<deltalake::DeltaTable>,
29
+ }
30
+
31
+ #[magnus::wrap(class = "DeltaLake::RawDeltaTableMetaData")]
32
+ struct RawDeltaTableMetaData {
33
+ id: String,
34
+ name: Option<String>,
35
+ description: Option<String>,
36
+ partition_columns: Vec<String>,
37
+ created_time: Option<i64>,
38
+ configuration: HashMap<String, Option<String>>,
39
+ }
40
+
41
+ impl RawDeltaTable {
42
+ pub fn new(
43
+ table_uri: String,
44
+ version: Option<i64>,
45
+ storage_options: Option<HashMap<String, String>>,
46
+ without_files: bool,
47
+ log_buffer_size: Option<usize>,
48
+ ) -> RbResult<Self> {
49
+ let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri)
50
+ .with_io_runtime(IORuntime::default());
51
+
52
+ if let Some(storage_options) = storage_options {
53
+ builder = builder.with_storage_options(storage_options)
54
+ }
55
+ if let Some(version) = version {
56
+ builder = builder.with_version(version)
57
+ }
58
+ if without_files {
59
+ builder = builder.without_files()
60
+ }
61
+ if let Some(buf_size) = log_buffer_size {
62
+ builder = builder
63
+ .with_log_buffer_size(buf_size)
64
+ .map_err(RubyError::from)?;
65
+ }
66
+
67
+ let table = rt().block_on(builder.load()).map_err(RubyError::from)?;
68
+ Ok(RawDeltaTable {
69
+ _table: RefCell::new(table),
70
+ })
71
+ }
72
+
73
+ pub fn is_deltatable(
74
+ table_uri: String,
75
+ storage_options: Option<HashMap<String, String>>,
76
+ ) -> RbResult<bool> {
77
+ let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri);
78
+ if let Some(storage_options) = storage_options {
79
+ builder = builder.with_storage_options(storage_options)
80
+ }
81
+ Ok(rt()
82
+ .block_on(async {
83
+ match builder.build() {
84
+ Ok(table) => table.verify_deltatable_existence().await,
85
+ Err(err) => Err(err),
86
+ }
87
+ })
88
+ .map_err(RubyError::from)?)
89
+ }
90
+
91
+ pub fn table_uri(&self) -> RbResult<String> {
92
+ Ok(self._table.borrow().table_uri())
93
+ }
94
+
95
+ pub fn version(&self) -> RbResult<i64> {
96
+ Ok(self._table.borrow().version())
97
+ }
98
+
99
+ pub fn has_files(&self) -> RbResult<bool> {
100
+ Ok(self._table.borrow().config.require_files)
101
+ }
102
+
103
+ pub fn metadata(&self) -> RbResult<RawDeltaTableMetaData> {
104
+ let binding = self._table.borrow();
105
+ let metadata = binding.metadata().map_err(RubyError::from)?;
106
+ Ok(RawDeltaTableMetaData {
107
+ id: metadata.id.clone(),
108
+ name: metadata.name.clone(),
109
+ description: metadata.description.clone(),
110
+ partition_columns: metadata.partition_columns.clone(),
111
+ created_time: metadata.created_time,
112
+ configuration: metadata.configuration.clone(),
113
+ })
114
+ }
115
+
116
+ pub fn load_version(&self, version: i64) -> RbResult<()> {
117
+ Ok(rt()
118
+ .block_on(self._table.borrow_mut().load_version(version))
119
+ .map_err(RubyError::from)?)
120
+ }
121
+
122
+ pub fn files(&self) -> RbResult<Vec<String>> {
123
+ if !self.has_files()? {
124
+ return Err(DeltaError::new_err("Table is instantiated without files."));
125
+ }
126
+
127
+ Ok(self
128
+ ._table
129
+ .borrow()
130
+ .get_files_iter()
131
+ .map_err(RubyError::from)?
132
+ .map(|f| f.to_string())
133
+ .collect())
134
+ }
135
+
136
+ pub fn file_uris(&self) -> RbResult<Vec<String>> {
137
+ if !self._table.borrow().config.require_files {
138
+ return Err(DeltaError::new_err("Table is initiated without files."));
139
+ }
140
+
141
+ Ok(self
142
+ ._table
143
+ .borrow()
144
+ .get_file_uris()
145
+ .map_err(RubyError::from)?
146
+ .collect())
147
+ }
148
+
149
+ pub fn schema(&self) -> RbResult<Value> {
150
+ let binding = self._table.borrow();
151
+ let schema: &StructType = binding.get_schema().map_err(RubyError::from)?;
152
+ schema_to_rbobject(schema.to_owned())
153
+ }
154
+
155
+ pub fn vacuum(
156
+ &self,
157
+ dry_run: bool,
158
+ retention_hours: Option<u64>,
159
+ enforce_retention_duration: bool,
160
+ ) -> RbResult<Vec<String>> {
161
+ let mut cmd = VacuumBuilder::new(
162
+ self._table.borrow().log_store(),
163
+ self._table
164
+ .borrow()
165
+ .snapshot()
166
+ .map_err(RubyError::from)?
167
+ .clone(),
168
+ )
169
+ .with_enforce_retention_duration(enforce_retention_duration)
170
+ .with_dry_run(dry_run);
171
+ if let Some(retention_period) = retention_hours {
172
+ cmd = cmd.with_retention_period(Duration::hours(retention_period as i64));
173
+ }
174
+
175
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
176
+ self._table.borrow_mut().state = table.state;
177
+ Ok(metrics.files_deleted)
178
+ }
179
+
180
+ pub fn update_incremental(&self) -> RbResult<()> {
181
+ #[allow(deprecated)]
182
+ Ok(rt()
183
+ .block_on(self._table.borrow_mut().update_incremental(None))
184
+ .map_err(RubyError::from)?)
185
+ }
186
+
187
+ pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
188
+ let mut cmd = DeleteBuilder::new(
189
+ self._table.borrow().log_store(),
190
+ self._table
191
+ .borrow()
192
+ .snapshot()
193
+ .map_err(RubyError::from)?
194
+ .clone(),
195
+ );
196
+ if let Some(predicate) = predicate {
197
+ cmd = cmd.with_predicate(predicate);
198
+ }
199
+
200
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
201
+ self._table.borrow_mut().state = table.state;
202
+ Ok(serde_json::to_string(&metrics).unwrap())
203
+ }
204
+ }
205
+
206
+ impl RawDeltaTableMetaData {
207
+ fn id(&self) -> String {
208
+ self.id.clone()
209
+ }
210
+
211
+ fn name(&self) -> Option<String> {
212
+ self.name.clone()
213
+ }
214
+
215
+ fn description(&self) -> Option<String> {
216
+ self.description.clone()
217
+ }
218
+
219
+ fn partition_columns(&self) -> Vec<String> {
220
+ self.partition_columns.clone()
221
+ }
222
+
223
+ fn created_time(&self) -> Option<i64> {
224
+ self.created_time
225
+ }
226
+
227
+ fn configuration(&self) -> HashMap<String, Option<String>> {
228
+ self.configuration.clone()
229
+ }
230
+ }
231
+
232
+ #[allow(clippy::too_many_arguments)]
233
+ fn write_to_deltalake(
234
+ table_uri: String,
235
+ data: Value,
236
+ mode: String,
237
+ table: Option<&RawDeltaTable>,
238
+ schema_mode: Option<String>,
239
+ partition_by: Option<Vec<String>>,
240
+ predicate: Option<String>,
241
+ target_file_size: Option<usize>,
242
+ name: Option<String>,
243
+ description: Option<String>,
244
+ configuration: Option<HashMap<String, Option<String>>>,
245
+ storage_options: Option<HashMap<String, String>>,
246
+ ) -> RbResult<()> {
247
+ let capsule_pointer: usize = data.funcall("to_i", ())?;
248
+
249
+ // use similar approach as Polars to avoid copy
250
+ let stream_ptr =
251
+ Box::new(unsafe { std::ptr::replace(capsule_pointer as _, FFI_ArrowArrayStream::empty()) });
252
+ let stream = ArrowArrayStreamReader::try_new(*stream_ptr)
253
+ .map_err(|err| DeltaError::new_err(err.to_string()))?;
254
+
255
+ let batches = stream.map(|batch| batch.unwrap()).collect::<Vec<_>>();
256
+ let save_mode = mode.parse().map_err(RubyError::from)?;
257
+
258
+ let options = storage_options.clone().unwrap_or_default();
259
+ let table = if let Some(table) = table {
260
+ DeltaOps(table._table.borrow().clone())
261
+ } else {
262
+ rt().block_on(DeltaOps::try_from_uri_with_storage_options(
263
+ &table_uri, options,
264
+ ))
265
+ .map_err(RubyError::from)?
266
+ };
267
+
268
+ let mut builder = table.write(batches).with_save_mode(save_mode);
269
+ if let Some(schema_mode) = schema_mode {
270
+ builder = builder.with_schema_mode(schema_mode.parse().map_err(RubyError::from)?);
271
+ }
272
+ if let Some(partition_columns) = partition_by {
273
+ builder = builder.with_partition_columns(partition_columns);
274
+ }
275
+
276
+ if let Some(name) = &name {
277
+ builder = builder.with_table_name(name);
278
+ };
279
+
280
+ if let Some(description) = &description {
281
+ builder = builder.with_description(description);
282
+ };
283
+
284
+ if let Some(predicate) = predicate {
285
+ builder = builder.with_replace_where(predicate);
286
+ };
287
+
288
+ if let Some(target_file_size) = target_file_size {
289
+ builder = builder.with_target_file_size(target_file_size)
290
+ };
291
+
292
+ if let Some(config) = configuration {
293
+ builder = builder.with_configuration(config);
294
+ };
295
+
296
+ rt().block_on(builder.into_future())
297
+ .map_err(RubyError::from)?;
298
+
299
+ Ok(())
300
+ }
301
+
302
+ #[magnus::init]
303
+ fn init(ruby: &Ruby) -> RbResult<()> {
304
+ deltalake::aws::register_handlers(None);
305
+
306
+ let module = ruby.define_module("DeltaLake")?;
307
+ module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 12))?;
308
+
309
+ let class = module.define_class("RawDeltaTable", ruby.class_object())?;
310
+ class.define_singleton_method("new", function!(RawDeltaTable::new, 5))?;
311
+ class.define_singleton_method("is_deltatable", function!(RawDeltaTable::is_deltatable, 2))?;
312
+ class.define_method("table_uri", method!(RawDeltaTable::table_uri, 0))?;
313
+ class.define_method("version", method!(RawDeltaTable::version, 0))?;
314
+ class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?;
315
+ class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?;
316
+ class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?;
317
+ class.define_method("files", method!(RawDeltaTable::files, 0))?;
318
+ class.define_method("file_uris", method!(RawDeltaTable::file_uris, 0))?;
319
+ class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
320
+ class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
321
+ class.define_method(
322
+ "update_incremental",
323
+ method!(RawDeltaTable::update_incremental, 0),
324
+ )?;
325
+ class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
326
+
327
+ let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
328
+ class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?;
329
+ class.define_method("name", method!(RawDeltaTableMetaData::name, 0))?;
330
+ class.define_method(
331
+ "description",
332
+ method!(RawDeltaTableMetaData::description, 0),
333
+ )?;
334
+ class.define_method(
335
+ "partition_columns",
336
+ method!(RawDeltaTableMetaData::partition_columns, 0),
337
+ )?;
338
+ class.define_method(
339
+ "created_time",
340
+ method!(RawDeltaTableMetaData::created_time, 0),
341
+ )?;
342
+ class.define_method(
343
+ "configuration",
344
+ method!(RawDeltaTableMetaData::configuration, 0),
345
+ )?;
346
+
347
+ let class = module.define_class("Field", ruby.class_object())?;
348
+ class.define_method("name", method!(Field::name, 0))?;
349
+ class.define_method("type", method!(Field::get_type, 0))?;
350
+ class.define_method("nullable", method!(Field::nullable, 0))?;
351
+
352
+ Ok(())
353
+ }
@@ -0,0 +1,37 @@
1
+ use deltalake::kernel::{StructField, StructType as DeltaStructType};
2
+ use magnus::{value::ReprValue, Module, RArray, RModule, Ruby, Value};
3
+
4
+ use crate::RbResult;
5
+
6
+ pub fn schema_to_rbobject(schema: DeltaStructType) -> RbResult<Value> {
7
+ let fields = schema.fields().map(|field| Field {
8
+ inner: field.clone(),
9
+ });
10
+
11
+ let rb_schema: Value = Ruby::get()
12
+ .unwrap()
13
+ .class_object()
14
+ .const_get::<_, RModule>("DeltaLake")?
15
+ .const_get("Schema")?;
16
+
17
+ rb_schema.funcall("new", (RArray::from_iter(fields),))
18
+ }
19
+
20
+ #[magnus::wrap(class = "DeltaLake::Field")]
21
+ pub struct Field {
22
+ pub inner: StructField,
23
+ }
24
+
25
+ impl Field {
26
+ pub fn name(&self) -> String {
27
+ self.inner.name().to_string()
28
+ }
29
+
30
+ pub fn get_type(&self) -> String {
31
+ self.inner.data_type().to_string()
32
+ }
33
+
34
+ pub fn nullable(&self) -> bool {
35
+ self.inner.is_nullable()
36
+ }
37
+ }
@@ -0,0 +1,21 @@
1
+ use std::sync::OnceLock;
2
+
3
+ use tokio::runtime::Runtime;
4
+
5
+ #[inline]
6
+ pub fn rt() -> &'static Runtime {
7
+ static TOKIO_RT: OnceLock<Runtime> = OnceLock::new();
8
+ static PID: OnceLock<u32> = OnceLock::new();
9
+ let pid = std::process::id();
10
+ let runtime_pid = *PID.get_or_init(|| pid);
11
+ if pid != runtime_pid {
12
+ panic!(
13
+ "Forked process detected - current PID is {} but the tokio runtime was created by {}. The tokio \
14
+ runtime does not support forked processes https://github.com/tokio-rs/tokio/issues/4301. If you are \
15
+ seeing this message while using Ruby multithreading make sure to use the `spawn` or `forkserver` \
16
+ mode.",
17
+ pid, runtime_pid
18
+ );
19
+ }
20
+ TOKIO_RT.get_or_init(|| Runtime::new().expect("Failed to create a tokio runtime."))
21
+ }
@@ -0,0 +1,12 @@
1
+ module DeltaLake
2
+ class Field
3
+ def inspect
4
+ attributes = {
5
+ name: name,
6
+ type: type,
7
+ nullable: nullable
8
+ }
9
+ "<#{self.class.name} #{attributes.map { |k, v| "#{k}=#{v.inspect}" }.join(", ")}>"
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,43 @@
1
+ module DeltaLake
2
+ class Metadata
3
+ def initialize(table)
4
+ @metadata = table.metadata
5
+ end
6
+
7
+ def id
8
+ @metadata.id
9
+ end
10
+
11
+ def name
12
+ @metadata.name
13
+ end
14
+
15
+ def description
16
+ @metadata.description
17
+ end
18
+
19
+ def partition_columns
20
+ @metadata.partition_columns
21
+ end
22
+
23
+ def created_time
24
+ @metadata.created_time
25
+ end
26
+
27
+ def configuration
28
+ @metadata.configuration
29
+ end
30
+
31
+ def inspect
32
+ attributes = {
33
+ id: id,
34
+ name: name,
35
+ description: description,
36
+ partition_columns: partition_columns,
37
+ created_time: created_time,
38
+ configuration: configuration
39
+ }
40
+ "<#{self.class.name} #{attributes.map { |k, v| "#{k}=#{v.inspect}" }.join(", ")}>"
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,9 @@
1
+ module DeltaLake
2
+ class Schema
3
+ attr_reader :fields
4
+
5
+ def initialize(fields)
6
+ @fields = fields
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,103 @@
1
+ module DeltaLake
2
+ class Table
3
+ def initialize(
4
+ table_uri,
5
+ version: nil,
6
+ storage_options: nil,
7
+ without_files: false,
8
+ log_buffer_size: nil
9
+ )
10
+ @storage_options = storage_options
11
+ @table =
12
+ RawDeltaTable.new(
13
+ table_uri,
14
+ version,
15
+ storage_options,
16
+ without_files,
17
+ log_buffer_size
18
+ )
19
+ end
20
+
21
+ def self.exists?(table_uri, storage_options: nil)
22
+ RawDeltaTable.is_deltatable(table_uri, storage_options)
23
+ end
24
+
25
+ def version
26
+ @table.version
27
+ end
28
+
29
+ def files
30
+ @table.files
31
+ end
32
+
33
+ def file_uris
34
+ @table.file_uris
35
+ end
36
+
37
+ def load_as_version(version)
38
+ if version.is_a?(Integer)
39
+ @table.load_version(version)
40
+ else
41
+ raise TypeError, "Invalid datatype provided for version, only Integer is accepted."
42
+ end
43
+ end
44
+
45
+ def table_uri
46
+ @table.table_uri
47
+ end
48
+
49
+ def schema
50
+ @table.schema
51
+ end
52
+
53
+ def metadata
54
+ Metadata.new(@table)
55
+ end
56
+
57
+ def vacuum(
58
+ retention_hours: nil,
59
+ dry_run: true,
60
+ enforce_retention_duration: true
61
+ )
62
+ if retention_hours
63
+ if retention_hours < 0
64
+ raise ArgumentError, "The retention periods should be positive."
65
+ end
66
+ end
67
+
68
+ @table.vacuum(
69
+ dry_run,
70
+ retention_hours,
71
+ enforce_retention_duration
72
+ )
73
+ end
74
+
75
+ def to_polars(eager: true)
76
+ require "polars-df"
77
+
78
+ sources = file_uris
79
+ lf =
80
+ if sources.empty?
81
+ Polars::LazyFrame.new
82
+ else
83
+ storage_options = @storage_options&.except("AWS_S3_ALLOW_UNSAFE_RENAME")
84
+ Polars.scan_parquet(sources, storage_options: storage_options)
85
+ end
86
+ eager ? lf.collect : lf
87
+ end
88
+
89
+ def update_incremental
90
+ @table.update_incremental
91
+ end
92
+
93
+ def delete(predicate = nil)
94
+ metrics = @table.delete(predicate)
95
+ JSON.parse(metrics).transform_keys(&:to_sym)
96
+ end
97
+
98
+ # private
99
+ def _table
100
+ @table
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,3 @@
1
+ module DeltaLake
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1 @@
1
+ require_relative "deltalake"
data/lib/deltalake.rb ADDED
@@ -0,0 +1,104 @@
1
+ # ext
2
+ begin
3
+ require "deltalake/#{RUBY_VERSION.to_f}/deltalake"
4
+ rescue LoadError
5
+ require "deltalake/deltalake"
6
+ end
7
+
8
+ # stdlib
9
+ require "json"
10
+
11
+ # modules
12
+ require_relative "deltalake/field"
13
+ require_relative "deltalake/metadata"
14
+ require_relative "deltalake/schema"
15
+ require_relative "deltalake/table"
16
+ require_relative "deltalake/version"
17
+
18
+ module DeltaLake
19
+ class Error < StandardError; end
20
+ class TableNotFoundError < Error; end
21
+ class DeltaProtocolError < Error; end
22
+ class CommitFailedError < Error; end
23
+ class SchemaMismatchError < Error; end
24
+
25
+ class << self
26
+
27
+ def write(
28
+ table_or_uri,
29
+ data,
30
+ partition_by: nil,
31
+ mode: "error",
32
+ name: nil,
33
+ description: nil,
34
+ configuration: nil,
35
+ schema_mode: nil,
36
+ storage_options: nil,
37
+ predicate: nil,
38
+ target_file_size: nil
39
+ )
40
+ table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options)
41
+
42
+ if partition_by.is_a?(String)
43
+ partition_by = [partition_by]
44
+ end
45
+
46
+ if !table.nil? && mode == "ignore"
47
+ return
48
+ end
49
+
50
+ data = convert_data(data)
51
+
52
+ write_deltalake_rust(
53
+ table_uri,
54
+ data,
55
+ mode,
56
+ table&._table,
57
+ schema_mode,
58
+ partition_by,
59
+ predicate,
60
+ target_file_size,
61
+ name,
62
+ description,
63
+ configuration,
64
+ storage_options
65
+ )
66
+
67
+ if table
68
+ table.update_incremental
69
+ end
70
+ end
71
+
72
+ private
73
+
74
+ def try_get_table_and_table_uri(table_or_uri, storage_options)
75
+ if !table_or_uri.is_a?(String) && !table_or_uri.is_a?(Table)
76
+ raise ArgumentError, "table_or_uri must be a String or Table"
77
+ end
78
+
79
+ if table_or_uri.is_a?(String)
80
+ table = try_get_deltatable(table_or_uri, storage_options)
81
+ table_uri = table_or_uri.to_s
82
+ else
83
+ table = table_or_uri
84
+ table_uri = table._table.table_uri
85
+ end
86
+
87
+ [table, table_uri]
88
+ end
89
+
90
+ def try_get_deltatable(table_uri, storage_options)
91
+ Table.new(table_uri, storage_options: storage_options)
92
+ rescue TableNotFoundError
93
+ nil
94
+ end
95
+
96
+ def convert_data(data)
97
+ if data.respond_to?(:arrow_c_stream)
98
+ data.arrow_c_stream
99
+ else
100
+ raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
101
+ end
102
+ end
103
+ end
104
+ end