deltalake-rb 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,353 @@
1
+ mod error;
2
+ mod schema;
3
+ mod utils;
4
+
5
+ use std::cell::RefCell;
6
+ use std::collections::HashMap;
7
+ use std::future::IntoFuture;
8
+
9
+ use chrono::Duration;
10
+ use deltalake::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
11
+ use deltalake::kernel::StructType;
12
+ use deltalake::operations::delete::DeleteBuilder;
13
+ use deltalake::operations::vacuum::VacuumBuilder;
14
+ use deltalake::storage::IORuntime;
15
+ use deltalake::DeltaOps;
16
+ use error::DeltaError;
17
+
18
+ use magnus::{function, method, prelude::*, Error, Module, Ruby, Value};
19
+
20
+ use crate::error::RubyError;
21
+ use crate::schema::{schema_to_rbobject, Field};
22
+ use crate::utils::rt;
23
+
24
+ type RbResult<T> = Result<T, Error>;
25
+
26
+ #[magnus::wrap(class = "DeltaLake::RawDeltaTable")]
27
+ struct RawDeltaTable {
28
+ _table: RefCell<deltalake::DeltaTable>,
29
+ }
30
+
31
+ #[magnus::wrap(class = "DeltaLake::RawDeltaTableMetaData")]
32
+ struct RawDeltaTableMetaData {
33
+ id: String,
34
+ name: Option<String>,
35
+ description: Option<String>,
36
+ partition_columns: Vec<String>,
37
+ created_time: Option<i64>,
38
+ configuration: HashMap<String, Option<String>>,
39
+ }
40
+
41
+ impl RawDeltaTable {
42
+ pub fn new(
43
+ table_uri: String,
44
+ version: Option<i64>,
45
+ storage_options: Option<HashMap<String, String>>,
46
+ without_files: bool,
47
+ log_buffer_size: Option<usize>,
48
+ ) -> RbResult<Self> {
49
+ let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri)
50
+ .with_io_runtime(IORuntime::default());
51
+
52
+ if let Some(storage_options) = storage_options {
53
+ builder = builder.with_storage_options(storage_options)
54
+ }
55
+ if let Some(version) = version {
56
+ builder = builder.with_version(version)
57
+ }
58
+ if without_files {
59
+ builder = builder.without_files()
60
+ }
61
+ if let Some(buf_size) = log_buffer_size {
62
+ builder = builder
63
+ .with_log_buffer_size(buf_size)
64
+ .map_err(RubyError::from)?;
65
+ }
66
+
67
+ let table = rt().block_on(builder.load()).map_err(RubyError::from)?;
68
+ Ok(RawDeltaTable {
69
+ _table: RefCell::new(table),
70
+ })
71
+ }
72
+
73
+ pub fn is_deltatable(
74
+ table_uri: String,
75
+ storage_options: Option<HashMap<String, String>>,
76
+ ) -> RbResult<bool> {
77
+ let mut builder = deltalake::DeltaTableBuilder::from_uri(&table_uri);
78
+ if let Some(storage_options) = storage_options {
79
+ builder = builder.with_storage_options(storage_options)
80
+ }
81
+ Ok(rt()
82
+ .block_on(async {
83
+ match builder.build() {
84
+ Ok(table) => table.verify_deltatable_existence().await,
85
+ Err(err) => Err(err),
86
+ }
87
+ })
88
+ .map_err(RubyError::from)?)
89
+ }
90
+
91
+ pub fn table_uri(&self) -> RbResult<String> {
92
+ Ok(self._table.borrow().table_uri())
93
+ }
94
+
95
+ pub fn version(&self) -> RbResult<i64> {
96
+ Ok(self._table.borrow().version())
97
+ }
98
+
99
+ pub fn has_files(&self) -> RbResult<bool> {
100
+ Ok(self._table.borrow().config.require_files)
101
+ }
102
+
103
+ pub fn metadata(&self) -> RbResult<RawDeltaTableMetaData> {
104
+ let binding = self._table.borrow();
105
+ let metadata = binding.metadata().map_err(RubyError::from)?;
106
+ Ok(RawDeltaTableMetaData {
107
+ id: metadata.id.clone(),
108
+ name: metadata.name.clone(),
109
+ description: metadata.description.clone(),
110
+ partition_columns: metadata.partition_columns.clone(),
111
+ created_time: metadata.created_time,
112
+ configuration: metadata.configuration.clone(),
113
+ })
114
+ }
115
+
116
+ pub fn load_version(&self, version: i64) -> RbResult<()> {
117
+ Ok(rt()
118
+ .block_on(self._table.borrow_mut().load_version(version))
119
+ .map_err(RubyError::from)?)
120
+ }
121
+
122
+ pub fn files(&self) -> RbResult<Vec<String>> {
123
+ if !self.has_files()? {
124
+ return Err(DeltaError::new_err("Table is instantiated without files."));
125
+ }
126
+
127
+ Ok(self
128
+ ._table
129
+ .borrow()
130
+ .get_files_iter()
131
+ .map_err(RubyError::from)?
132
+ .map(|f| f.to_string())
133
+ .collect())
134
+ }
135
+
136
+ pub fn file_uris(&self) -> RbResult<Vec<String>> {
137
+ if !self._table.borrow().config.require_files {
138
+ return Err(DeltaError::new_err("Table is initiated without files."));
139
+ }
140
+
141
+ Ok(self
142
+ ._table
143
+ .borrow()
144
+ .get_file_uris()
145
+ .map_err(RubyError::from)?
146
+ .collect())
147
+ }
148
+
149
+ pub fn schema(&self) -> RbResult<Value> {
150
+ let binding = self._table.borrow();
151
+ let schema: &StructType = binding.get_schema().map_err(RubyError::from)?;
152
+ schema_to_rbobject(schema.to_owned())
153
+ }
154
+
155
+ pub fn vacuum(
156
+ &self,
157
+ dry_run: bool,
158
+ retention_hours: Option<u64>,
159
+ enforce_retention_duration: bool,
160
+ ) -> RbResult<Vec<String>> {
161
+ let mut cmd = VacuumBuilder::new(
162
+ self._table.borrow().log_store(),
163
+ self._table
164
+ .borrow()
165
+ .snapshot()
166
+ .map_err(RubyError::from)?
167
+ .clone(),
168
+ )
169
+ .with_enforce_retention_duration(enforce_retention_duration)
170
+ .with_dry_run(dry_run);
171
+ if let Some(retention_period) = retention_hours {
172
+ cmd = cmd.with_retention_period(Duration::hours(retention_period as i64));
173
+ }
174
+
175
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
176
+ self._table.borrow_mut().state = table.state;
177
+ Ok(metrics.files_deleted)
178
+ }
179
+
180
+ pub fn update_incremental(&self) -> RbResult<()> {
181
+ #[allow(deprecated)]
182
+ Ok(rt()
183
+ .block_on(self._table.borrow_mut().update_incremental(None))
184
+ .map_err(RubyError::from)?)
185
+ }
186
+
187
+ pub fn delete(&self, predicate: Option<String>) -> RbResult<String> {
188
+ let mut cmd = DeleteBuilder::new(
189
+ self._table.borrow().log_store(),
190
+ self._table
191
+ .borrow()
192
+ .snapshot()
193
+ .map_err(RubyError::from)?
194
+ .clone(),
195
+ );
196
+ if let Some(predicate) = predicate {
197
+ cmd = cmd.with_predicate(predicate);
198
+ }
199
+
200
+ let (table, metrics) = rt().block_on(cmd.into_future()).map_err(RubyError::from)?;
201
+ self._table.borrow_mut().state = table.state;
202
+ Ok(serde_json::to_string(&metrics).unwrap())
203
+ }
204
+ }
205
+
206
+ impl RawDeltaTableMetaData {
207
+ fn id(&self) -> String {
208
+ self.id.clone()
209
+ }
210
+
211
+ fn name(&self) -> Option<String> {
212
+ self.name.clone()
213
+ }
214
+
215
+ fn description(&self) -> Option<String> {
216
+ self.description.clone()
217
+ }
218
+
219
+ fn partition_columns(&self) -> Vec<String> {
220
+ self.partition_columns.clone()
221
+ }
222
+
223
+ fn created_time(&self) -> Option<i64> {
224
+ self.created_time
225
+ }
226
+
227
+ fn configuration(&self) -> HashMap<String, Option<String>> {
228
+ self.configuration.clone()
229
+ }
230
+ }
231
+
232
+ #[allow(clippy::too_many_arguments)]
233
+ fn write_to_deltalake(
234
+ table_uri: String,
235
+ data: Value,
236
+ mode: String,
237
+ table: Option<&RawDeltaTable>,
238
+ schema_mode: Option<String>,
239
+ partition_by: Option<Vec<String>>,
240
+ predicate: Option<String>,
241
+ target_file_size: Option<usize>,
242
+ name: Option<String>,
243
+ description: Option<String>,
244
+ configuration: Option<HashMap<String, Option<String>>>,
245
+ storage_options: Option<HashMap<String, String>>,
246
+ ) -> RbResult<()> {
247
+ let capsule_pointer: usize = data.funcall("to_i", ())?;
248
+
249
+ // use similar approach as Polars to avoid copy
250
+ let stream_ptr =
251
+ Box::new(unsafe { std::ptr::replace(capsule_pointer as _, FFI_ArrowArrayStream::empty()) });
252
+ let stream = ArrowArrayStreamReader::try_new(*stream_ptr)
253
+ .map_err(|err| DeltaError::new_err(err.to_string()))?;
254
+
255
+ let batches = stream.map(|batch| batch.unwrap()).collect::<Vec<_>>();
256
+ let save_mode = mode.parse().map_err(RubyError::from)?;
257
+
258
+ let options = storage_options.clone().unwrap_or_default();
259
+ let table = if let Some(table) = table {
260
+ DeltaOps(table._table.borrow().clone())
261
+ } else {
262
+ rt().block_on(DeltaOps::try_from_uri_with_storage_options(
263
+ &table_uri, options,
264
+ ))
265
+ .map_err(RubyError::from)?
266
+ };
267
+
268
+ let mut builder = table.write(batches).with_save_mode(save_mode);
269
+ if let Some(schema_mode) = schema_mode {
270
+ builder = builder.with_schema_mode(schema_mode.parse().map_err(RubyError::from)?);
271
+ }
272
+ if let Some(partition_columns) = partition_by {
273
+ builder = builder.with_partition_columns(partition_columns);
274
+ }
275
+
276
+ if let Some(name) = &name {
277
+ builder = builder.with_table_name(name);
278
+ };
279
+
280
+ if let Some(description) = &description {
281
+ builder = builder.with_description(description);
282
+ };
283
+
284
+ if let Some(predicate) = predicate {
285
+ builder = builder.with_replace_where(predicate);
286
+ };
287
+
288
+ if let Some(target_file_size) = target_file_size {
289
+ builder = builder.with_target_file_size(target_file_size)
290
+ };
291
+
292
+ if let Some(config) = configuration {
293
+ builder = builder.with_configuration(config);
294
+ };
295
+
296
+ rt().block_on(builder.into_future())
297
+ .map_err(RubyError::from)?;
298
+
299
+ Ok(())
300
+ }
301
+
302
+ #[magnus::init]
303
+ fn init(ruby: &Ruby) -> RbResult<()> {
304
+ deltalake::aws::register_handlers(None);
305
+
306
+ let module = ruby.define_module("DeltaLake")?;
307
+ module.define_singleton_method("write_deltalake_rust", function!(write_to_deltalake, 12))?;
308
+
309
+ let class = module.define_class("RawDeltaTable", ruby.class_object())?;
310
+ class.define_singleton_method("new", function!(RawDeltaTable::new, 5))?;
311
+ class.define_singleton_method("is_deltatable", function!(RawDeltaTable::is_deltatable, 2))?;
312
+ class.define_method("table_uri", method!(RawDeltaTable::table_uri, 0))?;
313
+ class.define_method("version", method!(RawDeltaTable::version, 0))?;
314
+ class.define_method("has_files", method!(RawDeltaTable::has_files, 0))?;
315
+ class.define_method("metadata", method!(RawDeltaTable::metadata, 0))?;
316
+ class.define_method("load_version", method!(RawDeltaTable::load_version, 1))?;
317
+ class.define_method("files", method!(RawDeltaTable::files, 0))?;
318
+ class.define_method("file_uris", method!(RawDeltaTable::file_uris, 0))?;
319
+ class.define_method("schema", method!(RawDeltaTable::schema, 0))?;
320
+ class.define_method("vacuum", method!(RawDeltaTable::vacuum, 3))?;
321
+ class.define_method(
322
+ "update_incremental",
323
+ method!(RawDeltaTable::update_incremental, 0),
324
+ )?;
325
+ class.define_method("delete", method!(RawDeltaTable::delete, 1))?;
326
+
327
+ let class = module.define_class("RawDeltaTableMetaData", ruby.class_object())?;
328
+ class.define_method("id", method!(RawDeltaTableMetaData::id, 0))?;
329
+ class.define_method("name", method!(RawDeltaTableMetaData::name, 0))?;
330
+ class.define_method(
331
+ "description",
332
+ method!(RawDeltaTableMetaData::description, 0),
333
+ )?;
334
+ class.define_method(
335
+ "partition_columns",
336
+ method!(RawDeltaTableMetaData::partition_columns, 0),
337
+ )?;
338
+ class.define_method(
339
+ "created_time",
340
+ method!(RawDeltaTableMetaData::created_time, 0),
341
+ )?;
342
+ class.define_method(
343
+ "configuration",
344
+ method!(RawDeltaTableMetaData::configuration, 0),
345
+ )?;
346
+
347
+ let class = module.define_class("Field", ruby.class_object())?;
348
+ class.define_method("name", method!(Field::name, 0))?;
349
+ class.define_method("type", method!(Field::get_type, 0))?;
350
+ class.define_method("nullable", method!(Field::nullable, 0))?;
351
+
352
+ Ok(())
353
+ }
@@ -0,0 +1,37 @@
1
+ use deltalake::kernel::{StructField, StructType as DeltaStructType};
2
+ use magnus::{value::ReprValue, Module, RArray, RModule, Ruby, Value};
3
+
4
+ use crate::RbResult;
5
+
6
+ pub fn schema_to_rbobject(schema: DeltaStructType) -> RbResult<Value> {
7
+ let fields = schema.fields().map(|field| Field {
8
+ inner: field.clone(),
9
+ });
10
+
11
+ let rb_schema: Value = Ruby::get()
12
+ .unwrap()
13
+ .class_object()
14
+ .const_get::<_, RModule>("DeltaLake")?
15
+ .const_get("Schema")?;
16
+
17
+ rb_schema.funcall("new", (RArray::from_iter(fields),))
18
+ }
19
+
20
+ #[magnus::wrap(class = "DeltaLake::Field")]
21
+ pub struct Field {
22
+ pub inner: StructField,
23
+ }
24
+
25
+ impl Field {
26
+ pub fn name(&self) -> String {
27
+ self.inner.name().to_string()
28
+ }
29
+
30
+ pub fn get_type(&self) -> String {
31
+ self.inner.data_type().to_string()
32
+ }
33
+
34
+ pub fn nullable(&self) -> bool {
35
+ self.inner.is_nullable()
36
+ }
37
+ }
@@ -0,0 +1,21 @@
1
+ use std::sync::OnceLock;
2
+
3
+ use tokio::runtime::Runtime;
4
+
5
+ #[inline]
6
+ pub fn rt() -> &'static Runtime {
7
+ static TOKIO_RT: OnceLock<Runtime> = OnceLock::new();
8
+ static PID: OnceLock<u32> = OnceLock::new();
9
+ let pid = std::process::id();
10
+ let runtime_pid = *PID.get_or_init(|| pid);
11
+ if pid != runtime_pid {
12
+ panic!(
13
+ "Forked process detected - current PID is {} but the tokio runtime was created by {}. The tokio \
14
+ runtime does not support forked processes https://github.com/tokio-rs/tokio/issues/4301. If you are \
15
+ seeing this message while using Ruby multithreading make sure to use the `spawn` or `forkserver` \
16
+ mode.",
17
+ pid, runtime_pid
18
+ );
19
+ }
20
+ TOKIO_RT.get_or_init(|| Runtime::new().expect("Failed to create a tokio runtime."))
21
+ }
@@ -0,0 +1,12 @@
1
+ module DeltaLake
2
+ class Field
3
+ def inspect
4
+ attributes = {
5
+ name: name,
6
+ type: type,
7
+ nullable: nullable
8
+ }
9
+ "<#{self.class.name} #{attributes.map { |k, v| "#{k}=#{v.inspect}" }.join(", ")}>"
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,43 @@
1
+ module DeltaLake
2
+ class Metadata
3
+ def initialize(table)
4
+ @metadata = table.metadata
5
+ end
6
+
7
+ def id
8
+ @metadata.id
9
+ end
10
+
11
+ def name
12
+ @metadata.name
13
+ end
14
+
15
+ def description
16
+ @metadata.description
17
+ end
18
+
19
+ def partition_columns
20
+ @metadata.partition_columns
21
+ end
22
+
23
+ def created_time
24
+ @metadata.created_time
25
+ end
26
+
27
+ def configuration
28
+ @metadata.configuration
29
+ end
30
+
31
+ def inspect
32
+ attributes = {
33
+ id: id,
34
+ name: name,
35
+ description: description,
36
+ partition_columns: partition_columns,
37
+ created_time: created_time,
38
+ configuration: configuration
39
+ }
40
+ "<#{self.class.name} #{attributes.map { |k, v| "#{k}=#{v.inspect}" }.join(", ")}>"
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,9 @@
1
+ module DeltaLake
2
+ class Schema
3
+ attr_reader :fields
4
+
5
+ def initialize(fields)
6
+ @fields = fields
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,103 @@
1
+ module DeltaLake
2
+ class Table
3
+ def initialize(
4
+ table_uri,
5
+ version: nil,
6
+ storage_options: nil,
7
+ without_files: false,
8
+ log_buffer_size: nil
9
+ )
10
+ @storage_options = storage_options
11
+ @table =
12
+ RawDeltaTable.new(
13
+ table_uri,
14
+ version,
15
+ storage_options,
16
+ without_files,
17
+ log_buffer_size
18
+ )
19
+ end
20
+
21
+ def self.exists?(table_uri, storage_options: nil)
22
+ RawDeltaTable.is_deltatable(table_uri, storage_options)
23
+ end
24
+
25
+ def version
26
+ @table.version
27
+ end
28
+
29
+ def files
30
+ @table.files
31
+ end
32
+
33
+ def file_uris
34
+ @table.file_uris
35
+ end
36
+
37
+ def load_as_version(version)
38
+ if version.is_a?(Integer)
39
+ @table.load_version(version)
40
+ else
41
+ raise TypeError, "Invalid datatype provided for version, only Integer is accepted."
42
+ end
43
+ end
44
+
45
+ def table_uri
46
+ @table.table_uri
47
+ end
48
+
49
+ def schema
50
+ @table.schema
51
+ end
52
+
53
+ def metadata
54
+ Metadata.new(@table)
55
+ end
56
+
57
+ def vacuum(
58
+ retention_hours: nil,
59
+ dry_run: true,
60
+ enforce_retention_duration: true
61
+ )
62
+ if retention_hours
63
+ if retention_hours < 0
64
+ raise ArgumentError, "The retention periods should be positive."
65
+ end
66
+ end
67
+
68
+ @table.vacuum(
69
+ dry_run,
70
+ retention_hours,
71
+ enforce_retention_duration
72
+ )
73
+ end
74
+
75
+ def to_polars(eager: true)
76
+ require "polars-df"
77
+
78
+ sources = file_uris
79
+ lf =
80
+ if sources.empty?
81
+ Polars::LazyFrame.new
82
+ else
83
+ storage_options = @storage_options&.except("AWS_S3_ALLOW_UNSAFE_RENAME")
84
+ Polars.scan_parquet(sources, storage_options: storage_options)
85
+ end
86
+ eager ? lf.collect : lf
87
+ end
88
+
89
+ def update_incremental
90
+ @table.update_incremental
91
+ end
92
+
93
+ def delete(predicate = nil)
94
+ metrics = @table.delete(predicate)
95
+ JSON.parse(metrics).transform_keys(&:to_sym)
96
+ end
97
+
98
+ # private
99
+ def _table
100
+ @table
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,3 @@
1
+ module DeltaLake
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1 @@
1
+ require_relative "deltalake"
data/lib/deltalake.rb ADDED
@@ -0,0 +1,104 @@
1
+ # ext
2
+ begin
3
+ require "deltalake/#{RUBY_VERSION.to_f}/deltalake"
4
+ rescue LoadError
5
+ require "deltalake/deltalake"
6
+ end
7
+
8
+ # stdlib
9
+ require "json"
10
+
11
+ # modules
12
+ require_relative "deltalake/field"
13
+ require_relative "deltalake/metadata"
14
+ require_relative "deltalake/schema"
15
+ require_relative "deltalake/table"
16
+ require_relative "deltalake/version"
17
+
18
+ module DeltaLake
19
+ class Error < StandardError; end
20
+ class TableNotFoundError < Error; end
21
+ class DeltaProtocolError < Error; end
22
+ class CommitFailedError < Error; end
23
+ class SchemaMismatchError < Error; end
24
+
25
+ class << self
26
+
27
+ def write(
28
+ table_or_uri,
29
+ data,
30
+ partition_by: nil,
31
+ mode: "error",
32
+ name: nil,
33
+ description: nil,
34
+ configuration: nil,
35
+ schema_mode: nil,
36
+ storage_options: nil,
37
+ predicate: nil,
38
+ target_file_size: nil
39
+ )
40
+ table, table_uri = try_get_table_and_table_uri(table_or_uri, storage_options)
41
+
42
+ if partition_by.is_a?(String)
43
+ partition_by = [partition_by]
44
+ end
45
+
46
+ if !table.nil? && mode == "ignore"
47
+ return
48
+ end
49
+
50
+ data = convert_data(data)
51
+
52
+ write_deltalake_rust(
53
+ table_uri,
54
+ data,
55
+ mode,
56
+ table&._table,
57
+ schema_mode,
58
+ partition_by,
59
+ predicate,
60
+ target_file_size,
61
+ name,
62
+ description,
63
+ configuration,
64
+ storage_options
65
+ )
66
+
67
+ if table
68
+ table.update_incremental
69
+ end
70
+ end
71
+
72
+ private
73
+
74
+ def try_get_table_and_table_uri(table_or_uri, storage_options)
75
+ if !table_or_uri.is_a?(String) && !table_or_uri.is_a?(Table)
76
+ raise ArgumentError, "table_or_uri must be a String or Table"
77
+ end
78
+
79
+ if table_or_uri.is_a?(String)
80
+ table = try_get_deltatable(table_or_uri, storage_options)
81
+ table_uri = table_or_uri.to_s
82
+ else
83
+ table = table_or_uri
84
+ table_uri = table._table.table_uri
85
+ end
86
+
87
+ [table, table_uri]
88
+ end
89
+
90
+ def try_get_deltatable(table_uri, storage_options)
91
+ Table.new(table_uri, storage_options: storage_options)
92
+ rescue TableNotFoundError
93
+ nil
94
+ end
95
+
96
+ def convert_data(data)
97
+ if data.respond_to?(:arrow_c_stream)
98
+ data.arrow_c_stream
99
+ else
100
+ raise TypeError, "Only objects implementing the Arrow C stream interface are valid inputs for source."
101
+ end
102
+ end
103
+ end
104
+ end