iceberg 0.10.1 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -12,8 +12,6 @@ Add this line to your application’s Gemfile:
12
12
  gem "iceberg"
13
13
  ```
14
14
 
15
- It can take around 5 minutes to compile the gem.
16
-
17
15
  ## Getting Started
18
16
 
19
17
  Create a client for an Iceberg catalog
@@ -84,9 +82,7 @@ Iceberg::MemoryCatalog.new(
84
82
  )
85
83
  ```
86
84
 
87
- ## Reference
88
-
89
- ### Namespaces
85
+ ## Namespaces
90
86
 
91
87
  List namespaces
92
88
 
@@ -124,7 +120,7 @@ Drop a namespace
124
120
  catalog.drop_namespace("main")
125
121
  ```
126
122
 
127
- ### Tables
123
+ ## Tables
128
124
 
129
125
  List tables
130
126
 
@@ -171,7 +167,7 @@ Drop a table
171
167
  catalog.drop_table("main.events")
172
168
  ```
173
169
 
174
- ### Static Tables
170
+ ## Static Tables
175
171
 
176
172
  Load a static table
177
173
 
@@ -1,10 +1,10 @@
1
1
  [package]
2
2
  name = "iceberg"
3
- version = "0.10.1"
3
+ version = "0.10.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2024"
7
- rust-version = "1.86"
7
+ rust-version = "1.87"
8
8
  publish = false
9
9
 
10
10
  [lib]
@@ -13,13 +13,13 @@ crate-type = ["cdylib"]
13
13
  [dependencies]
14
14
  arrow-array = { version = "55", features = ["ffi"] }
15
15
  arrow-schema = "55"
16
- datafusion = { version = "47", optional = true }
16
+ datafusion = { version = "48", optional = true }
17
17
  futures = "0.3"
18
- iceberg = "0.6"
19
- iceberg-catalog-glue = { version = "0.6", optional = true }
20
- iceberg-catalog-rest = { version = "0.6", optional = true }
21
- iceberg-catalog-sql = { version = "0.6", optional = true }
22
- iceberg-datafusion = { version = "0.6", optional = true }
18
+ iceberg = "0.7"
19
+ iceberg-catalog-glue = { version = "0.7", optional = true }
20
+ iceberg-catalog-rest = { version = "0.7", optional = true }
21
+ iceberg-catalog-sql = { version = "0.7", optional = true }
22
+ iceberg-datafusion = { version = "0.7", optional = true }
23
23
  magnus = "0.8"
24
24
  parquet = "55"
25
25
  sqlx = { version = "0.8", features = ["postgres", "runtime-tokio", "sqlite"], default-features = false, optional = true }
@@ -1,12 +1,15 @@
1
1
  #[cfg(feature = "datafusion")]
2
2
  use datafusion::execution::context::SessionContext;
3
- use iceberg::io::{FileIO, FileIOBuilder};
3
+ use iceberg::io::FileIO;
4
+ use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder};
4
5
  use iceberg::spec::Schema;
5
- use iceberg::{Catalog, MemoryCatalog, NamespaceIdent, TableCreation, TableIdent};
6
+ use iceberg::{Catalog, CatalogBuilder, MemoryCatalog, NamespaceIdent, TableCreation, TableIdent};
6
7
  #[cfg(feature = "glue")]
7
- use iceberg_catalog_glue::{GlueCatalog, GlueCatalogConfig};
8
+ use iceberg_catalog_glue::{GLUE_CATALOG_PROP_WAREHOUSE, GlueCatalog, GlueCatalogBuilder};
8
9
  #[cfg(feature = "rest")]
9
- use iceberg_catalog_rest::{RestCatalog, RestCatalogConfig};
10
+ use iceberg_catalog_rest::{
11
+ REST_CATALOG_PROP_URI, REST_CATALOG_PROP_WAREHOUSE, RestCatalog, RestCatalogBuilder,
12
+ };
10
13
  #[cfg(feature = "sql")]
11
14
  use iceberg_catalog_sql::{SqlBindStyle, SqlCatalog, SqlCatalogConfig};
12
15
  #[cfg(feature = "datafusion")]
@@ -65,9 +68,9 @@ pub struct RbCatalog {
65
68
  impl RbCatalog {
66
69
  #[cfg(feature = "glue")]
67
70
  pub fn new_glue(warehouse: String) -> RbResult<Self> {
68
- let config = GlueCatalogConfig::builder().warehouse(warehouse).build();
71
+ let props = HashMap::from([(GLUE_CATALOG_PROP_WAREHOUSE.to_string(), warehouse)]);
69
72
  let catalog = runtime()
70
- .block_on(GlueCatalog::new(config))
73
+ .block_on(GlueCatalogBuilder::default().load("glue", props))
71
74
  .map_err(to_rb_err)?;
72
75
  Ok(Self {
73
76
  catalog: RbCatalogType::Glue(catalog.into()).into(),
@@ -75,14 +78,13 @@ impl RbCatalog {
75
78
  }
76
79
 
77
80
  pub fn new_memory(warehouse: Option<String>) -> RbResult<Self> {
78
- let file_io = match warehouse {
79
- Some(ref v) => FileIO::from_path(v)
80
- .map_err(to_rb_err)?
81
- .build()
82
- .map_err(to_rb_err)?,
83
- None => FileIOBuilder::new_fs_io().build().map_err(to_rb_err)?,
84
- };
85
- let catalog = MemoryCatalog::new(file_io, warehouse);
81
+ let mut props = HashMap::new();
82
+ if let Some(v) = warehouse {
83
+ props.insert(MEMORY_CATALOG_WAREHOUSE.to_string(), v);
84
+ }
85
+ let catalog = runtime()
86
+ .block_on(MemoryCatalogBuilder::default().load("memory", props))
87
+ .map_err(to_rb_err)?;
86
88
  Ok(Self {
87
89
  catalog: RbCatalogType::Memory(catalog.into()).into(),
88
90
  })
@@ -93,16 +95,18 @@ impl RbCatalog {
93
95
  uri: String,
94
96
  warehouse: Option<String>,
95
97
  props: HashMap<String, String>,
96
- ) -> Self {
97
- let config = RestCatalogConfig::builder()
98
- .uri(uri)
99
- .warehouse_opt(warehouse)
100
- .props(props)
101
- .build();
102
- let catalog = RestCatalog::new(config);
103
- Self {
104
- catalog: RbCatalogType::Rest(catalog.into()).into(),
98
+ ) -> RbResult<Self> {
99
+ let mut props = props;
100
+ props.insert(REST_CATALOG_PROP_URI.to_string(), uri);
101
+ if let Some(v) = warehouse {
102
+ props.insert(REST_CATALOG_PROP_WAREHOUSE.to_string(), v);
105
103
  }
104
+ let catalog = runtime()
105
+ .block_on(RestCatalogBuilder::default().load("rest", props))
106
+ .map_err(to_rb_err)?;
107
+ Ok(Self {
108
+ catalog: RbCatalogType::Rest(catalog.into()).into(),
109
+ })
106
110
  }
107
111
 
108
112
  #[cfg(feature = "sql")]
@@ -122,6 +122,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
122
122
 
123
123
  let class = module.define_class("RbTableScan", ruby.class_object())?;
124
124
  class.define_method("plan_files", method!(RbTableScan::plan_files, 0))?;
125
+ class.define_method("snapshot", method!(RbTableScan::snapshot, 0))?;
125
126
 
126
127
  Ok(())
127
128
  }
@@ -1,11 +1,12 @@
1
1
  use futures::TryStreamExt;
2
2
  use iceberg::scan::TableScan;
3
- use magnus::{RArray, Ruby};
3
+ use magnus::{RArray, Ruby, Value};
4
4
  use std::cell::RefCell;
5
5
 
6
6
  use crate::RbResult;
7
7
  use crate::error::to_rb_err;
8
8
  use crate::runtime::runtime;
9
+ use crate::utils::rb_snapshot;
9
10
 
10
11
  #[magnus::wrap(class = "Iceberg::RbTableScan")]
11
12
  pub struct RbTableScan {
@@ -44,4 +45,11 @@ impl RbTableScan {
44
45
  }
45
46
  Ok(files)
46
47
  }
48
+
49
+ pub fn snapshot(ruby: &Ruby, rb_self: &Self) -> RbResult<Option<Value>> {
50
+ match rb_self.scan.borrow().snapshot() {
51
+ Some(s) => Ok(Some(rb_snapshot(ruby, s)?)),
52
+ None => Ok(None),
53
+ }
54
+ }
47
55
  }
@@ -72,6 +72,7 @@ impl RbTable {
72
72
  let parquet_writer_builder = ParquetWriterBuilder::new(
73
73
  WriterProperties::default(),
74
74
  table.metadata().current_schema().clone(),
75
+ None,
75
76
  table.file_io().clone(),
76
77
  location_generator.clone(),
77
78
  file_name_generator.clone(),
@@ -145,21 +146,21 @@ impl RbTable {
145
146
  pub fn schemas(ruby: &Ruby, rb_self: &Self) -> RbResult<RArray> {
146
147
  let schemas = ruby.ary_new();
147
148
  for s in rb_self.table.borrow().metadata().schemas_iter() {
148
- schemas.push(rb_schema(s)?)?;
149
+ schemas.push(rb_schema(ruby, s)?)?;
149
150
  }
150
151
  Ok(schemas)
151
152
  }
152
153
 
153
- pub fn schema_by_id(&self, schema_id: i32) -> RbResult<Option<Value>> {
154
- let schema = match self.table.borrow().metadata().schema_by_id(schema_id) {
155
- Some(s) => Some(rb_schema(s)?),
154
+ pub fn schema_by_id(ruby: &Ruby, rb_self: &Self, schema_id: i32) -> RbResult<Option<Value>> {
155
+ let schema = match rb_self.table.borrow().metadata().schema_by_id(schema_id) {
156
+ Some(s) => Some(rb_schema(ruby, s)?),
156
157
  None => None,
157
158
  };
158
159
  Ok(schema)
159
160
  }
160
161
 
161
- pub fn current_schema(&self) -> RbResult<Value> {
162
- rb_schema(self.table.borrow().metadata().current_schema())
162
+ pub fn current_schema(ruby: &Ruby, rb_self: &Self) -> RbResult<Value> {
163
+ rb_schema(ruby, rb_self.table.borrow().metadata().current_schema())
163
164
  }
164
165
 
165
166
  pub fn current_schema_id(&self) -> i32 {
@@ -198,14 +199,23 @@ impl RbTable {
198
199
  pub fn snapshots(ruby: &Ruby, rb_self: &Self) -> RbResult<RArray> {
199
200
  let snapshots = ruby.ary_new();
200
201
  for s in rb_self.table.borrow().metadata().snapshots() {
201
- snapshots.push(rb_snapshot(s)?)?;
202
+ snapshots.push(rb_snapshot(ruby, s)?)?;
202
203
  }
203
204
  Ok(snapshots)
204
205
  }
205
206
 
206
- pub fn snapshot_by_id(&self, snapshot_id: i64) -> RbResult<Option<Value>> {
207
- let snapshot = match self.table.borrow().metadata().snapshot_by_id(snapshot_id) {
208
- Some(s) => Some(rb_snapshot(s)?),
207
+ pub fn snapshot_by_id(
208
+ ruby: &Ruby,
209
+ rb_self: &Self,
210
+ snapshot_id: i64,
211
+ ) -> RbResult<Option<Value>> {
212
+ let snapshot = match rb_self
213
+ .table
214
+ .borrow()
215
+ .metadata()
216
+ .snapshot_by_id(snapshot_id)
217
+ {
218
+ Some(s) => Some(rb_snapshot(ruby, s)?),
209
219
  None => None,
210
220
  };
211
221
  Ok(snapshot)
@@ -236,9 +246,9 @@ impl RbTable {
236
246
  Ok(metadata_logs)
237
247
  }
238
248
 
239
- pub fn current_snapshot(&self) -> RbResult<Option<Value>> {
240
- let snapshot = match self.table.borrow().metadata().current_snapshot() {
241
- Some(s) => Some(rb_snapshot(s)?),
249
+ pub fn current_snapshot(ruby: &Ruby, rb_self: &Self) -> RbResult<Option<Value>> {
250
+ let snapshot = match rb_self.table.borrow().metadata().current_snapshot() {
251
+ Some(s) => Some(rb_snapshot(ruby, s)?),
242
252
  None => None,
243
253
  };
244
254
  Ok(snapshot)
@@ -248,9 +258,18 @@ impl RbTable {
248
258
  self.table.borrow().metadata().current_snapshot_id()
249
259
  }
250
260
 
251
- pub fn snapshot_for_ref(&self, ref_name: String) -> RbResult<Option<Value>> {
252
- let snapshot = match self.table.borrow().metadata().snapshot_for_ref(&ref_name) {
253
- Some(s) => Some(rb_snapshot(s)?),
261
+ pub fn snapshot_for_ref(
262
+ ruby: &Ruby,
263
+ rb_self: &Self,
264
+ ref_name: String,
265
+ ) -> RbResult<Option<Value>> {
266
+ let snapshot = match rb_self
267
+ .table
268
+ .borrow()
269
+ .metadata()
270
+ .snapshot_for_ref(&ref_name)
271
+ {
272
+ Some(s) => Some(rb_snapshot(ruby, s)?),
254
273
  None => None,
255
274
  };
256
275
  Ok(snapshot)
@@ -146,8 +146,7 @@ fn default_value(ob: Value, field_type: &Type) -> RbResult<Option<Literal>> {
146
146
  Ok(Some(lit))
147
147
  }
148
148
 
149
- pub fn rb_schema(schema: &Schema) -> RbResult<Value> {
150
- let ruby = Ruby::get().unwrap();
149
+ pub fn rb_schema(ruby: &Ruby, schema: &Schema) -> RbResult<Value> {
151
150
  let fields = ruby.ary_new();
152
151
  for f in schema.as_struct().fields() {
153
152
  let field = ruby.hash_new();
@@ -182,10 +181,10 @@ pub fn rb_schema(schema: &Schema) -> RbResult<Value> {
182
181
 
183
182
  field.aset(ruby.to_symbol("required"), f.required)?;
184
183
 
185
- let initial_default = f.initial_default.as_ref().map(rb_literal);
184
+ let initial_default = f.initial_default.as_ref().map(|v| rb_literal(ruby, v));
186
185
  field.aset(ruby.to_symbol("initial_default"), initial_default)?;
187
186
 
188
- let write_default = f.write_default.as_ref().map(rb_literal);
187
+ let write_default = f.write_default.as_ref().map(|v| rb_literal(ruby, v));
189
188
  field.aset(ruby.to_symbol("write_default"), write_default)?;
190
189
 
191
190
  field.aset(
@@ -205,8 +204,20 @@ pub fn rb_schema(schema: &Schema) -> RbResult<Value> {
205
204
  .funcall("new", (fields, kwargs!("schema_id" => schema_id)))
206
205
  }
207
206
 
208
- pub fn rb_snapshot(_snapshot: &Snapshot) -> RbResult<Value> {
209
- todo!();
207
+ pub fn rb_snapshot(ruby: &Ruby, snapshot: &Snapshot) -> RbResult<Value> {
208
+ let rb_snapshot = ruby.hash_new();
209
+ rb_snapshot.aset(ruby.to_symbol("snapshot_id"), snapshot.snapshot_id())?;
210
+ rb_snapshot.aset(
211
+ ruby.to_symbol("parent_snapshot_id"),
212
+ snapshot.parent_snapshot_id(),
213
+ )?;
214
+ rb_snapshot.aset(
215
+ ruby.to_symbol("sequence_number"),
216
+ snapshot.sequence_number(),
217
+ )?;
218
+ rb_snapshot.aset(ruby.to_symbol("manifest_list"), snapshot.manifest_list())?;
219
+ rb_snapshot.aset(ruby.to_symbol("schema_id"), snapshot.schema_id())?;
220
+ Ok(rb_snapshot.as_value())
210
221
  }
211
222
 
212
223
  pub fn rb_partition_spec(_partition_spec: &PartitionSpec) -> RbResult<Value> {
@@ -227,15 +238,14 @@ pub fn rb_partition_statistics_file(
227
238
  todo!();
228
239
  }
229
240
 
230
- pub fn rb_literal(literal: &Literal) -> Value {
231
- let ruby = Ruby::get().unwrap();
241
+ pub fn rb_literal(ruby: &Ruby, literal: &Literal) -> Value {
232
242
  match literal {
233
243
  Literal::Primitive(pl) => match pl {
234
- PrimitiveLiteral::Boolean(v) => v.into_value_with(&ruby),
235
- PrimitiveLiteral::Int(v) => v.into_value_with(&ruby),
236
- PrimitiveLiteral::Long(v) => v.into_value_with(&ruby),
237
- PrimitiveLiteral::Float(v) => v.into_value_with(&ruby),
238
- PrimitiveLiteral::Double(v) => v.into_value_with(&ruby),
244
+ PrimitiveLiteral::Boolean(v) => v.into_value_with(ruby),
245
+ PrimitiveLiteral::Int(v) => v.into_value_with(ruby),
246
+ PrimitiveLiteral::Long(v) => v.into_value_with(ruby),
247
+ PrimitiveLiteral::Float(v) => v.into_value_with(ruby),
248
+ PrimitiveLiteral::Double(v) => v.into_value_with(ruby),
239
249
  PrimitiveLiteral::String(v) => ruby.str_new(v).as_value(),
240
250
  PrimitiveLiteral::Binary(v) => ruby.str_from_slice(v).as_value(),
241
251
  _ => todo!(),
data/lib/iceberg/table.rb CHANGED
@@ -83,14 +83,28 @@ module Iceberg
83
83
  @table.properties
84
84
  end
85
85
 
86
- def to_polars(snapshot_id: nil, storage_options: nil)
86
+ def scan(snapshot_id: nil)
87
+ TableScan.new(@table.scan(snapshot_id), self)
88
+ end
89
+
90
+ def to_polars(snapshot_id: nil, storage_options: nil, _schema_changes: false)
87
91
  require "polars-df"
88
92
 
89
- files = @table.scan(snapshot_id).plan_files
93
+ # TODO always take this path in 0.2.0
94
+ if _schema_changes
95
+ return Polars.scan_iceberg(self, snapshot_id:, storage_options:)
96
+ end
97
+
98
+ scan = scan(snapshot_id:)
99
+ files = scan.plan_files
100
+
90
101
  if files.empty?
102
+ snapshot = scan.snapshot
103
+ scan_schema = snapshot ? schema_by_id(snapshot[:schema_id]) : current_schema
104
+
91
105
  # TODO improve
92
106
  schema =
93
- current_schema.fields.to_h do |field|
107
+ scan_schema.fields.to_h do |field|
94
108
  dtype =
95
109
  case field[:type]
96
110
  when "int"
@@ -121,16 +135,12 @@ module Iceberg
121
135
  .to_h { |v, i| [i, v[:deletes].map { |d| d[:file_path] }] }
122
136
  ]
123
137
 
124
- Polars.scan_parquet(
125
- sources,
138
+ scan_options = {
126
139
  storage_options: storage_options,
127
- # TODO
128
- # cast_options: Polars::ScanCastOptions._default_iceberg,
129
- # allow_missing_columns: true,
130
- # extra_columns: "ignore",
131
- # _column_mapping: column_mapping,
132
- _deletion_files: deletion_files
133
- )
140
+ _deletion_files: deletion_files,
141
+ }
142
+
143
+ Polars.scan_parquet(sources, **scan_options)
134
144
  end
135
145
  end
136
146
 
@@ -0,0 +1,18 @@
1
+ module Iceberg
2
+ class TableScan
3
+ attr_reader :table
4
+
5
+ def initialize(scan, table)
6
+ @scan = scan
7
+ @table = table
8
+ end
9
+
10
+ def plan_files
11
+ @scan.plan_files
12
+ end
13
+
14
+ def snapshot
15
+ @scan.snapshot
16
+ end
17
+ end
18
+ end
@@ -1,3 +1,3 @@
1
1
  module Iceberg
2
- VERSION = "0.10.1"
2
+ VERSION = "0.10.3"
3
3
  end
data/lib/iceberg.rb CHANGED
@@ -9,6 +9,7 @@ end
9
9
  require_relative "iceberg/catalog"
10
10
  require_relative "iceberg/schema"
11
11
  require_relative "iceberg/table"
12
+ require_relative "iceberg/table_scan"
12
13
  require_relative "iceberg/static_table"
13
14
  require_relative "iceberg/table_definition"
14
15
  require_relative "iceberg/version"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iceberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.10.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -55,6 +55,7 @@ files:
55
55
  - lib/iceberg/static_table.rb
56
56
  - lib/iceberg/table.rb
57
57
  - lib/iceberg/table_definition.rb
58
+ - lib/iceberg/table_scan.rb
58
59
  - lib/iceberg/version.rb
59
60
  homepage: https://github.com/ankane/iceberg-ruby
60
61
  licenses: