iceberg 0.10.3 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  [Apache Iceberg](https://iceberg.apache.org/) for Ruby
4
4
 
5
+ :duck: Also check out [SeaDuck](https://github.com/ankane/seaduck)
6
+
5
7
  [![Build Status](https://github.com/ankane/iceberg-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/iceberg-ruby/actions)
6
8
 
7
9
  ## Installation
@@ -35,7 +37,7 @@ catalog.create_table("main.events") do |t|
35
37
  end
36
38
  ```
37
39
 
38
- Or
40
+ Or with [Polars](https://github.com/ankane/ruby-polars)
39
41
 
40
42
  ```ruby
41
43
  df = Polars::DataFrame.new({"id" => [1, 2], "value" => [3.0, 4.0]})
@@ -1,27 +1,29 @@
1
1
  [package]
2
- name = "iceberg"
3
- version = "0.10.3"
2
+ name = "iceberg-ruby"
3
+ version = "0.11.0"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2024"
7
- rust-version = "1.87"
7
+ rust-version = "1.88"
8
8
  publish = false
9
9
 
10
10
  [lib]
11
+ name = "iceberg"
11
12
  crate-type = ["cdylib"]
12
13
 
13
14
  [dependencies]
14
- arrow-array = { version = "55", features = ["ffi"] }
15
- arrow-schema = "55"
16
- datafusion = { version = "48", optional = true }
15
+ arrow-array = { version = "57", features = ["ffi"] }
16
+ arrow-schema = "57"
17
+ datafusion = { version = "51", optional = true }
17
18
  futures = "0.3"
18
- iceberg = "0.7"
19
- iceberg-catalog-glue = { version = "0.7", optional = true }
20
- iceberg-catalog-rest = { version = "0.7", optional = true }
21
- iceberg-catalog-sql = { version = "0.7", optional = true }
22
- iceberg-datafusion = { version = "0.7", optional = true }
19
+ iceberg = "0.8"
20
+ iceberg-catalog-glue = { version = "0.8", optional = true }
21
+ iceberg-catalog-rest = { version = "0.8", optional = true }
22
+ iceberg-catalog-s3tables = { version = "0.8", optional = true }
23
+ iceberg-catalog-sql = { version = "0.8", optional = true }
24
+ iceberg-datafusion = { version = "0.8", optional = true }
23
25
  magnus = "0.8"
24
- parquet = "55"
26
+ parquet = "57"
25
27
  sqlx = { version = "0.8", features = ["postgres", "runtime-tokio", "sqlite"], default-features = false, optional = true }
26
28
  tokio = { version = "1", features = ["rt-multi-thread"] }
27
29
  uuid = { version = "1", features = ["v4"] }
@@ -31,4 +33,5 @@ default = ["rest", "sql"]
31
33
  datafusion = ["dep:datafusion", "dep:iceberg-datafusion"]
32
34
  glue = ["dep:iceberg-catalog-glue"]
33
35
  rest = ["dep:iceberg-catalog-rest"]
36
+ s3tables = ["dep:iceberg-catalog-s3tables"]
34
37
  sql = ["dep:iceberg-catalog-sql", "dep:sqlx"]
@@ -1,6 +1,5 @@
1
1
  #[cfg(feature = "datafusion")]
2
2
  use datafusion::execution::context::SessionContext;
3
- use iceberg::io::FileIO;
4
3
  use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder};
5
4
  use iceberg::spec::Schema;
6
5
  use iceberg::{Catalog, CatalogBuilder, MemoryCatalog, NamespaceIdent, TableCreation, TableIdent};
@@ -10,8 +9,15 @@ use iceberg_catalog_glue::{GLUE_CATALOG_PROP_WAREHOUSE, GlueCatalog, GlueCatalog
10
9
  use iceberg_catalog_rest::{
11
10
  REST_CATALOG_PROP_URI, REST_CATALOG_PROP_WAREHOUSE, RestCatalog, RestCatalogBuilder,
12
11
  };
12
+ #[cfg(feature = "s3tables")]
13
+ use iceberg_catalog_s3tables::{
14
+ S3TABLES_CATALOG_PROP_TABLE_BUCKET_ARN, S3TablesCatalog, S3TablesCatalogBuilder,
15
+ };
13
16
  #[cfg(feature = "sql")]
14
- use iceberg_catalog_sql::{SqlBindStyle, SqlCatalog, SqlCatalogConfig};
17
+ use iceberg_catalog_sql::{
18
+ SQL_CATALOG_PROP_BIND_STYLE, SQL_CATALOG_PROP_URI, SQL_CATALOG_PROP_WAREHOUSE, SqlBindStyle,
19
+ SqlCatalog, SqlCatalogBuilder,
20
+ };
15
21
  #[cfg(feature = "datafusion")]
16
22
  use iceberg_datafusion::IcebergCatalogProvider;
17
23
  use std::cell::RefCell;
@@ -29,6 +35,8 @@ pub enum RbCatalogType {
29
35
  Memory(Arc<MemoryCatalog>),
30
36
  #[cfg(feature = "rest")]
31
37
  Rest(Arc<RestCatalog>),
38
+ #[cfg(feature = "s3tables")]
39
+ S3Tables(Arc<S3TablesCatalog>),
32
40
  #[cfg(feature = "sql")]
33
41
  Sql(Arc<SqlCatalog>),
34
42
  }
@@ -41,6 +49,8 @@ impl RbCatalogType {
41
49
  RbCatalogType::Memory(v) => v.as_ref(),
42
50
  #[cfg(feature = "rest")]
43
51
  RbCatalogType::Rest(v) => v.as_ref(),
52
+ #[cfg(feature = "s3tables")]
53
+ RbCatalogType::S3Tables(v) => v.as_ref(),
44
54
  #[cfg(feature = "sql")]
45
55
  RbCatalogType::Sql(v) => v.as_ref(),
46
56
  }
@@ -54,6 +64,8 @@ impl RbCatalogType {
54
64
  RbCatalogType::Memory(v) => v.clone(),
55
65
  #[cfg(feature = "rest")]
56
66
  RbCatalogType::Rest(v) => v.clone(),
67
+ #[cfg(feature = "s3tables")]
68
+ RbCatalogType::S3Tables(v) => v.clone(),
57
69
  #[cfg(feature = "sql")]
58
70
  RbCatalogType::Sql(v) => v.clone(),
59
71
  }
@@ -109,6 +121,18 @@ impl RbCatalog {
109
121
  })
110
122
  }
111
123
 
124
+ #[cfg(feature = "s3tables")]
125
+ pub fn new_s3tables(arn: String) -> RbResult<Self> {
126
+ let mut props = HashMap::new();
127
+ props.insert(S3TABLES_CATALOG_PROP_TABLE_BUCKET_ARN.to_string(), arn);
128
+ let catalog = runtime()
129
+ .block_on(S3TablesCatalogBuilder::default().load("s3tables", props))
130
+ .map_err(to_rb_err)?;
131
+ Ok(Self {
132
+ catalog: RbCatalogType::S3Tables(catalog.into()).into(),
133
+ })
134
+ }
135
+
112
136
  #[cfg(feature = "sql")]
113
137
  pub fn new_sql(
114
138
  uri: String,
@@ -116,20 +140,15 @@ impl RbCatalog {
116
140
  name: String,
117
141
  props: HashMap<String, String>,
118
142
  ) -> RbResult<Self> {
119
- let file_io = FileIO::from_path(&warehouse)
120
- .map_err(to_rb_err)?
121
- .build()
122
- .map_err(to_rb_err)?;
123
- let config = SqlCatalogConfig::builder()
124
- .uri(uri)
125
- .warehouse_location(warehouse)
126
- .name(name)
127
- .file_io(file_io)
128
- .sql_bind_style(SqlBindStyle::DollarNumeric)
129
- .props(props)
130
- .build();
143
+ let mut props = props;
144
+ props.insert(SQL_CATALOG_PROP_URI.to_string(), uri);
145
+ props.insert(SQL_CATALOG_PROP_WAREHOUSE.to_string(), warehouse);
146
+ props.insert(
147
+ SQL_CATALOG_PROP_BIND_STYLE.to_string(),
148
+ SqlBindStyle::DollarNumeric.to_string(),
149
+ );
131
150
  let catalog = runtime()
132
- .block_on(SqlCatalog::new(config))
151
+ .block_on(SqlCatalogBuilder::default().load(name, props))
133
152
  .map_err(to_rb_err)?;
134
153
  Ok(Self {
135
154
  catalog: RbCatalogType::Sql(catalog.into()).into(),
@@ -297,7 +316,7 @@ impl RbCatalog {
297
316
  }
298
317
 
299
318
  #[cfg(feature = "datafusion")]
300
- pub fn query(&self, sql: String) -> RbResult<()> {
319
+ pub fn sql(&self, sql: String) -> RbResult<()> {
301
320
  let runtime = runtime();
302
321
 
303
322
  // TODO only create context once
@@ -311,8 +330,7 @@ impl RbCatalog {
311
330
  let df = runtime.block_on(ctx.sql(&sql)).unwrap();
312
331
  let _results = runtime.block_on(df.collect()).unwrap();
313
332
 
314
- // println!("{:?}", df.schema().fields());
315
- // println!("{:?}", results);
333
+ // println!("{:?}", _results);
316
334
 
317
335
  Ok(())
318
336
  }
@@ -14,7 +14,7 @@ use crate::table::RbTable;
14
14
 
15
15
  type RbResult<T> = Result<T, RbErr>;
16
16
 
17
- #[magnus::init]
17
+ #[magnus::init(name = "iceberg")]
18
18
  fn init(ruby: &Ruby) -> RbResult<()> {
19
19
  let module = ruby.define_module("Iceberg")?;
20
20
 
@@ -24,6 +24,8 @@ fn init(ruby: &Ruby) -> RbResult<()> {
24
24
  class.define_singleton_method("new_memory", function!(RbCatalog::new_memory, 1))?;
25
25
  #[cfg(feature = "rest")]
26
26
  class.define_singleton_method("new_rest", function!(RbCatalog::new_rest, 3))?;
27
+ #[cfg(feature = "s3tables")]
28
+ class.define_singleton_method("new_s3tables", function!(RbCatalog::new_s3tables, 1))?;
27
29
  #[cfg(feature = "sql")]
28
30
  class.define_singleton_method("new_sql", function!(RbCatalog::new_sql, 4))?;
29
31
  class.define_method("list_namespaces", method!(RbCatalog::list_namespaces, 1))?;
@@ -43,7 +45,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
43
45
  class.define_method("rename_table", method!(RbCatalog::rename_table, 2))?;
44
46
  class.define_method("register_table", method!(RbCatalog::register_table, 2))?;
45
47
  #[cfg(feature = "datafusion")]
46
- class.define_method("query", method!(RbCatalog::query, 1))?;
48
+ class.define_method("sql", method!(RbCatalog::sql, 1))?;
47
49
 
48
50
  let class = module.define_class("RbTable", ruby.class_object())?;
49
51
  class.define_method("scan", method!(RbTable::scan, 1))?;
@@ -9,6 +9,7 @@ use iceberg::writer::file_writer::ParquetWriterBuilder;
9
9
  use iceberg::writer::file_writer::location_generator::{
10
10
  DefaultFileNameGenerator, DefaultLocationGenerator,
11
11
  };
12
+ use iceberg::writer::file_writer::rolling_writer::RollingFileWriterBuilder;
12
13
  use iceberg::writer::{IcebergWriter, IcebergWriterBuilder};
13
14
  use magnus::{Error as RbErr, RArray, Ruby, Value};
14
15
  use parquet::file::properties::WriterProperties;
@@ -72,14 +73,16 @@ impl RbTable {
72
73
  let parquet_writer_builder = ParquetWriterBuilder::new(
73
74
  WriterProperties::default(),
74
75
  table.metadata().current_schema().clone(),
75
- None,
76
+ );
77
+ let rolling_file_writer_builder = RollingFileWriterBuilder::new_with_default_file_size(
78
+ parquet_writer_builder,
76
79
  table.file_io().clone(),
77
80
  location_generator.clone(),
78
81
  file_name_generator.clone(),
79
82
  );
80
- let data_file_writer_builder = DataFileWriterBuilder::new(parquet_writer_builder, None, 0);
83
+ let data_file_writer_builder = DataFileWriterBuilder::new(rolling_file_writer_builder);
81
84
  let mut data_file_writer = runtime
82
- .block_on(data_file_writer_builder.build())
85
+ .block_on(data_file_writer_builder.build(None))
83
86
  .map_err(to_rb_err)?;
84
87
 
85
88
  for batch in data.0 {
@@ -112,6 +115,7 @@ impl RbTable {
112
115
  match self.table.borrow().metadata().format_version() {
113
116
  FormatVersion::V1 => 1,
114
117
  FormatVersion::V2 => 2,
118
+ FormatVersion::V3 => 3,
115
119
  }
116
120
  }
117
121
 
@@ -357,18 +361,18 @@ impl RbTable {
357
361
 
358
362
  pub fn encryption_keys(ruby: &Ruby, rb_self: &Self) -> RbResult<RArray> {
359
363
  let encryption_keys = ruby.ary_new();
360
- for (k, v) in rb_self.table.borrow().metadata().encryption_keys_iter() {
361
- encryption_keys.push((ruby.str_new(k), ruby.str_new(v)))?;
364
+ for k in rb_self.table.borrow().metadata().encryption_keys_iter() {
365
+ encryption_keys.push(rb_encrypted_key(k)?)?;
362
366
  }
363
367
  Ok(encryption_keys)
364
368
  }
365
369
 
366
- pub fn encryption_key(&self, key_id: String) -> Option<String> {
367
- self.table
368
- .borrow()
369
- .metadata()
370
- .encryption_key(&key_id)
371
- .cloned()
370
+ pub fn encryption_key(&self, key_id: String) -> RbResult<Option<Value>> {
371
+ let key = match self.table.borrow().metadata().encryption_key(&key_id) {
372
+ Some(k) => Some(rb_encrypted_key(k)?),
373
+ None => None,
374
+ };
375
+ Ok(key)
372
376
  }
373
377
 
374
378
  pub fn from_metadata_file(location: String) -> RbResult<Self> {
@@ -1,6 +1,6 @@
1
1
  use iceberg::spec::{
2
- Literal, NestedField, PartitionSpec, PartitionStatisticsFile, PrimitiveLiteral, PrimitiveType,
3
- Schema, Snapshot, SortOrder, StatisticsFile, Type,
2
+ EncryptedKey, Literal, NestedField, PartitionSpec, PartitionStatisticsFile, PrimitiveLiteral,
3
+ PrimitiveType, Schema, Snapshot, SortOrder, StatisticsFile, Type,
4
4
  };
5
5
  use iceberg::{NamespaceIdent, TableIdent};
6
6
  use magnus::{
@@ -238,6 +238,10 @@ pub fn rb_partition_statistics_file(
238
238
  todo!();
239
239
  }
240
240
 
241
+ pub fn rb_encrypted_key(_encrypted_key: &EncryptedKey) -> RbResult<Value> {
242
+ todo!();
243
+ }
244
+
241
245
  pub fn rb_literal(ruby: &Ruby, literal: &Literal) -> Value {
242
246
  match literal {
243
247
  Literal::Primitive(pl) => match pl {
@@ -7,6 +7,7 @@ module Iceberg
7
7
  def create_namespace(namespace, properties: {}, if_not_exists: nil)
8
8
  @catalog.create_namespace(namespace, properties)
9
9
  rescue Error => e
10
+ # ideally all catalogs would use NamespaceAlreadyExistsError
10
11
  if !if_not_exists || (e.message != "Cannot create namespace" && !e.message.include?("already exists"))
11
12
  raise e
12
13
  end
@@ -28,7 +29,8 @@ module Iceberg
28
29
  def drop_namespace(namespace, if_exists: nil)
29
30
  @catalog.drop_namespace(namespace)
30
31
  rescue Error => e
31
- if !if_exists || (e.message != "Tried to drop a namespace that does not exist" && !e.message.include?("No such namespace"))
32
+ # ideally all catalogs would use NamespaceNotFoundError
33
+ if !if_exists || (e.message != "Tried to drop a namespace that does not exist" && !e.message.include?("No such namespace") && !e.message.include?("The specified namespace does not exist") && !e.message.include?("not found"))
32
34
  raise e
33
35
  end
34
36
  nil
@@ -47,9 +49,9 @@ module Iceberg
47
49
  table_definition = TableDefinition.new
48
50
  yield table_definition
49
51
  schema = Schema.new(table_definition.fields)
50
- elsif schema.is_a?(Hash)
52
+ elsif schema.is_a?(Hash) || (defined?(Polars::Schema) && schema.is_a?(Polars::Schema))
51
53
  fields =
52
- schema.map.with_index do |(k, v), i|
54
+ schema.to_h.map.with_index do |(k, v), i|
53
55
  {
54
56
  id: i + 1,
55
57
  name: k.is_a?(Symbol) ? k.to_s : k,
@@ -72,7 +74,8 @@ module Iceberg
72
74
  def drop_table(table_name, if_exists: nil)
73
75
  @catalog.drop_table(table_name)
74
76
  rescue Error => e
75
- if !if_exists || (e.message != "Tried to drop a table that does not exist" && !e.message.include?("No such table"))
77
+ # ideally all catalogs would use TableNotFoundError
78
+ if !if_exists || (e.message != "Tried to drop a table that does not exist" && !e.message.include?("No such table") && !e.message.include?("The specified table does not exist") && !e.message.include?("not found"))
76
79
  raise e
77
80
  end
78
81
  nil
@@ -92,11 +95,11 @@ module Iceberg
92
95
  @catalog.register_table(table_name, metadata_location)
93
96
  end
94
97
 
95
- def query(sql)
98
+ def sql(sql)
96
99
  # requires datafusion feature
97
- raise Todo unless @catalog.respond_to?(:query)
100
+ raise Todo unless @catalog.respond_to?(:sql)
98
101
 
99
- @catalog.query(sql)
102
+ @catalog.sql(sql)
100
103
  end
101
104
 
102
105
  # hide internal state
@@ -2,9 +2,6 @@ module Iceberg
2
2
  class GlueCatalog < Catalog
3
3
  # warehouse is URI of S3 storage bucket
4
4
  def initialize(warehouse:)
5
- # requires glue feature
6
- raise Error, "Feature not enabled" unless RbCatalog.respond_to?(:new_glue)
7
-
8
5
  @catalog = RbCatalog.new_glue(warehouse)
9
6
  end
10
7
  end
@@ -0,0 +1,7 @@
1
+ module Iceberg
2
+ class S3TablesCatalog < Catalog
3
+ def initialize(arn:)
4
+ @catalog = RbCatalog.new_s3tables(arn)
5
+ end
6
+ end
7
+ end
data/lib/iceberg/table.rb CHANGED
@@ -87,61 +87,14 @@ module Iceberg
87
87
  TableScan.new(@table.scan(snapshot_id), self)
88
88
  end
89
89
 
90
- def to_polars(snapshot_id: nil, storage_options: nil, _schema_changes: false)
90
+ def to_polars(snapshot_id: nil, storage_options: nil)
91
91
  require "polars-df"
92
92
 
93
- # TODO always take this path in 0.2.0
94
- if _schema_changes
95
- return Polars.scan_iceberg(self, snapshot_id:, storage_options:)
93
+ if Gem::Version.new(Polars::VERSION) < Gem::Version.new("0.23")
94
+ raise "Requires polars-df >= 0.23"
96
95
  end
97
96
 
98
- scan = scan(snapshot_id:)
99
- files = scan.plan_files
100
-
101
- if files.empty?
102
- snapshot = scan.snapshot
103
- scan_schema = snapshot ? schema_by_id(snapshot[:schema_id]) : current_schema
104
-
105
- # TODO improve
106
- schema =
107
- scan_schema.fields.to_h do |field|
108
- dtype =
109
- case field[:type]
110
- when "int"
111
- Polars::Int32
112
- when "long"
113
- Polars::Int64
114
- when "double"
115
- Polars::Float64
116
- when "string"
117
- Polars::String
118
- when "timestamp"
119
- Polars::Datetime
120
- else
121
- raise Todo
122
- end
123
-
124
- [field[:name], dtype]
125
- end
126
-
127
- Polars::LazyFrame.new(schema: schema)
128
- else
129
- sources = files.map { |v| v[:data_file_path] }
130
-
131
- deletion_files = [
132
- "iceberg-position-delete",
133
- files.map.with_index
134
- .select { |v, i| v[:deletes].any? }
135
- .to_h { |v, i| [i, v[:deletes].map { |d| d[:file_path] }] }
136
- ]
137
-
138
- scan_options = {
139
- storage_options: storage_options,
140
- _deletion_files: deletion_files,
141
- }
142
-
143
- Polars.scan_parquet(sources, **scan_options)
144
- end
97
+ Polars.scan_iceberg(self, snapshot_id:, storage_options:)
145
98
  end
146
99
 
147
100
  def append(df)
@@ -1,3 +1,3 @@
1
1
  module Iceberg
2
- VERSION = "0.10.3"
2
+ VERSION = "0.11.0"
3
3
  end
data/lib/iceberg.rb CHANGED
@@ -18,6 +18,7 @@ require_relative "iceberg/version"
18
18
  require_relative "iceberg/glue_catalog"
19
19
  require_relative "iceberg/memory_catalog"
20
20
  require_relative "iceberg/rest_catalog"
21
+ require_relative "iceberg/s3_tables_catalog"
21
22
  require_relative "iceberg/sql_catalog"
22
23
 
23
24
  module Iceberg
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iceberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.3
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -50,6 +50,7 @@ files:
50
50
  - lib/iceberg/glue_catalog.rb
51
51
  - lib/iceberg/memory_catalog.rb
52
52
  - lib/iceberg/rest_catalog.rb
53
+ - lib/iceberg/s3_tables_catalog.rb
53
54
  - lib/iceberg/schema.rb
54
55
  - lib/iceberg/sql_catalog.rb
55
56
  - lib/iceberg/static_table.rb
@@ -68,7 +69,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
68
69
  requirements:
69
70
  - - ">="
70
71
  - !ruby/object:Gem::Version
71
- version: '3.2'
72
+ version: '3.3'
72
73
  required_rubygems_version: !ruby/object:Gem::Requirement
73
74
  requirements:
74
75
  - - ">="