seaduck 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7b1b2081ef2387c773e8d54e81f82b02573efdc8f6be7fb45083c359cba6ce18
4
+ data.tar.gz: 915186c80411757817991858b892f6dba31c353339a8eea6317299f8a1389683
5
+ SHA512:
6
+ metadata.gz: df40e3b4113ac02773e22db0bd66afe3b97ab93cf9c2bd2dedec4ef5390ae70931a09be48291d07bd263ea336839b04307f7c32ce46a77e9676d4c6ffab2ebff
7
+ data.tar.gz: 87a754ad9eaedc3cec9a71c7c02bca8544a96ea6767ae601c3fa7efc0e4ac850771f2220c1164bfeaaf6447fcbdcc6254718e02f3436308f109d6a313983d37b
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2025-09-30)
2
+
3
+ - First release
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2025 Andrew Kane
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,163 @@
1
+ # SeaDuck
2
+
3
+ [Apache Iceberg](https://iceberg.apache.org/) for Ruby, powered by libduckdb
4
+
5
+ [![Build Status](https://github.com/ankane/seaduck/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/seaduck/actions)
6
+
7
+ ## Installation
8
+
9
+ First, install libduckdb. For Homebrew, use:
10
+
11
+ ```sh
12
+ brew install duckdb
13
+ ```
14
+
15
+ Then add this line to your application’s Gemfile:
16
+
17
+ ```ruby
18
+ gem "seaduck"
19
+ ```
20
+
21
+ ## Getting Started
22
+
23
+ Create a client for an Iceberg catalog
24
+
25
+ ```ruby
26
+ catalog = SeaDuck::S3TablesCatalog.new(arn: "arn:aws:s3tables:...")
27
+ ```
28
+
29
+ Note: SeaDuck requires a default namespace, which is `main` by default. This namespace is created if it does not exist. Pass `default_namespace` to use a different one.
30
+
31
+ Create a table
32
+
33
+ ```ruby
34
+ catalog.sql("CREATE TABLE events (id bigint, name text)")
35
+ ```
36
+
37
+ Load data from a file
38
+
39
+ ```ruby
40
+ catalog.sql("COPY events FROM 'data.csv'")
41
+ ```
42
+
43
+ You can also load data directly from other [data sources](https://duckdb.org/docs/stable/data/data_sources)
44
+
45
+ ```ruby
46
+ catalog.attach("blog", "postgres://localhost:5432/blog")
47
+ catalog.sql("INSERT INTO events SELECT * FROM blog.ahoy_events")
48
+ ```
49
+
50
+ Query the data
51
+
52
+ ```ruby
53
+ catalog.sql("SELECT COUNT(*) FROM events").to_a
54
+ ```
55
+
56
+ ## Namespaces
57
+
58
+ List namespaces
59
+
60
+ ```ruby
61
+ catalog.list_namespaces
62
+ ```
63
+
64
+ Create a namespace
65
+
66
+ ```ruby
67
+ catalog.create_namespace("main")
68
+ ```
69
+
70
+ Check if a namespace exists
71
+
72
+ ```ruby
73
+ catalog.namespace_exists?("main")
74
+ ```
75
+
76
+ Drop a namespace
77
+
78
+ ```ruby
79
+ catalog.drop_namespace("main")
80
+ ```
81
+
82
+ ## Tables
83
+
84
+ List tables
85
+
86
+ ```ruby
87
+ catalog.list_tables
88
+ ```
89
+
90
+ Check if a table exists
91
+
92
+ ```ruby
93
+ catalog.table_exists?("events")
94
+ ```
95
+
96
+ Drop a table
97
+
98
+ ```ruby
99
+ catalog.drop_table("events")
100
+ ```
101
+
102
+ ## Snapshots
103
+
104
+ Get snapshots for a table
105
+
106
+ ```ruby
107
+ catalog.snapshots("events")
108
+ ```
109
+
110
+ Query the data at a specific snapshot version or time
111
+
112
+ ```ruby
113
+ catalog.sql("SELECT * FROM events AT (VERSION => ?)", [3])
114
+ # or
115
+ catalog.sql("SELECT * FROM events AT (TIMESTAMP => ?)", [Date.today - 7])
116
+ ```
117
+
118
+ ## SQL Safety
119
+
120
+ Use parameterized queries when possible
121
+
122
+ ```ruby
123
+ catalog.sql("SELECT * FROM events WHERE id = ?", [1])
124
+ ```
125
+
126
+ For places that do not support parameters, use `quote` or `quote_identifier`
127
+
128
+ ```ruby
129
+ quoted_table = catalog.quote_identifier("events")
130
+ quoted_file = catalog.quote("path/to/data.csv")
131
+ catalog.sql("COPY #{quoted_table} FROM #{quoted_file}")
132
+ ```
133
+
134
+ ## History
135
+
136
+ View the [changelog](https://github.com/ankane/seaduck/blob/master/CHANGELOG.md)
137
+
138
+ ## Contributing
139
+
140
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
141
+
142
+ - [Report bugs](https://github.com/ankane/seaduck/issues)
143
+ - Fix bugs and [submit pull requests](https://github.com/ankane/seaduck/pulls)
144
+ - Write, clarify, or fix documentation
145
+ - Suggest or add new features
146
+
147
+ To get started with development:
148
+
149
+ ```sh
150
+ git clone https://github.com/ankane/seaduck.git
151
+ cd seaduck
152
+ bundle install
153
+
154
+ # REST catalog
155
+ docker compose up
156
+ bundle exec rake test:rest
157
+
158
+ # S3 Tables catalog
159
+ bundle exec rake test:s3tables
160
+
161
+ # Glue catalog
162
+ bundle exec rake test:glue
163
+ ```
@@ -0,0 +1,255 @@
1
+ module SeaDuck
2
+ class Catalog
3
+ def _initialize(url, default_namespace:, attach_options:, secret_options: nil, extensions: [])
4
+ @catalog = "iceberg"
5
+ @default_namespace = default_namespace
6
+
7
+ @db = DuckDB::Database.open
8
+ @conn = @db.connect
9
+
10
+ install_extension("iceberg")
11
+ extensions.each do |extension|
12
+ install_extension(extension)
13
+ end
14
+ create_secret(secret_options) if secret_options
15
+ attach_with_options(@catalog, url, {type: "iceberg"}.merge(attach_options))
16
+
17
+ begin
18
+ use_namespace(@default_namespace)
19
+ rescue Error
20
+ create_namespace(@default_namespace, if_not_exists: true)
21
+ use_namespace(@default_namespace)
22
+ end
23
+ execute("DETACH memory")
24
+ end
25
+
26
+ def list_namespaces
27
+ execute("SELECT schema_name FROM information_schema.schemata WHERE catalog_name = ?", [@catalog]).rows
28
+ end
29
+
30
+ def create_namespace(namespace, if_not_exists: nil)
31
+ execute("CREATE SCHEMA#{" IF NOT EXISTS" if if_not_exists} #{quote_namespace(namespace)}")
32
+ nil
33
+ end
34
+
35
+ def namespace_exists?(namespace)
36
+ execute("SELECT 1 FROM information_schema.schemata WHERE catalog_name = ? AND schema_name = ?", [@catalog, namespace]).any?
37
+ end
38
+
39
+ # CASCADE not implemented for Iceberg yet
40
+ def drop_namespace(namespace, if_exists: nil)
41
+ execute("DROP SCHEMA#{" IF EXISTS" if if_exists} #{quote_namespace(namespace)}")
42
+ nil
43
+ end
44
+
45
+ def list_tables(namespace = nil)
46
+ sql = +"SELECT table_schema, table_name FROM information_schema.tables WHERE table_catalog = ?"
47
+ params = [@catalog]
48
+
49
+ if namespace
50
+ sql << " AND table_schema = ?"
51
+ params << namespace
52
+ end
53
+
54
+ execute(sql, params).rows
55
+ end
56
+
57
+ def table_exists?(table_name)
58
+ namespace, table_name = split_table(table_name)
59
+ execute("SELECT 1 FROM information_schema.tables WHERE table_catalog = ? AND table_schema = ? AND table_name = ?", [@catalog, namespace, table_name]).any?
60
+ end
61
+
62
+ def drop_table(table_name, if_exists: nil)
63
+ execute("DROP TABLE#{" IF EXISTS" if if_exists} #{quote_table(table_name)}")
64
+ end
65
+
66
+ def snapshots(table_name)
67
+ symbolize_keys execute("SELECT * FROM iceberg_snapshots(#{quote_table(table_name)})")
68
+ end
69
+
70
+ def sql(sql, params = [])
71
+ execute(sql, params)
72
+ end
73
+
74
+ def transaction
75
+ execute("BEGIN")
76
+ begin
77
+ yield
78
+ execute("COMMIT")
79
+ rescue => e
80
+ execute("ROLLBACK")
81
+ raise e unless e.is_a?(Rollback)
82
+ end
83
+ end
84
+
85
+ def attach(alias_, url)
86
+ type = nil
87
+ extension = nil
88
+
89
+ uri = URI.parse(url)
90
+ case uri.scheme
91
+ when "postgres", "postgresql"
92
+ type = "postgres"
93
+ extension = "postgres"
94
+ else
95
+ raise ArgumentError, "Unsupported data source type: #{uri.scheme}"
96
+ end
97
+
98
+ install_extension(extension) if extension
99
+
100
+ options = {
101
+ type: type,
102
+ read_only: true
103
+ }
104
+ attach_with_options(alias_, url, options)
105
+ end
106
+
107
+ def detach(alias_)
108
+ execute("DETACH #{quote_identifier(alias_)}")
109
+ nil
110
+ end
111
+
112
+ # libduckdb does not provide function
113
+ # https://duckdb.org/docs/stable/sql/dialect/keywords_and_identifiers.html
114
+ def quote_identifier(value)
115
+ "\"#{encoded(value).gsub('"', '""')}\""
116
+ end
117
+
118
+ # libduckdb does not provide function
119
+ # TODO support more types
120
+ def quote(value)
121
+ if value.nil?
122
+ "NULL"
123
+ elsif value == true
124
+ "true"
125
+ elsif value == false
126
+ "false"
127
+ elsif defined?(BigDecimal) && value.is_a?(BigDecimal)
128
+ value.to_s("F")
129
+ elsif value.is_a?(Numeric)
130
+ value.to_s
131
+ else
132
+ if value.is_a?(Time)
133
+ value = value.utc.iso8601(9)
134
+ elsif value.is_a?(DateTime)
135
+ value = value.iso8601(9)
136
+ elsif value.is_a?(Date)
137
+ value = value.strftime("%Y-%m-%d")
138
+ end
139
+ "'#{encoded(value).gsub("'", "''")}'"
140
+ end
141
+ end
142
+
143
+ # hide internal state
144
+ def inspect
145
+ to_s
146
+ end
147
+
148
+ private
149
+
150
+ def execute(sql, params = [])
151
+ # use prepare instead of query to prevent multiple statements at once
152
+ result =
153
+ @conn.prepare(sql) do |stmt|
154
+ params.each_with_index do |v, i|
155
+ stmt.bind(i + 1, v)
156
+ end
157
+ stmt.execute
158
+ end
159
+
160
+ # TODO add column types
161
+ Result.new(result.columns.map(&:name), result.to_a)
162
+ rescue DuckDB::Error => e
163
+ raise map_error(e), cause: nil
164
+ end
165
+
166
+ def error_mapping
167
+ @error_mapping ||= {
168
+ "Binder Error: " => BinderError,
169
+ "Catalog Error: " => CatalogError,
170
+ "Conversion Error: " => ConversionError,
171
+ "Invalid Configuration Error: " => InvalidConfigurationError,
172
+ "Invalid Input Error: " => InvalidInputError,
173
+ "IO Error: " => IOError,
174
+ "Not implemented Error: " => NotImplementedError,
175
+ "Permission Error: " => PermissionError,
176
+ "TransactionContext Error: " => TransactionContextError
177
+ }
178
+ end
179
+
180
+ # not ideal to base on prefix, but do not see a better way at the moment
181
+ def map_error(e)
182
+ error_mapping.each do |prefix, cls|
183
+ if e.message&.start_with?(prefix)
184
+ return cls.new(e.message.delete_prefix(prefix))
185
+ end
186
+ end
187
+ Error.new(e.message)
188
+ end
189
+
190
+ def install_extension(extension)
191
+ execute("INSTALL #{quote_identifier(extension)}")
192
+ end
193
+
194
+ def create_secret(options)
195
+ execute("CREATE SECRET (#{options_args(options)})")
196
+ end
197
+
198
+ def attach_with_options(alias_, url, options)
199
+ execute("ATTACH #{quote(url)} AS #{quote_identifier(alias_)} (#{options_args(options)})")
200
+ end
201
+
202
+ def options_args(options)
203
+ options.map { |k, v| "#{option_name(k)} #{quote(v)}" }.join(", ")
204
+ end
205
+
206
+ def option_name(k)
207
+ name = k.to_s.upcase
208
+ # should never contain user input, but just to be safe
209
+ unless name.match?(/\A[A-Z_]+\z/)
210
+ raise "Invalid option name"
211
+ end
212
+ name
213
+ end
214
+
215
+ def symbolize_keys(result)
216
+ result.map { |v| v.transform_keys(&:to_sym) }
217
+ end
218
+
219
+ def use_namespace(namespace)
220
+ execute("USE #{quote_namespace(namespace)}")
221
+ end
222
+
223
+ def quote_namespace(value)
224
+ "#{quote_identifier(@catalog)}.#{quote_identifier(value)}"
225
+ end
226
+
227
+ def split_table(value)
228
+ if value.is_a?(Array)
229
+ if value.size == 2
230
+ value
231
+ else
232
+ raise ArgumentError, "Invalid table identifier"
233
+ end
234
+ else
235
+ [@default_namespace, value]
236
+ end
237
+ end
238
+
239
+ def quote_table(value)
240
+ namespace, table_name = split_table(value)
241
+ "#{quote_namespace(namespace)}.#{quote_identifier(table_name)}"
242
+ end
243
+
244
+ def encoded(value)
245
+ value = value.to_s if value.is_a?(Symbol)
246
+ if !value.respond_to?(:to_str)
247
+ raise TypeError, "no implicit conversion of #{value.class.name} into String"
248
+ end
249
+ if ![Encoding::UTF_8, Encoding::US_ASCII].include?(value.encoding) || !value.valid_encoding?
250
+ raise ArgumentError, "Unsupported encoding"
251
+ end
252
+ value
253
+ end
254
+ end
255
+ end
@@ -0,0 +1,20 @@
1
+ module SeaDuck
2
+ class GlueCatalog < Catalog
3
+ # https://duckdb.org/docs/stable/core_extensions/iceberg/amazon_sagemaker_lakehouse
4
+ def initialize(warehouse:, default_namespace: "main")
5
+ attach_options = {
6
+ endpoint_type: "glue"
7
+ }
8
+ secret_options = {
9
+ type: "s3",
10
+ provider: "credential_chain"
11
+ }
12
+ _initialize(
13
+ warehouse,
14
+ default_namespace:,
15
+ attach_options:,
16
+ secret_options:
17
+ )
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,16 @@
1
+ module SeaDuck
2
+ class RestCatalog < Catalog
3
+ def initialize(uri:, warehouse: nil, default_namespace: "main", _secret_options: nil)
4
+ attach_options = {
5
+ endpoint: uri,
6
+ authorization_type: "none"
7
+ }
8
+ _initialize(
9
+ warehouse.to_s,
10
+ default_namespace:,
11
+ attach_options:,
12
+ secret_options: _secret_options
13
+ )
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,22 @@
1
+ module SeaDuck
2
+ class Result
3
+ include Enumerable
4
+
5
+ attr_reader :columns, :rows
6
+
7
+ def initialize(columns, rows)
8
+ @columns = columns
9
+ @rows = rows
10
+ end
11
+
12
+ def each
13
+ @rows.each do |row|
14
+ yield @columns.zip(row).to_h
15
+ end
16
+ end
17
+
18
+ def empty?
19
+ rows.empty?
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,21 @@
1
+ module SeaDuck
2
+ class S3TablesCatalog < Catalog
3
+ # https://duckdb.org/docs/stable/core_extensions/iceberg/amazon_s3_tables
4
+ def initialize(arn:, default_namespace: "main")
5
+ attach_options = {
6
+ endpoint_type: "s3_tables"
7
+ }
8
+ secret_options = {
9
+ type: "s3",
10
+ provider: "credential_chain"
11
+ }
12
+ _initialize(
13
+ arn,
14
+ default_namespace:,
15
+ attach_options:,
16
+ secret_options:,
17
+ extensions: ["aws", "httpfs"]
18
+ )
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,3 @@
1
+ module SeaDuck
2
+ VERSION = "0.1.0"
3
+ end
data/lib/seaduck.rb ADDED
@@ -0,0 +1,29 @@
1
+ # dependencies
2
+ require "duckdb"
3
+
4
+ # stdlib
5
+ require "uri"
6
+
7
+ # modules
8
+ require_relative "seaduck/catalog"
9
+ require_relative "seaduck/result"
10
+ require_relative "seaduck/version"
11
+
12
+ # catalogs
13
+ require_relative "seaduck/glue_catalog"
14
+ require_relative "seaduck/rest_catalog"
15
+ require_relative "seaduck/s3_tables_catalog"
16
+
17
+ module SeaDuck
18
+ class Error < StandardError; end
19
+ class BinderError < Error; end
20
+ class CatalogError < Error; end
21
+ class ConversionError < Error; end
22
+ class InvalidConfigurationError < Error; end
23
+ class InvalidInputError < Error; end
24
+ class IOError < Error; end
25
+ class NotImplementedError < Error; end
26
+ class PermissionError < Error; end
27
+ class Rollback < Error; end
28
+ class TransactionContextError < Error; end
29
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: seaduck
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Kane
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: duckdb
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '0'
26
+ email: andrew@ankane.org
27
+ executables: []
28
+ extensions: []
29
+ extra_rdoc_files: []
30
+ files:
31
+ - CHANGELOG.md
32
+ - LICENSE.txt
33
+ - README.md
34
+ - lib/seaduck.rb
35
+ - lib/seaduck/catalog.rb
36
+ - lib/seaduck/glue_catalog.rb
37
+ - lib/seaduck/rest_catalog.rb
38
+ - lib/seaduck/result.rb
39
+ - lib/seaduck/s3_tables_catalog.rb
40
+ - lib/seaduck/version.rb
41
+ homepage: https://github.com/ankane/seaduck
42
+ licenses:
43
+ - MIT
44
+ metadata: {}
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '3.2'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubygems_version: 3.6.9
60
+ specification_version: 4
61
+ summary: Apache Iceberg for Ruby, powered by libduckdb
62
+ test_files: []