athena-udf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: be808b4f155b8cdfa5c52bcc3c486d77cb9d056099f592b304bc8aa6426694b4
4
+ data.tar.gz: c41565a48ec8702756c48f01112b6d7dc881723a26a521c09cbb8bf506666a24
5
+ SHA512:
6
+ metadata.gz: 971c50fb5eee0df1c0f4522b19a1583fca1ad94144f38766f0a6ff42b82151ee8501d7b303cc361f70a151c4e29e57aaca360bdd2e57646fb8509c5d4fbd74bd
7
+ data.tar.gz: af0deed14deb22d1e382adfe9bc4097f07dff0cea835f9188e847df7bd27017e91b21c8c84bdf56ebbf6ac4dc729a3e6fe9dfbff43847f6eca1ee423389d3824
data/.dockerignore ADDED
@@ -0,0 +1,2 @@
1
+ .git
2
+ Dockerfile.example
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,36 @@
1
+ AllCops:
2
+ TargetRubyVersion: 3.2
3
+
4
+ Style/StringLiterals:
5
+ Enabled: true
6
+ EnforcedStyle: single_quotes
7
+
8
+ Style/StringLiteralsInInterpolation:
9
+ Enabled: true
10
+ EnforcedStyle: single_quotes
11
+
12
+ Layout/LineLength:
13
+ Max: 120
14
+
15
+ Style/Documentation:
16
+ Enabled: false
17
+
18
+ Metrics/MethodLength:
19
+ Max: 30
20
+
21
+ Metrics/AbcSize:
22
+ Max: 40
23
+
24
+ Metrics/BlockLength:
25
+ Exclude:
26
+ - "spec/**/*"
27
+
28
+ Style/TrailingCommaInArguments:
29
+ EnforcedStyleForMultiline: comma
30
+
31
+ Style/TrailingCommaInArrayLiteral:
32
+ EnforcedStyleForMultiline: comma
33
+
34
+ Style/TrailingCommaInHashLiteral:
35
+ EnforcedStyleForMultiline: comma
36
+
@@ -0,0 +1,31 @@
1
+ FROM public.ecr.aws/lambda/ruby:3.2
2
+
3
+ # General packages
4
+ RUN yum update -y \
5
+ && yum install -y \
6
+ amazon-linux-extras \
7
+ gcc-c++ \
8
+ make \
9
+ git \
10
+ && amazon-linux-extras install -y epel \
11
+ && yum clean all
12
+
13
+ # Apache Arrow
14
+ RUN yum update -y \
15
+ && yum install -y https://apache.jfrog.io/artifactory/arrow/amazon-linux/2/apache-arrow-release-latest.rpm \
16
+ && yum install -y --enablerepo=epel arrow-devel arrow-glib-devel arrow-dataset-devel arrow-dataset-glib-devel \
17
+ && yum clean all
18
+
19
+ # Update bundler
20
+ RUN gem update bundler
21
+
22
+ # Install gems
23
+ COPY Gemfile Gemfile.lock athena-udf.gemspec ${LAMBDA_TASK_ROOT}
24
+ COPY lib/athena-udf/version.rb ${LAMBDA_TASK_ROOT}/lib/athena-udf/version.rb
25
+ ENV GEM_HOME=${LAMBDA_TASK_ROOT}
26
+ RUN bundle config set --local without development \
27
+ && bundle install
28
+
29
+ COPY . ${LAMBDA_TASK_ROOT}
30
+
31
+ CMD ["example.SimpleVarcharUDF.lambda_handler"]
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ gemspec
6
+
7
+ group :development, :test do
8
+ gem 'rake', '~> 13.0'
9
+
10
+ gem 'rspec', '~> 3.0'
11
+
12
+ gem 'rubocop', '~> 1.21'
13
+
14
+ gem 'gem-release', '~> 2.2'
15
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,90 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ athena-udf (0.1.0)
5
+ base64
6
+ csv
7
+ red-arrow (~> 12.0.1)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ ast (2.4.2)
13
+ base64 (0.2.0)
14
+ bigdecimal (3.1.8)
15
+ csv (3.3.0)
16
+ diff-lcs (1.5.1)
17
+ extpp (0.1.1)
18
+ fiddle (1.1.2)
19
+ gem-release (2.2.2)
20
+ gio2 (4.2.2)
21
+ fiddle
22
+ gobject-introspection (= 4.2.2)
23
+ glib2 (4.2.2)
24
+ native-package-installer (>= 1.0.3)
25
+ pkg-config (>= 1.3.5)
26
+ gobject-introspection (4.2.2)
27
+ glib2 (= 4.2.2)
28
+ json (2.7.2)
29
+ language_server-protocol (3.17.0.3)
30
+ native-package-installer (1.1.9)
31
+ parallel (1.26.3)
32
+ parser (3.3.4.2)
33
+ ast (~> 2.4.1)
34
+ racc
35
+ pkg-config (1.5.6)
36
+ racc (1.8.1)
37
+ rainbow (3.1.1)
38
+ rake (13.2.1)
39
+ red-arrow (12.0.1)
40
+ bigdecimal (>= 3.1.0)
41
+ extpp (>= 0.1.1)
42
+ gio2 (>= 3.5.0)
43
+ native-package-installer
44
+ pkg-config
45
+ regexp_parser (2.9.2)
46
+ rexml (3.3.5)
47
+ strscan
48
+ rspec (3.13.0)
49
+ rspec-core (~> 3.13.0)
50
+ rspec-expectations (~> 3.13.0)
51
+ rspec-mocks (~> 3.13.0)
52
+ rspec-core (3.13.0)
53
+ rspec-support (~> 3.13.0)
54
+ rspec-expectations (3.13.1)
55
+ diff-lcs (>= 1.2.0, < 2.0)
56
+ rspec-support (~> 3.13.0)
57
+ rspec-mocks (3.13.1)
58
+ diff-lcs (>= 1.2.0, < 2.0)
59
+ rspec-support (~> 3.13.0)
60
+ rspec-support (3.13.1)
61
+ rubocop (1.65.1)
62
+ json (~> 2.3)
63
+ language_server-protocol (>= 3.17.0)
64
+ parallel (~> 1.10)
65
+ parser (>= 3.3.0.2)
66
+ rainbow (>= 2.2.2, < 4.0)
67
+ regexp_parser (>= 2.4, < 3.0)
68
+ rexml (>= 3.2.5, < 4.0)
69
+ rubocop-ast (>= 1.31.1, < 2.0)
70
+ ruby-progressbar (~> 1.7)
71
+ unicode-display_width (>= 2.4.0, < 3.0)
72
+ rubocop-ast (1.32.1)
73
+ parser (>= 3.3.1.0)
74
+ ruby-progressbar (1.13.0)
75
+ strscan (3.1.0)
76
+ unicode-display_width (2.5.0)
77
+
78
+ PLATFORMS
79
+ arm64-darwin-23
80
+ ruby
81
+
82
+ DEPENDENCIES
83
+ athena-udf!
84
+ gem-release (~> 2.2)
85
+ rake (~> 13.0)
86
+ rspec (~> 3.0)
87
+ rubocop (~> 1.21)
88
+
89
+ BUNDLED WITH
90
+ 2.5.17
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2024 Daisuke Taniwaki
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,90 @@
1
+ # AthenaUDF
2
+
3
+ Ruby-version Athena User Defined Function (UDF).
4
+
5
+ This gem is highly inspired by [the Python-version Athena UDF](https://github.com/dmarkey/python-athena-udf).
6
+
7
+ See [an official example implementation](https://github.com/awslabs/aws-athena-query-federation/blob/fc2e4e9cdcb71ec7f7c7d44cbda7f56c5835811e/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/UserDefinedFunctionHandler.java) for more detail of a lambda function for Athena UDF.
8
+
9
+ ## Installation
10
+
11
+ Install the gem and add to the application's Gemfile by executing:
12
+
13
+ ```sh
14
+ $ bundle add athena-udf
15
+ ```
16
+
17
+ If bundler is not being used to manage dependencies, install the gem by executing:
18
+
19
+ ```sh
20
+ $ gem install athena-udf
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ Just make a subclass of `AthenaUDF::BaseUDF` and implement a concrete function logic.
26
+
27
+ ```rb
28
+ require "athena-udf"
29
+
30
+ class SimpleVarcharUDF < AthenaUDF::BaseUDF
31
+ def self.handle_athena_record(_input_schema, _output_schema, record)
32
+ [record[0].downcase]
33
+ end
34
+ end
35
+ ```
36
+
37
+ Then, it can be called as `SimpleVarcharUDF.lambda_handler` in your lambda function for Athena UDF workloads.
38
+
39
+ After pushing an image to Amazon ECR, you can call the function like the following SQL.
40
+
41
+ ```sql
42
+ USING EXTERNAL FUNCTION my_udf(col1 varchar) RETURNS varchar LAMBDA 'athena-udf-simple-varchar'
43
+
44
+ SELECT my_udf('FooBar');
45
+ ```
46
+
47
+ See [the official document](https://docs.aws.amazon.com/athena/latest/ug/querying-udf.html) for the UDF usage.
48
+
49
+ ## Development
50
+
51
+ To contribute to this library, first checkout the code. Then, install the dependent gems.
52
+
53
+ ```sh
54
+ $ bundle install
55
+ ```
56
+
57
+ To run the tests:
58
+
59
+ ```sh
60
+ $ bundle exec rspec
61
+ ```
62
+
63
+ ## Deployment
64
+
65
+ You can try the example with the following steps.
66
+
67
+ First, push a container image to Amazon ECR:
68
+
69
+ ```sh
70
+ $ aws ecr get-login-password | docker login --username AWS --password-stdin https://<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com
71
+ $ docker build --platform=linux/amd64 -t <ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test -f Dockerfile.example .
72
+ $ docker push <ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test
73
+ ```
74
+
75
+ Then, create a lambda function with the CLI:
76
+
77
+ ```sh
78
+ $ aws iam create-role --role-name athena-udf-simple-varchar --assume-role-policy-document '{"Version": "2012-10-17","Statement": [{ "Effect": "Allow", "Principal": {"Service": "lambda.amazonaws.com"}, "Action": "sts:AssumeRole"}]}'
79
+ $ aws iam attach-role-policy --role-name athena-udf-simple-varchar --policy-arn arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
80
+ $ aws lambda create-function --function-name athena-udf-simple-varchar --package-type Image --role arn:aws:iam::<ACCOUNT_ID>:role/athena-udf-simple-varchar --code ImageUri=<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test:latest --publish
81
+ ```
82
+
83
+ ## Contributing
84
+
85
+ Bug reports and pull requests are welcome on GitHub at https://github.com/dtaniwaki/ruby-athena-udf.
86
+
87
+ ## License
88
+
89
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
90
+
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require 'rubocop/rake_task'
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i[spec rubocop]
data/example.rb ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/athena_udf'
4
+
5
+ class SimpleVarcharUDF < AthenaUDF::BaseUDF
6
+ def self.handle_athena_record(_input_schema, _output_schema, record)
7
+ [record[0].downcase]
8
+ end
9
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'securerandom'
4
+ require 'base64'
5
+ require 'tempfile'
6
+ require 'arrow'
7
+ require_relative 'utils'
8
+
9
+ module AthenaUDF
10
+ class BaseUDF
11
+ extend AthenaUDF::Utils
12
+
13
+ def self.lambda_handler(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
14
+ incoming_type = event['@type']
15
+ if incoming_type == 'PingRequest'
16
+ return handle_ping(event)
17
+ elsif incoming_type == 'UserDefinedFunctionRequest'
18
+ return handle_udf_request(event)
19
+ end
20
+
21
+ raise "Unknown event type #{incoming_type} from Athena"
22
+ end
23
+
24
+ def self.handle_ping(event)
25
+ {
26
+ '@type' => 'PingResponse',
27
+ 'catalogName' => 'event',
28
+ 'queryId' => event['queryId'],
29
+ 'sourceType' => 'athena_udf',
30
+ 'capabilities' => capabilities,
31
+ }
32
+ end
33
+
34
+ def self.handle_udf_request(event)
35
+ # Cannot find a way to write Arrow::RecordBatch to a buffer directly in Ruby.
36
+
37
+ output_schema = read_schema(Base64.decode64(event['outputSchema']['schema']))
38
+ output_builder = Arrow::RecordBatchBuilder.new(output_schema)
39
+
40
+ input_schema_data = Base64.decode64(event['inputRecords']['schema'])
41
+ input_records_data = Base64.decode64(event['inputRecords']['records'])
42
+ read_record_batches(input_schema_data, input_records_data) do |input_schema, record_batch|
43
+ output_builder.append_records(
44
+ record_batch.each_record.map do |record|
45
+ handle_athena_record(input_schema, output_schema, record)
46
+ end,
47
+ )
48
+ end
49
+
50
+ output_record_batch = output_builder.flush
51
+ output_records_bytes = get_record_batch_bytes(output_schema, output_record_batch)
52
+
53
+ {
54
+ '@type' => 'UserDefinedFunctionResponse',
55
+ 'methodName' => event['methodName'],
56
+ 'records' => {
57
+ 'aId' => SecureRandom.uuid,
58
+ 'schema' => event['outputSchema']['schema'],
59
+ 'records' => Base64.strict_encode64(output_records_bytes),
60
+ },
61
+ }
62
+ end
63
+
64
+ # About capabilities: https://github.com/awslabs/aws-athena-query-federation/blob/f52d929a109099a1e7180fa242e26331137ed84c/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/FederationCapabilities.java#L29-L32
65
+ def self.capabilities
66
+ 1
67
+ end
68
+
69
+ def self.handle_athena_record(input_schema, output_schema, records)
70
+ raise NotImplementedError
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AthenaUDF
4
+ module Utils
5
+ def read_record_batches(schema_data, record_batch_data)
6
+ Tempfile.create do |in_f|
7
+ in_f.write(schema_data)
8
+ in_f.write(record_batch_data)
9
+ in_f.flush
10
+
11
+ Arrow::MemoryMappedInputStream.open(in_f.path) do |inp|
12
+ reader = Arrow::RecordBatchStreamReader.new(inp)
13
+ input_schema = reader.schema
14
+ reader.each do |record_batch|
15
+ yield input_schema, record_batch
16
+ end
17
+ end
18
+ end
19
+ end
20
+
21
+ def read_schema(schema_data)
22
+ # schema_buf = Arrow::Buffer.try_convert(schema_data)
23
+ Tempfile.create do |f|
24
+ f.write(schema_data)
25
+ f.flush
26
+
27
+ Arrow::MemoryMappedInputStream.open(f.path) do |inp|
28
+ reader = Arrow::RecordBatchStreamReader.new(inp)
29
+ reader.schema
30
+ end
31
+ end
32
+ end
33
+
34
+ def get_schema_bytes(schema, record_batch)
35
+ Tempfile.create do |f|
36
+ Arrow::FileOutputStream.open(f.path, false) do |oup|
37
+ Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
38
+ writer.write_record_batch(record_batch)
39
+ end
40
+ end
41
+ f.flush
42
+
43
+ data = File.binread(f.path)
44
+ start_index = get_record_batch_index(data)
45
+ data.bytes[4..start_index - 5].pack('C*')
46
+ end
47
+ end
48
+
49
+ def get_record_batch_bytes(schema, record_batch)
50
+ Tempfile.create do |f|
51
+ Arrow::FileOutputStream.open(f.path, false) do |oup|
52
+ Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
53
+ writer.write_record_batch(record_batch)
54
+ end
55
+ end
56
+ f.flush
57
+
58
+ data = File.binread(f.path)
59
+ start_index = get_record_batch_index(data)
60
+ data.bytes[start_index..].pack('C*')
61
+ end
62
+ end
63
+
64
+ def get_record_batch_index(raw)
65
+ size = raw.bytes.size
66
+ found_count = 0
67
+ start_index = 0
68
+ 0.upto(size - 4).each do |i|
69
+ has_ffff = 4.times.all? do |n|
70
+ raw.bytes[i + n] == 255
71
+ end
72
+ found_count += 1 if has_ffff
73
+ if found_count == 2
74
+ start_index = i + 4
75
+ break
76
+ end
77
+ end
78
+
79
+ start_index
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AthenaUDF
4
+ VERSION = '0.1.0'
5
+ end
data/lib/athena_udf.rb ADDED
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'athena-udf/version'
4
+ require_relative 'athena-udf/base_udf'
5
+
6
+ module AthenaUDF
7
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: athena-udf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Daisuke Taniwaki
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-08-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: base64
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: csv
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: red-arrow
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 12.0.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 12.0.1
55
+ description: ''
56
+ email:
57
+ - daisuketaniwaki@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".dockerignore"
63
+ - ".rspec"
64
+ - ".rubocop.yml"
65
+ - Dockerfile.example
66
+ - Gemfile
67
+ - Gemfile.lock
68
+ - LICENSE.txt
69
+ - README.md
70
+ - Rakefile
71
+ - example.rb
72
+ - lib/athena-udf/base_udf.rb
73
+ - lib/athena-udf/utils.rb
74
+ - lib/athena-udf/version.rb
75
+ - lib/athena_udf.rb
76
+ homepage: https://github.com/dtaniwaki/ruby-athena-udf
77
+ licenses:
78
+ - MIT
79
+ metadata:
80
+ homepage_uri: https://github.com/dtaniwaki/ruby-athena-udf
81
+ source_code_uri: https://github.com/dtaniwaki/ruby-athena-udf
82
+ post_install_message:
83
+ rdoc_options: []
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: 3.2.0
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ requirements: []
97
+ rubygems_version: 3.5.11
98
+ signing_key:
99
+ specification_version: 4
100
+ summary: Ruby-version Athena UDF
101
+ test_files: []