athena-udf 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: be808b4f155b8cdfa5c52bcc3c486d77cb9d056099f592b304bc8aa6426694b4
4
+ data.tar.gz: c41565a48ec8702756c48f01112b6d7dc881723a26a521c09cbb8bf506666a24
5
+ SHA512:
6
+ metadata.gz: 971c50fb5eee0df1c0f4522b19a1583fca1ad94144f38766f0a6ff42b82151ee8501d7b303cc361f70a151c4e29e57aaca360bdd2e57646fb8509c5d4fbd74bd
7
+ data.tar.gz: af0deed14deb22d1e382adfe9bc4097f07dff0cea835f9188e847df7bd27017e91b21c8c84bdf56ebbf6ac4dc729a3e6fe9dfbff43847f6eca1ee423389d3824
data/.dockerignore ADDED
@@ -0,0 +1,2 @@
1
+ .git
2
+ Dockerfile.example
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,36 @@
1
+ AllCops:
2
+ TargetRubyVersion: 3.2
3
+
4
+ Style/StringLiterals:
5
+ Enabled: true
6
+ EnforcedStyle: single_quotes
7
+
8
+ Style/StringLiteralsInInterpolation:
9
+ Enabled: true
10
+ EnforcedStyle: single_quotes
11
+
12
+ Layout/LineLength:
13
+ Max: 120
14
+
15
+ Style/Documentation:
16
+ Enabled: false
17
+
18
+ Metrics/MethodLength:
19
+ Max: 30
20
+
21
+ Metrics/AbcSize:
22
+ Max: 40
23
+
24
+ Metrics/BlockLength:
25
+ Exclude:
26
+ - "spec/**/*"
27
+
28
+ Style/TrailingCommaInArguments:
29
+ EnforcedStyleForMultiline: comma
30
+
31
+ Style/TrailingCommaInArrayLiteral:
32
+ EnforcedStyleForMultiline: comma
33
+
34
+ Style/TrailingCommaInHashLiteral:
35
+ EnforcedStyleForMultiline: comma
36
+
@@ -0,0 +1,31 @@
1
+ FROM public.ecr.aws/lambda/ruby:3.2
2
+
3
+ # General packages
4
+ RUN yum update -y \
5
+ && yum install -y \
6
+ amazon-linux-extras \
7
+ gcc-c++ \
8
+ make \
9
+ git \
10
+ && amazon-linux-extras install -y epel \
11
+ && yum clean all
12
+
13
+ # Apache Arrow
14
+ RUN yum update -y \
15
+ && yum install -y https://apache.jfrog.io/artifactory/arrow/amazon-linux/2/apache-arrow-release-latest.rpm \
16
+ && yum install -y --enablerepo=epel arrow-devel arrow-glib-devel arrow-dataset-devel arrow-dataset-glib-devel \
17
+ && yum clean all
18
+
19
+ # Update bundler
20
+ RUN gem update bundler
21
+
22
+ # Install gems
23
+ COPY Gemfile Gemfile.lock athena-udf.gemspec ${LAMBDA_TASK_ROOT}
24
+ COPY lib/athena-udf/version.rb ${LAMBDA_TASK_ROOT}/lib/athena-udf/version.rb
25
+ ENV GEM_HOME=${LAMBDA_TASK_ROOT}
26
+ RUN bundle config set --local without development \
27
+ && bundle install
28
+
29
+ COPY . ${LAMBDA_TASK_ROOT}
30
+
31
+ CMD ["example.SimpleVarcharUDF.lambda_handler"]
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ gemspec
6
+
7
+ group :development, :test do
8
+ gem 'rake', '~> 13.0'
9
+
10
+ gem 'rspec', '~> 3.0'
11
+
12
+ gem 'rubocop', '~> 1.21'
13
+
14
+ gem 'gem-release', '~> 2.2'
15
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,90 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ athena-udf (0.1.0)
5
+ base64
6
+ csv
7
+ red-arrow (~> 12.0.1)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ ast (2.4.2)
13
+ base64 (0.2.0)
14
+ bigdecimal (3.1.8)
15
+ csv (3.3.0)
16
+ diff-lcs (1.5.1)
17
+ extpp (0.1.1)
18
+ fiddle (1.1.2)
19
+ gem-release (2.2.2)
20
+ gio2 (4.2.2)
21
+ fiddle
22
+ gobject-introspection (= 4.2.2)
23
+ glib2 (4.2.2)
24
+ native-package-installer (>= 1.0.3)
25
+ pkg-config (>= 1.3.5)
26
+ gobject-introspection (4.2.2)
27
+ glib2 (= 4.2.2)
28
+ json (2.7.2)
29
+ language_server-protocol (3.17.0.3)
30
+ native-package-installer (1.1.9)
31
+ parallel (1.26.3)
32
+ parser (3.3.4.2)
33
+ ast (~> 2.4.1)
34
+ racc
35
+ pkg-config (1.5.6)
36
+ racc (1.8.1)
37
+ rainbow (3.1.1)
38
+ rake (13.2.1)
39
+ red-arrow (12.0.1)
40
+ bigdecimal (>= 3.1.0)
41
+ extpp (>= 0.1.1)
42
+ gio2 (>= 3.5.0)
43
+ native-package-installer
44
+ pkg-config
45
+ regexp_parser (2.9.2)
46
+ rexml (3.3.5)
47
+ strscan
48
+ rspec (3.13.0)
49
+ rspec-core (~> 3.13.0)
50
+ rspec-expectations (~> 3.13.0)
51
+ rspec-mocks (~> 3.13.0)
52
+ rspec-core (3.13.0)
53
+ rspec-support (~> 3.13.0)
54
+ rspec-expectations (3.13.1)
55
+ diff-lcs (>= 1.2.0, < 2.0)
56
+ rspec-support (~> 3.13.0)
57
+ rspec-mocks (3.13.1)
58
+ diff-lcs (>= 1.2.0, < 2.0)
59
+ rspec-support (~> 3.13.0)
60
+ rspec-support (3.13.1)
61
+ rubocop (1.65.1)
62
+ json (~> 2.3)
63
+ language_server-protocol (>= 3.17.0)
64
+ parallel (~> 1.10)
65
+ parser (>= 3.3.0.2)
66
+ rainbow (>= 2.2.2, < 4.0)
67
+ regexp_parser (>= 2.4, < 3.0)
68
+ rexml (>= 3.2.5, < 4.0)
69
+ rubocop-ast (>= 1.31.1, < 2.0)
70
+ ruby-progressbar (~> 1.7)
71
+ unicode-display_width (>= 2.4.0, < 3.0)
72
+ rubocop-ast (1.32.1)
73
+ parser (>= 3.3.1.0)
74
+ ruby-progressbar (1.13.0)
75
+ strscan (3.1.0)
76
+ unicode-display_width (2.5.0)
77
+
78
+ PLATFORMS
79
+ arm64-darwin-23
80
+ ruby
81
+
82
+ DEPENDENCIES
83
+ athena-udf!
84
+ gem-release (~> 2.2)
85
+ rake (~> 13.0)
86
+ rspec (~> 3.0)
87
+ rubocop (~> 1.21)
88
+
89
+ BUNDLED WITH
90
+ 2.5.17
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2024 Daisuke Taniwaki
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,90 @@
1
+ # AthenaUDF
2
+
3
+ Ruby-version Athena User Defined Function (UDF).
4
+
5
+ This gem is highly inspired by [the Python-version Athena UDF](https://github.com/dmarkey/python-athena-udf).
6
+
7
+ See [an official example implementation](https://github.com/awslabs/aws-athena-query-federation/blob/fc2e4e9cdcb71ec7f7c7d44cbda7f56c5835811e/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/UserDefinedFunctionHandler.java) for more detail of a lambda function for Athena UDF.
8
+
9
+ ## Installation
10
+
11
+ Install the gem and add to the application's Gemfile by executing:
12
+
13
+ ```sh
14
+ $ bundle add athena-udf
15
+ ```
16
+
17
+ If bundler is not being used to manage dependencies, install the gem by executing:
18
+
19
+ ```sh
20
+ $ gem install athena-udf
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ Just make a subclass of `AthenaUDF::BaseUDF` and implement a concrete function logic.
26
+
27
+ ```rb
28
+ require "athena-udf"
29
+
30
+ class SimpleVarcharUDF < AthenaUDF::BaseUDF
31
+ def self.handle_athena_record(_input_schema, _output_schema, record)
32
+ [record[0].downcase]
33
+ end
34
+ end
35
+ ```
36
+
37
+ Then, it can be called as `SimpleVarcharUDF.lambda_handler` in your lambda function for Athena UDF workloads.
38
+
39
+ After pushing an image to Amazon ECR, you can call the function like the following SQL.
40
+
41
+ ```sql
42
+ USING EXTERNAL FUNCTION my_udf(col1 varchar) RETURNS varchar LAMBDA 'athena-udf-simple-varchar'
43
+
44
+ SELECT my_udf('FooBar');
45
+ ```
46
+
47
+ See [the official document](https://docs.aws.amazon.com/athena/latest/ug/querying-udf.html) for the UDF usage.
48
+
49
+ ## Development
50
+
51
+ To contribute to this library, first checkout the code. Then, install the dependent gems.
52
+
53
+ ```sh
54
+ $ bundle install
55
+ ```
56
+
57
+ To run the tests:
58
+
59
+ ```sh
60
+ $ bundle exec rspec
61
+ ```
62
+
63
+ ## Deployment
64
+
65
+ You can try the example with the following steps.
66
+
67
+ First, push a container image to Amazon ECR:
68
+
69
+ ```sh
70
+ $ aws ecr get-login-password | docker login --username AWS --password-stdin https://<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com
71
+ $ docker build --platform=linux/amd64 -t <ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test -f Dockerfile.example .
72
+ $ docker push <ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test
73
+ ```
74
+
75
+ Then, create a lambda function with the CLI:
76
+
77
+ ```sh
78
+ $ aws iam create-role --role-name athena-udf-simple-varchar --assume-role-policy-document '{"Version": "2012-10-17","Statement": [{ "Effect": "Allow", "Principal": {"Service": "lambda.amazonaws.com"}, "Action": "sts:AssumeRole"}]}'
79
+ $ aws iam attach-role-policy --role-name athena-udf-simple-varchar --policy-arn arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
80
+ $ aws lambda create-function --function-name athena-udf-simple-varchar --package-type Image --role arn:aws:iam::<ACCOUNT_ID>:role/athena-udf-simple-varchar --code ImageUri=<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test:latest --publish
81
+ ```
82
+
83
+ ## Contributing
84
+
85
+ Bug reports and pull requests are welcome on GitHub at https://github.com/dtaniwaki/ruby-athena-udf.
86
+
87
+ ## License
88
+
89
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
90
+
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require 'rubocop/rake_task'
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i[spec rubocop]
data/example.rb ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/athena_udf'
4
+
5
+ class SimpleVarcharUDF < AthenaUDF::BaseUDF
6
+ def self.handle_athena_record(_input_schema, _output_schema, record)
7
+ [record[0].downcase]
8
+ end
9
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'securerandom'
4
+ require 'base64'
5
+ require 'tempfile'
6
+ require 'arrow'
7
+ require_relative 'utils'
8
+
9
+ module AthenaUDF
10
+ class BaseUDF
11
+ extend AthenaUDF::Utils
12
+
13
+ def self.lambda_handler(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
14
+ incoming_type = event['@type']
15
+ if incoming_type == 'PingRequest'
16
+ return handle_ping(event)
17
+ elsif incoming_type == 'UserDefinedFunctionRequest'
18
+ return handle_udf_request(event)
19
+ end
20
+
21
+ raise "Unknown event type #{incoming_type} from Athena"
22
+ end
23
+
24
+ def self.handle_ping(event)
25
+ {
26
+ '@type' => 'PingResponse',
27
+ 'catalogName' => 'event',
28
+ 'queryId' => event['queryId'],
29
+ 'sourceType' => 'athena_udf',
30
+ 'capabilities' => capabilities,
31
+ }
32
+ end
33
+
34
+ def self.handle_udf_request(event)
35
+ # Cannot find a way to write Arrow::RecordBatch to a buffer directly in Ruby.
36
+
37
+ output_schema = read_schema(Base64.decode64(event['outputSchema']['schema']))
38
+ output_builder = Arrow::RecordBatchBuilder.new(output_schema)
39
+
40
+ input_schema_data = Base64.decode64(event['inputRecords']['schema'])
41
+ input_records_data = Base64.decode64(event['inputRecords']['records'])
42
+ read_record_batches(input_schema_data, input_records_data) do |input_schema, record_batch|
43
+ output_builder.append_records(
44
+ record_batch.each_record.map do |record|
45
+ handle_athena_record(input_schema, output_schema, record)
46
+ end,
47
+ )
48
+ end
49
+
50
+ output_record_batch = output_builder.flush
51
+ output_records_bytes = get_record_batch_bytes(output_schema, output_record_batch)
52
+
53
+ {
54
+ '@type' => 'UserDefinedFunctionResponse',
55
+ 'methodName' => event['methodName'],
56
+ 'records' => {
57
+ 'aId' => SecureRandom.uuid,
58
+ 'schema' => event['outputSchema']['schema'],
59
+ 'records' => Base64.strict_encode64(output_records_bytes),
60
+ },
61
+ }
62
+ end
63
+
64
+ # About capabilities: https://github.com/awslabs/aws-athena-query-federation/blob/f52d929a109099a1e7180fa242e26331137ed84c/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/FederationCapabilities.java#L29-L32
65
+ def self.capabilities
66
+ 1
67
+ end
68
+
69
+ def self.handle_athena_record(input_schema, output_schema, records)
70
+ raise NotImplementedError
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AthenaUDF
4
+ module Utils
5
+ def read_record_batches(schema_data, record_batch_data)
6
+ Tempfile.create do |in_f|
7
+ in_f.write(schema_data)
8
+ in_f.write(record_batch_data)
9
+ in_f.flush
10
+
11
+ Arrow::MemoryMappedInputStream.open(in_f.path) do |inp|
12
+ reader = Arrow::RecordBatchStreamReader.new(inp)
13
+ input_schema = reader.schema
14
+ reader.each do |record_batch|
15
+ yield input_schema, record_batch
16
+ end
17
+ end
18
+ end
19
+ end
20
+
21
+ def read_schema(schema_data)
22
+ # schema_buf = Arrow::Buffer.try_convert(schema_data)
23
+ Tempfile.create do |f|
24
+ f.write(schema_data)
25
+ f.flush
26
+
27
+ Arrow::MemoryMappedInputStream.open(f.path) do |inp|
28
+ reader = Arrow::RecordBatchStreamReader.new(inp)
29
+ reader.schema
30
+ end
31
+ end
32
+ end
33
+
34
+ def get_schema_bytes(schema, record_batch)
35
+ Tempfile.create do |f|
36
+ Arrow::FileOutputStream.open(f.path, false) do |oup|
37
+ Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
38
+ writer.write_record_batch(record_batch)
39
+ end
40
+ end
41
+ f.flush
42
+
43
+ data = File.binread(f.path)
44
+ start_index = get_record_batch_index(data)
45
+ data.bytes[4..start_index - 5].pack('C*')
46
+ end
47
+ end
48
+
49
+ def get_record_batch_bytes(schema, record_batch)
50
+ Tempfile.create do |f|
51
+ Arrow::FileOutputStream.open(f.path, false) do |oup|
52
+ Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
53
+ writer.write_record_batch(record_batch)
54
+ end
55
+ end
56
+ f.flush
57
+
58
+ data = File.binread(f.path)
59
+ start_index = get_record_batch_index(data)
60
+ data.bytes[start_index..].pack('C*')
61
+ end
62
+ end
63
+
64
+ def get_record_batch_index(raw)
65
+ size = raw.bytes.size
66
+ found_count = 0
67
+ start_index = 0
68
+ 0.upto(size - 4).each do |i|
69
+ has_ffff = 4.times.all? do |n|
70
+ raw.bytes[i + n] == 255
71
+ end
72
+ found_count += 1 if has_ffff
73
+ if found_count == 2
74
+ start_index = i + 4
75
+ break
76
+ end
77
+ end
78
+
79
+ start_index
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AthenaUDF
4
+ VERSION = '0.1.0'
5
+ end
data/lib/athena_udf.rb ADDED
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'athena-udf/version'
4
+ require_relative 'athena-udf/base_udf'
5
+
6
+ module AthenaUDF
7
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: athena-udf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Daisuke Taniwaki
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-08-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: base64
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: csv
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: red-arrow
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 12.0.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 12.0.1
55
+ description: ''
56
+ email:
57
+ - daisuketaniwaki@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".dockerignore"
63
+ - ".rspec"
64
+ - ".rubocop.yml"
65
+ - Dockerfile.example
66
+ - Gemfile
67
+ - Gemfile.lock
68
+ - LICENSE.txt
69
+ - README.md
70
+ - Rakefile
71
+ - example.rb
72
+ - lib/athena-udf/base_udf.rb
73
+ - lib/athena-udf/utils.rb
74
+ - lib/athena-udf/version.rb
75
+ - lib/athena_udf.rb
76
+ homepage: https://github.com/dtaniwaki/ruby-athena-udf
77
+ licenses:
78
+ - MIT
79
+ metadata:
80
+ homepage_uri: https://github.com/dtaniwaki/ruby-athena-udf
81
+ source_code_uri: https://github.com/dtaniwaki/ruby-athena-udf
82
+ post_install_message:
83
+ rdoc_options: []
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: 3.2.0
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ requirements: []
97
+ rubygems_version: 3.5.11
98
+ signing_key:
99
+ specification_version: 4
100
+ summary: Ruby-version Athena UDF
101
+ test_files: []