athena-udf 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +5 -0
- data/Dockerfile.dev +28 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +5 -7
- data/README.md +9 -0
- data/lib/athena-udf/base_udf.rb +20 -19
- data/lib/athena-udf/utils.rb +2 -2
- data/lib/athena-udf/version.rb +1 -1
- data/scripts/benchmark.rb +95 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9cc2a9e47dee420f0b11442039d1873aca9ad04ecf12775dbe8ea49df3153f02
|
4
|
+
data.tar.gz: 54d15b93ffc2fb58793aad53d03c69608e607c00d0f8da3f5210b47860a5aecc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b0f36011d582e681d575fe8d6f33a6cf9935519ed9bb21953dac980aa1c479c7d943992bbdd0e992f053f32eee6b228f473d772e4e698c3517370009fe49d3b6
|
7
|
+
data.tar.gz: 318b411aa1949b998c1a34176ca049cbccea541861a89931b1a3f71730ab45fec383e03549ec941f7a48305d4ad1e9c0de419d9a46b8e5469205467772df8598
|
data/.rubocop.yml
CHANGED
@@ -24,6 +24,7 @@ Metrics/AbcSize:
|
|
24
24
|
Metrics/BlockLength:
|
25
25
|
Exclude:
|
26
26
|
- "spec/**/*"
|
27
|
+
- "scripts/benchmark.rb"
|
27
28
|
|
28
29
|
Style/TrailingCommaInArguments:
|
29
30
|
EnforcedStyleForMultiline: comma
|
@@ -37,3 +38,7 @@ Style/TrailingCommaInHashLiteral:
|
|
37
38
|
Style/ClassVars:
|
38
39
|
Enabled: false
|
39
40
|
|
41
|
+
Naming/VariableNumber:
|
42
|
+
Exclude:
|
43
|
+
- "scripts/benchmark.rb"
|
44
|
+
|
data/Dockerfile.dev
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
FROM ruby:3.2
|
2
|
+
|
3
|
+
# General packages
|
4
|
+
RUN apt-get update -y \
|
5
|
+
&& apt-get install -y \
|
6
|
+
build-essential \
|
7
|
+
ca-certificates \
|
8
|
+
lsb-release \
|
9
|
+
wget \
|
10
|
+
vim \
|
11
|
+
&& apt-get clean
|
12
|
+
|
13
|
+
# Apache Arrow
|
14
|
+
RUN apt-get update -y \
|
15
|
+
&& wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
|
16
|
+
&& apt install -y ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
|
17
|
+
&& apt-get update -y \
|
18
|
+
&& apt-get install -y \
|
19
|
+
libarrow-dev \
|
20
|
+
libarrow-glib-dev \
|
21
|
+
&& apt-get clean
|
22
|
+
|
23
|
+
# Update bundler
|
24
|
+
RUN gem update bundler
|
25
|
+
|
26
|
+
WORKDIR /src
|
27
|
+
|
28
|
+
CMD ["/bin/bash"]
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -11,6 +11,7 @@ GEM
|
|
11
11
|
specs:
|
12
12
|
ast (2.4.2)
|
13
13
|
base64 (0.2.0)
|
14
|
+
benchmark (0.3.0)
|
14
15
|
bigdecimal (3.1.8)
|
15
16
|
csv (3.3.0)
|
16
17
|
diff-lcs (1.5.1)
|
@@ -44,8 +45,6 @@ GEM
|
|
44
45
|
native-package-installer
|
45
46
|
pkg-config
|
46
47
|
regexp_parser (2.9.2)
|
47
|
-
rexml (3.3.5)
|
48
|
-
strscan
|
49
48
|
rspec (3.13.0)
|
50
49
|
rspec-core (~> 3.13.0)
|
51
50
|
rspec-expectations (~> 3.13.0)
|
@@ -59,18 +58,17 @@ GEM
|
|
59
58
|
diff-lcs (>= 1.2.0, < 2.0)
|
60
59
|
rspec-support (~> 3.13.0)
|
61
60
|
rspec-support (3.13.1)
|
62
|
-
rubocop (1.
|
61
|
+
rubocop (1.66.0)
|
63
62
|
json (~> 2.3)
|
64
63
|
language_server-protocol (>= 3.17.0)
|
65
64
|
parallel (~> 1.10)
|
66
65
|
parser (>= 3.3.0.2)
|
67
66
|
rainbow (>= 2.2.2, < 4.0)
|
68
67
|
regexp_parser (>= 2.4, < 3.0)
|
69
|
-
|
70
|
-
rubocop-ast (>= 1.31.1, < 2.0)
|
68
|
+
rubocop-ast (>= 1.32.1, < 2.0)
|
71
69
|
ruby-progressbar (~> 1.7)
|
72
70
|
unicode-display_width (>= 2.4.0, < 3.0)
|
73
|
-
rubocop-ast (1.32.
|
71
|
+
rubocop-ast (1.32.2)
|
74
72
|
parser (>= 3.3.1.0)
|
75
73
|
ruby-progressbar (1.13.0)
|
76
74
|
simplecov (0.22.0)
|
@@ -80,7 +78,6 @@ GEM
|
|
80
78
|
simplecov-html (0.12.3)
|
81
79
|
simplecov-lcov (0.8.0)
|
82
80
|
simplecov_json_formatter (0.1.4)
|
83
|
-
strscan (3.1.0)
|
84
81
|
unicode-display_width (2.5.0)
|
85
82
|
|
86
83
|
PLATFORMS
|
@@ -89,6 +86,7 @@ PLATFORMS
|
|
89
86
|
|
90
87
|
DEPENDENCIES
|
91
88
|
athena-udf!
|
89
|
+
benchmark (~> 0.3.0)
|
92
90
|
gem-release (~> 2.2)
|
93
91
|
rake (~> 13.0)
|
94
92
|
rspec (~> 3.0)
|
data/README.md
CHANGED
@@ -83,6 +83,15 @@ $ aws iam attach-role-policy --role-name athena-udf-simple-varchar --policy-arn
|
|
83
83
|
$ aws lambda create-function --function-name athena-udf-simple-varchar --package-type Image --role arn:aws:iam::<ACCOUNT_ID>:role/athena-udf-simple-varchar --code ImageUri=<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test:latest --publish
|
84
84
|
```
|
85
85
|
|
86
|
+
## Development
|
87
|
+
|
88
|
+
You can use the dev container image, which includes necessary packages, to develop this library.
|
89
|
+
|
90
|
+
```sh
|
91
|
+
$ docker build -t ruby-athena-udf-dev -f Dockerfile.dev .
|
92
|
+
$ docker run -v $PWD:/src -it ruby-athena-udf-dev
|
93
|
+
```
|
94
|
+
|
86
95
|
## Contributing
|
87
96
|
|
88
97
|
Bug reports and pull requests are welcome on GitHub at https://github.com/dtaniwaki/ruby-athena-udf.
|
data/lib/athena-udf/base_udf.rb
CHANGED
@@ -9,33 +9,43 @@ require_relative 'utils'
|
|
9
9
|
|
10
10
|
module AthenaUDF
|
11
11
|
class BaseUDF
|
12
|
-
|
12
|
+
include AthenaUDF::Utils
|
13
13
|
|
14
|
-
|
15
|
-
@@logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
|
14
|
+
attr_reader :logger
|
16
15
|
|
17
|
-
def self.lambda_handler(event:, context:)
|
16
|
+
def self.lambda_handler(event:, context:)
|
17
|
+
instance = new(event:, context:)
|
18
18
|
incoming_type = event['@type']
|
19
19
|
if incoming_type == 'PingRequest'
|
20
|
-
return handle_ping(event)
|
20
|
+
return instance.handle_ping(event)
|
21
21
|
elsif incoming_type == 'UserDefinedFunctionRequest'
|
22
|
-
return handle_udf_request(event)
|
22
|
+
return instance.handle_udf_request(event)
|
23
23
|
end
|
24
24
|
|
25
25
|
raise "Unknown event type #{incoming_type} from Athena"
|
26
26
|
end
|
27
27
|
|
28
|
-
|
28
|
+
# About capabilities: https://github.com/awslabs/aws-athena-query-federation/blob/f52d929a109099a1e7180fa242e26331137ed84c/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/FederationCapabilities.java#L29-L32
|
29
|
+
def self.capabilities
|
30
|
+
1
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
|
34
|
+
@logger = Logger.new($stdout)
|
35
|
+
@logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
|
36
|
+
end
|
37
|
+
|
38
|
+
def handle_ping(event)
|
29
39
|
{
|
30
40
|
'@type' => 'PingResponse',
|
31
41
|
'catalogName' => 'event',
|
32
42
|
'queryId' => event['queryId'],
|
33
43
|
'sourceType' => 'athena_udf',
|
34
|
-
'capabilities' => capabilities,
|
44
|
+
'capabilities' => self.class.capabilities,
|
35
45
|
}
|
36
46
|
end
|
37
47
|
|
38
|
-
def
|
48
|
+
def handle_udf_request(event)
|
39
49
|
# Cannot find a way to write Arrow::RecordBatch to a buffer directly in Ruby.
|
40
50
|
|
41
51
|
output_schema = read_schema(Base64.decode64(event['outputSchema']['schema']))
|
@@ -66,17 +76,8 @@ module AthenaUDF
|
|
66
76
|
}
|
67
77
|
end
|
68
78
|
|
69
|
-
|
70
|
-
def self.capabilities
|
71
|
-
1
|
72
|
-
end
|
73
|
-
|
74
|
-
def self.handle_athena_record(input_schema, output_schema, records)
|
79
|
+
def handle_athena_record(input_schema, output_schema, records)
|
75
80
|
raise NotImplementedError
|
76
81
|
end
|
77
|
-
|
78
|
-
def self.logger
|
79
|
-
@@logger
|
80
|
-
end
|
81
82
|
end
|
82
83
|
end
|
data/lib/athena-udf/utils.rb
CHANGED
@@ -30,11 +30,11 @@ module AthenaUDF
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
def get_schema_bytes(schema
|
33
|
+
def get_schema_bytes(schema)
|
34
34
|
buffer = Arrow::ResizableBuffer.new(0)
|
35
35
|
Arrow::BufferOutputStream.open(buffer) do |output|
|
36
36
|
Arrow::RecordBatchStreamWriter.open(output, schema) do |writer|
|
37
|
-
|
37
|
+
# noop
|
38
38
|
end
|
39
39
|
|
40
40
|
bytes = buffer.data.to_s
|
data/lib/athena-udf/version.rb
CHANGED
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'benchmark'
|
4
|
+
require 'athena_udf'
|
5
|
+
|
6
|
+
Benchmark.bm 10 do |r|
|
7
|
+
include AthenaUDF::Utils
|
8
|
+
|
9
|
+
instance = Class.new(AthenaUDF::BaseUDF) do
|
10
|
+
def handle_athena_record(_input_schema, _output_schema, record)
|
11
|
+
record.to_a
|
12
|
+
end
|
13
|
+
end.new(event: {}, context: {})
|
14
|
+
|
15
|
+
input_schema_1 = Arrow::Schema.new("0": :string)
|
16
|
+
input_schema_bytes_1 = get_schema_bytes(input_schema_1)
|
17
|
+
Arrow::Schema.new(0.upto(100).map { |n| [n.to_s, :string] }.to_h)
|
18
|
+
input_schema_bytes_100 = get_schema_bytes(input_schema_1)
|
19
|
+
|
20
|
+
input_table1_1 = Arrow::Table.new(input_schema_1, [['FooBar']])
|
21
|
+
input_records_bytes_1_1 = get_record_batch_bytes(input_schema_1, input_table1_1.each_record_batch.first)
|
22
|
+
event_1_1 = {
|
23
|
+
'@type' => 'UserDefinedFunctionRequest',
|
24
|
+
'inputRecords' => {
|
25
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_1),
|
26
|
+
'records' => Base64.strict_encode64(input_records_bytes_1_1),
|
27
|
+
},
|
28
|
+
'methodName' => 'lower',
|
29
|
+
'outputSchema' => {
|
30
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_1),
|
31
|
+
},
|
32
|
+
'functionType' => 'SCALAR',
|
33
|
+
}
|
34
|
+
|
35
|
+
r.report '1 record 1 column' do
|
36
|
+
instance.handle_udf_request(event_1_1)
|
37
|
+
end
|
38
|
+
|
39
|
+
input_table100_1 = Arrow::Table.new(input_schema_1, [['FooBar']] * 100)
|
40
|
+
input_records_bytes_100_1 = get_record_batch_bytes(input_schema_1, input_table100_1.each_record_batch.first)
|
41
|
+
event_100 = {
|
42
|
+
'@type' => 'UserDefinedFunctionRequest',
|
43
|
+
'inputRecords' => {
|
44
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_1),
|
45
|
+
'records' => Base64.strict_encode64(input_records_bytes_100_1),
|
46
|
+
},
|
47
|
+
'methodName' => 'lower',
|
48
|
+
'outputSchema' => {
|
49
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_1),
|
50
|
+
},
|
51
|
+
'functionType' => 'SCALAR',
|
52
|
+
}
|
53
|
+
|
54
|
+
r.report '100 records 1 column' do
|
55
|
+
instance.handle_udf_request(event_100)
|
56
|
+
end
|
57
|
+
|
58
|
+
input_table1_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
|
59
|
+
input_records_bytes_1_100 = get_record_batch_bytes(input_schema_1, input_table1_100.each_record_batch.first)
|
60
|
+
event_1_100 = {
|
61
|
+
'@type' => 'UserDefinedFunctionRequest',
|
62
|
+
'inputRecords' => {
|
63
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_100),
|
64
|
+
'records' => Base64.strict_encode64(input_records_bytes_1_100),
|
65
|
+
},
|
66
|
+
'methodName' => 'lower',
|
67
|
+
'outputSchema' => {
|
68
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_100),
|
69
|
+
},
|
70
|
+
'functionType' => 'SCALAR',
|
71
|
+
}
|
72
|
+
|
73
|
+
r.report '1 record 100 column' do
|
74
|
+
instance.handle_udf_request(event_1_100)
|
75
|
+
end
|
76
|
+
|
77
|
+
input_table_100_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
|
78
|
+
input_records_bytes_100_100 = get_record_batch_bytes(input_schema_1, input_table_100_100.each_record_batch.first)
|
79
|
+
event_100_100 = {
|
80
|
+
'@type' => 'UserDefinedFunctionRequest',
|
81
|
+
'inputRecords' => {
|
82
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_100),
|
83
|
+
'records' => Base64.strict_encode64(input_records_bytes_100_100),
|
84
|
+
},
|
85
|
+
'methodName' => 'lower',
|
86
|
+
'outputSchema' => {
|
87
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_100),
|
88
|
+
},
|
89
|
+
'functionType' => 'SCALAR',
|
90
|
+
}
|
91
|
+
|
92
|
+
r.report '100 record 100 column' do
|
93
|
+
instance.handle_udf_request(event_100_100)
|
94
|
+
end
|
95
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: athena-udf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Daisuke Taniwaki
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-09-
|
11
|
+
date: 2024-09-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: base64
|
@@ -62,6 +62,7 @@ files:
|
|
62
62
|
- ".dockerignore"
|
63
63
|
- ".rspec"
|
64
64
|
- ".rubocop.yml"
|
65
|
+
- Dockerfile.dev
|
65
66
|
- Dockerfile.example
|
66
67
|
- Gemfile
|
67
68
|
- Gemfile.lock
|
@@ -73,6 +74,7 @@ files:
|
|
73
74
|
- lib/athena-udf/utils.rb
|
74
75
|
- lib/athena-udf/version.rb
|
75
76
|
- lib/athena_udf.rb
|
77
|
+
- scripts/benchmark.rb
|
76
78
|
homepage: https://github.com/dtaniwaki/ruby-athena-udf
|
77
79
|
licenses:
|
78
80
|
- MIT
|
@@ -94,7 +96,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
94
96
|
- !ruby/object:Gem::Version
|
95
97
|
version: '0'
|
96
98
|
requirements: []
|
97
|
-
rubygems_version: 3.
|
99
|
+
rubygems_version: 3.4.19
|
98
100
|
signing_key:
|
99
101
|
specification_version: 4
|
100
102
|
summary: Ruby-version Athena UDF
|