athena-udf 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +5 -0
- data/Dockerfile.dev +28 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +6 -8
- data/README.md +9 -0
- data/lib/athena-udf/base_udf.rb +20 -19
- data/lib/athena-udf/utils.rb +8 -7
- data/lib/athena-udf/version.rb +1 -1
- data/scripts/benchmark.rb +95 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9cc2a9e47dee420f0b11442039d1873aca9ad04ecf12775dbe8ea49df3153f02
|
4
|
+
data.tar.gz: 54d15b93ffc2fb58793aad53d03c69608e607c00d0f8da3f5210b47860a5aecc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b0f36011d582e681d575fe8d6f33a6cf9935519ed9bb21953dac980aa1c479c7d943992bbdd0e992f053f32eee6b228f473d772e4e698c3517370009fe49d3b6
|
7
|
+
data.tar.gz: 318b411aa1949b998c1a34176ca049cbccea541861a89931b1a3f71730ab45fec383e03549ec941f7a48305d4ad1e9c0de419d9a46b8e5469205467772df8598
|
data/.rubocop.yml
CHANGED
@@ -24,6 +24,7 @@ Metrics/AbcSize:
|
|
24
24
|
Metrics/BlockLength:
|
25
25
|
Exclude:
|
26
26
|
- "spec/**/*"
|
27
|
+
- "scripts/benchmark.rb"
|
27
28
|
|
28
29
|
Style/TrailingCommaInArguments:
|
29
30
|
EnforcedStyleForMultiline: comma
|
@@ -37,3 +38,7 @@ Style/TrailingCommaInHashLiteral:
|
|
37
38
|
Style/ClassVars:
|
38
39
|
Enabled: false
|
39
40
|
|
41
|
+
Naming/VariableNumber:
|
42
|
+
Exclude:
|
43
|
+
- "scripts/benchmark.rb"
|
44
|
+
|
data/Dockerfile.dev
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
FROM ruby:3.2
|
2
|
+
|
3
|
+
# General packages
|
4
|
+
RUN apt-get update -y \
|
5
|
+
&& apt-get install -y \
|
6
|
+
build-essential \
|
7
|
+
ca-certificates \
|
8
|
+
lsb-release \
|
9
|
+
wget \
|
10
|
+
vim \
|
11
|
+
&& apt-get clean
|
12
|
+
|
13
|
+
# Apache Arrow
|
14
|
+
RUN apt-get update -y \
|
15
|
+
&& wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
|
16
|
+
&& apt install -y ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
|
17
|
+
&& apt-get update -y \
|
18
|
+
&& apt-get install -y \
|
19
|
+
libarrow-dev \
|
20
|
+
libarrow-glib-dev \
|
21
|
+
&& apt-get clean
|
22
|
+
|
23
|
+
# Update bundler
|
24
|
+
RUN gem update bundler
|
25
|
+
|
26
|
+
WORKDIR /src
|
27
|
+
|
28
|
+
CMD ["/bin/bash"]
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
athena-udf (0.1.
|
4
|
+
athena-udf (0.1.3)
|
5
5
|
base64
|
6
6
|
csv
|
7
7
|
red-arrow (~> 12.0.1)
|
@@ -11,6 +11,7 @@ GEM
|
|
11
11
|
specs:
|
12
12
|
ast (2.4.2)
|
13
13
|
base64 (0.2.0)
|
14
|
+
benchmark (0.3.0)
|
14
15
|
bigdecimal (3.1.8)
|
15
16
|
csv (3.3.0)
|
16
17
|
diff-lcs (1.5.1)
|
@@ -44,8 +45,6 @@ GEM
|
|
44
45
|
native-package-installer
|
45
46
|
pkg-config
|
46
47
|
regexp_parser (2.9.2)
|
47
|
-
rexml (3.3.5)
|
48
|
-
strscan
|
49
48
|
rspec (3.13.0)
|
50
49
|
rspec-core (~> 3.13.0)
|
51
50
|
rspec-expectations (~> 3.13.0)
|
@@ -59,18 +58,17 @@ GEM
|
|
59
58
|
diff-lcs (>= 1.2.0, < 2.0)
|
60
59
|
rspec-support (~> 3.13.0)
|
61
60
|
rspec-support (3.13.1)
|
62
|
-
rubocop (1.
|
61
|
+
rubocop (1.66.0)
|
63
62
|
json (~> 2.3)
|
64
63
|
language_server-protocol (>= 3.17.0)
|
65
64
|
parallel (~> 1.10)
|
66
65
|
parser (>= 3.3.0.2)
|
67
66
|
rainbow (>= 2.2.2, < 4.0)
|
68
67
|
regexp_parser (>= 2.4, < 3.0)
|
69
|
-
|
70
|
-
rubocop-ast (>= 1.31.1, < 2.0)
|
68
|
+
rubocop-ast (>= 1.32.1, < 2.0)
|
71
69
|
ruby-progressbar (~> 1.7)
|
72
70
|
unicode-display_width (>= 2.4.0, < 3.0)
|
73
|
-
rubocop-ast (1.32.
|
71
|
+
rubocop-ast (1.32.2)
|
74
72
|
parser (>= 3.3.1.0)
|
75
73
|
ruby-progressbar (1.13.0)
|
76
74
|
simplecov (0.22.0)
|
@@ -80,7 +78,6 @@ GEM
|
|
80
78
|
simplecov-html (0.12.3)
|
81
79
|
simplecov-lcov (0.8.0)
|
82
80
|
simplecov_json_formatter (0.1.4)
|
83
|
-
strscan (3.1.0)
|
84
81
|
unicode-display_width (2.5.0)
|
85
82
|
|
86
83
|
PLATFORMS
|
@@ -89,6 +86,7 @@ PLATFORMS
|
|
89
86
|
|
90
87
|
DEPENDENCIES
|
91
88
|
athena-udf!
|
89
|
+
benchmark (~> 0.3.0)
|
92
90
|
gem-release (~> 2.2)
|
93
91
|
rake (~> 13.0)
|
94
92
|
rspec (~> 3.0)
|
data/README.md
CHANGED
@@ -83,6 +83,15 @@ $ aws iam attach-role-policy --role-name athena-udf-simple-varchar --policy-arn
|
|
83
83
|
$ aws lambda create-function --function-name athena-udf-simple-varchar --package-type Image --role arn:aws:iam::<ACCOUNT_ID>:role/athena-udf-simple-varchar --code ImageUri=<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test:latest --publish
|
84
84
|
```
|
85
85
|
|
86
|
+
## Development
|
87
|
+
|
88
|
+
You can use the dev container image, which includes necessary packages, to develop this library.
|
89
|
+
|
90
|
+
```sh
|
91
|
+
$ docker build -t ruby-athena-udf-dev -f Dockerfile.dev .
|
92
|
+
$ docker run -v $PWD:/src -it ruby-athena-udf-dev
|
93
|
+
```
|
94
|
+
|
86
95
|
## Contributing
|
87
96
|
|
88
97
|
Bug reports and pull requests are welcome on GitHub at https://github.com/dtaniwaki/ruby-athena-udf.
|
data/lib/athena-udf/base_udf.rb
CHANGED
@@ -9,33 +9,43 @@ require_relative 'utils'
|
|
9
9
|
|
10
10
|
module AthenaUDF
|
11
11
|
class BaseUDF
|
12
|
-
|
12
|
+
include AthenaUDF::Utils
|
13
13
|
|
14
|
-
|
15
|
-
@@logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
|
14
|
+
attr_reader :logger
|
16
15
|
|
17
|
-
def self.lambda_handler(event:, context:)
|
16
|
+
def self.lambda_handler(event:, context:)
|
17
|
+
instance = new(event:, context:)
|
18
18
|
incoming_type = event['@type']
|
19
19
|
if incoming_type == 'PingRequest'
|
20
|
-
return handle_ping(event)
|
20
|
+
return instance.handle_ping(event)
|
21
21
|
elsif incoming_type == 'UserDefinedFunctionRequest'
|
22
|
-
return handle_udf_request(event)
|
22
|
+
return instance.handle_udf_request(event)
|
23
23
|
end
|
24
24
|
|
25
25
|
raise "Unknown event type #{incoming_type} from Athena"
|
26
26
|
end
|
27
27
|
|
28
|
-
|
28
|
+
# About capabilities: https://github.com/awslabs/aws-athena-query-federation/blob/f52d929a109099a1e7180fa242e26331137ed84c/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/FederationCapabilities.java#L29-L32
|
29
|
+
def self.capabilities
|
30
|
+
1
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
|
34
|
+
@logger = Logger.new($stdout)
|
35
|
+
@logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
|
36
|
+
end
|
37
|
+
|
38
|
+
def handle_ping(event)
|
29
39
|
{
|
30
40
|
'@type' => 'PingResponse',
|
31
41
|
'catalogName' => 'event',
|
32
42
|
'queryId' => event['queryId'],
|
33
43
|
'sourceType' => 'athena_udf',
|
34
|
-
'capabilities' => capabilities,
|
44
|
+
'capabilities' => self.class.capabilities,
|
35
45
|
}
|
36
46
|
end
|
37
47
|
|
38
|
-
def
|
48
|
+
def handle_udf_request(event)
|
39
49
|
# Cannot find a way to write Arrow::RecordBatch to a buffer directly in Ruby.
|
40
50
|
|
41
51
|
output_schema = read_schema(Base64.decode64(event['outputSchema']['schema']))
|
@@ -66,17 +76,8 @@ module AthenaUDF
|
|
66
76
|
}
|
67
77
|
end
|
68
78
|
|
69
|
-
|
70
|
-
def self.capabilities
|
71
|
-
1
|
72
|
-
end
|
73
|
-
|
74
|
-
def self.handle_athena_record(input_schema, output_schema, records)
|
79
|
+
def handle_athena_record(input_schema, output_schema, records)
|
75
80
|
raise NotImplementedError
|
76
81
|
end
|
77
|
-
|
78
|
-
def self.logger
|
79
|
-
@@logger
|
80
|
-
end
|
81
82
|
end
|
82
83
|
end
|
data/lib/athena-udf/utils.rb
CHANGED
@@ -30,11 +30,11 @@ module AthenaUDF
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
def get_schema_bytes(schema
|
33
|
+
def get_schema_bytes(schema)
|
34
34
|
buffer = Arrow::ResizableBuffer.new(0)
|
35
35
|
Arrow::BufferOutputStream.open(buffer) do |output|
|
36
36
|
Arrow::RecordBatchStreamWriter.open(output, schema) do |writer|
|
37
|
-
|
37
|
+
# noop
|
38
38
|
end
|
39
39
|
|
40
40
|
bytes = buffer.data.to_s
|
@@ -61,12 +61,13 @@ module AthenaUDF
|
|
61
61
|
found_count = 0
|
62
62
|
start_index = 0
|
63
63
|
0.upto(size - 4).each do |i|
|
64
|
-
has_ffff = bytes.slice(i, 4) ==
|
64
|
+
has_ffff = bytes.slice(i, 4) == "\xFF\xFF\xFF\xFF".b
|
65
|
+
|
65
66
|
found_count += 1 if has_ffff
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
67
|
+
next unless found_count == 2
|
68
|
+
|
69
|
+
start_index = i + 4
|
70
|
+
break
|
70
71
|
end
|
71
72
|
|
72
73
|
start_index
|
data/lib/athena-udf/version.rb
CHANGED
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'benchmark'
|
4
|
+
require 'athena_udf'
|
5
|
+
|
6
|
+
Benchmark.bm 10 do |r|
|
7
|
+
include AthenaUDF::Utils
|
8
|
+
|
9
|
+
instance = Class.new(AthenaUDF::BaseUDF) do
|
10
|
+
def handle_athena_record(_input_schema, _output_schema, record)
|
11
|
+
record.to_a
|
12
|
+
end
|
13
|
+
end.new(event: {}, context: {})
|
14
|
+
|
15
|
+
input_schema_1 = Arrow::Schema.new("0": :string)
|
16
|
+
input_schema_bytes_1 = get_schema_bytes(input_schema_1)
|
17
|
+
Arrow::Schema.new(0.upto(100).map { |n| [n.to_s, :string] }.to_h)
|
18
|
+
input_schema_bytes_100 = get_schema_bytes(input_schema_1)
|
19
|
+
|
20
|
+
input_table1_1 = Arrow::Table.new(input_schema_1, [['FooBar']])
|
21
|
+
input_records_bytes_1_1 = get_record_batch_bytes(input_schema_1, input_table1_1.each_record_batch.first)
|
22
|
+
event_1_1 = {
|
23
|
+
'@type' => 'UserDefinedFunctionRequest',
|
24
|
+
'inputRecords' => {
|
25
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_1),
|
26
|
+
'records' => Base64.strict_encode64(input_records_bytes_1_1),
|
27
|
+
},
|
28
|
+
'methodName' => 'lower',
|
29
|
+
'outputSchema' => {
|
30
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_1),
|
31
|
+
},
|
32
|
+
'functionType' => 'SCALAR',
|
33
|
+
}
|
34
|
+
|
35
|
+
r.report '1 record 1 column' do
|
36
|
+
instance.handle_udf_request(event_1_1)
|
37
|
+
end
|
38
|
+
|
39
|
+
input_table100_1 = Arrow::Table.new(input_schema_1, [['FooBar']] * 100)
|
40
|
+
input_records_bytes_100_1 = get_record_batch_bytes(input_schema_1, input_table100_1.each_record_batch.first)
|
41
|
+
event_100 = {
|
42
|
+
'@type' => 'UserDefinedFunctionRequest',
|
43
|
+
'inputRecords' => {
|
44
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_1),
|
45
|
+
'records' => Base64.strict_encode64(input_records_bytes_100_1),
|
46
|
+
},
|
47
|
+
'methodName' => 'lower',
|
48
|
+
'outputSchema' => {
|
49
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_1),
|
50
|
+
},
|
51
|
+
'functionType' => 'SCALAR',
|
52
|
+
}
|
53
|
+
|
54
|
+
r.report '100 records 1 column' do
|
55
|
+
instance.handle_udf_request(event_100)
|
56
|
+
end
|
57
|
+
|
58
|
+
input_table1_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
|
59
|
+
input_records_bytes_1_100 = get_record_batch_bytes(input_schema_1, input_table1_100.each_record_batch.first)
|
60
|
+
event_1_100 = {
|
61
|
+
'@type' => 'UserDefinedFunctionRequest',
|
62
|
+
'inputRecords' => {
|
63
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_100),
|
64
|
+
'records' => Base64.strict_encode64(input_records_bytes_1_100),
|
65
|
+
},
|
66
|
+
'methodName' => 'lower',
|
67
|
+
'outputSchema' => {
|
68
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_100),
|
69
|
+
},
|
70
|
+
'functionType' => 'SCALAR',
|
71
|
+
}
|
72
|
+
|
73
|
+
r.report '1 record 100 column' do
|
74
|
+
instance.handle_udf_request(event_1_100)
|
75
|
+
end
|
76
|
+
|
77
|
+
input_table_100_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
|
78
|
+
input_records_bytes_100_100 = get_record_batch_bytes(input_schema_1, input_table_100_100.each_record_batch.first)
|
79
|
+
event_100_100 = {
|
80
|
+
'@type' => 'UserDefinedFunctionRequest',
|
81
|
+
'inputRecords' => {
|
82
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_100),
|
83
|
+
'records' => Base64.strict_encode64(input_records_bytes_100_100),
|
84
|
+
},
|
85
|
+
'methodName' => 'lower',
|
86
|
+
'outputSchema' => {
|
87
|
+
'schema' => Base64.strict_encode64(input_schema_bytes_100),
|
88
|
+
},
|
89
|
+
'functionType' => 'SCALAR',
|
90
|
+
}
|
91
|
+
|
92
|
+
r.report '100 record 100 column' do
|
93
|
+
instance.handle_udf_request(event_100_100)
|
94
|
+
end
|
95
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: athena-udf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Daisuke Taniwaki
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-09-
|
11
|
+
date: 2024-09-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: base64
|
@@ -62,6 +62,7 @@ files:
|
|
62
62
|
- ".dockerignore"
|
63
63
|
- ".rspec"
|
64
64
|
- ".rubocop.yml"
|
65
|
+
- Dockerfile.dev
|
65
66
|
- Dockerfile.example
|
66
67
|
- Gemfile
|
67
68
|
- Gemfile.lock
|
@@ -73,6 +74,7 @@ files:
|
|
73
74
|
- lib/athena-udf/utils.rb
|
74
75
|
- lib/athena-udf/version.rb
|
75
76
|
- lib/athena_udf.rb
|
77
|
+
- scripts/benchmark.rb
|
76
78
|
homepage: https://github.com/dtaniwaki/ruby-athena-udf
|
77
79
|
licenses:
|
78
80
|
- MIT
|
@@ -94,7 +96,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
94
96
|
- !ruby/object:Gem::Version
|
95
97
|
version: '0'
|
96
98
|
requirements: []
|
97
|
-
rubygems_version: 3.
|
99
|
+
rubygems_version: 3.4.19
|
98
100
|
signing_key:
|
99
101
|
specification_version: 4
|
100
102
|
summary: Ruby-version Athena UDF
|