athena-udf 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Dockerfile.dev +4 -0
- data/Dockerfile.example +29 -12
- data/Gemfile.lock +33 -29
- data/README.md +1 -1
- data/lib/athena-udf/utils.rb +6 -20
- data/lib/athena-udf/version.rb +1 -1
- data/scripts/benchmark.rb +26 -16
- metadata +13 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e5ff4cb6e2f166491b1bb18cc2be70b9de602852ff9535eb7b888de86d46569
|
4
|
+
data.tar.gz: df66a5f63ea2ac6a036d4f47befd0dc55ae0f8487de80a8d7d1da8d38606307d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 18e43fce15698f0fc1dd0160cb0098887cf78085ca13774e72819cc89369b3fd466b6bf27a216e1ef1ff863caa412881f0fab4ba5df9af62b6cc0ddfa77f2e85
|
7
|
+
data.tar.gz: eba9d3b48c2adabe2c02c6a22686e0f8b7a4ee35f535738d9a72a7384948765c14a4d978e3061b5d43e75610bab7df8580dd8cbcb9a270bd025a7bb8f0392d5a
|
data/Dockerfile.dev
CHANGED
data/Dockerfile.example
CHANGED
@@ -1,20 +1,36 @@
|
|
1
|
-
|
1
|
+
# Can NOT install apache-arrow on the amazonlinux:2023-minimal image,
|
2
|
+
# so install the ruby directly on amazonlinux:2023.
|
3
|
+
FROM public.ecr.aws/amazonlinux/amazonlinux:2023 AS builder
|
4
|
+
|
5
|
+
# Apache Arrow
|
6
|
+
RUN dnf upgrade -y --releasever=latest \
|
7
|
+
&& dnf install -y https://apache.jfrog.io/artifactory/arrow/amazon-linux/$(cut -d: -f6 /etc/system-release-cpe)/apache-arrow-release-latest.rpm \
|
8
|
+
&& dnf install -y arrow-devel arrow-glib-devel arrow-dataset-devel arrow-dataset-glib-devel
|
9
|
+
|
10
|
+
############
|
11
|
+
|
12
|
+
FROM public.ecr.aws/lambda/ruby:3.3
|
13
|
+
|
14
|
+
COPY --from=builder /usr/include/parquet/arrow/ /usr/include/parquet/arrow/
|
15
|
+
COPY --from=builder /usr/include/arrow* /usr/include/
|
16
|
+
COPY --from=builder /usr/share/gdb/auto-load/usr/lib64/libarrow.so* /usr/share/gdb/auto-load/usr/lib64/
|
17
|
+
COPY --from=builder /usr/share/licenses/arrow* /usr/share/licenses/
|
18
|
+
COPY --from=builder /usr/share/doc/arrow* /usr/share/doc/
|
19
|
+
COPY --from=builder /usr/share/arrow* /usr/share/
|
20
|
+
COPY --from=builder /usr/share/vala/vapi/arrow* /usr/share/vala/vapi/
|
21
|
+
COPY --from=builder /usr/lib64/libarrow* /usr/lib64/
|
22
|
+
COPY --from=builder /usr/lib64/pkgconfig/arrow* /usr/lib64/pkgconfig/
|
23
|
+
COPY --from=builder /usr/lib64/libarrow* /usr/lib64/libarrow/
|
24
|
+
COPY --from=builder /usr/lib64/cmake/Arrow/ /usr/lib64/cmake/Arrow/
|
2
25
|
|
3
26
|
# General packages
|
4
|
-
RUN
|
5
|
-
&&
|
6
|
-
amazon-linux-extras \
|
27
|
+
RUN dnf upgrade -y --releasever=latest \
|
28
|
+
&& dnf install -y \
|
7
29
|
gcc-c++ \
|
30
|
+
glib2-devel \
|
8
31
|
make \
|
9
32
|
git \
|
10
|
-
&&
|
11
|
-
&& yum clean all
|
12
|
-
|
13
|
-
# Apache Arrow
|
14
|
-
RUN yum update -y \
|
15
|
-
&& yum install -y https://apache.jfrog.io/artifactory/arrow/amazon-linux/2/apache-arrow-release-latest.rpm \
|
16
|
-
&& yum install -y --enablerepo=epel arrow-devel arrow-glib-devel arrow-dataset-devel arrow-dataset-glib-devel \
|
17
|
-
&& yum clean all
|
33
|
+
&& dnf clean all
|
18
34
|
|
19
35
|
# Update bundler
|
20
36
|
RUN gem update bundler
|
@@ -29,3 +45,4 @@ RUN bundle config set --local without development \
|
|
29
45
|
COPY . ${LAMBDA_TASK_ROOT}
|
30
46
|
|
31
47
|
CMD ["example.SimpleVarcharUDF.lambda_handler"]
|
48
|
+
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
athena-udf (0.
|
4
|
+
athena-udf (0.2.0)
|
5
5
|
base64
|
6
6
|
csv
|
7
|
-
red-arrow (
|
7
|
+
red-arrow (< 20.0.0)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -12,29 +12,30 @@ GEM
|
|
12
12
|
ast (2.4.2)
|
13
13
|
base64 (0.2.0)
|
14
14
|
benchmark (0.3.0)
|
15
|
-
bigdecimal (3.1.
|
16
|
-
csv (3.3.
|
17
|
-
diff-lcs (1.
|
15
|
+
bigdecimal (3.1.9)
|
16
|
+
csv (3.3.2)
|
17
|
+
diff-lcs (1.6.0)
|
18
18
|
docile (1.4.1)
|
19
19
|
extpp (0.1.1)
|
20
|
-
fiddle (1.1.
|
21
|
-
gem-release (2.2.
|
22
|
-
gio2 (4.2.
|
20
|
+
fiddle (1.1.6)
|
21
|
+
gem-release (2.2.4)
|
22
|
+
gio2 (4.2.7)
|
23
23
|
fiddle
|
24
|
-
gobject-introspection (= 4.2.
|
25
|
-
glib2 (4.2.
|
24
|
+
gobject-introspection (= 4.2.7)
|
25
|
+
glib2 (4.2.7)
|
26
26
|
native-package-installer (>= 1.0.3)
|
27
27
|
pkg-config (>= 1.3.5)
|
28
|
-
gobject-introspection (4.2.
|
29
|
-
glib2 (= 4.2.
|
30
|
-
json (2.
|
31
|
-
language_server-protocol (3.17.0.
|
28
|
+
gobject-introspection (4.2.7)
|
29
|
+
glib2 (= 4.2.7)
|
30
|
+
json (2.10.2)
|
31
|
+
language_server-protocol (3.17.0.4)
|
32
|
+
lint_roller (1.1.0)
|
32
33
|
native-package-installer (1.1.9)
|
33
34
|
parallel (1.26.3)
|
34
|
-
parser (3.3.
|
35
|
+
parser (3.3.7.1)
|
35
36
|
ast (~> 2.4.1)
|
36
37
|
racc
|
37
|
-
pkg-config (1.
|
38
|
+
pkg-config (1.6.0)
|
38
39
|
racc (1.8.1)
|
39
40
|
rainbow (3.1.1)
|
40
41
|
rake (13.2.1)
|
@@ -44,41 +45,44 @@ GEM
|
|
44
45
|
gio2 (>= 3.5.0)
|
45
46
|
native-package-installer
|
46
47
|
pkg-config
|
47
|
-
regexp_parser (2.
|
48
|
+
regexp_parser (2.10.0)
|
48
49
|
rspec (3.13.0)
|
49
50
|
rspec-core (~> 3.13.0)
|
50
51
|
rspec-expectations (~> 3.13.0)
|
51
52
|
rspec-mocks (~> 3.13.0)
|
52
|
-
rspec-core (3.13.
|
53
|
+
rspec-core (3.13.3)
|
53
54
|
rspec-support (~> 3.13.0)
|
54
|
-
rspec-expectations (3.13.
|
55
|
+
rspec-expectations (3.13.3)
|
55
56
|
diff-lcs (>= 1.2.0, < 2.0)
|
56
57
|
rspec-support (~> 3.13.0)
|
57
|
-
rspec-mocks (3.13.
|
58
|
+
rspec-mocks (3.13.2)
|
58
59
|
diff-lcs (>= 1.2.0, < 2.0)
|
59
60
|
rspec-support (~> 3.13.0)
|
60
|
-
rspec-support (3.13.
|
61
|
-
rubocop (1.
|
61
|
+
rspec-support (3.13.2)
|
62
|
+
rubocop (1.74.0)
|
62
63
|
json (~> 2.3)
|
63
|
-
language_server-protocol (
|
64
|
+
language_server-protocol (~> 3.17.0.2)
|
65
|
+
lint_roller (~> 1.1.0)
|
64
66
|
parallel (~> 1.10)
|
65
67
|
parser (>= 3.3.0.2)
|
66
68
|
rainbow (>= 2.2.2, < 4.0)
|
67
|
-
regexp_parser (>= 2.
|
68
|
-
rubocop-ast (>= 1.
|
69
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
70
|
+
rubocop-ast (>= 1.38.0, < 2.0)
|
69
71
|
ruby-progressbar (~> 1.7)
|
70
|
-
unicode-display_width (>= 2.4.0, <
|
71
|
-
rubocop-ast (1.
|
72
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
73
|
+
rubocop-ast (1.38.1)
|
72
74
|
parser (>= 3.3.1.0)
|
73
75
|
ruby-progressbar (1.13.0)
|
74
76
|
simplecov (0.22.0)
|
75
77
|
docile (~> 1.1)
|
76
78
|
simplecov-html (~> 0.11)
|
77
79
|
simplecov_json_formatter (~> 0.1)
|
78
|
-
simplecov-html (0.
|
80
|
+
simplecov-html (0.13.1)
|
79
81
|
simplecov-lcov (0.8.0)
|
80
82
|
simplecov_json_formatter (0.1.4)
|
81
|
-
unicode-display_width (
|
83
|
+
unicode-display_width (3.1.4)
|
84
|
+
unicode-emoji (~> 4.0, >= 4.0.4)
|
85
|
+
unicode-emoji (4.0.4)
|
82
86
|
|
83
87
|
PLATFORMS
|
84
88
|
arm64-darwin-23
|
data/README.md
CHANGED
@@ -31,7 +31,7 @@ Just make a subclass of `AthenaUDF::BaseUDF` and implement a concrete function l
|
|
31
31
|
require "athena-udf"
|
32
32
|
|
33
33
|
class SimpleVarcharUDF < AthenaUDF::BaseUDF
|
34
|
-
def
|
34
|
+
def handle_athena_record(_input_schema, _output_schema, record)
|
35
35
|
[record[0].downcase]
|
36
36
|
end
|
37
37
|
end
|
data/lib/athena-udf/utils.rb
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
|
3
3
|
module AthenaUDF
|
4
4
|
module Utils
|
5
|
+
SEPARATOR = "\xFF\xFF\xFF\xFF".b
|
6
|
+
SEPARATOR_SIZE = SEPARATOR.bytesize
|
7
|
+
|
5
8
|
def read_record_batches(schema_data, record_batch_data)
|
6
9
|
buffer = Arrow::ResizableBuffer.new(schema_data.bytes.size + record_batch_data.bytes.size)
|
7
10
|
Arrow::BufferOutputStream.open(buffer) do |output|
|
@@ -38,8 +41,8 @@ module AthenaUDF
|
|
38
41
|
end
|
39
42
|
|
40
43
|
bytes = buffer.data.to_s
|
41
|
-
|
42
|
-
bytes[
|
44
|
+
last_index = bytes.index(SEPARATOR, SEPARATOR_SIZE)
|
45
|
+
bytes[SEPARATOR_SIZE...last_index]
|
43
46
|
end
|
44
47
|
end
|
45
48
|
|
@@ -51,26 +54,9 @@ module AthenaUDF
|
|
51
54
|
end
|
52
55
|
|
53
56
|
bytes = buffer.data.to_s
|
54
|
-
start_index =
|
57
|
+
start_index = bytes.index(SEPARATOR, SEPARATOR_SIZE) + SEPARATOR_SIZE
|
55
58
|
bytes[start_index..]
|
56
59
|
end
|
57
60
|
end
|
58
|
-
|
59
|
-
def get_record_batch_index(bytes)
|
60
|
-
size = bytes.size
|
61
|
-
found_count = 0
|
62
|
-
start_index = 0
|
63
|
-
0.upto(size - 4).each do |i|
|
64
|
-
has_ffff = bytes.slice(i, 4) == "\xFF\xFF\xFF\xFF".b
|
65
|
-
|
66
|
-
found_count += 1 if has_ffff
|
67
|
-
next unless found_count == 2
|
68
|
-
|
69
|
-
start_index = i + 4
|
70
|
-
break
|
71
|
-
end
|
72
|
-
|
73
|
-
start_index
|
74
|
-
end
|
75
61
|
end
|
76
62
|
end
|
data/lib/athena-udf/version.rb
CHANGED
data/scripts/benchmark.rb
CHANGED
@@ -3,19 +3,21 @@
|
|
3
3
|
require 'benchmark'
|
4
4
|
require 'athena_udf'
|
5
5
|
|
6
|
-
Benchmark.bm
|
6
|
+
Benchmark.bm 20 do |r|
|
7
7
|
include AthenaUDF::Utils
|
8
8
|
|
9
9
|
instance = Class.new(AthenaUDF::BaseUDF) do
|
10
10
|
def handle_athena_record(_input_schema, _output_schema, record)
|
11
|
-
record
|
11
|
+
[record[0]]
|
12
12
|
end
|
13
13
|
end.new(event: {}, context: {})
|
14
14
|
|
15
15
|
input_schema_1 = Arrow::Schema.new("0": :string)
|
16
16
|
input_schema_bytes_1 = get_schema_bytes(input_schema_1)
|
17
|
-
Arrow::Schema.new(0.upto(100).map { |n| [n.to_s, :string] }.to_h)
|
18
|
-
input_schema_bytes_100 = get_schema_bytes(
|
17
|
+
input_schema_100 = Arrow::Schema.new(0.upto(100).map { |n| [n.to_s, :string] }.to_h)
|
18
|
+
input_schema_bytes_100 = get_schema_bytes(input_schema_100)
|
19
|
+
output_schema = Arrow::Schema.new("0": :string)
|
20
|
+
output_schema_bytes = get_schema_bytes(output_schema)
|
19
21
|
|
20
22
|
input_table1_1 = Arrow::Table.new(input_schema_1, [['FooBar']])
|
21
23
|
input_records_bytes_1_1 = get_record_batch_bytes(input_schema_1, input_table1_1.each_record_batch.first)
|
@@ -27,13 +29,15 @@ Benchmark.bm 10 do |r|
|
|
27
29
|
},
|
28
30
|
'methodName' => 'lower',
|
29
31
|
'outputSchema' => {
|
30
|
-
'schema' => Base64.strict_encode64(
|
32
|
+
'schema' => Base64.strict_encode64(output_schema_bytes),
|
31
33
|
},
|
32
34
|
'functionType' => 'SCALAR',
|
33
35
|
}
|
34
36
|
|
35
37
|
r.report '1 record 1 column' do
|
36
|
-
|
38
|
+
100.times do
|
39
|
+
instance.handle_udf_request(event_1_1)
|
40
|
+
end
|
37
41
|
end
|
38
42
|
|
39
43
|
input_table100_1 = Arrow::Table.new(input_schema_1, [['FooBar']] * 100)
|
@@ -46,17 +50,19 @@ Benchmark.bm 10 do |r|
|
|
46
50
|
},
|
47
51
|
'methodName' => 'lower',
|
48
52
|
'outputSchema' => {
|
49
|
-
'schema' => Base64.strict_encode64(
|
53
|
+
'schema' => Base64.strict_encode64(output_schema_bytes),
|
50
54
|
},
|
51
55
|
'functionType' => 'SCALAR',
|
52
56
|
}
|
53
57
|
|
54
58
|
r.report '100 records 1 column' do
|
55
|
-
|
59
|
+
100.times do
|
60
|
+
instance.handle_udf_request(event_100)
|
61
|
+
end
|
56
62
|
end
|
57
63
|
|
58
|
-
input_table1_100 = Arrow::Table.new(
|
59
|
-
input_records_bytes_1_100 = get_record_batch_bytes(
|
64
|
+
input_table1_100 = Arrow::Table.new(input_schema_100, [['FooBar'] * 100])
|
65
|
+
input_records_bytes_1_100 = get_record_batch_bytes(input_schema_100, input_table1_100.each_record_batch.first)
|
60
66
|
event_1_100 = {
|
61
67
|
'@type' => 'UserDefinedFunctionRequest',
|
62
68
|
'inputRecords' => {
|
@@ -65,17 +71,19 @@ Benchmark.bm 10 do |r|
|
|
65
71
|
},
|
66
72
|
'methodName' => 'lower',
|
67
73
|
'outputSchema' => {
|
68
|
-
'schema' => Base64.strict_encode64(
|
74
|
+
'schema' => Base64.strict_encode64(output_schema_bytes),
|
69
75
|
},
|
70
76
|
'functionType' => 'SCALAR',
|
71
77
|
}
|
72
78
|
|
73
79
|
r.report '1 record 100 column' do
|
74
|
-
|
80
|
+
100.times do
|
81
|
+
instance.handle_udf_request(event_1_100)
|
82
|
+
end
|
75
83
|
end
|
76
84
|
|
77
|
-
input_table_100_100 = Arrow::Table.new(
|
78
|
-
input_records_bytes_100_100 = get_record_batch_bytes(
|
85
|
+
input_table_100_100 = Arrow::Table.new(input_schema_100, [['FooBar'] * 100] * 100)
|
86
|
+
input_records_bytes_100_100 = get_record_batch_bytes(input_schema_100, input_table_100_100.each_record_batch.first)
|
79
87
|
event_100_100 = {
|
80
88
|
'@type' => 'UserDefinedFunctionRequest',
|
81
89
|
'inputRecords' => {
|
@@ -84,12 +92,14 @@ Benchmark.bm 10 do |r|
|
|
84
92
|
},
|
85
93
|
'methodName' => 'lower',
|
86
94
|
'outputSchema' => {
|
87
|
-
'schema' => Base64.strict_encode64(
|
95
|
+
'schema' => Base64.strict_encode64(output_schema_bytes),
|
88
96
|
},
|
89
97
|
'functionType' => 'SCALAR',
|
90
98
|
}
|
91
99
|
|
92
100
|
r.report '100 record 100 column' do
|
93
|
-
|
101
|
+
100.times do
|
102
|
+
instance.handle_udf_request(event_100_100)
|
103
|
+
end
|
94
104
|
end
|
95
105
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: athena-udf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Daisuke Taniwaki
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-03-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: base64
|
@@ -42,16 +42,22 @@ dependencies:
|
|
42
42
|
name: red-arrow
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 12.0.0
|
48
|
+
- - "<"
|
46
49
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
50
|
+
version: 20.0.0
|
48
51
|
type: :runtime
|
49
52
|
prerelease: false
|
50
53
|
version_requirements: !ruby/object:Gem::Requirement
|
51
54
|
requirements:
|
52
|
-
- - "
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: 12.0.0
|
58
|
+
- - "<"
|
53
59
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
60
|
+
version: 20.0.0
|
55
61
|
description: ''
|
56
62
|
email:
|
57
63
|
- daisuketaniwaki@gmail.com
|
@@ -96,7 +102,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
96
102
|
- !ruby/object:Gem::Version
|
97
103
|
version: '0'
|
98
104
|
requirements: []
|
99
|
-
rubygems_version: 3.
|
105
|
+
rubygems_version: 3.5.11
|
100
106
|
signing_key:
|
101
107
|
specification_version: 4
|
102
108
|
summary: Ruby-version Athena UDF
|