athena-udf 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9cc2a9e47dee420f0b11442039d1873aca9ad04ecf12775dbe8ea49df3153f02
4
- data.tar.gz: 54d15b93ffc2fb58793aad53d03c69608e607c00d0f8da3f5210b47860a5aecc
3
+ metadata.gz: 9e5ff4cb6e2f166491b1bb18cc2be70b9de602852ff9535eb7b888de86d46569
4
+ data.tar.gz: df66a5f63ea2ac6a036d4f47befd0dc55ae0f8487de80a8d7d1da8d38606307d
5
5
  SHA512:
6
- metadata.gz: b0f36011d582e681d575fe8d6f33a6cf9935519ed9bb21953dac980aa1c479c7d943992bbdd0e992f053f32eee6b228f473d772e4e698c3517370009fe49d3b6
7
- data.tar.gz: 318b411aa1949b998c1a34176ca049cbccea541861a89931b1a3f71730ab45fec383e03549ec941f7a48305d4ad1e9c0de419d9a46b8e5469205467772df8598
6
+ metadata.gz: 18e43fce15698f0fc1dd0160cb0098887cf78085ca13774e72819cc89369b3fd466b6bf27a216e1ef1ff863caa412881f0fab4ba5df9af62b6cc0ddfa77f2e85
7
+ data.tar.gz: eba9d3b48c2adabe2c02c6a22686e0f8b7a4ee35f535738d9a72a7384948765c14a4d978e3061b5d43e75610bab7df8580dd8cbcb9a270bd025a7bb8f0392d5a
data/Dockerfile.dev CHANGED
@@ -25,4 +25,8 @@ RUN gem update bundler
25
25
 
26
26
  WORKDIR /src
27
27
 
28
+ COPY Gemfile Gemfile.lock athena-udf.gemspec /src
29
+ COPY lib/athena-udf/version.rb /src/lib/athena-udf/version.rb
30
+ RUN bundle install
31
+
28
32
  CMD ["/bin/bash"]
data/Dockerfile.example CHANGED
@@ -1,20 +1,36 @@
1
- FROM public.ecr.aws/lambda/ruby:3.2
1
+ # Can NOT install apache-arrow on the amazonlinux:2023-minimal image,
2
+ # so install the ruby directly on amazonlinux:2023.
3
+ FROM public.ecr.aws/amazonlinux/amazonlinux:2023 AS builder
4
+
5
+ # Apache Arrow
6
+ RUN dnf upgrade -y --releasever=latest \
7
+ && dnf install -y https://apache.jfrog.io/artifactory/arrow/amazon-linux/$(cut -d: -f6 /etc/system-release-cpe)/apache-arrow-release-latest.rpm \
8
+ && dnf install -y arrow-devel arrow-glib-devel arrow-dataset-devel arrow-dataset-glib-devel
9
+
10
+ ############
11
+
12
+ FROM public.ecr.aws/lambda/ruby:3.3
13
+
14
+ COPY --from=builder /usr/include/parquet/arrow/ /usr/include/parquet/arrow/
15
+ COPY --from=builder /usr/include/arrow* /usr/include/
16
+ COPY --from=builder /usr/share/gdb/auto-load/usr/lib64/libarrow.so* /usr/share/gdb/auto-load/usr/lib64/
17
+ COPY --from=builder /usr/share/licenses/arrow* /usr/share/licenses/
18
+ COPY --from=builder /usr/share/doc/arrow* /usr/share/doc/
19
+ COPY --from=builder /usr/share/arrow* /usr/share/
20
+ COPY --from=builder /usr/share/vala/vapi/arrow* /usr/share/vala/vapi/
21
+ COPY --from=builder /usr/lib64/libarrow* /usr/lib64/
22
+ COPY --from=builder /usr/lib64/pkgconfig/arrow* /usr/lib64/pkgconfig/
23
+ COPY --from=builder /usr/lib64/libarrow* /usr/lib64/libarrow/
24
+ COPY --from=builder /usr/lib64/cmake/Arrow/ /usr/lib64/cmake/Arrow/
2
25
 
3
26
  # General packages
4
- RUN yum update -y \
5
- && yum install -y \
6
- amazon-linux-extras \
27
+ RUN dnf upgrade -y --releasever=latest \
28
+ && dnf install -y \
7
29
  gcc-c++ \
30
+ glib2-devel \
8
31
  make \
9
32
  git \
10
- && amazon-linux-extras install -y epel \
11
- && yum clean all
12
-
13
- # Apache Arrow
14
- RUN yum update -y \
15
- && yum install -y https://apache.jfrog.io/artifactory/arrow/amazon-linux/2/apache-arrow-release-latest.rpm \
16
- && yum install -y --enablerepo=epel arrow-devel arrow-glib-devel arrow-dataset-devel arrow-dataset-glib-devel \
17
- && yum clean all
33
+ && dnf clean all
18
34
 
19
35
  # Update bundler
20
36
  RUN gem update bundler
@@ -29,3 +45,4 @@ RUN bundle config set --local without development \
29
45
  COPY . ${LAMBDA_TASK_ROOT}
30
46
 
31
47
  CMD ["example.SimpleVarcharUDF.lambda_handler"]
48
+
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- athena-udf (0.1.3)
4
+ athena-udf (0.2.0)
5
5
  base64
6
6
  csv
7
- red-arrow (~> 12.0.1)
7
+ red-arrow (< 20.0.0)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -12,29 +12,30 @@ GEM
12
12
  ast (2.4.2)
13
13
  base64 (0.2.0)
14
14
  benchmark (0.3.0)
15
- bigdecimal (3.1.8)
16
- csv (3.3.0)
17
- diff-lcs (1.5.1)
15
+ bigdecimal (3.1.9)
16
+ csv (3.3.2)
17
+ diff-lcs (1.6.0)
18
18
  docile (1.4.1)
19
19
  extpp (0.1.1)
20
- fiddle (1.1.2)
21
- gem-release (2.2.2)
22
- gio2 (4.2.2)
20
+ fiddle (1.1.6)
21
+ gem-release (2.2.4)
22
+ gio2 (4.2.7)
23
23
  fiddle
24
- gobject-introspection (= 4.2.2)
25
- glib2 (4.2.2)
24
+ gobject-introspection (= 4.2.7)
25
+ glib2 (4.2.7)
26
26
  native-package-installer (>= 1.0.3)
27
27
  pkg-config (>= 1.3.5)
28
- gobject-introspection (4.2.2)
29
- glib2 (= 4.2.2)
30
- json (2.7.2)
31
- language_server-protocol (3.17.0.3)
28
+ gobject-introspection (4.2.7)
29
+ glib2 (= 4.2.7)
30
+ json (2.10.2)
31
+ language_server-protocol (3.17.0.4)
32
+ lint_roller (1.1.0)
32
33
  native-package-installer (1.1.9)
33
34
  parallel (1.26.3)
34
- parser (3.3.4.2)
35
+ parser (3.3.7.1)
35
36
  ast (~> 2.4.1)
36
37
  racc
37
- pkg-config (1.5.6)
38
+ pkg-config (1.6.0)
38
39
  racc (1.8.1)
39
40
  rainbow (3.1.1)
40
41
  rake (13.2.1)
@@ -44,41 +45,44 @@ GEM
44
45
  gio2 (>= 3.5.0)
45
46
  native-package-installer
46
47
  pkg-config
47
- regexp_parser (2.9.2)
48
+ regexp_parser (2.10.0)
48
49
  rspec (3.13.0)
49
50
  rspec-core (~> 3.13.0)
50
51
  rspec-expectations (~> 3.13.0)
51
52
  rspec-mocks (~> 3.13.0)
52
- rspec-core (3.13.0)
53
+ rspec-core (3.13.3)
53
54
  rspec-support (~> 3.13.0)
54
- rspec-expectations (3.13.1)
55
+ rspec-expectations (3.13.3)
55
56
  diff-lcs (>= 1.2.0, < 2.0)
56
57
  rspec-support (~> 3.13.0)
57
- rspec-mocks (3.13.1)
58
+ rspec-mocks (3.13.2)
58
59
  diff-lcs (>= 1.2.0, < 2.0)
59
60
  rspec-support (~> 3.13.0)
60
- rspec-support (3.13.1)
61
- rubocop (1.66.0)
61
+ rspec-support (3.13.2)
62
+ rubocop (1.74.0)
62
63
  json (~> 2.3)
63
- language_server-protocol (>= 3.17.0)
64
+ language_server-protocol (~> 3.17.0.2)
65
+ lint_roller (~> 1.1.0)
64
66
  parallel (~> 1.10)
65
67
  parser (>= 3.3.0.2)
66
68
  rainbow (>= 2.2.2, < 4.0)
67
- regexp_parser (>= 2.4, < 3.0)
68
- rubocop-ast (>= 1.32.1, < 2.0)
69
+ regexp_parser (>= 2.9.3, < 3.0)
70
+ rubocop-ast (>= 1.38.0, < 2.0)
69
71
  ruby-progressbar (~> 1.7)
70
- unicode-display_width (>= 2.4.0, < 3.0)
71
- rubocop-ast (1.32.2)
72
+ unicode-display_width (>= 2.4.0, < 4.0)
73
+ rubocop-ast (1.38.1)
72
74
  parser (>= 3.3.1.0)
73
75
  ruby-progressbar (1.13.0)
74
76
  simplecov (0.22.0)
75
77
  docile (~> 1.1)
76
78
  simplecov-html (~> 0.11)
77
79
  simplecov_json_formatter (~> 0.1)
78
- simplecov-html (0.12.3)
80
+ simplecov-html (0.13.1)
79
81
  simplecov-lcov (0.8.0)
80
82
  simplecov_json_formatter (0.1.4)
81
- unicode-display_width (2.5.0)
83
+ unicode-display_width (3.1.4)
84
+ unicode-emoji (~> 4.0, >= 4.0.4)
85
+ unicode-emoji (4.0.4)
82
86
 
83
87
  PLATFORMS
84
88
  arm64-darwin-23
data/README.md CHANGED
@@ -31,7 +31,7 @@ Just make a subclass of `AthenaUDF::BaseUDF` and implement a concrete function l
31
31
  require "athena-udf"
32
32
 
33
33
  class SimpleVarcharUDF < AthenaUDF::BaseUDF
34
- def self.handle_athena_record(_input_schema, _output_schema, record)
34
+ def handle_athena_record(_input_schema, _output_schema, record)
35
35
  [record[0].downcase]
36
36
  end
37
37
  end
@@ -2,6 +2,9 @@
2
2
 
3
3
  module AthenaUDF
4
4
  module Utils
5
+ SEPARATOR = "\xFF\xFF\xFF\xFF".b
6
+ SEPARATOR_SIZE = SEPARATOR.bytesize
7
+
5
8
  def read_record_batches(schema_data, record_batch_data)
6
9
  buffer = Arrow::ResizableBuffer.new(schema_data.bytes.size + record_batch_data.bytes.size)
7
10
  Arrow::BufferOutputStream.open(buffer) do |output|
@@ -38,8 +41,8 @@ module AthenaUDF
38
41
  end
39
42
 
40
43
  bytes = buffer.data.to_s
41
- start_index = get_record_batch_index(bytes)
42
- bytes[4..start_index - 5]
44
+ last_index = bytes.index(SEPARATOR, SEPARATOR_SIZE)
45
+ bytes[SEPARATOR_SIZE...last_index]
43
46
  end
44
47
  end
45
48
 
@@ -51,26 +54,9 @@ module AthenaUDF
51
54
  end
52
55
 
53
56
  bytes = buffer.data.to_s
54
- start_index = get_record_batch_index(bytes)
57
+ start_index = bytes.index(SEPARATOR, SEPARATOR_SIZE) + SEPARATOR_SIZE
55
58
  bytes[start_index..]
56
59
  end
57
60
  end
58
-
59
- def get_record_batch_index(bytes)
60
- size = bytes.size
61
- found_count = 0
62
- start_index = 0
63
- 0.upto(size - 4).each do |i|
64
- has_ffff = bytes.slice(i, 4) == "\xFF\xFF\xFF\xFF".b
65
-
66
- found_count += 1 if has_ffff
67
- next unless found_count == 2
68
-
69
- start_index = i + 4
70
- break
71
- end
72
-
73
- start_index
74
- end
75
61
  end
76
62
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module AthenaUDF
4
- VERSION = '0.2.0'
4
+ VERSION = '0.2.1'
5
5
  end
data/scripts/benchmark.rb CHANGED
@@ -3,19 +3,21 @@
3
3
  require 'benchmark'
4
4
  require 'athena_udf'
5
5
 
6
- Benchmark.bm 10 do |r|
6
+ Benchmark.bm 20 do |r|
7
7
  include AthenaUDF::Utils
8
8
 
9
9
  instance = Class.new(AthenaUDF::BaseUDF) do
10
10
  def handle_athena_record(_input_schema, _output_schema, record)
11
- record.to_a
11
+ [record[0]]
12
12
  end
13
13
  end.new(event: {}, context: {})
14
14
 
15
15
  input_schema_1 = Arrow::Schema.new("0": :string)
16
16
  input_schema_bytes_1 = get_schema_bytes(input_schema_1)
17
- Arrow::Schema.new(0.upto(100).map { |n| [n.to_s, :string] }.to_h)
18
- input_schema_bytes_100 = get_schema_bytes(input_schema_1)
17
+ input_schema_100 = Arrow::Schema.new(0.upto(100).map { |n| [n.to_s, :string] }.to_h)
18
+ input_schema_bytes_100 = get_schema_bytes(input_schema_100)
19
+ output_schema = Arrow::Schema.new("0": :string)
20
+ output_schema_bytes = get_schema_bytes(output_schema)
19
21
 
20
22
  input_table1_1 = Arrow::Table.new(input_schema_1, [['FooBar']])
21
23
  input_records_bytes_1_1 = get_record_batch_bytes(input_schema_1, input_table1_1.each_record_batch.first)
@@ -27,13 +29,15 @@ Benchmark.bm 10 do |r|
27
29
  },
28
30
  'methodName' => 'lower',
29
31
  'outputSchema' => {
30
- 'schema' => Base64.strict_encode64(input_schema_bytes_1),
32
+ 'schema' => Base64.strict_encode64(output_schema_bytes),
31
33
  },
32
34
  'functionType' => 'SCALAR',
33
35
  }
34
36
 
35
37
  r.report '1 record 1 column' do
36
- instance.handle_udf_request(event_1_1)
38
+ 100.times do
39
+ instance.handle_udf_request(event_1_1)
40
+ end
37
41
  end
38
42
 
39
43
  input_table100_1 = Arrow::Table.new(input_schema_1, [['FooBar']] * 100)
@@ -46,17 +50,19 @@ Benchmark.bm 10 do |r|
46
50
  },
47
51
  'methodName' => 'lower',
48
52
  'outputSchema' => {
49
- 'schema' => Base64.strict_encode64(input_schema_bytes_1),
53
+ 'schema' => Base64.strict_encode64(output_schema_bytes),
50
54
  },
51
55
  'functionType' => 'SCALAR',
52
56
  }
53
57
 
54
58
  r.report '100 records 1 column' do
55
- instance.handle_udf_request(event_100)
59
+ 100.times do
60
+ instance.handle_udf_request(event_100)
61
+ end
56
62
  end
57
63
 
58
- input_table1_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
59
- input_records_bytes_1_100 = get_record_batch_bytes(input_schema_1, input_table1_100.each_record_batch.first)
64
+ input_table1_100 = Arrow::Table.new(input_schema_100, [['FooBar'] * 100])
65
+ input_records_bytes_1_100 = get_record_batch_bytes(input_schema_100, input_table1_100.each_record_batch.first)
60
66
  event_1_100 = {
61
67
  '@type' => 'UserDefinedFunctionRequest',
62
68
  'inputRecords' => {
@@ -65,17 +71,19 @@ Benchmark.bm 10 do |r|
65
71
  },
66
72
  'methodName' => 'lower',
67
73
  'outputSchema' => {
68
- 'schema' => Base64.strict_encode64(input_schema_bytes_100),
74
+ 'schema' => Base64.strict_encode64(output_schema_bytes),
69
75
  },
70
76
  'functionType' => 'SCALAR',
71
77
  }
72
78
 
73
79
  r.report '1 record 100 column' do
74
- instance.handle_udf_request(event_1_100)
80
+ 100.times do
81
+ instance.handle_udf_request(event_1_100)
82
+ end
75
83
  end
76
84
 
77
- input_table_100_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
78
- input_records_bytes_100_100 = get_record_batch_bytes(input_schema_1, input_table_100_100.each_record_batch.first)
85
+ input_table_100_100 = Arrow::Table.new(input_schema_100, [['FooBar'] * 100] * 100)
86
+ input_records_bytes_100_100 = get_record_batch_bytes(input_schema_100, input_table_100_100.each_record_batch.first)
79
87
  event_100_100 = {
80
88
  '@type' => 'UserDefinedFunctionRequest',
81
89
  'inputRecords' => {
@@ -84,12 +92,14 @@ Benchmark.bm 10 do |r|
84
92
  },
85
93
  'methodName' => 'lower',
86
94
  'outputSchema' => {
87
- 'schema' => Base64.strict_encode64(input_schema_bytes_100),
95
+ 'schema' => Base64.strict_encode64(output_schema_bytes),
88
96
  },
89
97
  'functionType' => 'SCALAR',
90
98
  }
91
99
 
92
100
  r.report '100 record 100 column' do
93
- instance.handle_udf_request(event_100_100)
101
+ 100.times do
102
+ instance.handle_udf_request(event_100_100)
103
+ end
94
104
  end
95
105
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: athena-udf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daisuke Taniwaki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-09-07 00:00:00.000000000 Z
11
+ date: 2025-03-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64
@@ -42,16 +42,22 @@ dependencies:
42
42
  name: red-arrow
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 12.0.0
48
+ - - "<"
46
49
  - !ruby/object:Gem::Version
47
- version: 12.0.1
50
+ version: 20.0.0
48
51
  type: :runtime
49
52
  prerelease: false
50
53
  version_requirements: !ruby/object:Gem::Requirement
51
54
  requirements:
52
- - - "~>"
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: 12.0.0
58
+ - - "<"
53
59
  - !ruby/object:Gem::Version
54
- version: 12.0.1
60
+ version: 20.0.0
55
61
  description: ''
56
62
  email:
57
63
  - daisuketaniwaki@gmail.com
@@ -96,7 +102,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
96
102
  - !ruby/object:Gem::Version
97
103
  version: '0'
98
104
  requirements: []
99
- rubygems_version: 3.4.19
105
+ rubygems_version: 3.5.11
100
106
  signing_key:
101
107
  specification_version: 4
102
108
  summary: Ruby-version Athena UDF