athena-udf 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ab79db7e49eadd3d4448779185ecfc592f497e0a81209964ae7c2b8e4fdbeed5
4
- data.tar.gz: 3c2b577e2506f67b9c0e455d8e3988b844fb5762dba69fa91c508d75d5de18fe
3
+ metadata.gz: 9cc2a9e47dee420f0b11442039d1873aca9ad04ecf12775dbe8ea49df3153f02
4
+ data.tar.gz: 54d15b93ffc2fb58793aad53d03c69608e607c00d0f8da3f5210b47860a5aecc
5
5
  SHA512:
6
- metadata.gz: 03b34a9703a30047eaeaf03c2bf8b975f190bf1f7ad08c30f1f06f8dcf2b6a7eca90b29621437699ce9acd22579bc44f59d03941493fb27adcb173caa7488f7b
7
- data.tar.gz: c9dc25db956961249e40e4909ec284b5234622c05933c0fa78347431344afd5e5ca09cc6c36fa6b480ff2b5e4893d095e94369e6a07fc72917d8df279d9791f8
6
+ metadata.gz: b0f36011d582e681d575fe8d6f33a6cf9935519ed9bb21953dac980aa1c479c7d943992bbdd0e992f053f32eee6b228f473d772e4e698c3517370009fe49d3b6
7
+ data.tar.gz: 318b411aa1949b998c1a34176ca049cbccea541861a89931b1a3f71730ab45fec383e03549ec941f7a48305d4ad1e9c0de419d9a46b8e5469205467772df8598
data/.rubocop.yml CHANGED
@@ -24,6 +24,7 @@ Metrics/AbcSize:
24
24
  Metrics/BlockLength:
25
25
  Exclude:
26
26
  - "spec/**/*"
27
+ - "scripts/benchmark.rb"
27
28
 
28
29
  Style/TrailingCommaInArguments:
29
30
  EnforcedStyleForMultiline: comma
@@ -37,3 +38,7 @@ Style/TrailingCommaInHashLiteral:
37
38
  Style/ClassVars:
38
39
  Enabled: false
39
40
 
41
+ Naming/VariableNumber:
42
+ Exclude:
43
+ - "scripts/benchmark.rb"
44
+
data/Dockerfile.dev ADDED
@@ -0,0 +1,28 @@
1
+ FROM ruby:3.2
2
+
3
+ # General packages
4
+ RUN apt-get update -y \
5
+ && apt-get install -y \
6
+ build-essential \
7
+ ca-certificates \
8
+ lsb-release \
9
+ wget \
10
+ vim \
11
+ && apt-get clean
12
+
13
+ # Apache Arrow
14
+ RUN apt-get update -y \
15
+ && wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
16
+ && apt install -y ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
17
+ && apt-get update -y \
18
+ && apt-get install -y \
19
+ libarrow-dev \
20
+ libarrow-glib-dev \
21
+ && apt-get clean
22
+
23
+ # Update bundler
24
+ RUN gem update bundler
25
+
26
+ WORKDIR /src
27
+
28
+ CMD ["/bin/bash"]
data/Gemfile CHANGED
@@ -5,6 +5,7 @@ source 'https://rubygems.org'
5
5
  gemspec
6
6
 
7
7
  group :development, :test do
8
+ gem 'benchmark', '~> 0.3.0'
8
9
  gem 'gem-release', '~> 2.2'
9
10
  gem 'rake', '~> 13.0'
10
11
  gem 'rspec', '~> 3.0'
data/Gemfile.lock CHANGED
@@ -11,6 +11,7 @@ GEM
11
11
  specs:
12
12
  ast (2.4.2)
13
13
  base64 (0.2.0)
14
+ benchmark (0.3.0)
14
15
  bigdecimal (3.1.8)
15
16
  csv (3.3.0)
16
17
  diff-lcs (1.5.1)
@@ -44,8 +45,6 @@ GEM
44
45
  native-package-installer
45
46
  pkg-config
46
47
  regexp_parser (2.9.2)
47
- rexml (3.3.5)
48
- strscan
49
48
  rspec (3.13.0)
50
49
  rspec-core (~> 3.13.0)
51
50
  rspec-expectations (~> 3.13.0)
@@ -59,18 +58,17 @@ GEM
59
58
  diff-lcs (>= 1.2.0, < 2.0)
60
59
  rspec-support (~> 3.13.0)
61
60
  rspec-support (3.13.1)
62
- rubocop (1.65.1)
61
+ rubocop (1.66.0)
63
62
  json (~> 2.3)
64
63
  language_server-protocol (>= 3.17.0)
65
64
  parallel (~> 1.10)
66
65
  parser (>= 3.3.0.2)
67
66
  rainbow (>= 2.2.2, < 4.0)
68
67
  regexp_parser (>= 2.4, < 3.0)
69
- rexml (>= 3.2.5, < 4.0)
70
- rubocop-ast (>= 1.31.1, < 2.0)
68
+ rubocop-ast (>= 1.32.1, < 2.0)
71
69
  ruby-progressbar (~> 1.7)
72
70
  unicode-display_width (>= 2.4.0, < 3.0)
73
- rubocop-ast (1.32.1)
71
+ rubocop-ast (1.32.2)
74
72
  parser (>= 3.3.1.0)
75
73
  ruby-progressbar (1.13.0)
76
74
  simplecov (0.22.0)
@@ -80,7 +78,6 @@ GEM
80
78
  simplecov-html (0.12.3)
81
79
  simplecov-lcov (0.8.0)
82
80
  simplecov_json_formatter (0.1.4)
83
- strscan (3.1.0)
84
81
  unicode-display_width (2.5.0)
85
82
 
86
83
  PLATFORMS
@@ -89,6 +86,7 @@ PLATFORMS
89
86
 
90
87
  DEPENDENCIES
91
88
  athena-udf!
89
+ benchmark (~> 0.3.0)
92
90
  gem-release (~> 2.2)
93
91
  rake (~> 13.0)
94
92
  rspec (~> 3.0)
data/README.md CHANGED
@@ -83,6 +83,15 @@ $ aws iam attach-role-policy --role-name athena-udf-simple-varchar --policy-arn
83
83
  $ aws lambda create-function --function-name athena-udf-simple-varchar --package-type Image --role arn:aws:iam::<ACCOUNT_ID>:role/athena-udf-simple-varchar --code ImageUri=<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test:latest --publish
84
84
  ```
85
85
 
86
+ ## Development
87
+
88
+ You can use the dev container image, which includes necessary packages, to develop this library.
89
+
90
+ ```sh
91
+ $ docker build -t ruby-athena-udf-dev -f Dockerfile.dev .
92
+ $ docker run -v $PWD:/src -it ruby-athena-udf-dev
93
+ ```
94
+
86
95
  ## Contributing
87
96
 
88
97
  Bug reports and pull requests are welcome on GitHub at https://github.com/dtaniwaki/ruby-athena-udf.
@@ -9,33 +9,43 @@ require_relative 'utils'
9
9
 
10
10
  module AthenaUDF
11
11
  class BaseUDF
12
- extend AthenaUDF::Utils
12
+ include AthenaUDF::Utils
13
13
 
14
- @@logger = Logger.new($stdout)
15
- @@logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
14
+ attr_reader :logger
16
15
 
17
- def self.lambda_handler(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
16
+ def self.lambda_handler(event:, context:)
17
+ instance = new(event:, context:)
18
18
  incoming_type = event['@type']
19
19
  if incoming_type == 'PingRequest'
20
- return handle_ping(event)
20
+ return instance.handle_ping(event)
21
21
  elsif incoming_type == 'UserDefinedFunctionRequest'
22
- return handle_udf_request(event)
22
+ return instance.handle_udf_request(event)
23
23
  end
24
24
 
25
25
  raise "Unknown event type #{incoming_type} from Athena"
26
26
  end
27
27
 
28
- def self.handle_ping(event)
28
+ # About capabilities: https://github.com/awslabs/aws-athena-query-federation/blob/f52d929a109099a1e7180fa242e26331137ed84c/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/FederationCapabilities.java#L29-L32
29
+ def self.capabilities
30
+ 1
31
+ end
32
+
33
+ def initialize(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
34
+ @logger = Logger.new($stdout)
35
+ @logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
36
+ end
37
+
38
+ def handle_ping(event)
29
39
  {
30
40
  '@type' => 'PingResponse',
31
41
  'catalogName' => 'event',
32
42
  'queryId' => event['queryId'],
33
43
  'sourceType' => 'athena_udf',
34
- 'capabilities' => capabilities,
44
+ 'capabilities' => self.class.capabilities,
35
45
  }
36
46
  end
37
47
 
38
- def self.handle_udf_request(event)
48
+ def handle_udf_request(event)
39
49
  # Cannot find a way to write Arrow::RecordBatch to a buffer directly in Ruby.
40
50
 
41
51
  output_schema = read_schema(Base64.decode64(event['outputSchema']['schema']))
@@ -66,17 +76,8 @@ module AthenaUDF
66
76
  }
67
77
  end
68
78
 
69
- # About capabilities: https://github.com/awslabs/aws-athena-query-federation/blob/f52d929a109099a1e7180fa242e26331137ed84c/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/FederationCapabilities.java#L29-L32
70
- def self.capabilities
71
- 1
72
- end
73
-
74
- def self.handle_athena_record(input_schema, output_schema, records)
79
+ def handle_athena_record(input_schema, output_schema, records)
75
80
  raise NotImplementedError
76
81
  end
77
-
78
- def self.logger
79
- @@logger
80
- end
81
82
  end
82
83
  end
@@ -30,11 +30,11 @@ module AthenaUDF
30
30
  end
31
31
  end
32
32
 
33
- def get_schema_bytes(schema, record_batch)
33
+ def get_schema_bytes(schema)
34
34
  buffer = Arrow::ResizableBuffer.new(0)
35
35
  Arrow::BufferOutputStream.open(buffer) do |output|
36
36
  Arrow::RecordBatchStreamWriter.open(output, schema) do |writer|
37
- writer.write_record_batch(record_batch)
37
+ # noop
38
38
  end
39
39
 
40
40
  bytes = buffer.data.to_s
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module AthenaUDF
4
- VERSION = '0.1.3'
4
+ VERSION = '0.2.0'
5
5
  end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'benchmark'
4
+ require 'athena_udf'
5
+
6
+ Benchmark.bm 10 do |r|
7
+ include AthenaUDF::Utils
8
+
9
+ instance = Class.new(AthenaUDF::BaseUDF) do
10
+ def handle_athena_record(_input_schema, _output_schema, record)
11
+ record.to_a
12
+ end
13
+ end.new(event: {}, context: {})
14
+
15
+ input_schema_1 = Arrow::Schema.new("0": :string)
16
+ input_schema_bytes_1 = get_schema_bytes(input_schema_1)
17
+ Arrow::Schema.new(0.upto(100).map { |n| [n.to_s, :string] }.to_h)
18
+ input_schema_bytes_100 = get_schema_bytes(input_schema_1)
19
+
20
+ input_table1_1 = Arrow::Table.new(input_schema_1, [['FooBar']])
21
+ input_records_bytes_1_1 = get_record_batch_bytes(input_schema_1, input_table1_1.each_record_batch.first)
22
+ event_1_1 = {
23
+ '@type' => 'UserDefinedFunctionRequest',
24
+ 'inputRecords' => {
25
+ 'schema' => Base64.strict_encode64(input_schema_bytes_1),
26
+ 'records' => Base64.strict_encode64(input_records_bytes_1_1),
27
+ },
28
+ 'methodName' => 'lower',
29
+ 'outputSchema' => {
30
+ 'schema' => Base64.strict_encode64(input_schema_bytes_1),
31
+ },
32
+ 'functionType' => 'SCALAR',
33
+ }
34
+
35
+ r.report '1 record 1 column' do
36
+ instance.handle_udf_request(event_1_1)
37
+ end
38
+
39
+ input_table100_1 = Arrow::Table.new(input_schema_1, [['FooBar']] * 100)
40
+ input_records_bytes_100_1 = get_record_batch_bytes(input_schema_1, input_table100_1.each_record_batch.first)
41
+ event_100 = {
42
+ '@type' => 'UserDefinedFunctionRequest',
43
+ 'inputRecords' => {
44
+ 'schema' => Base64.strict_encode64(input_schema_bytes_1),
45
+ 'records' => Base64.strict_encode64(input_records_bytes_100_1),
46
+ },
47
+ 'methodName' => 'lower',
48
+ 'outputSchema' => {
49
+ 'schema' => Base64.strict_encode64(input_schema_bytes_1),
50
+ },
51
+ 'functionType' => 'SCALAR',
52
+ }
53
+
54
+ r.report '100 records 1 column' do
55
+ instance.handle_udf_request(event_100)
56
+ end
57
+
58
+ input_table1_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
59
+ input_records_bytes_1_100 = get_record_batch_bytes(input_schema_1, input_table1_100.each_record_batch.first)
60
+ event_1_100 = {
61
+ '@type' => 'UserDefinedFunctionRequest',
62
+ 'inputRecords' => {
63
+ 'schema' => Base64.strict_encode64(input_schema_bytes_100),
64
+ 'records' => Base64.strict_encode64(input_records_bytes_1_100),
65
+ },
66
+ 'methodName' => 'lower',
67
+ 'outputSchema' => {
68
+ 'schema' => Base64.strict_encode64(input_schema_bytes_100),
69
+ },
70
+ 'functionType' => 'SCALAR',
71
+ }
72
+
73
+ r.report '1 record 100 column' do
74
+ instance.handle_udf_request(event_1_100)
75
+ end
76
+
77
+ input_table_100_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
78
+ input_records_bytes_100_100 = get_record_batch_bytes(input_schema_1, input_table_100_100.each_record_batch.first)
79
+ event_100_100 = {
80
+ '@type' => 'UserDefinedFunctionRequest',
81
+ 'inputRecords' => {
82
+ 'schema' => Base64.strict_encode64(input_schema_bytes_100),
83
+ 'records' => Base64.strict_encode64(input_records_bytes_100_100),
84
+ },
85
+ 'methodName' => 'lower',
86
+ 'outputSchema' => {
87
+ 'schema' => Base64.strict_encode64(input_schema_bytes_100),
88
+ },
89
+ 'functionType' => 'SCALAR',
90
+ }
91
+
92
+ r.report '100 record 100 column' do
93
+ instance.handle_udf_request(event_100_100)
94
+ end
95
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: athena-udf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daisuke Taniwaki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-09-03 00:00:00.000000000 Z
11
+ date: 2024-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64
@@ -62,6 +62,7 @@ files:
62
62
  - ".dockerignore"
63
63
  - ".rspec"
64
64
  - ".rubocop.yml"
65
+ - Dockerfile.dev
65
66
  - Dockerfile.example
66
67
  - Gemfile
67
68
  - Gemfile.lock
@@ -73,6 +74,7 @@ files:
73
74
  - lib/athena-udf/utils.rb
74
75
  - lib/athena-udf/version.rb
75
76
  - lib/athena_udf.rb
77
+ - scripts/benchmark.rb
76
78
  homepage: https://github.com/dtaniwaki/ruby-athena-udf
77
79
  licenses:
78
80
  - MIT
@@ -94,7 +96,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
96
  - !ruby/object:Gem::Version
95
97
  version: '0'
96
98
  requirements: []
97
- rubygems_version: 3.5.11
99
+ rubygems_version: 3.4.19
98
100
  signing_key:
99
101
  specification_version: 4
100
102
  summary: Ruby-version Athena UDF