athena-udf 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1fca59aa26a814ceb98c86febbff75b38a5de2cb888c7291c483ed49aafe2301
4
- data.tar.gz: 1a1020ab4893a1689771c5a35049eeca8ec778d7e452de5de660790c8394f600
3
+ metadata.gz: 9cc2a9e47dee420f0b11442039d1873aca9ad04ecf12775dbe8ea49df3153f02
4
+ data.tar.gz: 54d15b93ffc2fb58793aad53d03c69608e607c00d0f8da3f5210b47860a5aecc
5
5
  SHA512:
6
- metadata.gz: 9f2f349f806c505d4957674ac6c18bf326b8c2cae08d259d30501d550b4f5fa8f73f5e79c5ec5eecab8da250abc91a3bb4bf0cddcc3c04e7288a2df2df79f76a
7
- data.tar.gz: efe4807a051325233719ffa97aeec66564ad5c02a2703f70ebccb725a464f0be7d8142be42fae0073706f8b56edddf94a58fa7e15bbb3734c0bd2242a9f14475
6
+ metadata.gz: b0f36011d582e681d575fe8d6f33a6cf9935519ed9bb21953dac980aa1c479c7d943992bbdd0e992f053f32eee6b228f473d772e4e698c3517370009fe49d3b6
7
+ data.tar.gz: 318b411aa1949b998c1a34176ca049cbccea541861a89931b1a3f71730ab45fec383e03549ec941f7a48305d4ad1e9c0de419d9a46b8e5469205467772df8598
data/.rubocop.yml CHANGED
@@ -24,6 +24,7 @@ Metrics/AbcSize:
24
24
  Metrics/BlockLength:
25
25
  Exclude:
26
26
  - "spec/**/*"
27
+ - "scripts/benchmark.rb"
27
28
 
28
29
  Style/TrailingCommaInArguments:
29
30
  EnforcedStyleForMultiline: comma
@@ -37,3 +38,7 @@ Style/TrailingCommaInHashLiteral:
37
38
  Style/ClassVars:
38
39
  Enabled: false
39
40
 
41
+ Naming/VariableNumber:
42
+ Exclude:
43
+ - "scripts/benchmark.rb"
44
+
data/Dockerfile.dev ADDED
@@ -0,0 +1,28 @@
1
+ FROM ruby:3.2
2
+
3
+ # General packages
4
+ RUN apt-get update -y \
5
+ && apt-get install -y \
6
+ build-essential \
7
+ ca-certificates \
8
+ lsb-release \
9
+ wget \
10
+ vim \
11
+ && apt-get clean
12
+
13
+ # Apache Arrow
14
+ RUN apt-get update -y \
15
+ && wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
16
+ && apt install -y ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
17
+ && apt-get update -y \
18
+ && apt-get install -y \
19
+ libarrow-dev \
20
+ libarrow-glib-dev \
21
+ && apt-get clean
22
+
23
+ # Update bundler
24
+ RUN gem update bundler
25
+
26
+ WORKDIR /src
27
+
28
+ CMD ["/bin/bash"]
data/Gemfile CHANGED
@@ -5,6 +5,7 @@ source 'https://rubygems.org'
5
5
  gemspec
6
6
 
7
7
  group :development, :test do
8
+ gem 'benchmark', '~> 0.3.0'
8
9
  gem 'gem-release', '~> 2.2'
9
10
  gem 'rake', '~> 13.0'
10
11
  gem 'rspec', '~> 3.0'
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- athena-udf (0.1.2)
4
+ athena-udf (0.1.3)
5
5
  base64
6
6
  csv
7
7
  red-arrow (~> 12.0.1)
@@ -11,6 +11,7 @@ GEM
11
11
  specs:
12
12
  ast (2.4.2)
13
13
  base64 (0.2.0)
14
+ benchmark (0.3.0)
14
15
  bigdecimal (3.1.8)
15
16
  csv (3.3.0)
16
17
  diff-lcs (1.5.1)
@@ -44,8 +45,6 @@ GEM
44
45
  native-package-installer
45
46
  pkg-config
46
47
  regexp_parser (2.9.2)
47
- rexml (3.3.5)
48
- strscan
49
48
  rspec (3.13.0)
50
49
  rspec-core (~> 3.13.0)
51
50
  rspec-expectations (~> 3.13.0)
@@ -59,18 +58,17 @@ GEM
59
58
  diff-lcs (>= 1.2.0, < 2.0)
60
59
  rspec-support (~> 3.13.0)
61
60
  rspec-support (3.13.1)
62
- rubocop (1.65.1)
61
+ rubocop (1.66.0)
63
62
  json (~> 2.3)
64
63
  language_server-protocol (>= 3.17.0)
65
64
  parallel (~> 1.10)
66
65
  parser (>= 3.3.0.2)
67
66
  rainbow (>= 2.2.2, < 4.0)
68
67
  regexp_parser (>= 2.4, < 3.0)
69
- rexml (>= 3.2.5, < 4.0)
70
- rubocop-ast (>= 1.31.1, < 2.0)
68
+ rubocop-ast (>= 1.32.1, < 2.0)
71
69
  ruby-progressbar (~> 1.7)
72
70
  unicode-display_width (>= 2.4.0, < 3.0)
73
- rubocop-ast (1.32.1)
71
+ rubocop-ast (1.32.2)
74
72
  parser (>= 3.3.1.0)
75
73
  ruby-progressbar (1.13.0)
76
74
  simplecov (0.22.0)
@@ -80,7 +78,6 @@ GEM
80
78
  simplecov-html (0.12.3)
81
79
  simplecov-lcov (0.8.0)
82
80
  simplecov_json_formatter (0.1.4)
83
- strscan (3.1.0)
84
81
  unicode-display_width (2.5.0)
85
82
 
86
83
  PLATFORMS
@@ -89,6 +86,7 @@ PLATFORMS
89
86
 
90
87
  DEPENDENCIES
91
88
  athena-udf!
89
+ benchmark (~> 0.3.0)
92
90
  gem-release (~> 2.2)
93
91
  rake (~> 13.0)
94
92
  rspec (~> 3.0)
data/README.md CHANGED
@@ -83,6 +83,15 @@ $ aws iam attach-role-policy --role-name athena-udf-simple-varchar --policy-arn
83
83
  $ aws lambda create-function --function-name athena-udf-simple-varchar --package-type Image --role arn:aws:iam::<ACCOUNT_ID>:role/athena-udf-simple-varchar --code ImageUri=<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test:latest --publish
84
84
  ```
85
85
 
86
+ ## Development
87
+
88
+ You can use the dev container image, which includes necessary packages, to develop this library.
89
+
90
+ ```sh
91
+ $ docker build -t ruby-athena-udf-dev -f Dockerfile.dev .
92
+ $ docker run -v $PWD:/src -it ruby-athena-udf-dev
93
+ ```
94
+
86
95
  ## Contributing
87
96
 
88
97
  Bug reports and pull requests are welcome on GitHub at https://github.com/dtaniwaki/ruby-athena-udf.
@@ -9,33 +9,43 @@ require_relative 'utils'
9
9
 
10
10
  module AthenaUDF
11
11
  class BaseUDF
12
- extend AthenaUDF::Utils
12
+ include AthenaUDF::Utils
13
13
 
14
- @@logger = Logger.new($stdout)
15
- @@logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
14
+ attr_reader :logger
16
15
 
17
- def self.lambda_handler(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
16
+ def self.lambda_handler(event:, context:)
17
+ instance = new(event:, context:)
18
18
  incoming_type = event['@type']
19
19
  if incoming_type == 'PingRequest'
20
- return handle_ping(event)
20
+ return instance.handle_ping(event)
21
21
  elsif incoming_type == 'UserDefinedFunctionRequest'
22
- return handle_udf_request(event)
22
+ return instance.handle_udf_request(event)
23
23
  end
24
24
 
25
25
  raise "Unknown event type #{incoming_type} from Athena"
26
26
  end
27
27
 
28
- def self.handle_ping(event)
28
+ # About capabilities: https://github.com/awslabs/aws-athena-query-federation/blob/f52d929a109099a1e7180fa242e26331137ed84c/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/FederationCapabilities.java#L29-L32
29
+ def self.capabilities
30
+ 1
31
+ end
32
+
33
+ def initialize(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
34
+ @logger = Logger.new($stdout)
35
+ @logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
36
+ end
37
+
38
+ def handle_ping(event)
29
39
  {
30
40
  '@type' => 'PingResponse',
31
41
  'catalogName' => 'event',
32
42
  'queryId' => event['queryId'],
33
43
  'sourceType' => 'athena_udf',
34
- 'capabilities' => capabilities,
44
+ 'capabilities' => self.class.capabilities,
35
45
  }
36
46
  end
37
47
 
38
- def self.handle_udf_request(event)
48
+ def handle_udf_request(event)
39
49
  # Cannot find a way to write Arrow::RecordBatch to a buffer directly in Ruby.
40
50
 
41
51
  output_schema = read_schema(Base64.decode64(event['outputSchema']['schema']))
@@ -66,17 +76,8 @@ module AthenaUDF
66
76
  }
67
77
  end
68
78
 
69
- # About capabilities: https://github.com/awslabs/aws-athena-query-federation/blob/f52d929a109099a1e7180fa242e26331137ed84c/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/FederationCapabilities.java#L29-L32
70
- def self.capabilities
71
- 1
72
- end
73
-
74
- def self.handle_athena_record(input_schema, output_schema, records)
79
+ def handle_athena_record(input_schema, output_schema, records)
75
80
  raise NotImplementedError
76
81
  end
77
-
78
- def self.logger
79
- @@logger
80
- end
81
82
  end
82
83
  end
@@ -30,11 +30,11 @@ module AthenaUDF
30
30
  end
31
31
  end
32
32
 
33
- def get_schema_bytes(schema, record_batch)
33
+ def get_schema_bytes(schema)
34
34
  buffer = Arrow::ResizableBuffer.new(0)
35
35
  Arrow::BufferOutputStream.open(buffer) do |output|
36
36
  Arrow::RecordBatchStreamWriter.open(output, schema) do |writer|
37
- writer.write_record_batch(record_batch)
37
+ # noop
38
38
  end
39
39
 
40
40
  bytes = buffer.data.to_s
@@ -61,12 +61,13 @@ module AthenaUDF
61
61
  found_count = 0
62
62
  start_index = 0
63
63
  0.upto(size - 4).each do |i|
64
- has_ffff = bytes.slice(i, 4) == [255, 255, 255, 255]
64
+ has_ffff = bytes.slice(i, 4) == "\xFF\xFF\xFF\xFF".b
65
+
65
66
  found_count += 1 if has_ffff
66
- if found_count == 2
67
- start_index = i + 4
68
- break
69
- end
67
+ next unless found_count == 2
68
+
69
+ start_index = i + 4
70
+ break
70
71
  end
71
72
 
72
73
  start_index
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module AthenaUDF
4
- VERSION = '0.1.2'
4
+ VERSION = '0.2.0'
5
5
  end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'benchmark'
4
+ require 'athena_udf'
5
+
6
+ Benchmark.bm 10 do |r|
7
+ include AthenaUDF::Utils
8
+
9
+ instance = Class.new(AthenaUDF::BaseUDF) do
10
+ def handle_athena_record(_input_schema, _output_schema, record)
11
+ record.to_a
12
+ end
13
+ end.new(event: {}, context: {})
14
+
15
+ input_schema_1 = Arrow::Schema.new("0": :string)
16
+ input_schema_bytes_1 = get_schema_bytes(input_schema_1)
17
+ Arrow::Schema.new(0.upto(100).map { |n| [n.to_s, :string] }.to_h)
18
+ input_schema_bytes_100 = get_schema_bytes(input_schema_1)
19
+
20
+ input_table1_1 = Arrow::Table.new(input_schema_1, [['FooBar']])
21
+ input_records_bytes_1_1 = get_record_batch_bytes(input_schema_1, input_table1_1.each_record_batch.first)
22
+ event_1_1 = {
23
+ '@type' => 'UserDefinedFunctionRequest',
24
+ 'inputRecords' => {
25
+ 'schema' => Base64.strict_encode64(input_schema_bytes_1),
26
+ 'records' => Base64.strict_encode64(input_records_bytes_1_1),
27
+ },
28
+ 'methodName' => 'lower',
29
+ 'outputSchema' => {
30
+ 'schema' => Base64.strict_encode64(input_schema_bytes_1),
31
+ },
32
+ 'functionType' => 'SCALAR',
33
+ }
34
+
35
+ r.report '1 record 1 column' do
36
+ instance.handle_udf_request(event_1_1)
37
+ end
38
+
39
+ input_table100_1 = Arrow::Table.new(input_schema_1, [['FooBar']] * 100)
40
+ input_records_bytes_100_1 = get_record_batch_bytes(input_schema_1, input_table100_1.each_record_batch.first)
41
+ event_100 = {
42
+ '@type' => 'UserDefinedFunctionRequest',
43
+ 'inputRecords' => {
44
+ 'schema' => Base64.strict_encode64(input_schema_bytes_1),
45
+ 'records' => Base64.strict_encode64(input_records_bytes_100_1),
46
+ },
47
+ 'methodName' => 'lower',
48
+ 'outputSchema' => {
49
+ 'schema' => Base64.strict_encode64(input_schema_bytes_1),
50
+ },
51
+ 'functionType' => 'SCALAR',
52
+ }
53
+
54
+ r.report '100 records 1 column' do
55
+ instance.handle_udf_request(event_100)
56
+ end
57
+
58
+ input_table1_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
59
+ input_records_bytes_1_100 = get_record_batch_bytes(input_schema_1, input_table1_100.each_record_batch.first)
60
+ event_1_100 = {
61
+ '@type' => 'UserDefinedFunctionRequest',
62
+ 'inputRecords' => {
63
+ 'schema' => Base64.strict_encode64(input_schema_bytes_100),
64
+ 'records' => Base64.strict_encode64(input_records_bytes_1_100),
65
+ },
66
+ 'methodName' => 'lower',
67
+ 'outputSchema' => {
68
+ 'schema' => Base64.strict_encode64(input_schema_bytes_100),
69
+ },
70
+ 'functionType' => 'SCALAR',
71
+ }
72
+
73
+ r.report '1 record 100 column' do
74
+ instance.handle_udf_request(event_1_100)
75
+ end
76
+
77
+ input_table_100_100 = Arrow::Table.new(input_schema_1, [['FooBar']])
78
+ input_records_bytes_100_100 = get_record_batch_bytes(input_schema_1, input_table_100_100.each_record_batch.first)
79
+ event_100_100 = {
80
+ '@type' => 'UserDefinedFunctionRequest',
81
+ 'inputRecords' => {
82
+ 'schema' => Base64.strict_encode64(input_schema_bytes_100),
83
+ 'records' => Base64.strict_encode64(input_records_bytes_100_100),
84
+ },
85
+ 'methodName' => 'lower',
86
+ 'outputSchema' => {
87
+ 'schema' => Base64.strict_encode64(input_schema_bytes_100),
88
+ },
89
+ 'functionType' => 'SCALAR',
90
+ }
91
+
92
+ r.report '100 record 100 column' do
93
+ instance.handle_udf_request(event_100_100)
94
+ end
95
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: athena-udf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daisuke Taniwaki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-09-03 00:00:00.000000000 Z
11
+ date: 2024-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64
@@ -62,6 +62,7 @@ files:
62
62
  - ".dockerignore"
63
63
  - ".rspec"
64
64
  - ".rubocop.yml"
65
+ - Dockerfile.dev
65
66
  - Dockerfile.example
66
67
  - Gemfile
67
68
  - Gemfile.lock
@@ -73,6 +74,7 @@ files:
73
74
  - lib/athena-udf/utils.rb
74
75
  - lib/athena-udf/version.rb
75
76
  - lib/athena_udf.rb
77
+ - scripts/benchmark.rb
76
78
  homepage: https://github.com/dtaniwaki/ruby-athena-udf
77
79
  licenses:
78
80
  - MIT
@@ -94,7 +96,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
96
  - !ruby/object:Gem::Version
95
97
  version: '0'
96
98
  requirements: []
97
- rubygems_version: 3.5.11
99
+ rubygems_version: 3.4.19
98
100
  signing_key:
99
101
  specification_version: 4
100
102
  summary: Ruby-version Athena UDF