athena-udf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.dockerignore +2 -0
- data/.rspec +3 -0
- data/.rubocop.yml +36 -0
- data/Dockerfile.example +31 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +90 -0
- data/LICENSE.txt +21 -0
- data/README.md +90 -0
- data/Rakefile +12 -0
- data/example.rb +9 -0
- data/lib/athena-udf/base_udf.rb +73 -0
- data/lib/athena-udf/utils.rb +82 -0
- data/lib/athena-udf/version.rb +5 -0
- data/lib/athena_udf.rb +7 -0
- metadata +101 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: be808b4f155b8cdfa5c52bcc3c486d77cb9d056099f592b304bc8aa6426694b4
|
|
4
|
+
data.tar.gz: c41565a48ec8702756c48f01112b6d7dc881723a26a521c09cbb8bf506666a24
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 971c50fb5eee0df1c0f4522b19a1583fca1ad94144f38766f0a6ff42b82151ee8501d7b303cc361f70a151c4e29e57aaca360bdd2e57646fb8509c5d4fbd74bd
|
|
7
|
+
data.tar.gz: af0deed14deb22d1e382adfe9bc4097f07dff0cea835f9188e847df7bd27017e91b21c8c84bdf56ebbf6ac4dc729a3e6fe9dfbff43847f6eca1ee423389d3824
|
data/.dockerignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
TargetRubyVersion: 3.2
|
|
3
|
+
|
|
4
|
+
Style/StringLiterals:
|
|
5
|
+
Enabled: true
|
|
6
|
+
EnforcedStyle: single_quotes
|
|
7
|
+
|
|
8
|
+
Style/StringLiteralsInInterpolation:
|
|
9
|
+
Enabled: true
|
|
10
|
+
EnforcedStyle: single_quotes
|
|
11
|
+
|
|
12
|
+
Layout/LineLength:
|
|
13
|
+
Max: 120
|
|
14
|
+
|
|
15
|
+
Style/Documentation:
|
|
16
|
+
Enabled: false
|
|
17
|
+
|
|
18
|
+
Metrics/MethodLength:
|
|
19
|
+
Max: 30
|
|
20
|
+
|
|
21
|
+
Metrics/AbcSize:
|
|
22
|
+
Max: 40
|
|
23
|
+
|
|
24
|
+
Metrics/BlockLength:
|
|
25
|
+
Exclude:
|
|
26
|
+
- "spec/**/*"
|
|
27
|
+
|
|
28
|
+
Style/TrailingCommaInArguments:
|
|
29
|
+
EnforcedStyleForMultiline: comma
|
|
30
|
+
|
|
31
|
+
Style/TrailingCommaInArrayLiteral:
|
|
32
|
+
EnforcedStyleForMultiline: comma
|
|
33
|
+
|
|
34
|
+
Style/TrailingCommaInHashLiteral:
|
|
35
|
+
EnforcedStyleForMultiline: comma
|
|
36
|
+
|
data/Dockerfile.example
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
FROM public.ecr.aws/lambda/ruby:3.2
|
|
2
|
+
|
|
3
|
+
# General packages
|
|
4
|
+
RUN yum update -y \
|
|
5
|
+
&& yum install -y \
|
|
6
|
+
amazon-linux-extras \
|
|
7
|
+
gcc-c++ \
|
|
8
|
+
make \
|
|
9
|
+
git \
|
|
10
|
+
&& amazon-linux-extras install -y epel \
|
|
11
|
+
&& yum clean all
|
|
12
|
+
|
|
13
|
+
# Apache Arrow
|
|
14
|
+
RUN yum update -y \
|
|
15
|
+
&& yum install -y https://apache.jfrog.io/artifactory/arrow/amazon-linux/2/apache-arrow-release-latest.rpm \
|
|
16
|
+
&& yum install -y --enablerepo=epel arrow-devel arrow-glib-devel arrow-dataset-devel arrow-dataset-glib-devel \
|
|
17
|
+
&& yum clean all
|
|
18
|
+
|
|
19
|
+
# Update bundler
|
|
20
|
+
RUN gem update bundler
|
|
21
|
+
|
|
22
|
+
# Install gems
|
|
23
|
+
COPY Gemfile Gemfile.lock athena-udf.gemspec ${LAMBDA_TASK_ROOT}
|
|
24
|
+
COPY lib/athena-udf/version.rb ${LAMBDA_TASK_ROOT}/lib/athena-udf/version.rb
|
|
25
|
+
ENV GEM_HOME=${LAMBDA_TASK_ROOT}
|
|
26
|
+
RUN bundle config set --local without development \
|
|
27
|
+
&& bundle install
|
|
28
|
+
|
|
29
|
+
COPY . ${LAMBDA_TASK_ROOT}
|
|
30
|
+
|
|
31
|
+
CMD ["example.SimpleVarcharUDF.lambda_handler"]
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
athena-udf (0.1.0)
|
|
5
|
+
base64
|
|
6
|
+
csv
|
|
7
|
+
red-arrow (~> 12.0.1)
|
|
8
|
+
|
|
9
|
+
GEM
|
|
10
|
+
remote: https://rubygems.org/
|
|
11
|
+
specs:
|
|
12
|
+
ast (2.4.2)
|
|
13
|
+
base64 (0.2.0)
|
|
14
|
+
bigdecimal (3.1.8)
|
|
15
|
+
csv (3.3.0)
|
|
16
|
+
diff-lcs (1.5.1)
|
|
17
|
+
extpp (0.1.1)
|
|
18
|
+
fiddle (1.1.2)
|
|
19
|
+
gem-release (2.2.2)
|
|
20
|
+
gio2 (4.2.2)
|
|
21
|
+
fiddle
|
|
22
|
+
gobject-introspection (= 4.2.2)
|
|
23
|
+
glib2 (4.2.2)
|
|
24
|
+
native-package-installer (>= 1.0.3)
|
|
25
|
+
pkg-config (>= 1.3.5)
|
|
26
|
+
gobject-introspection (4.2.2)
|
|
27
|
+
glib2 (= 4.2.2)
|
|
28
|
+
json (2.7.2)
|
|
29
|
+
language_server-protocol (3.17.0.3)
|
|
30
|
+
native-package-installer (1.1.9)
|
|
31
|
+
parallel (1.26.3)
|
|
32
|
+
parser (3.3.4.2)
|
|
33
|
+
ast (~> 2.4.1)
|
|
34
|
+
racc
|
|
35
|
+
pkg-config (1.5.6)
|
|
36
|
+
racc (1.8.1)
|
|
37
|
+
rainbow (3.1.1)
|
|
38
|
+
rake (13.2.1)
|
|
39
|
+
red-arrow (12.0.1)
|
|
40
|
+
bigdecimal (>= 3.1.0)
|
|
41
|
+
extpp (>= 0.1.1)
|
|
42
|
+
gio2 (>= 3.5.0)
|
|
43
|
+
native-package-installer
|
|
44
|
+
pkg-config
|
|
45
|
+
regexp_parser (2.9.2)
|
|
46
|
+
rexml (3.3.5)
|
|
47
|
+
strscan
|
|
48
|
+
rspec (3.13.0)
|
|
49
|
+
rspec-core (~> 3.13.0)
|
|
50
|
+
rspec-expectations (~> 3.13.0)
|
|
51
|
+
rspec-mocks (~> 3.13.0)
|
|
52
|
+
rspec-core (3.13.0)
|
|
53
|
+
rspec-support (~> 3.13.0)
|
|
54
|
+
rspec-expectations (3.13.1)
|
|
55
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
56
|
+
rspec-support (~> 3.13.0)
|
|
57
|
+
rspec-mocks (3.13.1)
|
|
58
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
59
|
+
rspec-support (~> 3.13.0)
|
|
60
|
+
rspec-support (3.13.1)
|
|
61
|
+
rubocop (1.65.1)
|
|
62
|
+
json (~> 2.3)
|
|
63
|
+
language_server-protocol (>= 3.17.0)
|
|
64
|
+
parallel (~> 1.10)
|
|
65
|
+
parser (>= 3.3.0.2)
|
|
66
|
+
rainbow (>= 2.2.2, < 4.0)
|
|
67
|
+
regexp_parser (>= 2.4, < 3.0)
|
|
68
|
+
rexml (>= 3.2.5, < 4.0)
|
|
69
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
|
70
|
+
ruby-progressbar (~> 1.7)
|
|
71
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
|
72
|
+
rubocop-ast (1.32.1)
|
|
73
|
+
parser (>= 3.3.1.0)
|
|
74
|
+
ruby-progressbar (1.13.0)
|
|
75
|
+
strscan (3.1.0)
|
|
76
|
+
unicode-display_width (2.5.0)
|
|
77
|
+
|
|
78
|
+
PLATFORMS
|
|
79
|
+
arm64-darwin-23
|
|
80
|
+
ruby
|
|
81
|
+
|
|
82
|
+
DEPENDENCIES
|
|
83
|
+
athena-udf!
|
|
84
|
+
gem-release (~> 2.2)
|
|
85
|
+
rake (~> 13.0)
|
|
86
|
+
rspec (~> 3.0)
|
|
87
|
+
rubocop (~> 1.21)
|
|
88
|
+
|
|
89
|
+
BUNDLED WITH
|
|
90
|
+
2.5.17
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Daisuke Taniwaki
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# AthenaUDF
|
|
2
|
+
|
|
3
|
+
Ruby-version Athena User Defined Function (UDF).
|
|
4
|
+
|
|
5
|
+
This gem is highly inspired by [the Python-version Athena UDF](https://github.com/dmarkey/python-athena-udf).
|
|
6
|
+
|
|
7
|
+
See [an official example implementation](https://github.com/awslabs/aws-athena-query-federation/blob/fc2e4e9cdcb71ec7f7c7d44cbda7f56c5835811e/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/UserDefinedFunctionHandler.java) for more detail of a lambda function for Athena UDF.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
Install the gem and add to the application's Gemfile by executing:
|
|
12
|
+
|
|
13
|
+
```sh
|
|
14
|
+
$ bundle add athena-udf
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
|
18
|
+
|
|
19
|
+
```sh
|
|
20
|
+
$ gem install athena-udf
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
Just make a subclass of `AthenaUDF::BaseUDF` and implement a concrete function logic.
|
|
26
|
+
|
|
27
|
+
```rb
|
|
28
|
+
require "athena-udf"
|
|
29
|
+
|
|
30
|
+
class SimpleVarcharUDF < AthenaUDF::BaseUDF
|
|
31
|
+
def self.handle_athena_record(_input_schema, _output_schema, record)
|
|
32
|
+
[record[0].downcase]
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Then, it can be called as `SimpleVarcharUDF.lambda_handler` in your lambda function for Athena UDF workloads.
|
|
38
|
+
|
|
39
|
+
After pushing an image to Amazon ECR, you can call the function like the following SQL.
|
|
40
|
+
|
|
41
|
+
```sql
|
|
42
|
+
USING EXTERNAL FUNCTION my_udf(col1 varchar) RETURNS varchar LAMBDA 'athena-udf-simple-varchar'
|
|
43
|
+
|
|
44
|
+
SELECT my_udf('FooBar');
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
See [the official document](https://docs.aws.amazon.com/athena/latest/ug/querying-udf.html) for the UDF usage.
|
|
48
|
+
|
|
49
|
+
## Development
|
|
50
|
+
|
|
51
|
+
To contribute to this library, first checkout the code. Then, install the dependent gems.
|
|
52
|
+
|
|
53
|
+
```sh
|
|
54
|
+
$ bundle install
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
To run the tests:
|
|
58
|
+
|
|
59
|
+
```sh
|
|
60
|
+
$ bundle exec rspec
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Deployment
|
|
64
|
+
|
|
65
|
+
You can try the example with the following steps.
|
|
66
|
+
|
|
67
|
+
First, push a container image to Amazon ECR:
|
|
68
|
+
|
|
69
|
+
```sh
|
|
70
|
+
$ aws ecr get-login-password | docker login --username AWS --password-stdin https://<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com
|
|
71
|
+
$ docker build --platform=linux/amd64 -t <ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test -f Dockerfile.example .
|
|
72
|
+
$ docker push <ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Then, create a lambda function with the CLI:
|
|
76
|
+
|
|
77
|
+
```sh
|
|
78
|
+
$ aws iam create-role --role-name athena-udf-simple-varchar --assume-role-policy-document '{"Version": "2012-10-17","Statement": [{ "Effect": "Allow", "Principal": {"Service": "lambda.amazonaws.com"}, "Action": "sts:AssumeRole"}]}'
|
|
79
|
+
$ aws iam attach-role-policy --role-name athena-udf-simple-varchar --policy-arn arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
|
|
80
|
+
$ aws lambda create-function --function-name athena-udf-simple-varchar --package-type Image --role arn:aws:iam::<ACCOUNT_ID>:role/athena-udf-simple-varchar --code ImageUri=<ACCOUNT_ID>.dkr.ecr.<AWS_REGION>.amazonaws.com/athena-udf-test:latest --publish
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Contributing
|
|
84
|
+
|
|
85
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/dtaniwaki/ruby-athena-udf.
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
|
|
89
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
|
90
|
+
|
data/Rakefile
ADDED
data/example.rb
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'securerandom'
|
|
4
|
+
require 'base64'
|
|
5
|
+
require 'tempfile'
|
|
6
|
+
require 'arrow'
|
|
7
|
+
require_relative 'utils'
|
|
8
|
+
|
|
9
|
+
module AthenaUDF
|
|
10
|
+
class BaseUDF
|
|
11
|
+
extend AthenaUDF::Utils
|
|
12
|
+
|
|
13
|
+
def self.lambda_handler(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
|
|
14
|
+
incoming_type = event['@type']
|
|
15
|
+
if incoming_type == 'PingRequest'
|
|
16
|
+
return handle_ping(event)
|
|
17
|
+
elsif incoming_type == 'UserDefinedFunctionRequest'
|
|
18
|
+
return handle_udf_request(event)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
raise "Unknown event type #{incoming_type} from Athena"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def self.handle_ping(event)
|
|
25
|
+
{
|
|
26
|
+
'@type' => 'PingResponse',
|
|
27
|
+
'catalogName' => 'event',
|
|
28
|
+
'queryId' => event['queryId'],
|
|
29
|
+
'sourceType' => 'athena_udf',
|
|
30
|
+
'capabilities' => capabilities,
|
|
31
|
+
}
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.handle_udf_request(event)
|
|
35
|
+
# Cannot find a way to write Arrow::RecordBatch to a buffer directly in Ruby.
|
|
36
|
+
|
|
37
|
+
output_schema = read_schema(Base64.decode64(event['outputSchema']['schema']))
|
|
38
|
+
output_builder = Arrow::RecordBatchBuilder.new(output_schema)
|
|
39
|
+
|
|
40
|
+
input_schema_data = Base64.decode64(event['inputRecords']['schema'])
|
|
41
|
+
input_records_data = Base64.decode64(event['inputRecords']['records'])
|
|
42
|
+
read_record_batches(input_schema_data, input_records_data) do |input_schema, record_batch|
|
|
43
|
+
output_builder.append_records(
|
|
44
|
+
record_batch.each_record.map do |record|
|
|
45
|
+
handle_athena_record(input_schema, output_schema, record)
|
|
46
|
+
end,
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
output_record_batch = output_builder.flush
|
|
51
|
+
output_records_bytes = get_record_batch_bytes(output_schema, output_record_batch)
|
|
52
|
+
|
|
53
|
+
{
|
|
54
|
+
'@type' => 'UserDefinedFunctionResponse',
|
|
55
|
+
'methodName' => event['methodName'],
|
|
56
|
+
'records' => {
|
|
57
|
+
'aId' => SecureRandom.uuid,
|
|
58
|
+
'schema' => event['outputSchema']['schema'],
|
|
59
|
+
'records' => Base64.strict_encode64(output_records_bytes),
|
|
60
|
+
},
|
|
61
|
+
}
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# About capabilities: https://github.com/awslabs/aws-athena-query-federation/blob/f52d929a109099a1e7180fa242e26331137ed84c/athena-federation-sdk/src/main/java/com/amazonaws/athena/connector/lambda/handlers/FederationCapabilities.java#L29-L32
|
|
65
|
+
def self.capabilities
|
|
66
|
+
1
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def self.handle_athena_record(input_schema, output_schema, records)
|
|
70
|
+
raise NotImplementedError
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module AthenaUDF
|
|
4
|
+
module Utils
|
|
5
|
+
def read_record_batches(schema_data, record_batch_data)
|
|
6
|
+
Tempfile.create do |in_f|
|
|
7
|
+
in_f.write(schema_data)
|
|
8
|
+
in_f.write(record_batch_data)
|
|
9
|
+
in_f.flush
|
|
10
|
+
|
|
11
|
+
Arrow::MemoryMappedInputStream.open(in_f.path) do |inp|
|
|
12
|
+
reader = Arrow::RecordBatchStreamReader.new(inp)
|
|
13
|
+
input_schema = reader.schema
|
|
14
|
+
reader.each do |record_batch|
|
|
15
|
+
yield input_schema, record_batch
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def read_schema(schema_data)
|
|
22
|
+
# schema_buf = Arrow::Buffer.try_convert(schema_data)
|
|
23
|
+
Tempfile.create do |f|
|
|
24
|
+
f.write(schema_data)
|
|
25
|
+
f.flush
|
|
26
|
+
|
|
27
|
+
Arrow::MemoryMappedInputStream.open(f.path) do |inp|
|
|
28
|
+
reader = Arrow::RecordBatchStreamReader.new(inp)
|
|
29
|
+
reader.schema
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def get_schema_bytes(schema, record_batch)
|
|
35
|
+
Tempfile.create do |f|
|
|
36
|
+
Arrow::FileOutputStream.open(f.path, false) do |oup|
|
|
37
|
+
Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
|
|
38
|
+
writer.write_record_batch(record_batch)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
f.flush
|
|
42
|
+
|
|
43
|
+
data = File.binread(f.path)
|
|
44
|
+
start_index = get_record_batch_index(data)
|
|
45
|
+
data.bytes[4..start_index - 5].pack('C*')
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def get_record_batch_bytes(schema, record_batch)
|
|
50
|
+
Tempfile.create do |f|
|
|
51
|
+
Arrow::FileOutputStream.open(f.path, false) do |oup|
|
|
52
|
+
Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
|
|
53
|
+
writer.write_record_batch(record_batch)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
f.flush
|
|
57
|
+
|
|
58
|
+
data = File.binread(f.path)
|
|
59
|
+
start_index = get_record_batch_index(data)
|
|
60
|
+
data.bytes[start_index..].pack('C*')
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def get_record_batch_index(raw)
|
|
65
|
+
size = raw.bytes.size
|
|
66
|
+
found_count = 0
|
|
67
|
+
start_index = 0
|
|
68
|
+
0.upto(size - 4).each do |i|
|
|
69
|
+
has_ffff = 4.times.all? do |n|
|
|
70
|
+
raw.bytes[i + n] == 255
|
|
71
|
+
end
|
|
72
|
+
found_count += 1 if has_ffff
|
|
73
|
+
if found_count == 2
|
|
74
|
+
start_index = i + 4
|
|
75
|
+
break
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
start_index
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
data/lib/athena_udf.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: athena-udf
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Daisuke Taniwaki
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2024-08-23 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: base64
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ">="
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ">="
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: csv
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ">="
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: red-arrow
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - "~>"
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: 12.0.1
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - "~>"
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: 12.0.1
|
|
55
|
+
description: ''
|
|
56
|
+
email:
|
|
57
|
+
- daisuketaniwaki@gmail.com
|
|
58
|
+
executables: []
|
|
59
|
+
extensions: []
|
|
60
|
+
extra_rdoc_files: []
|
|
61
|
+
files:
|
|
62
|
+
- ".dockerignore"
|
|
63
|
+
- ".rspec"
|
|
64
|
+
- ".rubocop.yml"
|
|
65
|
+
- Dockerfile.example
|
|
66
|
+
- Gemfile
|
|
67
|
+
- Gemfile.lock
|
|
68
|
+
- LICENSE.txt
|
|
69
|
+
- README.md
|
|
70
|
+
- Rakefile
|
|
71
|
+
- example.rb
|
|
72
|
+
- lib/athena-udf/base_udf.rb
|
|
73
|
+
- lib/athena-udf/utils.rb
|
|
74
|
+
- lib/athena-udf/version.rb
|
|
75
|
+
- lib/athena_udf.rb
|
|
76
|
+
homepage: https://github.com/dtaniwaki/ruby-athena-udf
|
|
77
|
+
licenses:
|
|
78
|
+
- MIT
|
|
79
|
+
metadata:
|
|
80
|
+
homepage_uri: https://github.com/dtaniwaki/ruby-athena-udf
|
|
81
|
+
source_code_uri: https://github.com/dtaniwaki/ruby-athena-udf
|
|
82
|
+
post_install_message:
|
|
83
|
+
rdoc_options: []
|
|
84
|
+
require_paths:
|
|
85
|
+
- lib
|
|
86
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
87
|
+
requirements:
|
|
88
|
+
- - ">="
|
|
89
|
+
- !ruby/object:Gem::Version
|
|
90
|
+
version: 3.2.0
|
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - ">="
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '0'
|
|
96
|
+
requirements: []
|
|
97
|
+
rubygems_version: 3.5.11
|
|
98
|
+
signing_key:
|
|
99
|
+
specification_version: 4
|
|
100
|
+
summary: Ruby-version Athena UDF
|
|
101
|
+
test_files: []
|