athena-udf 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be808b4f155b8cdfa5c52bcc3c486d77cb9d056099f592b304bc8aa6426694b4
4
- data.tar.gz: c41565a48ec8702756c48f01112b6d7dc881723a26a521c09cbb8bf506666a24
3
+ metadata.gz: 1fca59aa26a814ceb98c86febbff75b38a5de2cb888c7291c483ed49aafe2301
4
+ data.tar.gz: 1a1020ab4893a1689771c5a35049eeca8ec778d7e452de5de660790c8394f600
5
5
  SHA512:
6
- metadata.gz: 971c50fb5eee0df1c0f4522b19a1583fca1ad94144f38766f0a6ff42b82151ee8501d7b303cc361f70a151c4e29e57aaca360bdd2e57646fb8509c5d4fbd74bd
7
- data.tar.gz: af0deed14deb22d1e382adfe9bc4097f07dff0cea835f9188e847df7bd27017e91b21c8c84bdf56ebbf6ac4dc729a3e6fe9dfbff43847f6eca1ee423389d3824
6
+ metadata.gz: 9f2f349f806c505d4957674ac6c18bf326b8c2cae08d259d30501d550b4f5fa8f73f5e79c5ec5eecab8da250abc91a3bb4bf0cddcc3c04e7288a2df2df79f76a
7
+ data.tar.gz: efe4807a051325233719ffa97aeec66564ad5c02a2703f70ebccb725a464f0be7d8142be42fae0073706f8b56edddf94a58fa7e15bbb3734c0bd2242a9f14475
data/.rubocop.yml CHANGED
@@ -34,3 +34,6 @@ Style/TrailingCommaInArrayLiteral:
34
34
  Style/TrailingCommaInHashLiteral:
35
35
  EnforcedStyleForMultiline: comma
36
36
 
37
+ Style/ClassVars:
38
+ Enabled: false
39
+
data/Gemfile CHANGED
@@ -5,11 +5,10 @@ source 'https://rubygems.org'
5
5
  gemspec
6
6
 
7
7
  group :development, :test do
8
+ gem 'gem-release', '~> 2.2'
8
9
  gem 'rake', '~> 13.0'
9
-
10
10
  gem 'rspec', '~> 3.0'
11
-
12
11
  gem 'rubocop', '~> 1.21'
13
-
14
- gem 'gem-release', '~> 2.2'
12
+ gem 'simplecov', '~> 0.22'
13
+ gem 'simplecov-lcov', '~> 0.8.0'
15
14
  end
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- athena-udf (0.1.0)
4
+ athena-udf (0.1.2)
5
5
  base64
6
6
  csv
7
7
  red-arrow (~> 12.0.1)
@@ -14,6 +14,7 @@ GEM
14
14
  bigdecimal (3.1.8)
15
15
  csv (3.3.0)
16
16
  diff-lcs (1.5.1)
17
+ docile (1.4.1)
17
18
  extpp (0.1.1)
18
19
  fiddle (1.1.2)
19
20
  gem-release (2.2.2)
@@ -72,6 +73,13 @@ GEM
72
73
  rubocop-ast (1.32.1)
73
74
  parser (>= 3.3.1.0)
74
75
  ruby-progressbar (1.13.0)
76
+ simplecov (0.22.0)
77
+ docile (~> 1.1)
78
+ simplecov-html (~> 0.11)
79
+ simplecov_json_formatter (~> 0.1)
80
+ simplecov-html (0.12.3)
81
+ simplecov-lcov (0.8.0)
82
+ simplecov_json_formatter (0.1.4)
75
83
  strscan (3.1.0)
76
84
  unicode-display_width (2.5.0)
77
85
 
@@ -85,6 +93,8 @@ DEPENDENCIES
85
93
  rake (~> 13.0)
86
94
  rspec (~> 3.0)
87
95
  rubocop (~> 1.21)
96
+ simplecov (~> 0.22)
97
+ simplecov-lcov (~> 0.8.0)
88
98
 
89
99
  BUNDLED WITH
90
100
  2.5.17
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # AthenaUDF
2
2
 
3
+ [![Gem Version][gem-image]][gem-link]
4
+ [![Coverage Status][cov-image]][cov-link]
5
+
3
6
  Ruby-version Athena User Defined Function (UDF).
4
7
 
5
8
  This gem is highly inspired by [the Python-version Athena UDF](https://github.com/dmarkey/python-athena-udf).
@@ -88,3 +91,7 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/dtaniw
88
91
 
89
92
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
90
93
 
94
+ [gem-image]: https://badge.fury.io/rb/athena-udf.svg
95
+ [gem-link]: http://badge.fury.io/rb/athena-udf
96
+ [cov-image]: https://coveralls.io/repos/dtaniwaki/ruby-athena-udf/badge.png
97
+ [cov-link]: https://coveralls.io/r/dtaniwaki/ruby-athena-udf
@@ -4,12 +4,16 @@ require 'securerandom'
4
4
  require 'base64'
5
5
  require 'tempfile'
6
6
  require 'arrow'
7
+ require 'logger'
7
8
  require_relative 'utils'
8
9
 
9
10
  module AthenaUDF
10
11
  class BaseUDF
11
12
  extend AthenaUDF::Utils
12
13
 
14
+ @@logger = Logger.new($stdout)
15
+ @@logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
16
+
13
17
  def self.lambda_handler(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
14
18
  incoming_type = event['@type']
15
19
  if incoming_type == 'PingRequest'
@@ -40,6 +44,7 @@ module AthenaUDF
40
44
  input_schema_data = Base64.decode64(event['inputRecords']['schema'])
41
45
  input_records_data = Base64.decode64(event['inputRecords']['records'])
42
46
  read_record_batches(input_schema_data, input_records_data) do |input_schema, record_batch|
47
+ logger.info("Processing #{record_batch.size} records")
43
48
  output_builder.append_records(
44
49
  record_batch.each_record.map do |record|
45
50
  handle_athena_record(input_schema, output_schema, record)
@@ -69,5 +74,9 @@ module AthenaUDF
69
74
  def self.handle_athena_record(input_schema, output_schema, records)
70
75
  raise NotImplementedError
71
76
  end
77
+
78
+ def self.logger
79
+ @@logger
80
+ end
72
81
  end
73
82
  end
@@ -3,13 +3,13 @@
3
3
  module AthenaUDF
4
4
  module Utils
5
5
  def read_record_batches(schema_data, record_batch_data)
6
- Tempfile.create do |in_f|
7
- in_f.write(schema_data)
8
- in_f.write(record_batch_data)
9
- in_f.flush
6
+ buffer = Arrow::ResizableBuffer.new(schema_data.bytes.size + record_batch_data.bytes.size)
7
+ Arrow::BufferOutputStream.open(buffer) do |output|
8
+ output.write(schema_data)
9
+ output.write(record_batch_data)
10
10
 
11
- Arrow::MemoryMappedInputStream.open(in_f.path) do |inp|
12
- reader = Arrow::RecordBatchStreamReader.new(inp)
11
+ Arrow::BufferInputStream.open(buffer) do |input|
12
+ reader = Arrow::RecordBatchStreamReader.new(input)
13
13
  input_schema = reader.schema
14
14
  reader.each do |record_batch|
15
15
  yield input_schema, record_batch
@@ -19,56 +19,49 @@ module AthenaUDF
19
19
  end
20
20
 
21
21
  def read_schema(schema_data)
22
- # schema_buf = Arrow::Buffer.try_convert(schema_data)
23
- Tempfile.create do |f|
24
- f.write(schema_data)
25
- f.flush
22
+ buffer = Arrow::ResizableBuffer.new(schema_data.bytes.size)
23
+ Arrow::BufferOutputStream.open(buffer) do |output|
24
+ output.write(schema_data)
26
25
 
27
- Arrow::MemoryMappedInputStream.open(f.path) do |inp|
28
- reader = Arrow::RecordBatchStreamReader.new(inp)
26
+ Arrow::BufferInputStream.open(buffer) do |input|
27
+ reader = Arrow::RecordBatchStreamReader.new(input)
29
28
  reader.schema
30
29
  end
31
30
  end
32
31
  end
33
32
 
34
33
  def get_schema_bytes(schema, record_batch)
35
- Tempfile.create do |f|
36
- Arrow::FileOutputStream.open(f.path, false) do |oup|
37
- Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
38
- writer.write_record_batch(record_batch)
39
- end
34
+ buffer = Arrow::ResizableBuffer.new(0)
35
+ Arrow::BufferOutputStream.open(buffer) do |output|
36
+ Arrow::RecordBatchStreamWriter.open(output, schema) do |writer|
37
+ writer.write_record_batch(record_batch)
40
38
  end
41
- f.flush
42
39
 
43
- data = File.binread(f.path)
44
- start_index = get_record_batch_index(data)
45
- data.bytes[4..start_index - 5].pack('C*')
40
+ bytes = buffer.data.to_s
41
+ start_index = get_record_batch_index(bytes)
42
+ bytes[4..start_index - 5]
46
43
  end
47
44
  end
48
45
 
49
46
  def get_record_batch_bytes(schema, record_batch)
50
- Tempfile.create do |f|
51
- Arrow::FileOutputStream.open(f.path, false) do |oup|
52
- Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
53
- writer.write_record_batch(record_batch)
54
- end
47
+ buffer = Arrow::ResizableBuffer.new(0)
48
+ Arrow::BufferOutputStream.open(buffer) do |output|
49
+ Arrow::RecordBatchStreamWriter.open(output, schema) do |writer|
50
+ writer.write_record_batch(record_batch)
55
51
  end
56
- f.flush
57
52
 
58
- data = File.binread(f.path)
59
- start_index = get_record_batch_index(data)
60
- data.bytes[start_index..].pack('C*')
53
+ bytes = buffer.data.to_s
54
+ start_index = get_record_batch_index(bytes)
55
+ bytes[start_index..]
61
56
  end
62
57
  end
63
58
 
64
- def get_record_batch_index(raw)
65
- size = raw.bytes.size
59
+ def get_record_batch_index(bytes)
60
+ size = bytes.size
66
61
  found_count = 0
67
62
  start_index = 0
68
63
  0.upto(size - 4).each do |i|
69
- has_ffff = 4.times.all? do |n|
70
- raw.bytes[i + n] == 255
71
- end
64
+ has_ffff = bytes.slice(i, 4) == [255, 255, 255, 255]
72
65
  found_count += 1 if has_ffff
73
66
  if found_count == 2
74
67
  start_index = i + 4
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module AthenaUDF
4
- VERSION = '0.1.0'
4
+ VERSION = '0.1.2'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: athena-udf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daisuke Taniwaki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-08-23 00:00:00.000000000 Z
11
+ date: 2024-09-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64