athena-udf 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be808b4f155b8cdfa5c52bcc3c486d77cb9d056099f592b304bc8aa6426694b4
4
- data.tar.gz: c41565a48ec8702756c48f01112b6d7dc881723a26a521c09cbb8bf506666a24
3
+ metadata.gz: 1fca59aa26a814ceb98c86febbff75b38a5de2cb888c7291c483ed49aafe2301
4
+ data.tar.gz: 1a1020ab4893a1689771c5a35049eeca8ec778d7e452de5de660790c8394f600
5
5
  SHA512:
6
- metadata.gz: 971c50fb5eee0df1c0f4522b19a1583fca1ad94144f38766f0a6ff42b82151ee8501d7b303cc361f70a151c4e29e57aaca360bdd2e57646fb8509c5d4fbd74bd
7
- data.tar.gz: af0deed14deb22d1e382adfe9bc4097f07dff0cea835f9188e847df7bd27017e91b21c8c84bdf56ebbf6ac4dc729a3e6fe9dfbff43847f6eca1ee423389d3824
6
+ metadata.gz: 9f2f349f806c505d4957674ac6c18bf326b8c2cae08d259d30501d550b4f5fa8f73f5e79c5ec5eecab8da250abc91a3bb4bf0cddcc3c04e7288a2df2df79f76a
7
+ data.tar.gz: efe4807a051325233719ffa97aeec66564ad5c02a2703f70ebccb725a464f0be7d8142be42fae0073706f8b56edddf94a58fa7e15bbb3734c0bd2242a9f14475
data/.rubocop.yml CHANGED
@@ -34,3 +34,6 @@ Style/TrailingCommaInArrayLiteral:
34
34
  Style/TrailingCommaInHashLiteral:
35
35
  EnforcedStyleForMultiline: comma
36
36
 
37
+ Style/ClassVars:
38
+ Enabled: false
39
+
data/Gemfile CHANGED
@@ -5,11 +5,10 @@ source 'https://rubygems.org'
5
5
  gemspec
6
6
 
7
7
  group :development, :test do
8
+ gem 'gem-release', '~> 2.2'
8
9
  gem 'rake', '~> 13.0'
9
-
10
10
  gem 'rspec', '~> 3.0'
11
-
12
11
  gem 'rubocop', '~> 1.21'
13
-
14
- gem 'gem-release', '~> 2.2'
12
+ gem 'simplecov', '~> 0.22'
13
+ gem 'simplecov-lcov', '~> 0.8.0'
15
14
  end
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- athena-udf (0.1.0)
4
+ athena-udf (0.1.2)
5
5
  base64
6
6
  csv
7
7
  red-arrow (~> 12.0.1)
@@ -14,6 +14,7 @@ GEM
14
14
  bigdecimal (3.1.8)
15
15
  csv (3.3.0)
16
16
  diff-lcs (1.5.1)
17
+ docile (1.4.1)
17
18
  extpp (0.1.1)
18
19
  fiddle (1.1.2)
19
20
  gem-release (2.2.2)
@@ -72,6 +73,13 @@ GEM
72
73
  rubocop-ast (1.32.1)
73
74
  parser (>= 3.3.1.0)
74
75
  ruby-progressbar (1.13.0)
76
+ simplecov (0.22.0)
77
+ docile (~> 1.1)
78
+ simplecov-html (~> 0.11)
79
+ simplecov_json_formatter (~> 0.1)
80
+ simplecov-html (0.12.3)
81
+ simplecov-lcov (0.8.0)
82
+ simplecov_json_formatter (0.1.4)
75
83
  strscan (3.1.0)
76
84
  unicode-display_width (2.5.0)
77
85
 
@@ -85,6 +93,8 @@ DEPENDENCIES
85
93
  rake (~> 13.0)
86
94
  rspec (~> 3.0)
87
95
  rubocop (~> 1.21)
96
+ simplecov (~> 0.22)
97
+ simplecov-lcov (~> 0.8.0)
88
98
 
89
99
  BUNDLED WITH
90
100
  2.5.17
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # AthenaUDF
2
2
 
3
+ [![Gem Version][gem-image]][gem-link]
4
+ [![Coverage Status][cov-image]][cov-link]
5
+
3
6
  Ruby-version Athena User Defined Function (UDF).
4
7
 
5
8
  This gem is highly inspired by [the Python-version Athena UDF](https://github.com/dmarkey/python-athena-udf).
@@ -88,3 +91,7 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/dtaniw
88
91
 
89
92
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
90
93
 
94
+ [gem-image]: https://badge.fury.io/rb/athena-udf.svg
95
+ [gem-link]: http://badge.fury.io/rb/athena-udf
96
+ [cov-image]: https://coveralls.io/repos/dtaniwaki/ruby-athena-udf/badge.png
97
+ [cov-link]: https://coveralls.io/r/dtaniwaki/ruby-athena-udf
@@ -4,12 +4,16 @@ require 'securerandom'
4
4
  require 'base64'
5
5
  require 'tempfile'
6
6
  require 'arrow'
7
+ require 'logger'
7
8
  require_relative 'utils'
8
9
 
9
10
  module AthenaUDF
10
11
  class BaseUDF
11
12
  extend AthenaUDF::Utils
12
13
 
14
+ @@logger = Logger.new($stdout)
15
+ @@logger.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'WARN').upcase)
16
+
13
17
  def self.lambda_handler(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
14
18
  incoming_type = event['@type']
15
19
  if incoming_type == 'PingRequest'
@@ -40,6 +44,7 @@ module AthenaUDF
40
44
  input_schema_data = Base64.decode64(event['inputRecords']['schema'])
41
45
  input_records_data = Base64.decode64(event['inputRecords']['records'])
42
46
  read_record_batches(input_schema_data, input_records_data) do |input_schema, record_batch|
47
+ logger.info("Processing #{record_batch.size} records")
43
48
  output_builder.append_records(
44
49
  record_batch.each_record.map do |record|
45
50
  handle_athena_record(input_schema, output_schema, record)
@@ -69,5 +74,9 @@ module AthenaUDF
69
74
  def self.handle_athena_record(input_schema, output_schema, records)
70
75
  raise NotImplementedError
71
76
  end
77
+
78
+ def self.logger
79
+ @@logger
80
+ end
72
81
  end
73
82
  end
@@ -3,13 +3,13 @@
3
3
  module AthenaUDF
4
4
  module Utils
5
5
  def read_record_batches(schema_data, record_batch_data)
6
- Tempfile.create do |in_f|
7
- in_f.write(schema_data)
8
- in_f.write(record_batch_data)
9
- in_f.flush
6
+ buffer = Arrow::ResizableBuffer.new(schema_data.bytes.size + record_batch_data.bytes.size)
7
+ Arrow::BufferOutputStream.open(buffer) do |output|
8
+ output.write(schema_data)
9
+ output.write(record_batch_data)
10
10
 
11
- Arrow::MemoryMappedInputStream.open(in_f.path) do |inp|
12
- reader = Arrow::RecordBatchStreamReader.new(inp)
11
+ Arrow::BufferInputStream.open(buffer) do |input|
12
+ reader = Arrow::RecordBatchStreamReader.new(input)
13
13
  input_schema = reader.schema
14
14
  reader.each do |record_batch|
15
15
  yield input_schema, record_batch
@@ -19,56 +19,49 @@ module AthenaUDF
19
19
  end
20
20
 
21
21
  def read_schema(schema_data)
22
- # schema_buf = Arrow::Buffer.try_convert(schema_data)
23
- Tempfile.create do |f|
24
- f.write(schema_data)
25
- f.flush
22
+ buffer = Arrow::ResizableBuffer.new(schema_data.bytes.size)
23
+ Arrow::BufferOutputStream.open(buffer) do |output|
24
+ output.write(schema_data)
26
25
 
27
- Arrow::MemoryMappedInputStream.open(f.path) do |inp|
28
- reader = Arrow::RecordBatchStreamReader.new(inp)
26
+ Arrow::BufferInputStream.open(buffer) do |input|
27
+ reader = Arrow::RecordBatchStreamReader.new(input)
29
28
  reader.schema
30
29
  end
31
30
  end
32
31
  end
33
32
 
34
33
  def get_schema_bytes(schema, record_batch)
35
- Tempfile.create do |f|
36
- Arrow::FileOutputStream.open(f.path, false) do |oup|
37
- Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
38
- writer.write_record_batch(record_batch)
39
- end
34
+ buffer = Arrow::ResizableBuffer.new(0)
35
+ Arrow::BufferOutputStream.open(buffer) do |output|
36
+ Arrow::RecordBatchStreamWriter.open(output, schema) do |writer|
37
+ writer.write_record_batch(record_batch)
40
38
  end
41
- f.flush
42
39
 
43
- data = File.binread(f.path)
44
- start_index = get_record_batch_index(data)
45
- data.bytes[4..start_index - 5].pack('C*')
40
+ bytes = buffer.data.to_s
41
+ start_index = get_record_batch_index(bytes)
42
+ bytes[4..start_index - 5]
46
43
  end
47
44
  end
48
45
 
49
46
  def get_record_batch_bytes(schema, record_batch)
50
- Tempfile.create do |f|
51
- Arrow::FileOutputStream.open(f.path, false) do |oup|
52
- Arrow::RecordBatchStreamWriter.open(oup, schema) do |writer|
53
- writer.write_record_batch(record_batch)
54
- end
47
+ buffer = Arrow::ResizableBuffer.new(0)
48
+ Arrow::BufferOutputStream.open(buffer) do |output|
49
+ Arrow::RecordBatchStreamWriter.open(output, schema) do |writer|
50
+ writer.write_record_batch(record_batch)
55
51
  end
56
- f.flush
57
52
 
58
- data = File.binread(f.path)
59
- start_index = get_record_batch_index(data)
60
- data.bytes[start_index..].pack('C*')
53
+ bytes = buffer.data.to_s
54
+ start_index = get_record_batch_index(bytes)
55
+ bytes[start_index..]
61
56
  end
62
57
  end
63
58
 
64
- def get_record_batch_index(raw)
65
- size = raw.bytes.size
59
+ def get_record_batch_index(bytes)
60
+ size = bytes.size
66
61
  found_count = 0
67
62
  start_index = 0
68
63
  0.upto(size - 4).each do |i|
69
- has_ffff = 4.times.all? do |n|
70
- raw.bytes[i + n] == 255
71
- end
64
+ has_ffff = bytes.slice(i, 4) == [255, 255, 255, 255]
72
65
  found_count += 1 if has_ffff
73
66
  if found_count == 2
74
67
  start_index = i + 4
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module AthenaUDF
4
- VERSION = '0.1.0'
4
+ VERSION = '0.1.2'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: athena-udf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daisuke Taniwaki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-08-23 00:00:00.000000000 Z
11
+ date: 2024-09-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64