fluent-plugin-s3-arrow 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 8d4d40af870606fbd11cf131fa371e29e45013281f7b23deae6e30b4f260ecf4
4
+ data.tar.gz: dc911408cac26e521a32eb50a56e544d18b1f36e2ea83b20fbf7d08c357b7a3e
5
+ SHA512:
6
+ metadata.gz: 26a33661cab8c5d524d160c727eab8da58c43973ea708d8e1aad45eade08a220612a1620404f9647628d9e6e81cb94e8ccb2a08a59019317d1a582e5254831fc
7
+ data.tar.gz: c24341e83a739023ab2bfc9b151ab9d1d6cedbba1f25bce63ae1455c426d724e20807cbed89a6663d18840f96196b266446772946176a0a5fdd7d4629f31b028
@@ -0,0 +1,20 @@
1
+ name: "bench"
2
+ on:
3
+ pull_request:
4
+
5
+ jobs:
6
+ test:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v2
10
+ - name: 'build with docker'
11
+ run: |
12
+ docker build -t fluent-plugin-s3-arrow .
13
+ - name: 'bench'
14
+ id: 'bench'
15
+ run: |
16
+ docker run fluent-plugin-s3-arrow /bin/bash -c "benchmark-driver benchmark/compress.yml"
17
+ - name: 'bench memory'
18
+ id: 'bench_mem'
19
+ run: |
20
+ docker run fluent-plugin-s3-arrow /bin/bash -c "benchmark-driver benchmark/compress.yml --runner memory"
@@ -0,0 +1,13 @@
1
+ name: "test"
2
+ on:
3
+ pull_request:
4
+
5
+ jobs:
6
+ test:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v2
10
+ - name: 'Test with docker'
11
+ run: |
12
+ docker build -t fluent-plugin-s3-arrow .
13
+ docker run fluent-plugin-s3-arrow /bin/bash -c "bundle exec rake test"
@@ -0,0 +1,56 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ # Ignore Byebug command history file.
17
+ .byebug_history
18
+
19
+ ## Specific to RubyMotion:
20
+ .dat*
21
+ .repl_history
22
+ build/
23
+ *.bridgesupport
24
+ build-iPhoneOS/
25
+ build-iPhoneSimulator/
26
+
27
+ ## Specific to RubyMotion (use of CocoaPods):
28
+ #
29
+ # We recommend against adding the Pods directory to your .gitignore. However
30
+ # you should judge for yourself, the pros and cons are mentioned at:
31
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
32
+ #
33
+ # vendor/Pods/
34
+
35
+ ## Documentation cache and generated files:
36
+ /.yardoc/
37
+ /_yardoc/
38
+ /doc/
39
+ /rdoc/
40
+
41
+ ## Environment normalization:
42
+ /.bundle/
43
+ /vendor/bundle
44
+ /lib/bundler/man/
45
+
46
+ # for a library or gem, you might want to ignore these files since the code is
47
+ # intended to run in multiple environments; otherwise, check them in:
48
+ # Gemfile.lock
49
+ # .ruby-version
50
+ # .ruby-gemset
51
+
52
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
53
+ .rvmrc
54
+
55
+ # Used by RuboCop. Remote config files pulled in from inherit_from directive.
56
+ # .rubocop-https?--*
@@ -0,0 +1,22 @@
1
+ FROM ruby:2.7-buster
2
+
3
+ RUN apt update && \
4
+ apt install -y -V ca-certificates lsb-release wget time gzip && \
5
+ wget https://apache.bintray.com/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-archive-keyring-latest-$(lsb_release --codename --short).deb && \
6
+ apt install -y -V ./apache-arrow-archive-keyring-latest-$(lsb_release --codename --short).deb
7
+
8
+ RUN apt update && \
9
+ apt install -y -V libparquet-glib-dev
10
+
11
+ RUN cd /var/tmp/ && \
12
+ curl -L -O https://github.com/reproio/columnify/releases/download/v0.1.0/columnify_0.1.0_Linux_x86_64.tar.gz && \
13
+ tar xvfz columnify_0.1.0_Linux_x86_64.tar.gz && \
14
+ chmod +x columnify && \
15
+ mv columnify /usr/local/bin/ && \
16
+ rm -rf /var/tmp/* && \
17
+ which columnify
18
+
19
+ RUN mkdir /app
20
+ WORKDIR /app
21
+ COPY . /app
22
+ RUN apt update && bundle install
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,202 @@
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright [yyyy] [name of copyright owner]
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
@@ -0,0 +1,60 @@
1
+ # fluent-plugin-s3-arrow
2
+
3
+ Extends the [fluent-plugin-s3](https://github.com/fluent/fluent-plugin-s3) (de)compression algorithm to enable red-arrow compression.
4
+
5
+ ## Installation
6
+
7
+ ### Requirements
8
+
9
+ - Apache Arrow GLib and Apache Parquet GLib
10
+ - See Apache [Arrow install document](https://arrow.apache.org/install/) for details.
11
+ - [red-arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
12
+ - [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet)
13
+
14
+ ### RubyGems
15
+
16
+ ```
17
+ $ gem install fluent-plugin-s3-arrow
18
+ ```
19
+
20
+ ### Bundler
21
+
22
+ Add following line to your Gemfile:
23
+
24
+ ```ruby
25
+ gem "fluent-plugin-s3-arrow"
26
+ ```
27
+
28
+ And then execute:
29
+
30
+ ```
31
+ $ bundle
32
+ ```
33
+
34
+ ## Configuration
35
+
36
+ Example of fluent-plugin-s3 configuration.
37
+
38
+ ```
39
+ <match pattern>
40
+ @type s3
41
+
42
+ # fluent-plugin-s3 configurations ...
43
+
44
+ <format>
45
+ @type json # This plugin currently supports only json formatter.
46
+ </format>
47
+
48
+ store_as arrow
49
+ <arrow>
50
+ schema [
51
+ {"name": "test_string", "type": "string"},
52
+ {"name": "test_uint64", "type": "uint64"}
53
+ ]
54
+ </arrow>
55
+ </match>
56
+ ```
57
+
58
+ ## License
59
+
60
+ Apache License, Version 2.0
@@ -0,0 +1,13 @@
1
+ require "bundler"
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs.push("lib", "test")
8
+ t.test_files = FileList["test/**/test_*.rb"]
9
+ t.verbose = true
10
+ t.warning = true
11
+ end
12
+
13
+ task default: [:test]
@@ -0,0 +1,23 @@
1
+ prelude: |-
2
+ $LOAD_PATH.unshift(File.expand_path("benchmark"))
3
+ require 'prelude'
4
+ require 's3_compressor_parquet'
5
+
6
+ gzip = create_compressor GZIP_CONFIG
7
+ arrow = create_compressor ARROW_CONFIG
8
+ columnify = create_compressor COLUMNIFY_CONFIG
9
+ chunk = create_chunk
10
+
11
+ benchmark:
12
+ gzip: |-
13
+ Tempfile.create do |tmp|
14
+ gzip.compress(chunk, tmp)
15
+ end
16
+ arrow: |-
17
+ Tempfile.create do |tmp|
18
+ arrow.compress(chunk, tmp)
19
+ end
20
+ columnify: |-
21
+ Tempfile.create do |tmp|
22
+ columnify.compress(chunk, tmp)
23
+ end
@@ -0,0 +1,59 @@
1
+ $LOAD_PATH.unshift(File.expand_path("lib"))
2
+ require "fluent/test"
3
+ require "fluent/test/driver/output"
4
+ require "fluent/test/helpers"
5
+ require "fluent/plugin/out_s3"
6
+ require "fluent/plugin/s3_compressor_arrow"
7
+ require "json"
8
+ require "faker"
9
+
10
+ GZIP_CONFIG = %[
11
+ s3_bucket test_bucket
12
+ store_as gzip
13
+ ]
14
+
15
+ ARROW_CONFIG = %[
16
+ s3_bucket test_bucket
17
+ store_as arrow
18
+ <arrow>
19
+ format parquet
20
+ compression gzip
21
+ schema [
22
+ {"name": "test_string", "type": "string"},
23
+ {"name": "test_uint64", "type": "uint64"},
24
+ {"name": "test_boolean", "type": "boolean"}
25
+ ]
26
+ </arrow>
27
+ ]
28
+
29
+ COLUMNIFY_CONFIG = %[
30
+ s3_bucket test_bucket
31
+ store_as parquet
32
+ <compress>
33
+ schema_type bigquery
34
+ schema_file benchmark/schema.bq.json
35
+ record_type jsonl
36
+ parquet_compression_codec gzip
37
+ </compress>
38
+ ]
39
+
40
+ def create_compressor(conf = CONFIG)
41
+ Fluent::Test::Driver::Output.new(Fluent::Plugin::S3Output) do
42
+ end.configure(conf).instance.instance_variable_get(:@compressor)
43
+ end
44
+
45
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
46
+ Faker::Config.random = Random.new(state)
47
+
48
+ def create_chunk
49
+ chunk = Fluent::Plugin::Buffer::MemoryChunk.new(Object.new)
50
+ while chunk.bytesize < 8388608 do
51
+ data = {
52
+ "test_string" => Faker::Name.name,
53
+ "test_uint64" => Faker::Number.number(digits: 11).to_i,
54
+ "test_boolean" => Faker::Boolean.boolean
55
+ }
56
+ chunk.append([data.to_json + "\n"])
57
+ end
58
+ return chunk
59
+ end
@@ -0,0 +1,83 @@
1
+ require "open3"
2
+
3
+ module Fluent::Plugin
4
+ class S3Output
5
+ class ParquetCompressor < Compressor
6
+ S3Output.register_compressor("parquet", self)
7
+
8
+ config_section :compress, multi: false do
9
+ desc "parquet compression codec"
10
+ config_param :parquet_compression_codec, :enum, list: [:uncompressed, :snappy, :gzip, :lzo, :brotli, :lz4, :zstd], default: :snappy
11
+ desc "parquet file page size"
12
+ config_param :parquet_page_size, :size, default: 8192
13
+ desc "parquet file row group size"
14
+ config_param :parquet_row_group_size, :size, default: 128 * 1024 * 1024
15
+ desc "record data format type"
16
+ config_param :record_type, :enum, list: [:avro, :csv, :jsonl, :msgpack, :tsv, :json], default: :msgpack
17
+ desc "schema type"
18
+ config_param :schema_type, :enum, list: [:avro, :bigquery], default: :avro
19
+ desc "path to schema file"
20
+ config_param :schema_file, :string
21
+ end
22
+
23
+ def configure(conf)
24
+ super
25
+ check_command("columnify", "-h")
26
+
27
+ if [:lzo, :brotli, :lz4].include?@compress.parquet_compression_codec
28
+ raise Fluent::ConfigError, "unsupported compression codec: #{@compress.parquet_compression_codec}"
29
+ end
30
+
31
+ @parquet_compression_codec = @compress.parquet_compression_codec.to_s.upcase
32
+ if @compress.record_type == :json
33
+ @record_type = :jsonl
34
+ else
35
+ @record_type = @compress.record_type
36
+ end
37
+ end
38
+
39
+ def ext
40
+ "parquet".freeze
41
+ end
42
+
43
+ def content_type
44
+ "application/octet-stream".freeze
45
+ end
46
+
47
+ def compress(chunk, tmp)
48
+ chunk_is_file = @buffer_type == "file"
49
+ path = if chunk_is_file
50
+ chunk.path
51
+ else
52
+ w = Tempfile.new("chunk-parquet-tmp")
53
+ w.binmode
54
+ chunk.write_to(w)
55
+ w.close
56
+ w.path
57
+ end
58
+ stdout, stderr, status = columnify(path, tmp.path)
59
+ unless status.success?
60
+ raise "failed to execute columnify command. stdout=#{stdout} stderr=#{stderr} status=#{status.inspect}"
61
+ end
62
+ ensure
63
+ unless chunk_is_file
64
+ w.close(true) rescue nil
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ def columnify(src_path, dst_path)
71
+ Open3.capture3("columnify",
72
+ "-parquetCompressionCodec", @parquet_compression_codec,
73
+ "-parquetPageSize", @compress.parquet_page_size.to_s,
74
+ "-parquetRowGroupSize", @compress.parquet_row_group_size.to_s,
75
+ "-recordType", @record_type.to_s,
76
+ "-schemaType", @compress.schema_type.to_s,
77
+ "-schemaFile", @compress.schema_file,
78
+ "-output", dst_path,
79
+ src_path)
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,17 @@
1
+ [
2
+ {
3
+ "name": "test_string",
4
+ "type": "STRING",
5
+ "mode": "REQUIRED"
6
+ },
7
+ {
8
+ "name": "test_uint64",
9
+ "type": "INTEGER",
10
+ "mode": "REQUIRED"
11
+ },
12
+ {
13
+ "name": "test_boolean",
14
+ "type": "BOOLEAN",
15
+ "mode": "REQUIRED"
16
+ }
17
+ ]
@@ -0,0 +1,31 @@
1
+ lib = File.expand_path("../lib", __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+
4
+ Gem::Specification.new do |spec|
5
+ spec.name = "fluent-plugin-s3-arrow"
6
+ spec.version = "0.1.0"
7
+ spec.authors = ["kanga333"]
8
+ spec.email = ["e411z7t40w@gmail.com"]
9
+
10
+ spec.summary = %q{Extends the fluent-plugin-s3 (de)compression algorithm to enable red-arrow compression.}
11
+ spec.description = %q{Extends the fluent-plugin-s3 (de)compression algorithm to enable red-arrow compression.}
12
+ spec.homepage = "https://github.com/red-data-tools/fluent-plugin-s3-arrow"
13
+ spec.license = "Apache-2.0"
14
+
15
+ test_files, files = `git ls-files -z`.split("\x0").partition do |f|
16
+ f.match(%r{^(test|spec|features)/})
17
+ end
18
+ spec.files = files
19
+ spec.executables = files.grep(%r{^bin/}) { |f| File.basename(f) }
20
+ spec.test_files = test_files
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_development_dependency "benchmark-driver"
24
+ spec.add_development_dependency "faker"
25
+ spec.add_development_dependency "rake", "~> 12.0"
26
+ spec.add_development_dependency "test-unit", "~> 3.0"
27
+ spec.add_runtime_dependency "fluentd", [">= 0.14.10", "< 2"]
28
+ spec.add_runtime_dependency "fluent-plugin-s3", ">= 1.0"
29
+ spec.add_runtime_dependency "red-arrow", ">= 1.0"
30
+ spec.add_runtime_dependency "red-parquet", ">= 1.0"
31
+ end
@@ -0,0 +1,56 @@
1
+ require 'arrow'
2
+ require 'parquet'
3
+
4
+ module Fluent::Plugin
5
+ class S3Output
6
+ class ArrowCompressor < Compressor
7
+ S3Output.register_compressor('arrow', self)
8
+
9
+ INVALID_COMBINATIONS = {
10
+ :arrow => [:snappy],
11
+ :feather => [:gzip, :snappy],
12
+ }
13
+
14
+ config_section :arrow, multi: false do
15
+ config_param :schema, :array
16
+ config_param :format, :enum, list: [:arrow, :feather, :parquet], default: :arrow
17
+ SUPPORTED_COMPRESSION = [:gzip, :snappy, :zstd]
18
+ config_param :compression, :enum, list: SUPPORTED_COMPRESSION, default: nil
19
+ config_param :chunk_size, :integer, default: 1024
20
+ end
21
+
22
+ def configure(conf)
23
+ super
24
+
25
+ if INVALID_COMBINATIONS[@arrow.format]&.include? @arrow.compression
26
+ raise Fluent::ConfigError, "#{@arrow.format} unsupported with #{@arrow.format}"
27
+ end
28
+
29
+ @schema = Arrow::Schema.new(@arrow.schema)
30
+ @options = Arrow::JSONReadOptions.new
31
+ @options.schema = @schema
32
+ @options.unexpected_field_behavior = :ignore
33
+ end
34
+
35
+ def ext
36
+ @arrow.format.freeze
37
+ end
38
+
39
+ def content_type
40
+ 'application/x-apache-arrow-file'.freeze
41
+ end
42
+
43
+ def compress(chunk, tmp)
44
+ buffer = Arrow::Buffer.new(chunk.read)
45
+ stream = Arrow::BufferInputStream.new(buffer)
46
+ table = Arrow::JSONReader.new(stream, @options)
47
+
48
+ table.read.save(tmp,
49
+ format: @arrow.format,
50
+ chunk_size: @arrow.chunk_size,
51
+ compression: @arrow.compression,
52
+ )
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,8 @@
1
+ $LOAD_PATH.unshift(File.expand_path("../../", __FILE__))
2
+ require "test-unit"
3
+ require "fluent/test"
4
+ require "fluent/test/driver/output"
5
+ require "fluent/test/helpers"
6
+
7
+ Test::Unit::TestCase.include(Fluent::Test::Helpers)
8
+ Test::Unit::TestCase.extend(Fluent::Test::Helpers)
@@ -0,0 +1,132 @@
1
+ require "helper"
2
+ require 'json'
3
+ require "fluent/plugin/out_s3"
4
+ require "fluent/plugin/s3_compressor_arrow"
5
+ require "fluent/plugin/output"
6
+
7
+ class S3OutputTest < Test::Unit::TestCase
8
+ def setup
9
+ Fluent::Test.setup
10
+ end
11
+
12
+ S3_CONFIG = {"s3_bucket" => "test", "store_as" => "arrow"}
13
+ SCHEMA = [
14
+ {"name": "test_string", "type": "string"},
15
+ {"name": "test_uint64", "type": "uint64"},
16
+ ]
17
+ CONFIG = config_element("ROOT", "", S3_CONFIG, [config_element("arrow", "", {"schema" => SCHEMA})])
18
+
19
+ def test_configure
20
+ d = create_driver
21
+ c = d.instance.instance_variable_get(:@compressor)
22
+ assert_equal :arrow, c.ext
23
+ assert_equal 'application/x-apache-arrow-file', c.content_type
24
+ assert c.instance_variable_get(:@schema).is_a?(Arrow::Schema)
25
+ assert_equal 1024, c.instance_variable_get(:@arrow).chunk_size
26
+ end
27
+
28
+ data(
29
+ 'arrow_snappy': ['arrow', 'snappy'],
30
+ 'feather_gzip': ['feather', 'gzip'],
31
+ 'feather_snappy': ['feather', 'snappy'],
32
+ )
33
+ def test_invalid_configure
34
+ format, compression = data
35
+ arrow_config = config_element("arrow", "", { "schema" => SCHEMA,
36
+ "format" => format,
37
+ "compression" => compression,
38
+ })
39
+ config = config_element("ROOT", "", S3_CONFIG, [arrow_config])
40
+ assert_raise Fluent::ConfigError do
41
+ create_driver(config)
42
+ end
43
+ end
44
+
45
+ def test_compress
46
+ d = create_driver
47
+ c = d.instance.instance_variable_get(:@compressor)
48
+
49
+ chunk = Fluent::Plugin::Buffer::MemoryChunk.new(Object.new)
50
+ d1 = {"test_string" => 'record1', "test_uint64" => 1}
51
+ d2 = {"test_string" => 'record2', "test_uint64" => 2, "unexpected_field" => false}
52
+ expected_d2 = d2.dup
53
+ expected_d2.delete "unexpected_field"
54
+ chunk.append([d1.to_json + "\n", d2.to_json + "\n"])
55
+
56
+ Tempfile.create do |tmp|
57
+ c.compress(chunk, tmp)
58
+ Arrow::MemoryMappedInputStream.open(tmp.path) do |input|
59
+ reader = Arrow::RecordBatchFileReader.new(input)
60
+ reader.each do |record_batch|
61
+ assert_equal([d1, expected_d2], record_batch.collect(&:to_h))
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ data(gzip: "gzip", zstd: "zstd")
68
+ def test_compress_with_compression
69
+ arrow_config = config_element("arrow", "", { "schema" => SCHEMA,
70
+ "compression" => data,
71
+ })
72
+ config = config_element("ROOT", "", S3_CONFIG, [arrow_config])
73
+
74
+ d = create_driver(conf=config)
75
+ c = d.instance.instance_variable_get(:@compressor)
76
+
77
+ chunk = Fluent::Plugin::Buffer::MemoryChunk.new(Object.new)
78
+ d1 = {"test_string" => 'record1', "test_uint64" => 1}
79
+ d2 = {"test_string" => 'record2', "test_uint64" => 2}
80
+ chunk.append([d1.to_json + "\n", d2.to_json + "\n"])
81
+ codec = Arrow::Codec.new(data.to_sym)
82
+
83
+ Tempfile.create do |tmp|
84
+ c.compress(chunk, tmp)
85
+ raw_input = Arrow::MemoryMappedInputStream.open(tmp.path)
86
+ Arrow::CompressedInputStream.new(codec,raw_input) do |input|
87
+ reader = Arrow::RecordBatchFileReader.new(input)
88
+ reader.each do |record_batch|
89
+ assert_equal([d1, d2], record_batch.collect(&:to_h))
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ data(
96
+ 'parquet_gzip': ['parquet', 'gzip'],
97
+ 'parquet_snappy': ['parquet', 'snappy'],
98
+ 'parquet_zstd': ['parquet', 'zstd'],
99
+ 'feather_zstd': ['feather', 'zstd'],
100
+ )
101
+ def test_compress_with_format
102
+ format, compression = data
103
+ arrow_config = config_element("arrow", "", { "schema" => SCHEMA,
104
+ "format" => format,
105
+ "compression" => compression,
106
+ })
107
+ config = config_element("ROOT", "", S3_CONFIG, [arrow_config])
108
+
109
+ d = create_driver(conf=config)
110
+ c = d.instance.instance_variable_get(:@compressor)
111
+
112
+ chunk = Fluent::Plugin::Buffer::MemoryChunk.new(Object.new)
113
+ d1 = {"test_string" => 'record1', "test_uint64" => 1}
114
+ d2 = {"test_string" => 'record2', "test_uint64" => 2}
115
+ chunk.append([d1.to_json + "\n", d2.to_json + "\n"])
116
+
117
+ Tempfile.create do |tmp|
118
+ c.compress(chunk, tmp)
119
+ table = Arrow::Table.load(tmp.path, format: format.to_sym, compress: compression.to_sym)
120
+ table.each_record_batch do |record_batch|
121
+ assert_equal([d1, d2], record_batch.collect(&:to_h))
122
+ end
123
+ end
124
+ end
125
+
126
+ private
127
+
128
+ def create_driver(conf = CONFIG)
129
+ Fluent::Test::Driver::Output.new(Fluent::Plugin::S3Output) do
130
+ end.configure(conf)
131
+ end
132
+ end
metadata ADDED
@@ -0,0 +1,181 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fluent-plugin-s3-arrow
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - kanga333
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: benchmark-driver
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: faker
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '12.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '12.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: test-unit
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: fluentd
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: 0.14.10
76
+ - - "<"
77
+ - !ruby/object:Gem::Version
78
+ version: '2'
79
+ type: :runtime
80
+ prerelease: false
81
+ version_requirements: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: 0.14.10
86
+ - - "<"
87
+ - !ruby/object:Gem::Version
88
+ version: '2'
89
+ - !ruby/object:Gem::Dependency
90
+ name: fluent-plugin-s3
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '1.0'
96
+ type: :runtime
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '1.0'
103
+ - !ruby/object:Gem::Dependency
104
+ name: red-arrow
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '1.0'
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '1.0'
117
+ - !ruby/object:Gem::Dependency
118
+ name: red-parquet
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '1.0'
124
+ type: :runtime
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '1.0'
131
+ description: Extends the fluent-plugin-s3 (de)compression algorithm to enable red-arrow
132
+ compression.
133
+ email:
134
+ - e411z7t40w@gmail.com
135
+ executables: []
136
+ extensions: []
137
+ extra_rdoc_files: []
138
+ files:
139
+ - ".github/workflows/bench.yml"
140
+ - ".github/workflows/test.yml"
141
+ - ".gitignore"
142
+ - Dockerfile
143
+ - Gemfile
144
+ - LICENSE
145
+ - README.md
146
+ - Rakefile
147
+ - benchmark/compress.yml
148
+ - benchmark/prelude.rb
149
+ - benchmark/s3_compressor_parquet.rb
150
+ - benchmark/schema.bq.json
151
+ - fluent-plugin-s3-arrow.gemspec
152
+ - lib/fluent/plugin/s3_compressor_arrow.rb
153
+ - test/helper.rb
154
+ - test/plugin/test_s3_compressor_arrow.rb
155
+ homepage: https://github.com/red-data-tools/fluent-plugin-s3-arrow
156
+ licenses:
157
+ - Apache-2.0
158
+ metadata: {}
159
+ post_install_message:
160
+ rdoc_options: []
161
+ require_paths:
162
+ - lib
163
+ required_ruby_version: !ruby/object:Gem::Requirement
164
+ requirements:
165
+ - - ">="
166
+ - !ruby/object:Gem::Version
167
+ version: '0'
168
+ required_rubygems_version: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ requirements: []
174
+ rubygems_version: 3.0.3
175
+ signing_key:
176
+ specification_version: 4
177
+ summary: Extends the fluent-plugin-s3 (de)compression algorithm to enable red-arrow
178
+ compression.
179
+ test_files:
180
+ - test/helper.rb
181
+ - test/plugin/test_s3_compressor_arrow.rb