fluent-plugin-arrow 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/fluent-plugin-arrow.gemspec +1 -1
- data/lib/fluent/plugin/buf_arrow_file.rb +62 -0
- data/lib/fluent/plugin/buffer/arrow_buffer_string_builder.rb +44 -0
- data/lib/fluent/plugin/buffer/arrow_file_chunk.rb +63 -0
- data/lib/fluent/plugin/buffer/arrow_memory_chunk.rb +5 -29
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9325c65f679d81a53d202a11a842a6c3a82b1cc3d600203064fc1329eeb3ded2
|
4
|
+
data.tar.gz: 134a1bb09054b3feeed480efb0d7df558ad8a8131dbe3951d73395511ed2e843
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c72b02f95969d7dd8cdbfe525b579afc23a16f8fae1fdd72ac095479f94514c890287f7b3c41822888965a4f9554b33b5ed49e23d92401cefbfee3f8f8feb9b0
|
7
|
+
data.tar.gz: 18eaeb2135ecc0339e8986db79545a772aeeef0a2499119bfd3b2194beb125c5be518d6bb86dd0b5caaae1c1d842f98926557219c8fd6586fa04e0ac51b34e2c
|
data/fluent-plugin-arrow.gemspec
CHANGED
@@ -0,0 +1,62 @@
|
|
1
|
+
|
2
|
+
#
|
3
|
+
# Copyright 2018- joker1007
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require "arrow"
|
18
|
+
require 'fluent/plugin/buf_file'
|
19
|
+
require 'fluent/plugin/buffer/arrow_file_chunk'
|
20
|
+
require 'fluent/plugin/arrow/field_wrapper'
|
21
|
+
|
22
|
+
module Fluent
|
23
|
+
module Plugin
|
24
|
+
class ArrowFileBuffer < Fluent::Plugin::FileBuffer
|
25
|
+
Plugin.register_buffer('arrow_file', self)
|
26
|
+
|
27
|
+
config_param :schema, :array
|
28
|
+
config_param :arrow_format, :enum, list: [:arrow, :parquet], default: :arrow
|
29
|
+
config_param :row_group_chunk_size, :integer, default: 1024
|
30
|
+
|
31
|
+
attr_reader :arrow_schema
|
32
|
+
|
33
|
+
def configure(conf)
|
34
|
+
super
|
35
|
+
|
36
|
+
# [{"name" => foo1, "type" => "uint64"}, {"name" => foo2, "type" => "struct", "fields" => [{"name" => bar1, "type" => "string"}]}
|
37
|
+
@field_wrappers = @schema.each_with_object({}) do |field, h|
|
38
|
+
h[field["name"]] = Fluent::Plugin::Arrow::FieldWrapper.build(field)
|
39
|
+
end
|
40
|
+
|
41
|
+
@arrow_schema = ::Arrow::Schema.new(@field_wrappers.values.map(&:arrow_field))
|
42
|
+
end
|
43
|
+
|
44
|
+
def resume
|
45
|
+
return {}, []
|
46
|
+
end
|
47
|
+
|
48
|
+
def generate_chunk(metadata)
|
49
|
+
# FileChunk generates real path with unique_id
|
50
|
+
if @file_permission
|
51
|
+
chunk = Fluent::Plugin::Buffer::ArrowFileChunk.new(metadata, @path, :create, @arrow_schema, @field_wrappers, perm: @file_permission, chunk_size: @row_group_chunk_size, format: @arrow_format)
|
52
|
+
else
|
53
|
+
chunk = Fluent::Plugin::Buffer::ArrowFileChunk.new(metadata, @path, :create, @arrow_schema, @field_wrappers, chunk_size: @row_group_chunk_size, format: @arrow_format)
|
54
|
+
end
|
55
|
+
|
56
|
+
log.debug "Created new chunk", chunk_id: dump_unique_id_hex(chunk.unique_id), metadata: metadata
|
57
|
+
|
58
|
+
return chunk
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Fluent
|
2
|
+
module Plugin
|
3
|
+
class Buffer
|
4
|
+
module ArrowBufferStringBuilder
|
5
|
+
|
6
|
+
private
|
7
|
+
|
8
|
+
def each_record(&block)
|
9
|
+
raise NotImplementedError
|
10
|
+
end
|
11
|
+
|
12
|
+
def build_arrow_buffer_string
|
13
|
+
count = 0
|
14
|
+
each_record do |record|
|
15
|
+
count += 1
|
16
|
+
record.each do |k, v|
|
17
|
+
@field_wrappers[k].append(v)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
arrow_buf = ::Arrow::ResizableBuffer.new(bytesize * 1.2)
|
21
|
+
|
22
|
+
::Arrow::BufferOutputStream.open(arrow_buf) do |output|
|
23
|
+
if @format == :parquet
|
24
|
+
Parquet::ArrowFileWriter.open(@schema, output) do |writer|
|
25
|
+
columns = @schema.fields.map do |f|
|
26
|
+
::Arrow::Column.new(f, @field_wrappers[f.name].finish)
|
27
|
+
end
|
28
|
+
table = ::Arrow::Table.new(@schema, columns)
|
29
|
+
writer.write_table(table, @chunk_size)
|
30
|
+
end
|
31
|
+
else
|
32
|
+
::Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
|
33
|
+
record_batch = ::Arrow::RecordBatch.new(@schema, count, @field_wrappers.values.map(&:finish))
|
34
|
+
writer.write_record_batch(record_batch)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
arrow_buf.data.to_s
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2018- joker1007
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
require 'arrow'
|
17
|
+
require 'parquet'
|
18
|
+
require 'fluent/msgpack_factory'
|
19
|
+
require 'fluent/plugin/buffer/chunk'
|
20
|
+
require 'fluent/plugin/buffer/file_chunk'
|
21
|
+
require 'fluent/plugin/buffer/arrow_buffer_string_builder'
|
22
|
+
require 'fluent/plugin/arrow/field_wrapper'
|
23
|
+
|
24
|
+
module Fluent
|
25
|
+
module Plugin
|
26
|
+
class Buffer
|
27
|
+
class ArrowFileChunk < FileChunk
|
28
|
+
include ArrowBufferStringBuilder
|
29
|
+
|
30
|
+
def initialize(metadata, path, mode, schema, field_wrappers, perm: system_config.file_permission || FILE_PERMISSION, chunk_size: 1024, format: :arrow)
|
31
|
+
super(metadata, path, mode, perm: perm, compress: :text)
|
32
|
+
@schema = schema
|
33
|
+
@field_wrappers = field_wrappers
|
34
|
+
@chunk_size = chunk_size
|
35
|
+
@format = format
|
36
|
+
end
|
37
|
+
|
38
|
+
def read(**kwargs)
|
39
|
+
@chunk.seek(0, IO::SEEK_SET)
|
40
|
+
build_arrow_buffer_string
|
41
|
+
end
|
42
|
+
|
43
|
+
def open(**kwargs, &block)
|
44
|
+
@chunk.seek(0, IO::SEEK_SET)
|
45
|
+
val = StringIO.open(build_arrow_buffer_string, &block)
|
46
|
+
@chunk.seek(0, IO::SEEK_END) if self.staged?
|
47
|
+
val
|
48
|
+
end
|
49
|
+
|
50
|
+
def write_to(io, **kwargs)
|
51
|
+
@chunk.seek(0, IO::SEEK_SET)
|
52
|
+
io.write build_arrow_buffer_string
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def each_record(&block)
|
58
|
+
Fluent::MessagePackFactory.engine_factory.unpacker(@chunk).each(&block)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -18,19 +18,21 @@ require 'parquet'
|
|
18
18
|
require 'fluent/msgpack_factory'
|
19
19
|
require 'fluent/plugin/buffer/chunk'
|
20
20
|
require 'fluent/plugin/buffer/memory_chunk'
|
21
|
+
require 'fluent/plugin/buffer/arrow_buffer_string_builder'
|
21
22
|
require 'fluent/plugin/arrow/field_wrapper'
|
22
23
|
|
23
24
|
module Fluent
|
24
25
|
module Plugin
|
25
26
|
class Buffer
|
26
27
|
class ArrowMemoryChunk < MemoryChunk
|
28
|
+
include ArrowBufferStringBuilder
|
29
|
+
|
27
30
|
def initialize(metadata, schema, field_wrappers, chunk_size: 1024, format: :arrow)
|
28
31
|
super(metadata, compress: :text)
|
29
32
|
@schema = schema
|
30
33
|
@field_wrappers = field_wrappers
|
31
34
|
@chunk_size = chunk_size
|
32
35
|
@format = format
|
33
|
-
@unpacker = Fluent::MessagePackFactory.engine_factory.unpacker
|
34
36
|
end
|
35
37
|
|
36
38
|
def read(**kwargs)
|
@@ -48,34 +50,8 @@ module Fluent
|
|
48
50
|
|
49
51
|
private
|
50
52
|
|
51
|
-
def
|
52
|
-
|
53
|
-
@unpacker.feed_each(@chunk) do |record|
|
54
|
-
count += 1
|
55
|
-
record.each do |k, v|
|
56
|
-
@field_wrappers[k].append(v)
|
57
|
-
end
|
58
|
-
end
|
59
|
-
arrow_buf = ::Arrow::ResizableBuffer.new(@chunk_bytes * 1.2)
|
60
|
-
|
61
|
-
::Arrow::BufferOutputStream.open(arrow_buf) do |output|
|
62
|
-
if @format == :parquet
|
63
|
-
Parquet::ArrowFileWriter.open(@schema, output) do |writer|
|
64
|
-
columns = @schema.fields.map do |f|
|
65
|
-
::Arrow::Column.new(f, @field_wrappers[f.name].finish)
|
66
|
-
end
|
67
|
-
table = ::Arrow::Table.new(@schema, columns)
|
68
|
-
writer.write_table(table, @chunk_size)
|
69
|
-
end
|
70
|
-
else
|
71
|
-
::Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
|
72
|
-
record_batch = ::Arrow::RecordBatch.new(@schema, count, @field_wrappers.values.map(&:finish))
|
73
|
-
writer.write_record_batch(record_batch)
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
arrow_buf.data.to_s
|
53
|
+
def each_record(&block)
|
54
|
+
Fluent::MessagePackFactory.engine_factory.unpacker.feed_each(@chunk, &block)
|
79
55
|
end
|
80
56
|
end
|
81
57
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-arrow
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- joker1007
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-12-
|
11
|
+
date: 2018-12-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -114,7 +114,10 @@ files:
|
|
114
114
|
- Rakefile
|
115
115
|
- fluent-plugin-arrow.gemspec
|
116
116
|
- lib/fluent/plugin/arrow/field_wrapper.rb
|
117
|
+
- lib/fluent/plugin/buf_arrow_file.rb
|
117
118
|
- lib/fluent/plugin/buf_arrow_memory.rb
|
119
|
+
- lib/fluent/plugin/buffer/arrow_buffer_string_builder.rb
|
120
|
+
- lib/fluent/plugin/buffer/arrow_file_chunk.rb
|
118
121
|
- lib/fluent/plugin/buffer/arrow_memory_chunk.rb
|
119
122
|
- lib/fluent/plugin/formatter_arrow.rb
|
120
123
|
- test/helper.rb
|