fluent-plugin-arrow 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a1ed6b1b40e55501097a3a0455b2f2be93b3cf55d7e8cb831d4dc742f4007b97
4
- data.tar.gz: fdcaed4462a919076c29f5947c3c81744b887e0099d2e5ad7a317e6845391da1
3
+ metadata.gz: 9325c65f679d81a53d202a11a842a6c3a82b1cc3d600203064fc1329eeb3ded2
4
+ data.tar.gz: 134a1bb09054b3feeed480efb0d7df558ad8a8131dbe3951d73395511ed2e843
5
5
  SHA512:
6
- metadata.gz: 5f219215b1f8b8a57b2f9d79a6d1fdec39e10652c6d674a8d1390f7f3ce75d8752c15e89b268e36fb781bed44cfed2e128fd840b0978eb508c75dec1fc2b82a1
7
- data.tar.gz: 2d473fbead6b6c12e361b045e7de325652b36c22d94685be289bc8a076c03f4dcb8b2c5270d922f76c0f462475d6846c96176abf4085abf24affaad0bc4b04fd
6
+ metadata.gz: c72b02f95969d7dd8cdbfe525b579afc23a16f8fae1fdd72ac095479f94514c890287f7b3c41822888965a4f9554b33b5ed49e23d92401cefbfee3f8f8feb9b0
7
+ data.tar.gz: 18eaeb2135ecc0339e8986db79545a772aeeef0a2499119bfd3b2194beb125c5be518d6bb86dd0b5caaae1c1d842f98926557219c8fd6586fa04e0ac51b34e2c
@@ -3,7 +3,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
 
4
4
  Gem::Specification.new do |spec|
5
5
  spec.name = "fluent-plugin-arrow"
6
- spec.version = "0.0.2"
6
+ spec.version = "0.0.3"
7
7
  spec.authors = ["joker1007"]
8
8
  spec.email = ["kakyoin.hierophant@gmail.com"]
9
9
 
@@ -0,0 +1,62 @@
1
+
2
+ #
3
+ # Copyright 2018- joker1007
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require "arrow"
18
+ require 'fluent/plugin/buf_file'
19
+ require 'fluent/plugin/buffer/arrow_file_chunk'
20
+ require 'fluent/plugin/arrow/field_wrapper'
21
+
22
+ module Fluent
23
+ module Plugin
24
+ class ArrowFileBuffer < Fluent::Plugin::FileBuffer
25
+ Plugin.register_buffer('arrow_file', self)
26
+
27
+ config_param :schema, :array
28
+ config_param :arrow_format, :enum, list: [:arrow, :parquet], default: :arrow
29
+ config_param :row_group_chunk_size, :integer, default: 1024
30
+
31
+ attr_reader :arrow_schema
32
+
33
+ def configure(conf)
34
+ super
35
+
36
+ # [{"name" => foo1, "type" => "uint64"}, {"name" => foo2, "type" => "struct", "fields" => [{"name" => bar1, "type" => "string"}]}
37
+ @field_wrappers = @schema.each_with_object({}) do |field, h|
38
+ h[field["name"]] = Fluent::Plugin::Arrow::FieldWrapper.build(field)
39
+ end
40
+
41
+ @arrow_schema = ::Arrow::Schema.new(@field_wrappers.values.map(&:arrow_field))
42
+ end
43
+
44
+ def resume
45
+ return {}, []
46
+ end
47
+
48
+ def generate_chunk(metadata)
49
+ # FileChunk generates real path with unique_id
50
+ if @file_permission
51
+ chunk = Fluent::Plugin::Buffer::ArrowFileChunk.new(metadata, @path, :create, @arrow_schema, @field_wrappers, perm: @file_permission, chunk_size: @row_group_chunk_size, format: @arrow_format)
52
+ else
53
+ chunk = Fluent::Plugin::Buffer::ArrowFileChunk.new(metadata, @path, :create, @arrow_schema, @field_wrappers, chunk_size: @row_group_chunk_size, format: @arrow_format)
54
+ end
55
+
56
+ log.debug "Created new chunk", chunk_id: dump_unique_id_hex(chunk.unique_id), metadata: metadata
57
+
58
+ return chunk
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,44 @@
1
+ module Fluent
2
+ module Plugin
3
+ class Buffer
4
+ module ArrowBufferStringBuilder
5
+
6
+ private
7
+
8
+ def each_record(&block)
9
+ raise NotImplementedError
10
+ end
11
+
12
+ def build_arrow_buffer_string
13
+ count = 0
14
+ each_record do |record|
15
+ count += 1
16
+ record.each do |k, v|
17
+ @field_wrappers[k].append(v)
18
+ end
19
+ end
20
+ arrow_buf = ::Arrow::ResizableBuffer.new(bytesize * 1.2)
21
+
22
+ ::Arrow::BufferOutputStream.open(arrow_buf) do |output|
23
+ if @format == :parquet
24
+ Parquet::ArrowFileWriter.open(@schema, output) do |writer|
25
+ columns = @schema.fields.map do |f|
26
+ ::Arrow::Column.new(f, @field_wrappers[f.name].finish)
27
+ end
28
+ table = ::Arrow::Table.new(@schema, columns)
29
+ writer.write_table(table, @chunk_size)
30
+ end
31
+ else
32
+ ::Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
33
+ record_batch = ::Arrow::RecordBatch.new(@schema, count, @field_wrappers.values.map(&:finish))
34
+ writer.write_record_batch(record_batch)
35
+ end
36
+ end
37
+ end
38
+
39
+ arrow_buf.data.to_s
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,63 @@
1
+ #
2
+ # Copyright 2018- joker1007
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ require 'arrow'
17
+ require 'parquet'
18
+ require 'fluent/msgpack_factory'
19
+ require 'fluent/plugin/buffer/chunk'
20
+ require 'fluent/plugin/buffer/file_chunk'
21
+ require 'fluent/plugin/buffer/arrow_buffer_string_builder'
22
+ require 'fluent/plugin/arrow/field_wrapper'
23
+
24
+ module Fluent
25
+ module Plugin
26
+ class Buffer
27
+ class ArrowFileChunk < FileChunk
28
+ include ArrowBufferStringBuilder
29
+
30
+ def initialize(metadata, path, mode, schema, field_wrappers, perm: system_config.file_permission || FILE_PERMISSION, chunk_size: 1024, format: :arrow)
31
+ super(metadata, path, mode, perm: perm, compress: :text)
32
+ @schema = schema
33
+ @field_wrappers = field_wrappers
34
+ @chunk_size = chunk_size
35
+ @format = format
36
+ end
37
+
38
+ def read(**kwargs)
39
+ @chunk.seek(0, IO::SEEK_SET)
40
+ build_arrow_buffer_string
41
+ end
42
+
43
+ def open(**kwargs, &block)
44
+ @chunk.seek(0, IO::SEEK_SET)
45
+ val = StringIO.open(build_arrow_buffer_string, &block)
46
+ @chunk.seek(0, IO::SEEK_END) if self.staged?
47
+ val
48
+ end
49
+
50
+ def write_to(io, **kwargs)
51
+ @chunk.seek(0, IO::SEEK_SET)
52
+ io.write build_arrow_buffer_string
53
+ end
54
+
55
+ private
56
+
57
+ def each_record(&block)
58
+ Fluent::MessagePackFactory.engine_factory.unpacker(@chunk).each(&block)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -18,19 +18,21 @@ require 'parquet'
18
18
  require 'fluent/msgpack_factory'
19
19
  require 'fluent/plugin/buffer/chunk'
20
20
  require 'fluent/plugin/buffer/memory_chunk'
21
+ require 'fluent/plugin/buffer/arrow_buffer_string_builder'
21
22
  require 'fluent/plugin/arrow/field_wrapper'
22
23
 
23
24
  module Fluent
24
25
  module Plugin
25
26
  class Buffer
26
27
  class ArrowMemoryChunk < MemoryChunk
28
+ include ArrowBufferStringBuilder
29
+
27
30
  def initialize(metadata, schema, field_wrappers, chunk_size: 1024, format: :arrow)
28
31
  super(metadata, compress: :text)
29
32
  @schema = schema
30
33
  @field_wrappers = field_wrappers
31
34
  @chunk_size = chunk_size
32
35
  @format = format
33
- @unpacker = Fluent::MessagePackFactory.engine_factory.unpacker
34
36
  end
35
37
 
36
38
  def read(**kwargs)
@@ -48,34 +50,8 @@ module Fluent
48
50
 
49
51
  private
50
52
 
51
- def build_arrow_buffer_string
52
- count = 0
53
- @unpacker.feed_each(@chunk) do |record|
54
- count += 1
55
- record.each do |k, v|
56
- @field_wrappers[k].append(v)
57
- end
58
- end
59
- arrow_buf = ::Arrow::ResizableBuffer.new(@chunk_bytes * 1.2)
60
-
61
- ::Arrow::BufferOutputStream.open(arrow_buf) do |output|
62
- if @format == :parquet
63
- Parquet::ArrowFileWriter.open(@schema, output) do |writer|
64
- columns = @schema.fields.map do |f|
65
- ::Arrow::Column.new(f, @field_wrappers[f.name].finish)
66
- end
67
- table = ::Arrow::Table.new(@schema, columns)
68
- writer.write_table(table, @chunk_size)
69
- end
70
- else
71
- ::Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
72
- record_batch = ::Arrow::RecordBatch.new(@schema, count, @field_wrappers.values.map(&:finish))
73
- writer.write_record_batch(record_batch)
74
- end
75
- end
76
- end
77
-
78
- arrow_buf.data.to_s
53
+ def each_record(&block)
54
+ Fluent::MessagePackFactory.engine_factory.unpacker.feed_each(@chunk, &block)
79
55
  end
80
56
  end
81
57
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-arrow
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - joker1007
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-12-01 00:00:00.000000000 Z
11
+ date: 2018-12-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,7 +114,10 @@ files:
114
114
  - Rakefile
115
115
  - fluent-plugin-arrow.gemspec
116
116
  - lib/fluent/plugin/arrow/field_wrapper.rb
117
+ - lib/fluent/plugin/buf_arrow_file.rb
117
118
  - lib/fluent/plugin/buf_arrow_memory.rb
119
+ - lib/fluent/plugin/buffer/arrow_buffer_string_builder.rb
120
+ - lib/fluent/plugin/buffer/arrow_file_chunk.rb
118
121
  - lib/fluent/plugin/buffer/arrow_memory_chunk.rb
119
122
  - lib/fluent/plugin/formatter_arrow.rb
120
123
  - test/helper.rb