fluent-plugin-arrow 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a1ed6b1b40e55501097a3a0455b2f2be93b3cf55d7e8cb831d4dc742f4007b97
4
- data.tar.gz: fdcaed4462a919076c29f5947c3c81744b887e0099d2e5ad7a317e6845391da1
3
+ metadata.gz: 9325c65f679d81a53d202a11a842a6c3a82b1cc3d600203064fc1329eeb3ded2
4
+ data.tar.gz: 134a1bb09054b3feeed480efb0d7df558ad8a8131dbe3951d73395511ed2e843
5
5
  SHA512:
6
- metadata.gz: 5f219215b1f8b8a57b2f9d79a6d1fdec39e10652c6d674a8d1390f7f3ce75d8752c15e89b268e36fb781bed44cfed2e128fd840b0978eb508c75dec1fc2b82a1
7
- data.tar.gz: 2d473fbead6b6c12e361b045e7de325652b36c22d94685be289bc8a076c03f4dcb8b2c5270d922f76c0f462475d6846c96176abf4085abf24affaad0bc4b04fd
6
+ metadata.gz: c72b02f95969d7dd8cdbfe525b579afc23a16f8fae1fdd72ac095479f94514c890287f7b3c41822888965a4f9554b33b5ed49e23d92401cefbfee3f8f8feb9b0
7
+ data.tar.gz: 18eaeb2135ecc0339e8986db79545a772aeeef0a2499119bfd3b2194beb125c5be518d6bb86dd0b5caaae1c1d842f98926557219c8fd6586fa04e0ac51b34e2c
@@ -3,7 +3,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
 
4
4
  Gem::Specification.new do |spec|
5
5
  spec.name = "fluent-plugin-arrow"
6
- spec.version = "0.0.2"
6
+ spec.version = "0.0.3"
7
7
  spec.authors = ["joker1007"]
8
8
  spec.email = ["kakyoin.hierophant@gmail.com"]
9
9
 
@@ -0,0 +1,62 @@
1
+
2
+ #
3
+ # Copyright 2018- joker1007
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require "arrow"
18
+ require 'fluent/plugin/buf_file'
19
+ require 'fluent/plugin/buffer/arrow_file_chunk'
20
+ require 'fluent/plugin/arrow/field_wrapper'
21
+
22
+ module Fluent
23
+ module Plugin
24
+ class ArrowFileBuffer < Fluent::Plugin::FileBuffer
25
+ Plugin.register_buffer('arrow_file', self)
26
+
27
+ config_param :schema, :array
28
+ config_param :arrow_format, :enum, list: [:arrow, :parquet], default: :arrow
29
+ config_param :row_group_chunk_size, :integer, default: 1024
30
+
31
+ attr_reader :arrow_schema
32
+
33
+ def configure(conf)
34
+ super
35
+
36
+ # [{"name" => foo1, "type" => "uint64"}, {"name" => foo2, "type" => "struct", "fields" => [{"name" => bar1, "type" => "string"}]}
37
+ @field_wrappers = @schema.each_with_object({}) do |field, h|
38
+ h[field["name"]] = Fluent::Plugin::Arrow::FieldWrapper.build(field)
39
+ end
40
+
41
+ @arrow_schema = ::Arrow::Schema.new(@field_wrappers.values.map(&:arrow_field))
42
+ end
43
+
44
+ def resume
45
+ return {}, []
46
+ end
47
+
48
+ def generate_chunk(metadata)
49
+ # FileChunk generates real path with unique_id
50
+ if @file_permission
51
+ chunk = Fluent::Plugin::Buffer::ArrowFileChunk.new(metadata, @path, :create, @arrow_schema, @field_wrappers, perm: @file_permission, chunk_size: @row_group_chunk_size, format: @arrow_format)
52
+ else
53
+ chunk = Fluent::Plugin::Buffer::ArrowFileChunk.new(metadata, @path, :create, @arrow_schema, @field_wrappers, chunk_size: @row_group_chunk_size, format: @arrow_format)
54
+ end
55
+
56
+ log.debug "Created new chunk", chunk_id: dump_unique_id_hex(chunk.unique_id), metadata: metadata
57
+
58
+ return chunk
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,44 @@
1
+ module Fluent
2
+ module Plugin
3
+ class Buffer
4
+ module ArrowBufferStringBuilder
5
+
6
+ private
7
+
8
+ def each_record(&block)
9
+ raise NotImplementedError
10
+ end
11
+
12
+ def build_arrow_buffer_string
13
+ count = 0
14
+ each_record do |record|
15
+ count += 1
16
+ record.each do |k, v|
17
+ @field_wrappers[k].append(v)
18
+ end
19
+ end
20
+ arrow_buf = ::Arrow::ResizableBuffer.new(bytesize * 1.2)
21
+
22
+ ::Arrow::BufferOutputStream.open(arrow_buf) do |output|
23
+ if @format == :parquet
24
+ Parquet::ArrowFileWriter.open(@schema, output) do |writer|
25
+ columns = @schema.fields.map do |f|
26
+ ::Arrow::Column.new(f, @field_wrappers[f.name].finish)
27
+ end
28
+ table = ::Arrow::Table.new(@schema, columns)
29
+ writer.write_table(table, @chunk_size)
30
+ end
31
+ else
32
+ ::Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
33
+ record_batch = ::Arrow::RecordBatch.new(@schema, count, @field_wrappers.values.map(&:finish))
34
+ writer.write_record_batch(record_batch)
35
+ end
36
+ end
37
+ end
38
+
39
+ arrow_buf.data.to_s
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,63 @@
1
+ #
2
+ # Copyright 2018- joker1007
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ require 'arrow'
17
+ require 'parquet'
18
+ require 'fluent/msgpack_factory'
19
+ require 'fluent/plugin/buffer/chunk'
20
+ require 'fluent/plugin/buffer/file_chunk'
21
+ require 'fluent/plugin/buffer/arrow_buffer_string_builder'
22
+ require 'fluent/plugin/arrow/field_wrapper'
23
+
24
+ module Fluent
25
+ module Plugin
26
+ class Buffer
27
+ class ArrowFileChunk < FileChunk
28
+ include ArrowBufferStringBuilder
29
+
30
+ def initialize(metadata, path, mode, schema, field_wrappers, perm: system_config.file_permission || FILE_PERMISSION, chunk_size: 1024, format: :arrow)
31
+ super(metadata, path, mode, perm: perm, compress: :text)
32
+ @schema = schema
33
+ @field_wrappers = field_wrappers
34
+ @chunk_size = chunk_size
35
+ @format = format
36
+ end
37
+
38
+ def read(**kwargs)
39
+ @chunk.seek(0, IO::SEEK_SET)
40
+ build_arrow_buffer_string
41
+ end
42
+
43
+ def open(**kwargs, &block)
44
+ @chunk.seek(0, IO::SEEK_SET)
45
+ val = StringIO.open(build_arrow_buffer_string, &block)
46
+ @chunk.seek(0, IO::SEEK_END) if self.staged?
47
+ val
48
+ end
49
+
50
+ def write_to(io, **kwargs)
51
+ @chunk.seek(0, IO::SEEK_SET)
52
+ io.write build_arrow_buffer_string
53
+ end
54
+
55
+ private
56
+
57
+ def each_record(&block)
58
+ Fluent::MessagePackFactory.engine_factory.unpacker(@chunk).each(&block)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -18,19 +18,21 @@ require 'parquet'
18
18
  require 'fluent/msgpack_factory'
19
19
  require 'fluent/plugin/buffer/chunk'
20
20
  require 'fluent/plugin/buffer/memory_chunk'
21
+ require 'fluent/plugin/buffer/arrow_buffer_string_builder'
21
22
  require 'fluent/plugin/arrow/field_wrapper'
22
23
 
23
24
  module Fluent
24
25
  module Plugin
25
26
  class Buffer
26
27
  class ArrowMemoryChunk < MemoryChunk
28
+ include ArrowBufferStringBuilder
29
+
27
30
  def initialize(metadata, schema, field_wrappers, chunk_size: 1024, format: :arrow)
28
31
  super(metadata, compress: :text)
29
32
  @schema = schema
30
33
  @field_wrappers = field_wrappers
31
34
  @chunk_size = chunk_size
32
35
  @format = format
33
- @unpacker = Fluent::MessagePackFactory.engine_factory.unpacker
34
36
  end
35
37
 
36
38
  def read(**kwargs)
@@ -48,34 +50,8 @@ module Fluent
48
50
 
49
51
  private
50
52
 
51
- def build_arrow_buffer_string
52
- count = 0
53
- @unpacker.feed_each(@chunk) do |record|
54
- count += 1
55
- record.each do |k, v|
56
- @field_wrappers[k].append(v)
57
- end
58
- end
59
- arrow_buf = ::Arrow::ResizableBuffer.new(@chunk_bytes * 1.2)
60
-
61
- ::Arrow::BufferOutputStream.open(arrow_buf) do |output|
62
- if @format == :parquet
63
- Parquet::ArrowFileWriter.open(@schema, output) do |writer|
64
- columns = @schema.fields.map do |f|
65
- ::Arrow::Column.new(f, @field_wrappers[f.name].finish)
66
- end
67
- table = ::Arrow::Table.new(@schema, columns)
68
- writer.write_table(table, @chunk_size)
69
- end
70
- else
71
- ::Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
72
- record_batch = ::Arrow::RecordBatch.new(@schema, count, @field_wrappers.values.map(&:finish))
73
- writer.write_record_batch(record_batch)
74
- end
75
- end
76
- end
77
-
78
- arrow_buf.data.to_s
53
+ def each_record(&block)
54
+ Fluent::MessagePackFactory.engine_factory.unpacker.feed_each(@chunk, &block)
79
55
  end
80
56
  end
81
57
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-arrow
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - joker1007
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-12-01 00:00:00.000000000 Z
11
+ date: 2018-12-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -114,7 +114,10 @@ files:
114
114
  - Rakefile
115
115
  - fluent-plugin-arrow.gemspec
116
116
  - lib/fluent/plugin/arrow/field_wrapper.rb
117
+ - lib/fluent/plugin/buf_arrow_file.rb
117
118
  - lib/fluent/plugin/buf_arrow_memory.rb
119
+ - lib/fluent/plugin/buffer/arrow_buffer_string_builder.rb
120
+ - lib/fluent/plugin/buffer/arrow_file_chunk.rb
118
121
  - lib/fluent/plugin/buffer/arrow_memory_chunk.rb
119
122
  - lib/fluent/plugin/formatter_arrow.rb
120
123
  - test/helper.rb