fluent-plugin-arrow 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +32 -3
- data/fluent-plugin-arrow.gemspec +1 -1
- data/lib/fluent/plugin/arrow/field_wrapper.rb +259 -0
- data/lib/fluent/plugin/buf_arrow_memory.rb +5 -26
- data/lib/fluent/plugin/buffer/arrow_memory_chunk.rb +11 -32
- data/test/plugin/test_buffer_arrow_memory_chunk.rb +20 -18
- data/test/plugin/test_field_wrapper.rb +145 -0
- metadata +5 -4
- data/test/plugin/test_formatter_arrow.rb +0 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a1ed6b1b40e55501097a3a0455b2f2be93b3cf55d7e8cb831d4dc742f4007b97
|
4
|
+
data.tar.gz: fdcaed4462a919076c29f5947c3c81744b887e0099d2e5ad7a317e6845391da1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f219215b1f8b8a57b2f9d79a6d1fdec39e10652c6d674a8d1390f7f3ce75d8752c15e89b268e36fb781bed44cfed2e128fd840b0978eb508c75dec1fc2b82a1
|
7
|
+
data.tar.gz: 2d473fbead6b6c12e361b045e7de325652b36c22d94685be289bc8a076c03f4dcb8b2c5270d922f76c0f462475d6846c96176abf4085abf24affaad0bc4b04fd
|
data/README.md
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
# fluent-plugin-arrow
|
2
2
|
|
3
|
-
[Fluentd](https://fluentd.org/)
|
3
|
+
[Fluentd](https://fluentd.org/) buffer plugin to output Apache Arrow and Parquet format.
|
4
4
|
|
5
|
-
|
5
|
+
## Prerequisite
|
6
|
+
|
7
|
+
- [Apache Arrow c++](https://github.com/apache/arrow/tree/master/cpp) (with -DARROW_PARQUET=ON)
|
8
|
+
- [Apache Arrow c_glib](https://github.com/apache/arrow/tree/master/c_glib)
|
9
|
+
- [red-arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
|
10
|
+
- [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet)
|
6
11
|
|
7
12
|
## Installation
|
8
13
|
|
@@ -31,7 +36,31 @@ $ bundle
|
|
31
36
|
You can generate configuration template:
|
32
37
|
|
33
38
|
```
|
34
|
-
|
39
|
+
<match arrow>
|
40
|
+
@type file
|
41
|
+
|
42
|
+
path arrow_test
|
43
|
+
|
44
|
+
<buffer>
|
45
|
+
@type arrow_memory
|
46
|
+
arrow_format arrow # or parquet
|
47
|
+
|
48
|
+
schema [
|
49
|
+
{"name": "key1", "type": "string"},
|
50
|
+
{"name": "key2", "type": "uint64"},
|
51
|
+
{"name": "key3", "type": "timestamp", "unit": "milli"},
|
52
|
+
{"name": "key4", "type": "list", "value_type": {"name": "value", "type": "uint64"}},
|
53
|
+
{"name": "key5", "type": "struct", "fields": [
|
54
|
+
{"name": "bar1", "type": "uint64"},
|
55
|
+
{"name": "bar2", "type": "list", "value_type": {"name": "value", "type": "string"}}
|
56
|
+
]}
|
57
|
+
]
|
58
|
+
</buffer>
|
59
|
+
|
60
|
+
<format>
|
61
|
+
@type arrow
|
62
|
+
</format>
|
63
|
+
</match>
|
35
64
|
```
|
36
65
|
|
37
66
|
You can copy and paste generated documents here.
|
data/fluent-plugin-arrow.gemspec
CHANGED
@@ -0,0 +1,259 @@
|
|
1
|
+
require "arrow"
|
2
|
+
|
3
|
+
module Fluent
|
4
|
+
module Plugin
|
5
|
+
module Arrow
|
6
|
+
class FieldWrapper
|
7
|
+
class << self
|
8
|
+
def build(field)
|
9
|
+
case field["type"]
|
10
|
+
when "string"
|
11
|
+
StringFieldWrapper.new(field)
|
12
|
+
when "int", "int8", "int16", "int32", "int64", "uint", "uint8", "uint16", "uint32", "uint64"
|
13
|
+
IntegerFieldWrapper.new(field)
|
14
|
+
when "float", "double"
|
15
|
+
FloatFieldWrapper.new(field)
|
16
|
+
when "boolean"
|
17
|
+
BooleanFieldWrapper.new(field)
|
18
|
+
when "date32"
|
19
|
+
Date32FieldWrapper.new(field)
|
20
|
+
when "date64"
|
21
|
+
Date64FieldWrapper.new(field)
|
22
|
+
when "timestamp"
|
23
|
+
TimestampFieldWrapper.new(field)
|
24
|
+
when "list"
|
25
|
+
ListFieldWrapper.new(field)
|
26
|
+
when "struct"
|
27
|
+
StructFieldWrapper.new(field)
|
28
|
+
else
|
29
|
+
raise "Unsupported data type"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
attr_reader :field, :name, :type, :children, :arrow_field, :array_builder
|
35
|
+
|
36
|
+
def initialize(field)
|
37
|
+
@field = field
|
38
|
+
@name = field["name"]
|
39
|
+
@type = field["type"]
|
40
|
+
@children = []
|
41
|
+
|
42
|
+
field["value_type"]&.tap do |f|
|
43
|
+
@children << self.class.build(f)
|
44
|
+
end
|
45
|
+
|
46
|
+
field["fields"]&.each do |f|
|
47
|
+
@children << self.class.build(f)
|
48
|
+
end
|
49
|
+
|
50
|
+
create_arrow_field
|
51
|
+
create_array_builder
|
52
|
+
end
|
53
|
+
|
54
|
+
def append(value)
|
55
|
+
if value.nil?
|
56
|
+
@array_builder.append_null
|
57
|
+
else
|
58
|
+
@array_builder.append(cast_value(value))
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def finish
|
63
|
+
@array_builder.finish
|
64
|
+
end
|
65
|
+
|
66
|
+
def create_arrow_field
|
67
|
+
@arrow_field = ::Arrow::Field.new(name, create_arrow_data_type)
|
68
|
+
end
|
69
|
+
|
70
|
+
def create_arrow_data_type
|
71
|
+
data_type_name = type.to_s.capitalize.gsub(/\AUint/, "UInt")
|
72
|
+
data_type_class_name = "#{data_type_name}DataType"
|
73
|
+
data_type_class = ::Arrow.const_get(data_type_class_name)
|
74
|
+
data_type_class.new
|
75
|
+
end
|
76
|
+
|
77
|
+
def create_array_builder(from_parent = nil)
|
78
|
+
if from_parent
|
79
|
+
@array_builder = from_parent
|
80
|
+
else
|
81
|
+
data_type_str = arrow_field.data_type.to_s
|
82
|
+
data_type_name = data_type_str.capitalize.gsub(/\AUint/, "UInt")
|
83
|
+
array_builder_class_name = "#{data_type_name}ArrayBuilder"
|
84
|
+
array_builder_class = ::Arrow.const_get(array_builder_class_name)
|
85
|
+
@array_builder = array_builder_class.new
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def cast_value(value)
|
90
|
+
raise NotImplementedError
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class StringFieldWrapper < FieldWrapper
|
95
|
+
def cast_value(value)
|
96
|
+
value.to_s
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class IntegerFieldWrapper < FieldWrapper
|
101
|
+
def cast_value(value)
|
102
|
+
value.to_i
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
class FloatFieldWrapper < FieldWrapper
|
107
|
+
def cast_value(value)
|
108
|
+
value.to_f
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
class BooleanFieldWrapper < FieldWrapper
|
113
|
+
def cast_value(value)
|
114
|
+
!!value
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
require "date"
|
119
|
+
class Date32FieldWrapper < FieldWrapper
|
120
|
+
UNIX_EPOCH = Date.new(1970, 1, 1)
|
121
|
+
def cast_value(value)
|
122
|
+
date =
|
123
|
+
if value.respond_to?(:to_date)
|
124
|
+
value.to_date
|
125
|
+
else
|
126
|
+
Date.parse(value)
|
127
|
+
end
|
128
|
+
|
129
|
+
(date - UNIX_EPOCH).to_i
|
130
|
+
end
|
131
|
+
|
132
|
+
def create_array_builder(from_parent = nil)
|
133
|
+
if from_parent
|
134
|
+
@array_builder = from_parent
|
135
|
+
else
|
136
|
+
@array_builder = ::Arrow::Date32ArrayBuilder.new
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
class Date64FieldWrapper < FieldWrapper
|
142
|
+
UNIX_EPOCH = Date.new(1970, 1, 1)
|
143
|
+
def cast_value(value)
|
144
|
+
time =
|
145
|
+
if value.respond_to?(:to_time)
|
146
|
+
value.to_time
|
147
|
+
else
|
148
|
+
Time.parse(value)
|
149
|
+
end
|
150
|
+
|
151
|
+
time.to_i * 1_000 + time.usec / 1_000
|
152
|
+
end
|
153
|
+
|
154
|
+
def create_array_builder(from_parent = nil)
|
155
|
+
if from_parent
|
156
|
+
@array_builder = from_parent
|
157
|
+
else
|
158
|
+
@array_builder = ::Arrow::Date64ArrayBuilder.new
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
require "time"
|
164
|
+
class TimestampFieldWrapper < FieldWrapper
|
165
|
+
def cast_value(value)
|
166
|
+
value =
|
167
|
+
if value.is_a?(Fluent::EventTime)
|
168
|
+
Time.at(value, value.usec)
|
169
|
+
elsif value.respond_to?(:to_time)
|
170
|
+
value.to_time
|
171
|
+
elsif value.is_a?(String)
|
172
|
+
Time.parse(value)
|
173
|
+
else
|
174
|
+
value
|
175
|
+
end
|
176
|
+
|
177
|
+
return value if value.is_a?(Numeric)
|
178
|
+
|
179
|
+
case field["unit"]
|
180
|
+
when "second"
|
181
|
+
value.to_i
|
182
|
+
when "milli"
|
183
|
+
value.to_i * 1_000 + value.usec / 1_000
|
184
|
+
when "micro"
|
185
|
+
value.to_i * 1_000_000 + value.usec
|
186
|
+
else
|
187
|
+
value.to_i * 1_000_000_000 + value.nsec
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def create_arrow_data_type
|
192
|
+
::Arrow::TimestampDataType.new(field["unit"].to_sym)
|
193
|
+
end
|
194
|
+
|
195
|
+
def create_array_builder(from_parent = nil)
|
196
|
+
if from_parent
|
197
|
+
@array_builder = from_parent
|
198
|
+
else
|
199
|
+
@array_builder = ::Arrow::TimestampArrayBuilder.new(arrow_field.data_type)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
class ListFieldWrapper < FieldWrapper
|
205
|
+
def append(value)
|
206
|
+
if value.nil?
|
207
|
+
@array_builder.append_null
|
208
|
+
else
|
209
|
+
@array_builder.append
|
210
|
+
value.each do |v|
|
211
|
+
@children[0].append(v)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
def create_arrow_data_type
|
217
|
+
::Arrow::ListDataType.new(children[0].arrow_field)
|
218
|
+
end
|
219
|
+
|
220
|
+
def create_array_builder(from_parent = nil)
|
221
|
+
if from_parent
|
222
|
+
@array_builder = from_parent
|
223
|
+
else
|
224
|
+
@array_builder = ::Arrow::ListArrayBuilder.new(arrow_field.data_type)
|
225
|
+
end
|
226
|
+
|
227
|
+
@children.each { |c| c.create_array_builder(@array_builder.value_builder) }
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
class StructFieldWrapper < FieldWrapper
|
232
|
+
def append(value)
|
233
|
+
if value.nil?
|
234
|
+
@array_builder.append_null
|
235
|
+
else
|
236
|
+
@array_builder.append
|
237
|
+
value.each do |k, v|
|
238
|
+
@children.find { |c| c.name == k }.append(v)
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def create_arrow_data_type
|
244
|
+
::Arrow::StructDataType.new(children.map(&:arrow_field))
|
245
|
+
end
|
246
|
+
|
247
|
+
def create_array_builder(from_parent = nil)
|
248
|
+
if from_parent
|
249
|
+
@array_builder = from_parent
|
250
|
+
else
|
251
|
+
@array_builder = ::Arrow::StructArrayBuilder.new(arrow_field.data_type)
|
252
|
+
end
|
253
|
+
|
254
|
+
@children.each_with_index { |c, i| c.create_array_builder(@array_builder.get_field_builder(i)) }
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
@@ -16,6 +16,7 @@
|
|
16
16
|
require "arrow"
|
17
17
|
require 'fluent/plugin/buffer'
|
18
18
|
require 'fluent/plugin/buffer/arrow_memory_chunk'
|
19
|
+
require 'fluent/plugin/arrow/field_wrapper'
|
19
20
|
|
20
21
|
module Fluent
|
21
22
|
module Plugin
|
@@ -32,11 +33,11 @@ module Fluent
|
|
32
33
|
super
|
33
34
|
|
34
35
|
# [{"name" => foo1, "type" => "uint64"}, {"name" => foo2, "type" => "struct", "fields" => [{"name" => bar1, "type" => "string"}]}
|
35
|
-
|
36
|
-
|
36
|
+
@field_wrappers = @schema.each_with_object({}) do |field, h|
|
37
|
+
h[field["name"]] = Fluent::Plugin::Arrow::FieldWrapper.build(field)
|
37
38
|
end
|
38
39
|
|
39
|
-
@arrow_schema = Arrow::Schema.new(
|
40
|
+
@arrow_schema = ::Arrow::Schema.new(@field_wrappers.values.map(&:arrow_field))
|
40
41
|
end
|
41
42
|
|
42
43
|
def resume
|
@@ -44,29 +45,7 @@ module Fluent
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def generate_chunk(metadata)
|
47
|
-
Fluent::Plugin::Buffer::ArrowMemoryChunk.new(metadata, @arrow_schema, chunk_size: @row_group_chunk_size, format: @arrow_format)
|
48
|
-
end
|
49
|
-
|
50
|
-
private
|
51
|
-
|
52
|
-
def create_arrow_field(field)
|
53
|
-
Arrow::Field.new(field["name"], create_arrow_data_type(field))
|
54
|
-
end
|
55
|
-
|
56
|
-
def create_arrow_data_type(field)
|
57
|
-
case field["type"]
|
58
|
-
when "struct"
|
59
|
-
Arrow::StructDataType.new(field["fields"].map { |f| create_arrow_field(f) })
|
60
|
-
when "list"
|
61
|
-
Arrow::ListDataType.new(create_arrow_field(field["value_type"]))
|
62
|
-
when "timestamp"
|
63
|
-
Arrow::TimestampDataType.new(field["unit"].to_sym)
|
64
|
-
else
|
65
|
-
data_type_name = field["type"].to_s.capitalize.gsub(/\AUint/, "UInt")
|
66
|
-
data_type_class_name = "#{data_type_name}DataType"
|
67
|
-
data_type_class = Arrow.const_get(data_type_class_name)
|
68
|
-
data_type_class.new
|
69
|
-
end
|
48
|
+
Fluent::Plugin::Buffer::ArrowMemoryChunk.new(metadata, @arrow_schema, @field_wrappers, chunk_size: @row_group_chunk_size, format: @arrow_format)
|
70
49
|
end
|
71
50
|
end
|
72
51
|
end
|
@@ -15,22 +15,21 @@
|
|
15
15
|
|
16
16
|
require 'arrow'
|
17
17
|
require 'parquet'
|
18
|
+
require 'fluent/msgpack_factory'
|
18
19
|
require 'fluent/plugin/buffer/chunk'
|
19
20
|
require 'fluent/plugin/buffer/memory_chunk'
|
21
|
+
require 'fluent/plugin/arrow/field_wrapper'
|
20
22
|
|
21
23
|
module Fluent
|
22
24
|
module Plugin
|
23
25
|
class Buffer
|
24
26
|
class ArrowMemoryChunk < MemoryChunk
|
25
|
-
def initialize(metadata, schema, chunk_size: 1024, format: :arrow)
|
27
|
+
def initialize(metadata, schema, field_wrappers, chunk_size: 1024, format: :arrow)
|
26
28
|
super(metadata, compress: :text)
|
27
29
|
@schema = schema
|
30
|
+
@field_wrappers = field_wrappers
|
28
31
|
@chunk_size = chunk_size
|
29
32
|
@format = format
|
30
|
-
@array_builders = {}
|
31
|
-
@schema.fields.each do |f|
|
32
|
-
@array_builders[f.name] = field_to_array_builder(f)
|
33
|
-
end
|
34
33
|
@unpacker = Fluent::MessagePackFactory.engine_factory.unpacker
|
35
34
|
end
|
36
35
|
|
@@ -49,48 +48,28 @@ module Fluent
|
|
49
48
|
|
50
49
|
private
|
51
50
|
|
52
|
-
def field_to_array_builder(f)
|
53
|
-
data_type_str = f.data_type.to_s
|
54
|
-
if data_type_str =~ /timestamp/
|
55
|
-
return Arrow::TimestampArrayBuilder.new(f.data_type)
|
56
|
-
end
|
57
|
-
|
58
|
-
data_type_name = data_type_str.capitalize.gsub(/\AUint/, "UInt")
|
59
|
-
array_builder_class_name = "#{data_type_name}ArrayBuilder"
|
60
|
-
array_builder_class = Arrow.const_get(array_builder_class_name)
|
61
|
-
if array_builder_class.method(:new).arity > 0
|
62
|
-
array_builder_class.new(f.data_type)
|
63
|
-
else
|
64
|
-
array_builder_class.new
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
51
|
def build_arrow_buffer_string
|
69
52
|
count = 0
|
70
53
|
@unpacker.feed_each(@chunk) do |record|
|
71
54
|
count += 1
|
72
55
|
record.each do |k, v|
|
73
|
-
|
74
|
-
@array_builders[k].append_null
|
75
|
-
else
|
76
|
-
@array_builders[k].append(v)
|
77
|
-
end
|
56
|
+
@field_wrappers[k].append(v)
|
78
57
|
end
|
79
58
|
end
|
80
|
-
arrow_buf = Arrow::ResizableBuffer.new(@chunk_bytes * 1.2)
|
59
|
+
arrow_buf = ::Arrow::ResizableBuffer.new(@chunk_bytes * 1.2)
|
81
60
|
|
82
|
-
Arrow::BufferOutputStream.open(arrow_buf) do |output|
|
61
|
+
::Arrow::BufferOutputStream.open(arrow_buf) do |output|
|
83
62
|
if @format == :parquet
|
84
63
|
Parquet::ArrowFileWriter.open(@schema, output) do |writer|
|
85
64
|
columns = @schema.fields.map do |f|
|
86
|
-
Arrow::Column.new(f, @
|
65
|
+
::Arrow::Column.new(f, @field_wrappers[f.name].finish)
|
87
66
|
end
|
88
|
-
table = Arrow::Table.new(@schema, columns)
|
67
|
+
table = ::Arrow::Table.new(@schema, columns)
|
89
68
|
writer.write_table(table, @chunk_size)
|
90
69
|
end
|
91
70
|
else
|
92
|
-
Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
|
93
|
-
record_batch = Arrow::RecordBatch.new(@schema, count, @
|
71
|
+
::Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
|
72
|
+
record_batch = ::Arrow::RecordBatch.new(@schema, count, @field_wrappers.values.map(&:finish))
|
94
73
|
writer.write_record_batch(record_batch)
|
95
74
|
end
|
96
75
|
end
|
@@ -5,12 +5,17 @@ require "fluent/plugin/buffer/arrow_memory_chunk"
|
|
5
5
|
class ArrowMemoryChunkTest < Test::Unit::TestCase
|
6
6
|
setup do
|
7
7
|
@fields = [
|
8
|
-
Arrow::Field.new("key1", :uint64),
|
9
|
-
Arrow::Field.new("key2", :double),
|
10
|
-
Arrow::Field.new("key3", Arrow::TimestampDataType.new(:second)),
|
8
|
+
::Arrow::Field.new("key1", :uint64),
|
9
|
+
::Arrow::Field.new("key2", :double),
|
10
|
+
::Arrow::Field.new("key3", ::Arrow::TimestampDataType.new(:second)),
|
11
11
|
]
|
12
|
+
field_wrappers = {
|
13
|
+
"key1" => Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "uint64"}),
|
14
|
+
"key2" => Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "double"}),
|
15
|
+
"key3" => Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "timestamp", "unit" => "second"}),
|
16
|
+
}
|
12
17
|
@schema = Arrow::Schema.new(@fields)
|
13
|
-
@c = Fluent::Plugin::Buffer::ArrowMemoryChunk.new(Object.new, @schema)
|
18
|
+
@c = Fluent::Plugin::Buffer::ArrowMemoryChunk.new(Object.new, @schema, field_wrappers)
|
14
19
|
end
|
15
20
|
|
16
21
|
test "can #read" do
|
@@ -18,44 +23,41 @@ class ArrowMemoryChunkTest < Test::Unit::TestCase
|
|
18
23
|
d2 = {"key1" => 124, "key2" => 11.1234, "key3" => Fluent::EventTime.from_time(Time.now)}
|
19
24
|
data = [d1.to_msgpack, d2.to_msgpack]
|
20
25
|
@c.append(data)
|
21
|
-
Arrow::BufferInputStream.open(Arrow::Buffer.new(@c.read)) do |input|
|
22
|
-
reader = Arrow::RecordBatchFileReader.new(input)
|
26
|
+
::Arrow::BufferInputStream.open(::Arrow::Buffer.new(@c.read)) do |input|
|
27
|
+
reader = ::Arrow::RecordBatchFileReader.new(input)
|
23
28
|
|
24
29
|
reader.each do |record_batch|
|
25
30
|
assert { record_batch.n_rows == 2 }
|
26
31
|
|
27
|
-
assert { record_batch.find_column(@fields[0].name).class == Arrow::UInt64Array }
|
32
|
+
assert { record_batch.find_column(@fields[0].name).class == ::Arrow::UInt64Array }
|
28
33
|
assert { record_batch.find_column(@fields[0].name).values == [123, 124] }
|
29
34
|
end
|
30
35
|
end
|
31
36
|
end
|
32
37
|
|
33
38
|
test "can #write_to" do
|
34
|
-
|
35
|
-
|
39
|
+
time = Time.now
|
40
|
+
d1 = {"key1" => 123, "key2" => 10.1234, "key3" => Fluent::EventTime.from_time(time)}
|
41
|
+
d2 = {"key1" => 124, "key2" => 11.1234, "key3" => Fluent::EventTime.from_time(time)}
|
36
42
|
data = [d1.to_msgpack, d2.to_msgpack]
|
37
43
|
@c.append(data)
|
38
44
|
Tempfile.create do |tf|
|
39
45
|
@c.write_to(tf)
|
40
46
|
tf.flush
|
41
47
|
|
42
|
-
Arrow::MemoryMappedInputStream.open(tf.path) do |input|
|
43
|
-
reader = Arrow::RecordBatchFileReader.new(input)
|
48
|
+
::Arrow::MemoryMappedInputStream.open(tf.path) do |input|
|
49
|
+
reader = ::Arrow::RecordBatchFileReader.new(input)
|
44
50
|
reader.each_with_index do |record_batch, i|
|
45
51
|
reader.each do |record_batch|
|
46
52
|
assert { record_batch.n_rows == 2 }
|
47
53
|
|
48
|
-
assert { record_batch.find_column(@fields[0].name).class == Arrow::UInt64Array }
|
54
|
+
assert { record_batch.find_column(@fields[0].name).class == ::Arrow::UInt64Array }
|
49
55
|
assert { record_batch.find_column(@fields[0].name).values == [123, 124] }
|
56
|
+
assert { record_batch.find_column(@fields[1].name).values == [10.1234, 11.1234] }
|
57
|
+
assert { record_batch.find_column(@fields[2].name)[0].to_i == time.to_i }
|
50
58
|
end
|
51
59
|
end
|
52
60
|
end
|
53
61
|
end
|
54
62
|
end
|
55
|
-
|
56
|
-
private
|
57
|
-
|
58
|
-
def create_driver(conf)
|
59
|
-
Fluent::Test::Driver::Formatter.new(Fluent::Plugin::ArrowFormatter).configure(conf)
|
60
|
-
end
|
61
63
|
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
require "helper"
|
2
|
+
require "fluent/plugin/arrow/field_wrapper"
|
3
|
+
|
4
|
+
class ArrowFieldWrapperTest < Test::Unit::TestCase
|
5
|
+
test ".build (string)" do
|
6
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "string"})
|
7
|
+
assert_equal "key1", field_wrapper.name
|
8
|
+
assert_equal "string", field_wrapper.type
|
9
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
10
|
+
end
|
11
|
+
|
12
|
+
test ".build (timestamp)" do
|
13
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "timestamp", "unit" => "nano"})
|
14
|
+
assert_equal "key1", field_wrapper.name
|
15
|
+
assert_equal "timestamp", field_wrapper.type
|
16
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
17
|
+
end
|
18
|
+
|
19
|
+
test ".build (list)" do
|
20
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "list", "value_type" => {"name" => "value", "type" => "string"}})
|
21
|
+
assert_equal "key1", field_wrapper.name
|
22
|
+
assert_equal "list", field_wrapper.type
|
23
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
24
|
+
assert_kind_of Arrow::ListDataType, field_wrapper.arrow_field.data_type
|
25
|
+
assert_kind_of Arrow::ListArrayBuilder, field_wrapper.array_builder
|
26
|
+
|
27
|
+
assert_equal "value", field_wrapper.children[0].name
|
28
|
+
assert_equal "string", field_wrapper.children[0].type
|
29
|
+
assert_kind_of Arrow::Field, field_wrapper.children[0].arrow_field
|
30
|
+
assert_kind_of Arrow::StringDataType, field_wrapper.children[0].arrow_field.data_type
|
31
|
+
assert_kind_of Arrow::StringArrayBuilder, field_wrapper.children[0].array_builder
|
32
|
+
end
|
33
|
+
|
34
|
+
test ".build (struct)" do
|
35
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "struct", "fields" => [
|
36
|
+
{"name" => "foo1", "type" => "string"},
|
37
|
+
{"name" => "foo2", "type" => "uint64"},
|
38
|
+
{"name" => "foo3", "type" => "timestamp", "unit" => "milli"},
|
39
|
+
]})
|
40
|
+
assert_equal "key1", field_wrapper.name
|
41
|
+
assert_equal "struct", field_wrapper.type
|
42
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
43
|
+
assert_kind_of Arrow::StructDataType, field_wrapper.arrow_field.data_type
|
44
|
+
assert_kind_of Arrow::StructArrayBuilder, field_wrapper.array_builder
|
45
|
+
|
46
|
+
assert_equal "foo1", field_wrapper.children[0].name
|
47
|
+
assert_equal "string", field_wrapper.children[0].type
|
48
|
+
assert_kind_of Arrow::Field, field_wrapper.children[0].arrow_field
|
49
|
+
assert_kind_of Arrow::StringDataType, field_wrapper.children[0].arrow_field.data_type
|
50
|
+
assert_kind_of Arrow::StringArrayBuilder, field_wrapper.children[0].array_builder
|
51
|
+
|
52
|
+
assert_equal "foo2", field_wrapper.children[1].name
|
53
|
+
assert_equal "uint64", field_wrapper.children[1].type
|
54
|
+
assert_kind_of Arrow::Field, field_wrapper.children[1].arrow_field
|
55
|
+
assert_kind_of Arrow::UInt64DataType, field_wrapper.children[1].arrow_field.data_type
|
56
|
+
assert_kind_of Arrow::UInt64ArrayBuilder, field_wrapper.children[1].array_builder
|
57
|
+
|
58
|
+
assert_equal "foo3", field_wrapper.children[2].name
|
59
|
+
assert_equal "timestamp", field_wrapper.children[2].type
|
60
|
+
assert_kind_of Arrow::Field, field_wrapper.children[2].arrow_field
|
61
|
+
assert_kind_of Arrow::TimestampDataType, field_wrapper.children[2].arrow_field.data_type
|
62
|
+
assert_kind_of Arrow::TimestampArrayBuilder, field_wrapper.children[2].array_builder
|
63
|
+
end
|
64
|
+
|
65
|
+
test ".build (nested)" do
|
66
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "struct", "fields" => [
|
67
|
+
{"name" => "foo1", "type" => "string"},
|
68
|
+
{"name" => "foo2", "type" => "list", "value_type" => {"name" => "value", "type" => "uint64"}},
|
69
|
+
]})
|
70
|
+
assert_equal "key1", field_wrapper.name
|
71
|
+
assert_equal "struct", field_wrapper.type
|
72
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
73
|
+
assert_kind_of Arrow::StructDataType, field_wrapper.arrow_field.data_type
|
74
|
+
assert_kind_of Arrow::StructArrayBuilder, field_wrapper.array_builder
|
75
|
+
|
76
|
+
assert_equal "foo1", field_wrapper.children[0].name
|
77
|
+
assert_equal "string", field_wrapper.children[0].type
|
78
|
+
assert_kind_of Arrow::Field, field_wrapper.children[0].arrow_field
|
79
|
+
assert_kind_of Arrow::StringDataType, field_wrapper.children[0].arrow_field.data_type
|
80
|
+
assert_kind_of Arrow::StringArrayBuilder, field_wrapper.children[0].array_builder
|
81
|
+
|
82
|
+
assert_equal "foo2", field_wrapper.children[1].name
|
83
|
+
assert_equal "list", field_wrapper.children[1].type
|
84
|
+
assert_kind_of Arrow::Field, field_wrapper.children[1].arrow_field
|
85
|
+
assert_kind_of Arrow::ListDataType, field_wrapper.children[1].arrow_field.data_type
|
86
|
+
assert_kind_of Arrow::ListArrayBuilder, field_wrapper.children[1].array_builder
|
87
|
+
|
88
|
+
assert_equal "value", field_wrapper.children[1].children[0].name
|
89
|
+
assert_equal "uint64", field_wrapper.children[1].children[0].type
|
90
|
+
assert_kind_of Arrow::Field, field_wrapper.children[1].children[0].arrow_field
|
91
|
+
assert_kind_of Arrow::UInt64DataType, field_wrapper.children[1].children[0].arrow_field.data_type
|
92
|
+
assert_kind_of Arrow::UInt64ArrayBuilder, field_wrapper.children[1].children[0].array_builder
|
93
|
+
end
|
94
|
+
|
95
|
+
test "#append (timestamp)" do
|
96
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "timestamp", "unit" => "nano"})
|
97
|
+
time = Time.now
|
98
|
+
field_wrapper.append(time)
|
99
|
+
timestamp_array = field_wrapper.finish
|
100
|
+
assert_kind_of Time, timestamp_array[0]
|
101
|
+
assert_equal time.to_i, timestamp_array[0].to_i
|
102
|
+
end
|
103
|
+
|
104
|
+
test "#append (date32)" do
|
105
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "date32"})
|
106
|
+
date = Date.today
|
107
|
+
field_wrapper.append(date)
|
108
|
+
date_array = field_wrapper.finish
|
109
|
+
assert_kind_of Date, date_array[0]
|
110
|
+
assert_equal date, date_array[0]
|
111
|
+
end
|
112
|
+
|
113
|
+
test "#append (date64)" do
|
114
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "date64"})
|
115
|
+
date = Date.today
|
116
|
+
field_wrapper.append(date)
|
117
|
+
date_array = field_wrapper.finish
|
118
|
+
assert_kind_of DateTime, date_array[0]
|
119
|
+
assert_equal date, date_array[0].to_date
|
120
|
+
end
|
121
|
+
|
122
|
+
test "#append (nested)" do
|
123
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "struct", "fields" => [
|
124
|
+
{"name" => "foo1", "type" => "string"},
|
125
|
+
{"name" => "foo2", "type" => "list", "value_type" => {"name" => "value", "type" => "uint64"}},
|
126
|
+
]})
|
127
|
+
|
128
|
+
field_wrapper.append({"foo1" => "rec1", "foo2" => [1, 2, 3]})
|
129
|
+
field_wrapper.append({"foo1" => "rec2", "foo2" => [4, 5]})
|
130
|
+
|
131
|
+
struct_array = field_wrapper.finish
|
132
|
+
assert_kind_of Arrow::StringArray, struct_array.fields[0]
|
133
|
+
assert_equal "rec1", struct_array.fields[0][0]
|
134
|
+
assert_equal "rec2", struct_array.fields[0][1]
|
135
|
+
|
136
|
+
assert_kind_of Arrow::UInt64Array, struct_array.fields[1].get_value(0)
|
137
|
+
assert_equal 1, struct_array.fields[1].get_value(0)[0]
|
138
|
+
assert_equal 2, struct_array.fields[1].get_value(0)[1]
|
139
|
+
assert_equal 3, struct_array.fields[1].get_value(0)[2]
|
140
|
+
|
141
|
+
assert_kind_of Arrow::UInt64Array, struct_array.fields[1].get_value(1)
|
142
|
+
assert_equal 4, struct_array.fields[1].get_value(1)[0]
|
143
|
+
assert_equal 5, struct_array.fields[1].get_value(1)[1]
|
144
|
+
end
|
145
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-arrow
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- joker1007
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-12-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -113,13 +113,14 @@ files:
|
|
113
113
|
- README.md
|
114
114
|
- Rakefile
|
115
115
|
- fluent-plugin-arrow.gemspec
|
116
|
+
- lib/fluent/plugin/arrow/field_wrapper.rb
|
116
117
|
- lib/fluent/plugin/buf_arrow_memory.rb
|
117
118
|
- lib/fluent/plugin/buffer/arrow_memory_chunk.rb
|
118
119
|
- lib/fluent/plugin/formatter_arrow.rb
|
119
120
|
- test/helper.rb
|
120
121
|
- test/plugin/test_buf_arrow_memory.rb
|
121
122
|
- test/plugin/test_buffer_arrow_memory_chunk.rb
|
122
|
-
- test/plugin/
|
123
|
+
- test/plugin/test_field_wrapper.rb
|
123
124
|
homepage: https://github.com/joker1007/fluent-plugin-arrow
|
124
125
|
licenses:
|
125
126
|
- Apache-2.0
|
@@ -148,4 +149,4 @@ test_files:
|
|
148
149
|
- test/helper.rb
|
149
150
|
- test/plugin/test_buf_arrow_memory.rb
|
150
151
|
- test/plugin/test_buffer_arrow_memory_chunk.rb
|
151
|
-
- test/plugin/
|
152
|
+
- test/plugin/test_field_wrapper.rb
|
@@ -1,14 +0,0 @@
|
|
1
|
-
require "helper"
|
2
|
-
require "fluent/plugin/formatter_arrow.rb"
|
3
|
-
|
4
|
-
class ArrowFormatterTest < Test::Unit::TestCase
|
5
|
-
setup do
|
6
|
-
Fluent::Test.setup
|
7
|
-
end
|
8
|
-
|
9
|
-
private
|
10
|
-
|
11
|
-
def create_driver(conf)
|
12
|
-
Fluent::Test::Driver::Formatter.new(Fluent::Plugin::ArrowFormatter).configure(conf)
|
13
|
-
end
|
14
|
-
end
|