fluent-plugin-arrow 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +32 -3
- data/fluent-plugin-arrow.gemspec +1 -1
- data/lib/fluent/plugin/arrow/field_wrapper.rb +259 -0
- data/lib/fluent/plugin/buf_arrow_memory.rb +5 -26
- data/lib/fluent/plugin/buffer/arrow_memory_chunk.rb +11 -32
- data/test/plugin/test_buffer_arrow_memory_chunk.rb +20 -18
- data/test/plugin/test_field_wrapper.rb +145 -0
- metadata +5 -4
- data/test/plugin/test_formatter_arrow.rb +0 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a1ed6b1b40e55501097a3a0455b2f2be93b3cf55d7e8cb831d4dc742f4007b97
|
4
|
+
data.tar.gz: fdcaed4462a919076c29f5947c3c81744b887e0099d2e5ad7a317e6845391da1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f219215b1f8b8a57b2f9d79a6d1fdec39e10652c6d674a8d1390f7f3ce75d8752c15e89b268e36fb781bed44cfed2e128fd840b0978eb508c75dec1fc2b82a1
|
7
|
+
data.tar.gz: 2d473fbead6b6c12e361b045e7de325652b36c22d94685be289bc8a076c03f4dcb8b2c5270d922f76c0f462475d6846c96176abf4085abf24affaad0bc4b04fd
|
data/README.md
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
# fluent-plugin-arrow
|
2
2
|
|
3
|
-
[Fluentd](https://fluentd.org/)
|
3
|
+
[Fluentd](https://fluentd.org/) buffer plugin to output Apache Arrow and Parquet format.
|
4
4
|
|
5
|
-
|
5
|
+
## Prerequisite
|
6
|
+
|
7
|
+
- [Apache Arrow c++](https://github.com/apache/arrow/tree/master/cpp) (with -DARROW_PARQUET=ON)
|
8
|
+
- [Apache Arrow c_glib](https://github.com/apache/arrow/tree/master/c_glib)
|
9
|
+
- [red-arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
|
10
|
+
- [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet)
|
6
11
|
|
7
12
|
## Installation
|
8
13
|
|
@@ -31,7 +36,31 @@ $ bundle
|
|
31
36
|
You can generate configuration template:
|
32
37
|
|
33
38
|
```
|
34
|
-
|
39
|
+
<match arrow>
|
40
|
+
@type file
|
41
|
+
|
42
|
+
path arrow_test
|
43
|
+
|
44
|
+
<buffer>
|
45
|
+
@type arrow_memory
|
46
|
+
arrow_format arrow # or parquet
|
47
|
+
|
48
|
+
schema [
|
49
|
+
{"name": "key1", "type": "string"},
|
50
|
+
{"name": "key2", "type": "uint64"},
|
51
|
+
{"name": "key3", "type": "timestamp", "unit": "milli"},
|
52
|
+
{"name": "key4", "type": "list", "value_type": {"name": "value", "type": "uint64"}},
|
53
|
+
{"name": "key5", "type": "struct", "fields": [
|
54
|
+
{"name": "bar1", "type": "uint64"},
|
55
|
+
{"name": "bar2", "type": "list", "value_type": {"name": "value", "type": "string"}}
|
56
|
+
]}
|
57
|
+
]
|
58
|
+
</buffer>
|
59
|
+
|
60
|
+
<format>
|
61
|
+
@type arrow
|
62
|
+
</format>
|
63
|
+
</match>
|
35
64
|
```
|
36
65
|
|
37
66
|
You can copy and paste generated documents here.
|
data/fluent-plugin-arrow.gemspec
CHANGED
@@ -0,0 +1,259 @@
|
|
1
|
+
require "arrow"
|
2
|
+
|
3
|
+
module Fluent
|
4
|
+
module Plugin
|
5
|
+
module Arrow
|
6
|
+
class FieldWrapper
|
7
|
+
class << self
|
8
|
+
def build(field)
|
9
|
+
case field["type"]
|
10
|
+
when "string"
|
11
|
+
StringFieldWrapper.new(field)
|
12
|
+
when "int", "int8", "int16", "int32", "int64", "uint", "uint8", "uint16", "uint32", "uint64"
|
13
|
+
IntegerFieldWrapper.new(field)
|
14
|
+
when "float", "double"
|
15
|
+
FloatFieldWrapper.new(field)
|
16
|
+
when "boolean"
|
17
|
+
BooleanFieldWrapper.new(field)
|
18
|
+
when "date32"
|
19
|
+
Date32FieldWrapper.new(field)
|
20
|
+
when "date64"
|
21
|
+
Date64FieldWrapper.new(field)
|
22
|
+
when "timestamp"
|
23
|
+
TimestampFieldWrapper.new(field)
|
24
|
+
when "list"
|
25
|
+
ListFieldWrapper.new(field)
|
26
|
+
when "struct"
|
27
|
+
StructFieldWrapper.new(field)
|
28
|
+
else
|
29
|
+
raise "Unsupported data type"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
attr_reader :field, :name, :type, :children, :arrow_field, :array_builder
|
35
|
+
|
36
|
+
def initialize(field)
|
37
|
+
@field = field
|
38
|
+
@name = field["name"]
|
39
|
+
@type = field["type"]
|
40
|
+
@children = []
|
41
|
+
|
42
|
+
field["value_type"]&.tap do |f|
|
43
|
+
@children << self.class.build(f)
|
44
|
+
end
|
45
|
+
|
46
|
+
field["fields"]&.each do |f|
|
47
|
+
@children << self.class.build(f)
|
48
|
+
end
|
49
|
+
|
50
|
+
create_arrow_field
|
51
|
+
create_array_builder
|
52
|
+
end
|
53
|
+
|
54
|
+
def append(value)
|
55
|
+
if value.nil?
|
56
|
+
@array_builder.append_null
|
57
|
+
else
|
58
|
+
@array_builder.append(cast_value(value))
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def finish
|
63
|
+
@array_builder.finish
|
64
|
+
end
|
65
|
+
|
66
|
+
def create_arrow_field
|
67
|
+
@arrow_field = ::Arrow::Field.new(name, create_arrow_data_type)
|
68
|
+
end
|
69
|
+
|
70
|
+
def create_arrow_data_type
|
71
|
+
data_type_name = type.to_s.capitalize.gsub(/\AUint/, "UInt")
|
72
|
+
data_type_class_name = "#{data_type_name}DataType"
|
73
|
+
data_type_class = ::Arrow.const_get(data_type_class_name)
|
74
|
+
data_type_class.new
|
75
|
+
end
|
76
|
+
|
77
|
+
def create_array_builder(from_parent = nil)
|
78
|
+
if from_parent
|
79
|
+
@array_builder = from_parent
|
80
|
+
else
|
81
|
+
data_type_str = arrow_field.data_type.to_s
|
82
|
+
data_type_name = data_type_str.capitalize.gsub(/\AUint/, "UInt")
|
83
|
+
array_builder_class_name = "#{data_type_name}ArrayBuilder"
|
84
|
+
array_builder_class = ::Arrow.const_get(array_builder_class_name)
|
85
|
+
@array_builder = array_builder_class.new
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def cast_value(value)
|
90
|
+
raise NotImplementedError
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class StringFieldWrapper < FieldWrapper
|
95
|
+
def cast_value(value)
|
96
|
+
value.to_s
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class IntegerFieldWrapper < FieldWrapper
|
101
|
+
def cast_value(value)
|
102
|
+
value.to_i
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
class FloatFieldWrapper < FieldWrapper
|
107
|
+
def cast_value(value)
|
108
|
+
value.to_f
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
class BooleanFieldWrapper < FieldWrapper
|
113
|
+
def cast_value(value)
|
114
|
+
!!value
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
require "date"
|
119
|
+
class Date32FieldWrapper < FieldWrapper
|
120
|
+
UNIX_EPOCH = Date.new(1970, 1, 1)
|
121
|
+
def cast_value(value)
|
122
|
+
date =
|
123
|
+
if value.respond_to?(:to_date)
|
124
|
+
value.to_date
|
125
|
+
else
|
126
|
+
Date.parse(value)
|
127
|
+
end
|
128
|
+
|
129
|
+
(date - UNIX_EPOCH).to_i
|
130
|
+
end
|
131
|
+
|
132
|
+
def create_array_builder(from_parent = nil)
|
133
|
+
if from_parent
|
134
|
+
@array_builder = from_parent
|
135
|
+
else
|
136
|
+
@array_builder = ::Arrow::Date32ArrayBuilder.new
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
class Date64FieldWrapper < FieldWrapper
|
142
|
+
UNIX_EPOCH = Date.new(1970, 1, 1)
|
143
|
+
def cast_value(value)
|
144
|
+
time =
|
145
|
+
if value.respond_to?(:to_time)
|
146
|
+
value.to_time
|
147
|
+
else
|
148
|
+
Time.parse(value)
|
149
|
+
end
|
150
|
+
|
151
|
+
time.to_i * 1_000 + time.usec / 1_000
|
152
|
+
end
|
153
|
+
|
154
|
+
def create_array_builder(from_parent = nil)
|
155
|
+
if from_parent
|
156
|
+
@array_builder = from_parent
|
157
|
+
else
|
158
|
+
@array_builder = ::Arrow::Date64ArrayBuilder.new
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
require "time"
|
164
|
+
class TimestampFieldWrapper < FieldWrapper
|
165
|
+
def cast_value(value)
|
166
|
+
value =
|
167
|
+
if value.is_a?(Fluent::EventTime)
|
168
|
+
Time.at(value, value.usec)
|
169
|
+
elsif value.respond_to?(:to_time)
|
170
|
+
value.to_time
|
171
|
+
elsif value.is_a?(String)
|
172
|
+
Time.parse(value)
|
173
|
+
else
|
174
|
+
value
|
175
|
+
end
|
176
|
+
|
177
|
+
return value if value.is_a?(Numeric)
|
178
|
+
|
179
|
+
case field["unit"]
|
180
|
+
when "second"
|
181
|
+
value.to_i
|
182
|
+
when "milli"
|
183
|
+
value.to_i * 1_000 + value.usec / 1_000
|
184
|
+
when "micro"
|
185
|
+
value.to_i * 1_000_000 + value.usec
|
186
|
+
else
|
187
|
+
value.to_i * 1_000_000_000 + value.nsec
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def create_arrow_data_type
|
192
|
+
::Arrow::TimestampDataType.new(field["unit"].to_sym)
|
193
|
+
end
|
194
|
+
|
195
|
+
def create_array_builder(from_parent = nil)
|
196
|
+
if from_parent
|
197
|
+
@array_builder = from_parent
|
198
|
+
else
|
199
|
+
@array_builder = ::Arrow::TimestampArrayBuilder.new(arrow_field.data_type)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
class ListFieldWrapper < FieldWrapper
|
205
|
+
def append(value)
|
206
|
+
if value.nil?
|
207
|
+
@array_builder.append_null
|
208
|
+
else
|
209
|
+
@array_builder.append
|
210
|
+
value.each do |v|
|
211
|
+
@children[0].append(v)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
def create_arrow_data_type
|
217
|
+
::Arrow::ListDataType.new(children[0].arrow_field)
|
218
|
+
end
|
219
|
+
|
220
|
+
def create_array_builder(from_parent = nil)
|
221
|
+
if from_parent
|
222
|
+
@array_builder = from_parent
|
223
|
+
else
|
224
|
+
@array_builder = ::Arrow::ListArrayBuilder.new(arrow_field.data_type)
|
225
|
+
end
|
226
|
+
|
227
|
+
@children.each { |c| c.create_array_builder(@array_builder.value_builder) }
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
class StructFieldWrapper < FieldWrapper
|
232
|
+
def append(value)
|
233
|
+
if value.nil?
|
234
|
+
@array_builder.append_null
|
235
|
+
else
|
236
|
+
@array_builder.append
|
237
|
+
value.each do |k, v|
|
238
|
+
@children.find { |c| c.name == k }.append(v)
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def create_arrow_data_type
|
244
|
+
::Arrow::StructDataType.new(children.map(&:arrow_field))
|
245
|
+
end
|
246
|
+
|
247
|
+
def create_array_builder(from_parent = nil)
|
248
|
+
if from_parent
|
249
|
+
@array_builder = from_parent
|
250
|
+
else
|
251
|
+
@array_builder = ::Arrow::StructArrayBuilder.new(arrow_field.data_type)
|
252
|
+
end
|
253
|
+
|
254
|
+
@children.each_with_index { |c, i| c.create_array_builder(@array_builder.get_field_builder(i)) }
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
@@ -16,6 +16,7 @@
|
|
16
16
|
require "arrow"
|
17
17
|
require 'fluent/plugin/buffer'
|
18
18
|
require 'fluent/plugin/buffer/arrow_memory_chunk'
|
19
|
+
require 'fluent/plugin/arrow/field_wrapper'
|
19
20
|
|
20
21
|
module Fluent
|
21
22
|
module Plugin
|
@@ -32,11 +33,11 @@ module Fluent
|
|
32
33
|
super
|
33
34
|
|
34
35
|
# [{"name" => foo1, "type" => "uint64"}, {"name" => foo2, "type" => "struct", "fields" => [{"name" => bar1, "type" => "string"}]}
|
35
|
-
|
36
|
-
|
36
|
+
@field_wrappers = @schema.each_with_object({}) do |field, h|
|
37
|
+
h[field["name"]] = Fluent::Plugin::Arrow::FieldWrapper.build(field)
|
37
38
|
end
|
38
39
|
|
39
|
-
@arrow_schema = Arrow::Schema.new(
|
40
|
+
@arrow_schema = ::Arrow::Schema.new(@field_wrappers.values.map(&:arrow_field))
|
40
41
|
end
|
41
42
|
|
42
43
|
def resume
|
@@ -44,29 +45,7 @@ module Fluent
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def generate_chunk(metadata)
|
47
|
-
Fluent::Plugin::Buffer::ArrowMemoryChunk.new(metadata, @arrow_schema, chunk_size: @row_group_chunk_size, format: @arrow_format)
|
48
|
-
end
|
49
|
-
|
50
|
-
private
|
51
|
-
|
52
|
-
def create_arrow_field(field)
|
53
|
-
Arrow::Field.new(field["name"], create_arrow_data_type(field))
|
54
|
-
end
|
55
|
-
|
56
|
-
def create_arrow_data_type(field)
|
57
|
-
case field["type"]
|
58
|
-
when "struct"
|
59
|
-
Arrow::StructDataType.new(field["fields"].map { |f| create_arrow_field(f) })
|
60
|
-
when "list"
|
61
|
-
Arrow::ListDataType.new(create_arrow_field(field["value_type"]))
|
62
|
-
when "timestamp"
|
63
|
-
Arrow::TimestampDataType.new(field["unit"].to_sym)
|
64
|
-
else
|
65
|
-
data_type_name = field["type"].to_s.capitalize.gsub(/\AUint/, "UInt")
|
66
|
-
data_type_class_name = "#{data_type_name}DataType"
|
67
|
-
data_type_class = Arrow.const_get(data_type_class_name)
|
68
|
-
data_type_class.new
|
69
|
-
end
|
48
|
+
Fluent::Plugin::Buffer::ArrowMemoryChunk.new(metadata, @arrow_schema, @field_wrappers, chunk_size: @row_group_chunk_size, format: @arrow_format)
|
70
49
|
end
|
71
50
|
end
|
72
51
|
end
|
@@ -15,22 +15,21 @@
|
|
15
15
|
|
16
16
|
require 'arrow'
|
17
17
|
require 'parquet'
|
18
|
+
require 'fluent/msgpack_factory'
|
18
19
|
require 'fluent/plugin/buffer/chunk'
|
19
20
|
require 'fluent/plugin/buffer/memory_chunk'
|
21
|
+
require 'fluent/plugin/arrow/field_wrapper'
|
20
22
|
|
21
23
|
module Fluent
|
22
24
|
module Plugin
|
23
25
|
class Buffer
|
24
26
|
class ArrowMemoryChunk < MemoryChunk
|
25
|
-
def initialize(metadata, schema, chunk_size: 1024, format: :arrow)
|
27
|
+
def initialize(metadata, schema, field_wrappers, chunk_size: 1024, format: :arrow)
|
26
28
|
super(metadata, compress: :text)
|
27
29
|
@schema = schema
|
30
|
+
@field_wrappers = field_wrappers
|
28
31
|
@chunk_size = chunk_size
|
29
32
|
@format = format
|
30
|
-
@array_builders = {}
|
31
|
-
@schema.fields.each do |f|
|
32
|
-
@array_builders[f.name] = field_to_array_builder(f)
|
33
|
-
end
|
34
33
|
@unpacker = Fluent::MessagePackFactory.engine_factory.unpacker
|
35
34
|
end
|
36
35
|
|
@@ -49,48 +48,28 @@ module Fluent
|
|
49
48
|
|
50
49
|
private
|
51
50
|
|
52
|
-
def field_to_array_builder(f)
|
53
|
-
data_type_str = f.data_type.to_s
|
54
|
-
if data_type_str =~ /timestamp/
|
55
|
-
return Arrow::TimestampArrayBuilder.new(f.data_type)
|
56
|
-
end
|
57
|
-
|
58
|
-
data_type_name = data_type_str.capitalize.gsub(/\AUint/, "UInt")
|
59
|
-
array_builder_class_name = "#{data_type_name}ArrayBuilder"
|
60
|
-
array_builder_class = Arrow.const_get(array_builder_class_name)
|
61
|
-
if array_builder_class.method(:new).arity > 0
|
62
|
-
array_builder_class.new(f.data_type)
|
63
|
-
else
|
64
|
-
array_builder_class.new
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
51
|
def build_arrow_buffer_string
|
69
52
|
count = 0
|
70
53
|
@unpacker.feed_each(@chunk) do |record|
|
71
54
|
count += 1
|
72
55
|
record.each do |k, v|
|
73
|
-
|
74
|
-
@array_builders[k].append_null
|
75
|
-
else
|
76
|
-
@array_builders[k].append(v)
|
77
|
-
end
|
56
|
+
@field_wrappers[k].append(v)
|
78
57
|
end
|
79
58
|
end
|
80
|
-
arrow_buf = Arrow::ResizableBuffer.new(@chunk_bytes * 1.2)
|
59
|
+
arrow_buf = ::Arrow::ResizableBuffer.new(@chunk_bytes * 1.2)
|
81
60
|
|
82
|
-
Arrow::BufferOutputStream.open(arrow_buf) do |output|
|
61
|
+
::Arrow::BufferOutputStream.open(arrow_buf) do |output|
|
83
62
|
if @format == :parquet
|
84
63
|
Parquet::ArrowFileWriter.open(@schema, output) do |writer|
|
85
64
|
columns = @schema.fields.map do |f|
|
86
|
-
Arrow::Column.new(f, @
|
65
|
+
::Arrow::Column.new(f, @field_wrappers[f.name].finish)
|
87
66
|
end
|
88
|
-
table = Arrow::Table.new(@schema, columns)
|
67
|
+
table = ::Arrow::Table.new(@schema, columns)
|
89
68
|
writer.write_table(table, @chunk_size)
|
90
69
|
end
|
91
70
|
else
|
92
|
-
Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
|
93
|
-
record_batch = Arrow::RecordBatch.new(@schema, count, @
|
71
|
+
::Arrow::RecordBatchFileWriter.open(output, @schema) do |writer|
|
72
|
+
record_batch = ::Arrow::RecordBatch.new(@schema, count, @field_wrappers.values.map(&:finish))
|
94
73
|
writer.write_record_batch(record_batch)
|
95
74
|
end
|
96
75
|
end
|
@@ -5,12 +5,17 @@ require "fluent/plugin/buffer/arrow_memory_chunk"
|
|
5
5
|
class ArrowMemoryChunkTest < Test::Unit::TestCase
|
6
6
|
setup do
|
7
7
|
@fields = [
|
8
|
-
Arrow::Field.new("key1", :uint64),
|
9
|
-
Arrow::Field.new("key2", :double),
|
10
|
-
Arrow::Field.new("key3", Arrow::TimestampDataType.new(:second)),
|
8
|
+
::Arrow::Field.new("key1", :uint64),
|
9
|
+
::Arrow::Field.new("key2", :double),
|
10
|
+
::Arrow::Field.new("key3", ::Arrow::TimestampDataType.new(:second)),
|
11
11
|
]
|
12
|
+
field_wrappers = {
|
13
|
+
"key1" => Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "uint64"}),
|
14
|
+
"key2" => Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "double"}),
|
15
|
+
"key3" => Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "timestamp", "unit" => "second"}),
|
16
|
+
}
|
12
17
|
@schema = Arrow::Schema.new(@fields)
|
13
|
-
@c = Fluent::Plugin::Buffer::ArrowMemoryChunk.new(Object.new, @schema)
|
18
|
+
@c = Fluent::Plugin::Buffer::ArrowMemoryChunk.new(Object.new, @schema, field_wrappers)
|
14
19
|
end
|
15
20
|
|
16
21
|
test "can #read" do
|
@@ -18,44 +23,41 @@ class ArrowMemoryChunkTest < Test::Unit::TestCase
|
|
18
23
|
d2 = {"key1" => 124, "key2" => 11.1234, "key3" => Fluent::EventTime.from_time(Time.now)}
|
19
24
|
data = [d1.to_msgpack, d2.to_msgpack]
|
20
25
|
@c.append(data)
|
21
|
-
Arrow::BufferInputStream.open(Arrow::Buffer.new(@c.read)) do |input|
|
22
|
-
reader = Arrow::RecordBatchFileReader.new(input)
|
26
|
+
::Arrow::BufferInputStream.open(::Arrow::Buffer.new(@c.read)) do |input|
|
27
|
+
reader = ::Arrow::RecordBatchFileReader.new(input)
|
23
28
|
|
24
29
|
reader.each do |record_batch|
|
25
30
|
assert { record_batch.n_rows == 2 }
|
26
31
|
|
27
|
-
assert { record_batch.find_column(@fields[0].name).class == Arrow::UInt64Array }
|
32
|
+
assert { record_batch.find_column(@fields[0].name).class == ::Arrow::UInt64Array }
|
28
33
|
assert { record_batch.find_column(@fields[0].name).values == [123, 124] }
|
29
34
|
end
|
30
35
|
end
|
31
36
|
end
|
32
37
|
|
33
38
|
test "can #write_to" do
|
34
|
-
|
35
|
-
|
39
|
+
time = Time.now
|
40
|
+
d1 = {"key1" => 123, "key2" => 10.1234, "key3" => Fluent::EventTime.from_time(time)}
|
41
|
+
d2 = {"key1" => 124, "key2" => 11.1234, "key3" => Fluent::EventTime.from_time(time)}
|
36
42
|
data = [d1.to_msgpack, d2.to_msgpack]
|
37
43
|
@c.append(data)
|
38
44
|
Tempfile.create do |tf|
|
39
45
|
@c.write_to(tf)
|
40
46
|
tf.flush
|
41
47
|
|
42
|
-
Arrow::MemoryMappedInputStream.open(tf.path) do |input|
|
43
|
-
reader = Arrow::RecordBatchFileReader.new(input)
|
48
|
+
::Arrow::MemoryMappedInputStream.open(tf.path) do |input|
|
49
|
+
reader = ::Arrow::RecordBatchFileReader.new(input)
|
44
50
|
reader.each_with_index do |record_batch, i|
|
45
51
|
reader.each do |record_batch|
|
46
52
|
assert { record_batch.n_rows == 2 }
|
47
53
|
|
48
|
-
assert { record_batch.find_column(@fields[0].name).class == Arrow::UInt64Array }
|
54
|
+
assert { record_batch.find_column(@fields[0].name).class == ::Arrow::UInt64Array }
|
49
55
|
assert { record_batch.find_column(@fields[0].name).values == [123, 124] }
|
56
|
+
assert { record_batch.find_column(@fields[1].name).values == [10.1234, 11.1234] }
|
57
|
+
assert { record_batch.find_column(@fields[2].name)[0].to_i == time.to_i }
|
50
58
|
end
|
51
59
|
end
|
52
60
|
end
|
53
61
|
end
|
54
62
|
end
|
55
|
-
|
56
|
-
private
|
57
|
-
|
58
|
-
def create_driver(conf)
|
59
|
-
Fluent::Test::Driver::Formatter.new(Fluent::Plugin::ArrowFormatter).configure(conf)
|
60
|
-
end
|
61
63
|
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
require "helper"
|
2
|
+
require "fluent/plugin/arrow/field_wrapper"
|
3
|
+
|
4
|
+
class ArrowFieldWrapperTest < Test::Unit::TestCase
|
5
|
+
test ".build (string)" do
|
6
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "string"})
|
7
|
+
assert_equal "key1", field_wrapper.name
|
8
|
+
assert_equal "string", field_wrapper.type
|
9
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
10
|
+
end
|
11
|
+
|
12
|
+
test ".build (timestamp)" do
|
13
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "timestamp", "unit" => "nano"})
|
14
|
+
assert_equal "key1", field_wrapper.name
|
15
|
+
assert_equal "timestamp", field_wrapper.type
|
16
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
17
|
+
end
|
18
|
+
|
19
|
+
test ".build (list)" do
|
20
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "list", "value_type" => {"name" => "value", "type" => "string"}})
|
21
|
+
assert_equal "key1", field_wrapper.name
|
22
|
+
assert_equal "list", field_wrapper.type
|
23
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
24
|
+
assert_kind_of Arrow::ListDataType, field_wrapper.arrow_field.data_type
|
25
|
+
assert_kind_of Arrow::ListArrayBuilder, field_wrapper.array_builder
|
26
|
+
|
27
|
+
assert_equal "value", field_wrapper.children[0].name
|
28
|
+
assert_equal "string", field_wrapper.children[0].type
|
29
|
+
assert_kind_of Arrow::Field, field_wrapper.children[0].arrow_field
|
30
|
+
assert_kind_of Arrow::StringDataType, field_wrapper.children[0].arrow_field.data_type
|
31
|
+
assert_kind_of Arrow::StringArrayBuilder, field_wrapper.children[0].array_builder
|
32
|
+
end
|
33
|
+
|
34
|
+
test ".build (struct)" do
|
35
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "struct", "fields" => [
|
36
|
+
{"name" => "foo1", "type" => "string"},
|
37
|
+
{"name" => "foo2", "type" => "uint64"},
|
38
|
+
{"name" => "foo3", "type" => "timestamp", "unit" => "milli"},
|
39
|
+
]})
|
40
|
+
assert_equal "key1", field_wrapper.name
|
41
|
+
assert_equal "struct", field_wrapper.type
|
42
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
43
|
+
assert_kind_of Arrow::StructDataType, field_wrapper.arrow_field.data_type
|
44
|
+
assert_kind_of Arrow::StructArrayBuilder, field_wrapper.array_builder
|
45
|
+
|
46
|
+
assert_equal "foo1", field_wrapper.children[0].name
|
47
|
+
assert_equal "string", field_wrapper.children[0].type
|
48
|
+
assert_kind_of Arrow::Field, field_wrapper.children[0].arrow_field
|
49
|
+
assert_kind_of Arrow::StringDataType, field_wrapper.children[0].arrow_field.data_type
|
50
|
+
assert_kind_of Arrow::StringArrayBuilder, field_wrapper.children[0].array_builder
|
51
|
+
|
52
|
+
assert_equal "foo2", field_wrapper.children[1].name
|
53
|
+
assert_equal "uint64", field_wrapper.children[1].type
|
54
|
+
assert_kind_of Arrow::Field, field_wrapper.children[1].arrow_field
|
55
|
+
assert_kind_of Arrow::UInt64DataType, field_wrapper.children[1].arrow_field.data_type
|
56
|
+
assert_kind_of Arrow::UInt64ArrayBuilder, field_wrapper.children[1].array_builder
|
57
|
+
|
58
|
+
assert_equal "foo3", field_wrapper.children[2].name
|
59
|
+
assert_equal "timestamp", field_wrapper.children[2].type
|
60
|
+
assert_kind_of Arrow::Field, field_wrapper.children[2].arrow_field
|
61
|
+
assert_kind_of Arrow::TimestampDataType, field_wrapper.children[2].arrow_field.data_type
|
62
|
+
assert_kind_of Arrow::TimestampArrayBuilder, field_wrapper.children[2].array_builder
|
63
|
+
end
|
64
|
+
|
65
|
+
test ".build (nested)" do
|
66
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "struct", "fields" => [
|
67
|
+
{"name" => "foo1", "type" => "string"},
|
68
|
+
{"name" => "foo2", "type" => "list", "value_type" => {"name" => "value", "type" => "uint64"}},
|
69
|
+
]})
|
70
|
+
assert_equal "key1", field_wrapper.name
|
71
|
+
assert_equal "struct", field_wrapper.type
|
72
|
+
assert_kind_of Arrow::Field, field_wrapper.arrow_field
|
73
|
+
assert_kind_of Arrow::StructDataType, field_wrapper.arrow_field.data_type
|
74
|
+
assert_kind_of Arrow::StructArrayBuilder, field_wrapper.array_builder
|
75
|
+
|
76
|
+
assert_equal "foo1", field_wrapper.children[0].name
|
77
|
+
assert_equal "string", field_wrapper.children[0].type
|
78
|
+
assert_kind_of Arrow::Field, field_wrapper.children[0].arrow_field
|
79
|
+
assert_kind_of Arrow::StringDataType, field_wrapper.children[0].arrow_field.data_type
|
80
|
+
assert_kind_of Arrow::StringArrayBuilder, field_wrapper.children[0].array_builder
|
81
|
+
|
82
|
+
assert_equal "foo2", field_wrapper.children[1].name
|
83
|
+
assert_equal "list", field_wrapper.children[1].type
|
84
|
+
assert_kind_of Arrow::Field, field_wrapper.children[1].arrow_field
|
85
|
+
assert_kind_of Arrow::ListDataType, field_wrapper.children[1].arrow_field.data_type
|
86
|
+
assert_kind_of Arrow::ListArrayBuilder, field_wrapper.children[1].array_builder
|
87
|
+
|
88
|
+
assert_equal "value", field_wrapper.children[1].children[0].name
|
89
|
+
assert_equal "uint64", field_wrapper.children[1].children[0].type
|
90
|
+
assert_kind_of Arrow::Field, field_wrapper.children[1].children[0].arrow_field
|
91
|
+
assert_kind_of Arrow::UInt64DataType, field_wrapper.children[1].children[0].arrow_field.data_type
|
92
|
+
assert_kind_of Arrow::UInt64ArrayBuilder, field_wrapper.children[1].children[0].array_builder
|
93
|
+
end
|
94
|
+
|
95
|
+
test "#append (timestamp)" do
|
96
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "timestamp", "unit" => "nano"})
|
97
|
+
time = Time.now
|
98
|
+
field_wrapper.append(time)
|
99
|
+
timestamp_array = field_wrapper.finish
|
100
|
+
assert_kind_of Time, timestamp_array[0]
|
101
|
+
assert_equal time.to_i, timestamp_array[0].to_i
|
102
|
+
end
|
103
|
+
|
104
|
+
test "#append (date32)" do
|
105
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "date32"})
|
106
|
+
date = Date.today
|
107
|
+
field_wrapper.append(date)
|
108
|
+
date_array = field_wrapper.finish
|
109
|
+
assert_kind_of Date, date_array[0]
|
110
|
+
assert_equal date, date_array[0]
|
111
|
+
end
|
112
|
+
|
113
|
+
test "#append (date64)" do
|
114
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "date64"})
|
115
|
+
date = Date.today
|
116
|
+
field_wrapper.append(date)
|
117
|
+
date_array = field_wrapper.finish
|
118
|
+
assert_kind_of DateTime, date_array[0]
|
119
|
+
assert_equal date, date_array[0].to_date
|
120
|
+
end
|
121
|
+
|
122
|
+
test "#append (nested)" do
|
123
|
+
field_wrapper = Fluent::Plugin::Arrow::FieldWrapper.build({"name" => "key1", "type" => "struct", "fields" => [
|
124
|
+
{"name" => "foo1", "type" => "string"},
|
125
|
+
{"name" => "foo2", "type" => "list", "value_type" => {"name" => "value", "type" => "uint64"}},
|
126
|
+
]})
|
127
|
+
|
128
|
+
field_wrapper.append({"foo1" => "rec1", "foo2" => [1, 2, 3]})
|
129
|
+
field_wrapper.append({"foo1" => "rec2", "foo2" => [4, 5]})
|
130
|
+
|
131
|
+
struct_array = field_wrapper.finish
|
132
|
+
assert_kind_of Arrow::StringArray, struct_array.fields[0]
|
133
|
+
assert_equal "rec1", struct_array.fields[0][0]
|
134
|
+
assert_equal "rec2", struct_array.fields[0][1]
|
135
|
+
|
136
|
+
assert_kind_of Arrow::UInt64Array, struct_array.fields[1].get_value(0)
|
137
|
+
assert_equal 1, struct_array.fields[1].get_value(0)[0]
|
138
|
+
assert_equal 2, struct_array.fields[1].get_value(0)[1]
|
139
|
+
assert_equal 3, struct_array.fields[1].get_value(0)[2]
|
140
|
+
|
141
|
+
assert_kind_of Arrow::UInt64Array, struct_array.fields[1].get_value(1)
|
142
|
+
assert_equal 4, struct_array.fields[1].get_value(1)[0]
|
143
|
+
assert_equal 5, struct_array.fields[1].get_value(1)[1]
|
144
|
+
end
|
145
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-arrow
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- joker1007
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-12-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -113,13 +113,14 @@ files:
|
|
113
113
|
- README.md
|
114
114
|
- Rakefile
|
115
115
|
- fluent-plugin-arrow.gemspec
|
116
|
+
- lib/fluent/plugin/arrow/field_wrapper.rb
|
116
117
|
- lib/fluent/plugin/buf_arrow_memory.rb
|
117
118
|
- lib/fluent/plugin/buffer/arrow_memory_chunk.rb
|
118
119
|
- lib/fluent/plugin/formatter_arrow.rb
|
119
120
|
- test/helper.rb
|
120
121
|
- test/plugin/test_buf_arrow_memory.rb
|
121
122
|
- test/plugin/test_buffer_arrow_memory_chunk.rb
|
122
|
-
- test/plugin/
|
123
|
+
- test/plugin/test_field_wrapper.rb
|
123
124
|
homepage: https://github.com/joker1007/fluent-plugin-arrow
|
124
125
|
licenses:
|
125
126
|
- Apache-2.0
|
@@ -148,4 +149,4 @@ test_files:
|
|
148
149
|
- test/helper.rb
|
149
150
|
- test/plugin/test_buf_arrow_memory.rb
|
150
151
|
- test/plugin/test_buffer_arrow_memory_chunk.rb
|
151
|
-
- test/plugin/
|
152
|
+
- test/plugin/test_field_wrapper.rb
|
@@ -1,14 +0,0 @@
|
|
1
|
-
require "helper"
|
2
|
-
require "fluent/plugin/formatter_arrow.rb"
|
3
|
-
|
4
|
-
class ArrowFormatterTest < Test::Unit::TestCase
|
5
|
-
setup do
|
6
|
-
Fluent::Test.setup
|
7
|
-
end
|
8
|
-
|
9
|
-
private
|
10
|
-
|
11
|
-
def create_driver(conf)
|
12
|
-
Fluent::Test::Driver::Formatter.new(Fluent::Plugin::ArrowFormatter).configure(conf)
|
13
|
-
end
|
14
|
-
end
|