feedx 0.9.2 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +2 -0
- data/.travis.yml +13 -6
- data/Gemfile.lock +43 -20
- data/Makefile +5 -0
- data/consumer_test.go +5 -5
- data/feedx.gemspec +3 -2
- data/feedx_test.go +13 -13
- data/format.go +16 -16
- data/format_test.go +6 -7
- data/go.mod +5 -11
- data/go.sum +43 -26
- data/internal/testdata/testdata.pb.go +124 -0
- data/internal/testdata/testdata.proto +15 -0
- data/lib/feedx/cache/abstract.rb +2 -2
- data/lib/feedx/compression.rb +11 -4
- data/lib/feedx/compression/abstract.rb +2 -2
- data/lib/feedx/compression/gzip.rb +14 -2
- data/lib/feedx/compression/none.rb +4 -4
- data/lib/feedx/consumer.rb +12 -9
- data/lib/feedx/format.rb +18 -9
- data/lib/feedx/format/abstract.rb +42 -13
- data/lib/feedx/format/json.rb +12 -8
- data/lib/feedx/format/parquet.rb +102 -0
- data/lib/feedx/format/protobuf.rb +16 -8
- data/lib/feedx/producer.rb +12 -9
- data/lib/feedx/stream.rb +22 -25
- data/producer.go +1 -4
- data/producer_test.go +1 -2
- data/reader_test.go +7 -8
- data/spec/feedx/compression/gzip_spec.rb +4 -2
- data/spec/feedx/compression/none_spec.rb +2 -2
- data/spec/feedx/compression_spec.rb +9 -9
- data/spec/feedx/consumer_spec.rb +6 -3
- data/spec/feedx/format/abstract_spec.rb +11 -8
- data/spec/feedx/format/json_spec.rb +12 -11
- data/spec/feedx/format/parquet_spec.rb +30 -0
- data/spec/feedx/format/protobuf_spec.rb +12 -11
- data/spec/feedx/format_spec.rb +8 -8
- data/spec/feedx/producer_spec.rb +6 -0
- data/spec/feedx/stream_spec.rb +20 -1
- data/spec/spec_helper.rb +17 -1
- data/writer.go +19 -18
- data/writer_test.go +3 -5
- metadata +22 -3
@@ -0,0 +1,124 @@
|
|
1
|
+
// Code generated by protoc-gen-gogo. DO NOT EDIT.
|
2
|
+
// source: internal/testdata/testdata.proto
|
3
|
+
|
4
|
+
package testdata
|
5
|
+
|
6
|
+
import (
|
7
|
+
fmt "fmt"
|
8
|
+
proto "github.com/gogo/protobuf/proto"
|
9
|
+
math "math"
|
10
|
+
)
|
11
|
+
|
12
|
+
// Reference imports to suppress errors if they are not otherwise used.
|
13
|
+
var _ = proto.Marshal
|
14
|
+
var _ = fmt.Errorf
|
15
|
+
var _ = math.Inf
|
16
|
+
|
17
|
+
// This is a compile-time assertion to ensure that this generated file
|
18
|
+
// is compatible with the proto package it is being compiled against.
|
19
|
+
// A compilation error at this line likely means your copy of the
|
20
|
+
// proto package needs to be updated.
|
21
|
+
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package
|
22
|
+
|
23
|
+
type MockEnum int32
|
24
|
+
|
25
|
+
const (
|
26
|
+
MockEnum_UNKNOWN MockEnum = 0
|
27
|
+
MockEnum_FIRST MockEnum = 3
|
28
|
+
)
|
29
|
+
|
30
|
+
var MockEnum_name = map[int32]string{
|
31
|
+
0: "UNKNOWN",
|
32
|
+
3: "FIRST",
|
33
|
+
}
|
34
|
+
|
35
|
+
var MockEnum_value = map[string]int32{
|
36
|
+
"UNKNOWN": 0,
|
37
|
+
"FIRST": 3,
|
38
|
+
}
|
39
|
+
|
40
|
+
func (x MockEnum) String() string {
|
41
|
+
return proto.EnumName(MockEnum_name, int32(x))
|
42
|
+
}
|
43
|
+
|
44
|
+
func (MockEnum) EnumDescriptor() ([]byte, []int) {
|
45
|
+
return fileDescriptor_076a9f61cb4a1904, []int{0}
|
46
|
+
}
|
47
|
+
|
48
|
+
type MockMessage struct {
|
49
|
+
Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
|
50
|
+
Enum MockEnum `protobuf:"varint,2,opt,name=enum,proto3,enum=feedx.internal.testdata.MockEnum" json:"enum,omitempty"`
|
51
|
+
Height uint32 `protobuf:"varint,3,opt,name=height,proto3" json:"height,omitempty"`
|
52
|
+
XXX_NoUnkeyedLiteral struct{} `json:"-"`
|
53
|
+
XXX_unrecognized []byte `json:"-"`
|
54
|
+
XXX_sizecache int32 `json:"-"`
|
55
|
+
}
|
56
|
+
|
57
|
+
func (m *MockMessage) Reset() { *m = MockMessage{} }
|
58
|
+
func (m *MockMessage) String() string { return proto.CompactTextString(m) }
|
59
|
+
func (*MockMessage) ProtoMessage() {}
|
60
|
+
func (*MockMessage) Descriptor() ([]byte, []int) {
|
61
|
+
return fileDescriptor_076a9f61cb4a1904, []int{0}
|
62
|
+
}
|
63
|
+
func (m *MockMessage) XXX_Unmarshal(b []byte) error {
|
64
|
+
return xxx_messageInfo_MockMessage.Unmarshal(m, b)
|
65
|
+
}
|
66
|
+
func (m *MockMessage) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
|
67
|
+
return xxx_messageInfo_MockMessage.Marshal(b, m, deterministic)
|
68
|
+
}
|
69
|
+
func (m *MockMessage) XXX_Merge(src proto.Message) {
|
70
|
+
xxx_messageInfo_MockMessage.Merge(m, src)
|
71
|
+
}
|
72
|
+
func (m *MockMessage) XXX_Size() int {
|
73
|
+
return xxx_messageInfo_MockMessage.Size(m)
|
74
|
+
}
|
75
|
+
func (m *MockMessage) XXX_DiscardUnknown() {
|
76
|
+
xxx_messageInfo_MockMessage.DiscardUnknown(m)
|
77
|
+
}
|
78
|
+
|
79
|
+
var xxx_messageInfo_MockMessage proto.InternalMessageInfo
|
80
|
+
|
81
|
+
func (m *MockMessage) GetName() string {
|
82
|
+
if m != nil {
|
83
|
+
return m.Name
|
84
|
+
}
|
85
|
+
return ""
|
86
|
+
}
|
87
|
+
|
88
|
+
func (m *MockMessage) GetEnum() MockEnum {
|
89
|
+
if m != nil {
|
90
|
+
return m.Enum
|
91
|
+
}
|
92
|
+
return MockEnum_UNKNOWN
|
93
|
+
}
|
94
|
+
|
95
|
+
func (m *MockMessage) GetHeight() uint32 {
|
96
|
+
if m != nil {
|
97
|
+
return m.Height
|
98
|
+
}
|
99
|
+
return 0
|
100
|
+
}
|
101
|
+
|
102
|
+
func init() {
|
103
|
+
proto.RegisterEnum("feedx.internal.testdata.MockEnum", MockEnum_name, MockEnum_value)
|
104
|
+
proto.RegisterType((*MockMessage)(nil), "feedx.internal.testdata.MockMessage")
|
105
|
+
}
|
106
|
+
|
107
|
+
func init() { proto.RegisterFile("internal/testdata/testdata.proto", fileDescriptor_076a9f61cb4a1904) }
|
108
|
+
|
109
|
+
var fileDescriptor_076a9f61cb4a1904 = []byte{
|
110
|
+
// 199 bytes of a gzipped FileDescriptorProto
|
111
|
+
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x52, 0xc8, 0xcc, 0x2b, 0x49,
|
112
|
+
0x2d, 0xca, 0x4b, 0xcc, 0xd1, 0x2f, 0x49, 0x2d, 0x2e, 0x49, 0x49, 0x2c, 0x49, 0x84, 0x33, 0xf4,
|
113
|
+
0x0a, 0x8a, 0xf2, 0x4b, 0xf2, 0x85, 0xc4, 0xd3, 0x52, 0x53, 0x53, 0x2a, 0xf4, 0x60, 0xea, 0xf4,
|
114
|
+
0x60, 0xd2, 0x4a, 0x05, 0x5c, 0xdc, 0xbe, 0xf9, 0xc9, 0xd9, 0xbe, 0xa9, 0xc5, 0xc5, 0x89, 0xe9,
|
115
|
+
0xa9, 0x42, 0x42, 0x5c, 0x2c, 0x79, 0x89, 0xb9, 0xa9, 0x12, 0x8c, 0x0a, 0x8c, 0x1a, 0x9c, 0x41,
|
116
|
+
0x60, 0xb6, 0x90, 0x29, 0x17, 0x4b, 0x6a, 0x5e, 0x69, 0xae, 0x04, 0x93, 0x02, 0xa3, 0x06, 0x9f,
|
117
|
+
0x91, 0xa2, 0x1e, 0x0e, 0xa3, 0xf4, 0x40, 0xe6, 0xb8, 0xe6, 0x95, 0xe6, 0x06, 0x81, 0x95, 0x0b,
|
118
|
+
0x89, 0x71, 0xb1, 0x65, 0xa4, 0x66, 0xa6, 0x67, 0x94, 0x48, 0x30, 0x2b, 0x30, 0x6a, 0xf0, 0x06,
|
119
|
+
0x41, 0x79, 0x5a, 0x4a, 0x5c, 0x1c, 0x30, 0x95, 0x42, 0xdc, 0x5c, 0xec, 0xa1, 0x7e, 0xde, 0x7e,
|
120
|
+
0xfe, 0xe1, 0x7e, 0x02, 0x0c, 0x42, 0x9c, 0x5c, 0xac, 0x6e, 0x9e, 0x41, 0xc1, 0x21, 0x02, 0xcc,
|
121
|
+
0x4e, 0x2a, 0x51, 0x4a, 0xe9, 0x99, 0x25, 0x19, 0xa5, 0x49, 0x7a, 0xc9, 0xf9, 0xb9, 0xfa, 0x60,
|
122
|
+
0x0b, 0xf5, 0x31, 0xfc, 0x98, 0xc4, 0x06, 0xf6, 0x9b, 0x31, 0x20, 0x00, 0x00, 0xff, 0xff, 0x42,
|
123
|
+
0xf6, 0x49, 0xb3, 0xff, 0x00, 0x00, 0x00,
|
124
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
syntax = "proto3";
|
2
|
+
|
3
|
+
package feedx.internal.testdata;
|
4
|
+
option go_package = "github.com/feedx/internal/testdata";
|
5
|
+
|
6
|
+
enum MockEnum {
|
7
|
+
UNKNOWN = 0;
|
8
|
+
FIRST = 3;
|
9
|
+
}
|
10
|
+
|
11
|
+
message MockMessage {
|
12
|
+
string name = 1;
|
13
|
+
MockEnum enum = 2;
|
14
|
+
uint32 height = 3;
|
15
|
+
}
|
data/lib/feedx/cache/abstract.rb
CHANGED
@@ -5,12 +5,12 @@ class Feedx::Cache::Abstract
|
|
5
5
|
end
|
6
6
|
|
7
7
|
# Read reads a key.
|
8
|
-
def read(_key, **
|
8
|
+
def read(_key, **)
|
9
9
|
raise 'Not implemented'
|
10
10
|
end
|
11
11
|
|
12
12
|
# Write writes a key/value pair.
|
13
|
-
def write(_key, _value, **
|
13
|
+
def write(_key, _value, **)
|
14
14
|
raise 'Not implemented'
|
15
15
|
end
|
16
16
|
|
data/lib/feedx/compression.rb
CHANGED
@@ -5,12 +5,19 @@ module Feedx
|
|
5
5
|
autoload :Gzip, 'feedx/compression/gzip'
|
6
6
|
|
7
7
|
class << self
|
8
|
+
def validate!(kind)
|
9
|
+
raise ArgumentError, "#{kind} does not implement #reader(io, &block)" unless kind.respond_to?(:reader)
|
10
|
+
raise ArgumentError, "#{kind} does not implement #writer(io, &block)" unless kind.respond_to?(:writer)
|
11
|
+
|
12
|
+
kind
|
13
|
+
end
|
14
|
+
|
8
15
|
def resolve(name)
|
9
16
|
case name.to_s
|
10
17
|
when 'gz', 'gzip'
|
11
|
-
Gzip
|
18
|
+
Gzip.new
|
12
19
|
when ''
|
13
|
-
None
|
20
|
+
None.new
|
14
21
|
else
|
15
22
|
raise ArgumentError, "invalid compression #{name}"
|
16
23
|
end
|
@@ -18,9 +25,9 @@ module Feedx
|
|
18
25
|
|
19
26
|
def detect(path)
|
20
27
|
if File.extname(path)[-1] == 'z'
|
21
|
-
Gzip
|
28
|
+
Gzip.new
|
22
29
|
else
|
23
|
-
None
|
30
|
+
None.new
|
24
31
|
end
|
25
32
|
end
|
26
33
|
end
|
@@ -1,11 +1,23 @@
|
|
1
1
|
require 'zlib'
|
2
2
|
|
3
3
|
class Feedx::Compression::Gzip < Feedx::Compression::Abstract
|
4
|
-
def
|
4
|
+
def reader(io, **, &block)
|
5
|
+
force_binmode(io)
|
5
6
|
Zlib::GzipReader.wrap(io, &block)
|
6
7
|
end
|
7
8
|
|
8
|
-
def
|
9
|
+
def writer(io, **, &block)
|
10
|
+
force_binmode(io)
|
9
11
|
Zlib::GzipWriter.wrap(io, &block)
|
10
12
|
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def force_binmode(io)
|
17
|
+
if io.respond_to?(:binmode)
|
18
|
+
io.binmode
|
19
|
+
elsif io.respond_to?(:set_encoding)
|
20
|
+
io.set_encoding(Encoding::BINARY)
|
21
|
+
end
|
22
|
+
end
|
11
23
|
end
|
data/lib/feedx/consumer.rb
CHANGED
@@ -8,22 +8,25 @@ module Feedx
|
|
8
8
|
include Enumerable
|
9
9
|
|
10
10
|
# See constructor.
|
11
|
-
def self.each(url, klass, opts
|
12
|
-
new(url, klass, opts).each(&block)
|
11
|
+
def self.each(url, klass, **opts, &block)
|
12
|
+
new(url, klass, **opts).each(&block)
|
13
13
|
end
|
14
14
|
|
15
15
|
# @param [String] url the destination URL.
|
16
16
|
# @param [Class] klass the record class.
|
17
17
|
# @param [Hash] opts options
|
18
18
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
19
|
-
# @option opts [Hash] :format_options format decode options. Default: {}.
|
20
19
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
21
20
|
# @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
|
22
|
-
def initialize(url, klass,
|
23
|
-
@klass
|
24
|
-
@stream
|
25
|
-
@
|
26
|
-
@
|
21
|
+
def initialize(url, klass, format_options: {}, cache: nil, **opts)
|
22
|
+
@klass = klass
|
23
|
+
@stream = Feedx::Stream.new(url, **opts)
|
24
|
+
@cache = cache
|
25
|
+
@opts = opts.merge(format_options)
|
26
|
+
|
27
|
+
return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
|
28
|
+
|
29
|
+
warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
|
27
30
|
end
|
28
31
|
|
29
32
|
# @return [Boolean] returns true if performed.
|
@@ -38,7 +41,7 @@ module Feedx
|
|
38
41
|
end
|
39
42
|
|
40
43
|
@stream.open do |fmt|
|
41
|
-
fmt.decode_each(@klass, **@
|
44
|
+
fmt.decode_each(@klass, **@opts, &block)
|
42
45
|
end
|
43
46
|
@cache.write(remote_rev) if @cache && remote_rev
|
44
47
|
|
data/lib/feedx/format.rb
CHANGED
@@ -2,13 +2,19 @@ module Feedx
|
|
2
2
|
module Format
|
3
3
|
autoload :Abstract, 'feedx/format/abstract'
|
4
4
|
autoload :JSON, 'feedx/format/json'
|
5
|
+
autoload :Parquet, 'feedx/format/parquet'
|
5
6
|
autoload :Protobuf, 'feedx/format/protobuf'
|
6
7
|
|
7
8
|
class << self
|
8
|
-
def
|
9
|
-
raise ArgumentError, "#{kind}
|
9
|
+
def validate!(kind)
|
10
|
+
raise ArgumentError, "#{kind} does not implement #encoder(io, &block)" unless kind.respond_to?(:encoder)
|
11
|
+
raise ArgumentError, "#{kind} does not implement #decoder(io, &block)" unless kind.respond_to?(:decoder)
|
12
|
+
|
13
|
+
kind
|
14
|
+
end
|
10
15
|
|
11
|
-
|
16
|
+
def register(ext, kind)
|
17
|
+
registry[ext.to_s] = validate!(kind)
|
12
18
|
end
|
13
19
|
|
14
20
|
def resolve(name)
|
@@ -33,6 +39,9 @@ module Feedx
|
|
33
39
|
def registry
|
34
40
|
@registry ||= {
|
35
41
|
'json' => :JSON,
|
42
|
+
'jsonl' => :JSON,
|
43
|
+
'ndjson' => :JSON,
|
44
|
+
'parquet' => :Parquet,
|
36
45
|
'pb' => :Protobuf,
|
37
46
|
'proto' => :Protobuf,
|
38
47
|
'protobuf' => :Protobuf,
|
@@ -40,13 +49,13 @@ module Feedx
|
|
40
49
|
end
|
41
50
|
|
42
51
|
def _resolve(name)
|
43
|
-
name
|
44
|
-
|
45
|
-
if
|
46
|
-
|
47
|
-
registry[name.to_s] =
|
52
|
+
name = name.to_s
|
53
|
+
kind = registry[name]
|
54
|
+
if kind.is_a?(Symbol)
|
55
|
+
kind = const_get(kind).new
|
56
|
+
registry[name.to_s] = kind
|
48
57
|
end
|
49
|
-
|
58
|
+
kind
|
50
59
|
end
|
51
60
|
end
|
52
61
|
end
|
@@ -1,25 +1,54 @@
|
|
1
1
|
class Feedx::Format::Abstract
|
2
|
-
def
|
3
|
-
|
2
|
+
def decoder(io, **opts, &block)
|
3
|
+
self.class::Decoder.open(io, **opts, &block)
|
4
4
|
end
|
5
5
|
|
6
|
-
def
|
7
|
-
|
6
|
+
def encoder(io, **opts, &block)
|
7
|
+
self.class::Encoder.open(io, **opts, &block)
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
class Wrapper
|
11
|
+
def self.open(io, **opts)
|
12
|
+
inst = new(io, **opts)
|
13
|
+
yield inst
|
14
|
+
ensure
|
15
|
+
inst&.close
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(io, **)
|
19
|
+
@io = io
|
15
20
|
end
|
16
21
|
end
|
17
22
|
|
18
|
-
|
19
|
-
|
23
|
+
class Decoder < Wrapper
|
24
|
+
def eof?
|
25
|
+
@io.eof?
|
26
|
+
end
|
27
|
+
|
28
|
+
def decode_each(target, **opts)
|
29
|
+
if block_given?
|
30
|
+
yield decode(target, **opts) until eof?
|
31
|
+
else
|
32
|
+
Enumerator.new do |acc|
|
33
|
+
acc << decode(target, **opts) until eof?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def decode(_target, **)
|
39
|
+
raise 'Not implemented'
|
40
|
+
end
|
41
|
+
|
42
|
+
def close; end
|
20
43
|
end
|
21
44
|
|
22
|
-
|
23
|
-
|
45
|
+
class Encoder < Wrapper
|
46
|
+
def encode(_msg, **)
|
47
|
+
raise 'Not implemented'
|
48
|
+
end
|
49
|
+
|
50
|
+
def close
|
51
|
+
@io.flush if @io.respond_to?(:flush)
|
52
|
+
end
|
24
53
|
end
|
25
54
|
end
|
data/lib/feedx/format/json.rb
CHANGED
@@ -1,16 +1,20 @@
|
|
1
1
|
require 'json'
|
2
2
|
|
3
3
|
class Feedx::Format::JSON < Feedx::Format::Abstract
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
5
|
+
def decode(target, **)
|
6
|
+
line = @io.gets
|
7
|
+
return unless line
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
target = target.allocate if target.is_a?(Class)
|
10
|
+
target.from_json(line)
|
11
|
+
target
|
12
|
+
end
|
11
13
|
end
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
16
|
+
def encode(msg, **opts)
|
17
|
+
@io.write msg.to_json(**opts) << "\n"
|
18
|
+
end
|
15
19
|
end
|
16
20
|
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'parquet'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
class Feedx::Format::Parquet < Feedx::Format::Abstract
|
5
|
+
class Record < Arrow::Record
|
6
|
+
def each_pair
|
7
|
+
container.columns.each do |col|
|
8
|
+
yield col.name, col[index]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
14
|
+
def initialize(io, **)
|
15
|
+
super(io)
|
16
|
+
|
17
|
+
@table = read_table
|
18
|
+
@cursor = 0
|
19
|
+
end
|
20
|
+
|
21
|
+
def eof?
|
22
|
+
@cursor >= @table.n_rows
|
23
|
+
end
|
24
|
+
|
25
|
+
def decode(target, **)
|
26
|
+
return if eof?
|
27
|
+
|
28
|
+
rec = Record.new(@table, @cursor)
|
29
|
+
@cursor += 1
|
30
|
+
|
31
|
+
target = target.allocate if target.is_a?(Class)
|
32
|
+
target.from_parquet(rec)
|
33
|
+
target
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def read_table
|
39
|
+
tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
40
|
+
IO.copy_stream(@io, tmpname)
|
41
|
+
|
42
|
+
@table = Arrow::Table.load(tmpname, format: 'parquet')
|
43
|
+
ensure
|
44
|
+
unlink!(tmpname) if tmpname
|
45
|
+
end
|
46
|
+
|
47
|
+
def unlink!(tmpname)
|
48
|
+
File.unlink(tmpname)
|
49
|
+
rescue Errno::ENOENT
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
55
|
+
attr_reader :schema
|
56
|
+
|
57
|
+
def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
|
58
|
+
super(io)
|
59
|
+
|
60
|
+
@schema = schema
|
61
|
+
@batch_size = batch_size.to_i
|
62
|
+
@buffer_size = buffer_size.to_i
|
63
|
+
|
64
|
+
@tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
65
|
+
@output = Arrow::FileOutputStream.new(@tmpname, append: false)
|
66
|
+
@writer = Parquet::ArrowFileWriter.new(@schema, @output)
|
67
|
+
@batch = []
|
68
|
+
end
|
69
|
+
|
70
|
+
def encode(msg, **opts)
|
71
|
+
msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
|
72
|
+
|
73
|
+
res = @batch.push(msg)
|
74
|
+
flush_table if @batch.size >= @batch_size
|
75
|
+
res
|
76
|
+
end
|
77
|
+
|
78
|
+
def close
|
79
|
+
flush_table unless @batch.empty?
|
80
|
+
|
81
|
+
@writer.close
|
82
|
+
@output.close
|
83
|
+
IO.copy_stream(@tmpname, @io)
|
84
|
+
ensure
|
85
|
+
unlink!
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def flush_table
|
91
|
+
table = Arrow::RecordBatch.new(@schema, @batch).to_table
|
92
|
+
@writer.write_table table, @buffer_size
|
93
|
+
@batch.clear
|
94
|
+
end
|
95
|
+
|
96
|
+
def unlink!
|
97
|
+
File.unlink(@tmpname)
|
98
|
+
rescue Errno::ENOENT
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|