feedx 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +12 -10
- data/Gemfile.lock +44 -21
- data/Makefile +5 -0
- data/consumer_test.go +5 -5
- data/feedx.gemspec +3 -2
- data/feedx_test.go +13 -13
- data/format.go +16 -16
- data/format_test.go +6 -7
- data/go.mod +6 -11
- data/go.sum +47 -28
- data/internal/testdata/testdata.pb.go +124 -0
- data/internal/testdata/testdata.proto +15 -0
- data/lib/feedx/compression.rb +11 -4
- data/lib/feedx/compression/abstract.rb +2 -2
- data/lib/feedx/compression/gzip.rb +14 -2
- data/lib/feedx/compression/none.rb +4 -4
- data/lib/feedx/consumer.rb +4 -4
- data/lib/feedx/format.rb +15 -9
- data/lib/feedx/format/abstract.rb +42 -13
- data/lib/feedx/format/json.rb +12 -8
- data/lib/feedx/format/parquet.rb +102 -0
- data/lib/feedx/format/protobuf.rb +16 -8
- data/lib/feedx/producer.rb +6 -5
- data/lib/feedx/stream.rb +17 -21
- data/producer.go +4 -11
- data/producer_test.go +16 -6
- data/reader_test.go +7 -8
- data/spec/feedx/compression/gzip_spec.rb +4 -2
- data/spec/feedx/compression/none_spec.rb +2 -2
- data/spec/feedx/compression_spec.rb +9 -9
- data/spec/feedx/consumer_spec.rb +6 -3
- data/spec/feedx/format/abstract_spec.rb +11 -8
- data/spec/feedx/format/json_spec.rb +12 -11
- data/spec/feedx/format/parquet_spec.rb +30 -0
- data/spec/feedx/format/protobuf_spec.rb +12 -11
- data/spec/feedx/format_spec.rb +8 -8
- data/spec/feedx/stream_spec.rb +20 -1
- data/spec/spec_helper.rb +17 -1
- data/writer.go +20 -15
- data/writer_test.go +3 -5
- metadata +22 -3
@@ -0,0 +1,124 @@
|
|
1
|
+
// Code generated by protoc-gen-gogo. DO NOT EDIT.
|
2
|
+
// source: internal/testdata/testdata.proto
|
3
|
+
|
4
|
+
package testdata
|
5
|
+
|
6
|
+
import (
|
7
|
+
fmt "fmt"
|
8
|
+
proto "github.com/gogo/protobuf/proto"
|
9
|
+
math "math"
|
10
|
+
)
|
11
|
+
|
12
|
+
// Reference imports to suppress errors if they are not otherwise used.
|
13
|
+
var _ = proto.Marshal
|
14
|
+
var _ = fmt.Errorf
|
15
|
+
var _ = math.Inf
|
16
|
+
|
17
|
+
// This is a compile-time assertion to ensure that this generated file
|
18
|
+
// is compatible with the proto package it is being compiled against.
|
19
|
+
// A compilation error at this line likely means your copy of the
|
20
|
+
// proto package needs to be updated.
|
21
|
+
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package
|
22
|
+
|
23
|
+
type MockEnum int32
|
24
|
+
|
25
|
+
const (
|
26
|
+
MockEnum_UNKNOWN MockEnum = 0
|
27
|
+
MockEnum_FIRST MockEnum = 3
|
28
|
+
)
|
29
|
+
|
30
|
+
var MockEnum_name = map[int32]string{
|
31
|
+
0: "UNKNOWN",
|
32
|
+
3: "FIRST",
|
33
|
+
}
|
34
|
+
|
35
|
+
var MockEnum_value = map[string]int32{
|
36
|
+
"UNKNOWN": 0,
|
37
|
+
"FIRST": 3,
|
38
|
+
}
|
39
|
+
|
40
|
+
func (x MockEnum) String() string {
|
41
|
+
return proto.EnumName(MockEnum_name, int32(x))
|
42
|
+
}
|
43
|
+
|
44
|
+
func (MockEnum) EnumDescriptor() ([]byte, []int) {
|
45
|
+
return fileDescriptor_076a9f61cb4a1904, []int{0}
|
46
|
+
}
|
47
|
+
|
48
|
+
type MockMessage struct {
|
49
|
+
Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
|
50
|
+
Enum MockEnum `protobuf:"varint,2,opt,name=enum,proto3,enum=feedx.internal.testdata.MockEnum" json:"enum,omitempty"`
|
51
|
+
Height uint32 `protobuf:"varint,3,opt,name=height,proto3" json:"height,omitempty"`
|
52
|
+
XXX_NoUnkeyedLiteral struct{} `json:"-"`
|
53
|
+
XXX_unrecognized []byte `json:"-"`
|
54
|
+
XXX_sizecache int32 `json:"-"`
|
55
|
+
}
|
56
|
+
|
57
|
+
func (m *MockMessage) Reset() { *m = MockMessage{} }
|
58
|
+
func (m *MockMessage) String() string { return proto.CompactTextString(m) }
|
59
|
+
func (*MockMessage) ProtoMessage() {}
|
60
|
+
func (*MockMessage) Descriptor() ([]byte, []int) {
|
61
|
+
return fileDescriptor_076a9f61cb4a1904, []int{0}
|
62
|
+
}
|
63
|
+
func (m *MockMessage) XXX_Unmarshal(b []byte) error {
|
64
|
+
return xxx_messageInfo_MockMessage.Unmarshal(m, b)
|
65
|
+
}
|
66
|
+
func (m *MockMessage) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
|
67
|
+
return xxx_messageInfo_MockMessage.Marshal(b, m, deterministic)
|
68
|
+
}
|
69
|
+
func (m *MockMessage) XXX_Merge(src proto.Message) {
|
70
|
+
xxx_messageInfo_MockMessage.Merge(m, src)
|
71
|
+
}
|
72
|
+
func (m *MockMessage) XXX_Size() int {
|
73
|
+
return xxx_messageInfo_MockMessage.Size(m)
|
74
|
+
}
|
75
|
+
func (m *MockMessage) XXX_DiscardUnknown() {
|
76
|
+
xxx_messageInfo_MockMessage.DiscardUnknown(m)
|
77
|
+
}
|
78
|
+
|
79
|
+
var xxx_messageInfo_MockMessage proto.InternalMessageInfo
|
80
|
+
|
81
|
+
func (m *MockMessage) GetName() string {
|
82
|
+
if m != nil {
|
83
|
+
return m.Name
|
84
|
+
}
|
85
|
+
return ""
|
86
|
+
}
|
87
|
+
|
88
|
+
func (m *MockMessage) GetEnum() MockEnum {
|
89
|
+
if m != nil {
|
90
|
+
return m.Enum
|
91
|
+
}
|
92
|
+
return MockEnum_UNKNOWN
|
93
|
+
}
|
94
|
+
|
95
|
+
func (m *MockMessage) GetHeight() uint32 {
|
96
|
+
if m != nil {
|
97
|
+
return m.Height
|
98
|
+
}
|
99
|
+
return 0
|
100
|
+
}
|
101
|
+
|
102
|
+
func init() {
|
103
|
+
proto.RegisterEnum("feedx.internal.testdata.MockEnum", MockEnum_name, MockEnum_value)
|
104
|
+
proto.RegisterType((*MockMessage)(nil), "feedx.internal.testdata.MockMessage")
|
105
|
+
}
|
106
|
+
|
107
|
+
func init() { proto.RegisterFile("internal/testdata/testdata.proto", fileDescriptor_076a9f61cb4a1904) }
|
108
|
+
|
109
|
+
var fileDescriptor_076a9f61cb4a1904 = []byte{
|
110
|
+
// 199 bytes of a gzipped FileDescriptorProto
|
111
|
+
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x52, 0xc8, 0xcc, 0x2b, 0x49,
|
112
|
+
0x2d, 0xca, 0x4b, 0xcc, 0xd1, 0x2f, 0x49, 0x2d, 0x2e, 0x49, 0x49, 0x2c, 0x49, 0x84, 0x33, 0xf4,
|
113
|
+
0x0a, 0x8a, 0xf2, 0x4b, 0xf2, 0x85, 0xc4, 0xd3, 0x52, 0x53, 0x53, 0x2a, 0xf4, 0x60, 0xea, 0xf4,
|
114
|
+
0x60, 0xd2, 0x4a, 0x05, 0x5c, 0xdc, 0xbe, 0xf9, 0xc9, 0xd9, 0xbe, 0xa9, 0xc5, 0xc5, 0x89, 0xe9,
|
115
|
+
0xa9, 0x42, 0x42, 0x5c, 0x2c, 0x79, 0x89, 0xb9, 0xa9, 0x12, 0x8c, 0x0a, 0x8c, 0x1a, 0x9c, 0x41,
|
116
|
+
0x60, 0xb6, 0x90, 0x29, 0x17, 0x4b, 0x6a, 0x5e, 0x69, 0xae, 0x04, 0x93, 0x02, 0xa3, 0x06, 0x9f,
|
117
|
+
0x91, 0xa2, 0x1e, 0x0e, 0xa3, 0xf4, 0x40, 0xe6, 0xb8, 0xe6, 0x95, 0xe6, 0x06, 0x81, 0x95, 0x0b,
|
118
|
+
0x89, 0x71, 0xb1, 0x65, 0xa4, 0x66, 0xa6, 0x67, 0x94, 0x48, 0x30, 0x2b, 0x30, 0x6a, 0xf0, 0x06,
|
119
|
+
0x41, 0x79, 0x5a, 0x4a, 0x5c, 0x1c, 0x30, 0x95, 0x42, 0xdc, 0x5c, 0xec, 0xa1, 0x7e, 0xde, 0x7e,
|
120
|
+
0xfe, 0xe1, 0x7e, 0x02, 0x0c, 0x42, 0x9c, 0x5c, 0xac, 0x6e, 0x9e, 0x41, 0xc1, 0x21, 0x02, 0xcc,
|
121
|
+
0x4e, 0x2a, 0x51, 0x4a, 0xe9, 0x99, 0x25, 0x19, 0xa5, 0x49, 0x7a, 0xc9, 0xf9, 0xb9, 0xfa, 0x60,
|
122
|
+
0x0b, 0xf5, 0x31, 0xfc, 0x98, 0xc4, 0x06, 0xf6, 0x9b, 0x31, 0x20, 0x00, 0x00, 0xff, 0xff, 0x42,
|
123
|
+
0xf6, 0x49, 0xb3, 0xff, 0x00, 0x00, 0x00,
|
124
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
syntax = "proto3";
|
2
|
+
|
3
|
+
package feedx.internal.testdata;
|
4
|
+
option go_package = "github.com/feedx/internal/testdata";
|
5
|
+
|
6
|
+
enum MockEnum {
|
7
|
+
UNKNOWN = 0;
|
8
|
+
FIRST = 3;
|
9
|
+
}
|
10
|
+
|
11
|
+
message MockMessage {
|
12
|
+
string name = 1;
|
13
|
+
MockEnum enum = 2;
|
14
|
+
uint32 height = 3;
|
15
|
+
}
|
data/lib/feedx/compression.rb
CHANGED
@@ -5,12 +5,19 @@ module Feedx
|
|
5
5
|
autoload :Gzip, 'feedx/compression/gzip'
|
6
6
|
|
7
7
|
class << self
|
8
|
+
def validate!(kind)
|
9
|
+
raise ArgumentError, "#{kind} does not implement #reader(io, &block)" unless kind.respond_to?(:reader)
|
10
|
+
raise ArgumentError, "#{kind} does not implement #writer(io, &block)" unless kind.respond_to?(:writer)
|
11
|
+
|
12
|
+
kind
|
13
|
+
end
|
14
|
+
|
8
15
|
def resolve(name)
|
9
16
|
case name.to_s
|
10
17
|
when 'gz', 'gzip'
|
11
|
-
Gzip
|
18
|
+
Gzip.new
|
12
19
|
when ''
|
13
|
-
None
|
20
|
+
None.new
|
14
21
|
else
|
15
22
|
raise ArgumentError, "invalid compression #{name}"
|
16
23
|
end
|
@@ -18,9 +25,9 @@ module Feedx
|
|
18
25
|
|
19
26
|
def detect(path)
|
20
27
|
if File.extname(path)[-1] == 'z'
|
21
|
-
Gzip
|
28
|
+
Gzip.new
|
22
29
|
else
|
23
|
-
None
|
30
|
+
None.new
|
24
31
|
end
|
25
32
|
end
|
26
33
|
end
|
@@ -1,11 +1,23 @@
|
|
1
1
|
require 'zlib'
|
2
2
|
|
3
3
|
class Feedx::Compression::Gzip < Feedx::Compression::Abstract
|
4
|
-
def
|
4
|
+
def reader(io, &block)
|
5
|
+
force_binmode(io)
|
5
6
|
Zlib::GzipReader.wrap(io, &block)
|
6
7
|
end
|
7
8
|
|
8
|
-
def
|
9
|
+
def writer(io, &block)
|
10
|
+
force_binmode(io)
|
9
11
|
Zlib::GzipWriter.wrap(io, &block)
|
10
12
|
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def force_binmode(io)
|
17
|
+
if io.respond_to?(:binmode)
|
18
|
+
io.binmode
|
19
|
+
elsif io.respond_to?(:set_encoding)
|
20
|
+
io.set_encoding(Encoding::BINARY)
|
21
|
+
end
|
22
|
+
end
|
11
23
|
end
|
data/lib/feedx/consumer.rb
CHANGED
@@ -8,8 +8,8 @@ module Feedx
|
|
8
8
|
include Enumerable
|
9
9
|
|
10
10
|
# See constructor.
|
11
|
-
def self.each(url, klass, opts
|
12
|
-
new(url, klass, opts).each(&block)
|
11
|
+
def self.each(url, klass, **opts, &block)
|
12
|
+
new(url, klass, **opts).each(&block)
|
13
13
|
end
|
14
14
|
|
15
15
|
# @param [String] url the destination URL.
|
@@ -19,9 +19,9 @@ module Feedx
|
|
19
19
|
# @option opts [Hash] :format_options format decode options. Default: {}.
|
20
20
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
21
21
|
# @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
|
22
|
-
def initialize(url, klass, opts
|
22
|
+
def initialize(url, klass, **opts)
|
23
23
|
@klass = klass
|
24
|
-
@stream = Feedx::Stream.new(url, opts)
|
24
|
+
@stream = Feedx::Stream.new(url, **opts)
|
25
25
|
@fmt_opts = opts[:format_options] || {}
|
26
26
|
@cache = opts[:cache]
|
27
27
|
end
|
data/lib/feedx/format.rb
CHANGED
@@ -2,13 +2,19 @@ module Feedx
|
|
2
2
|
module Format
|
3
3
|
autoload :Abstract, 'feedx/format/abstract'
|
4
4
|
autoload :JSON, 'feedx/format/json'
|
5
|
+
autoload :Parquet, 'feedx/format/parquet'
|
5
6
|
autoload :Protobuf, 'feedx/format/protobuf'
|
6
7
|
|
7
8
|
class << self
|
8
|
-
def
|
9
|
-
raise ArgumentError, "#{kind}
|
9
|
+
def validate!(kind)
|
10
|
+
raise ArgumentError, "#{kind} does not implement #encoder(io, &block)" unless kind.respond_to?(:encoder)
|
11
|
+
raise ArgumentError, "#{kind} does not implement #decoder(io, &block)" unless kind.respond_to?(:decoder)
|
12
|
+
|
13
|
+
kind
|
14
|
+
end
|
10
15
|
|
11
|
-
|
16
|
+
def register(ext, kind)
|
17
|
+
registry[ext.to_s] = validate!(kind)
|
12
18
|
end
|
13
19
|
|
14
20
|
def resolve(name)
|
@@ -40,13 +46,13 @@ module Feedx
|
|
40
46
|
end
|
41
47
|
|
42
48
|
def _resolve(name)
|
43
|
-
name
|
44
|
-
|
45
|
-
if
|
46
|
-
|
47
|
-
registry[name.to_s] =
|
49
|
+
name = name.to_s
|
50
|
+
kind = registry[name]
|
51
|
+
if kind.is_a?(Symbol)
|
52
|
+
kind = const_get(kind).new
|
53
|
+
registry[name.to_s] = kind
|
48
54
|
end
|
49
|
-
|
55
|
+
kind
|
50
56
|
end
|
51
57
|
end
|
52
58
|
end
|
@@ -1,25 +1,54 @@
|
|
1
1
|
class Feedx::Format::Abstract
|
2
|
-
def
|
3
|
-
|
2
|
+
def decoder(io, **opts, &block)
|
3
|
+
self.class::Decoder.open(io, **opts, &block)
|
4
4
|
end
|
5
5
|
|
6
|
-
def
|
7
|
-
|
6
|
+
def encoder(io, **opts, &block)
|
7
|
+
self.class::Encoder.open(io, **opts, &block)
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
class Wrapper
|
11
|
+
def self.open(io, **opts)
|
12
|
+
inst = new(io, **opts)
|
13
|
+
yield inst
|
14
|
+
ensure
|
15
|
+
inst&.close
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(io, **)
|
19
|
+
@io = io
|
15
20
|
end
|
16
21
|
end
|
17
22
|
|
18
|
-
|
19
|
-
|
23
|
+
class Decoder < Wrapper
|
24
|
+
def eof?
|
25
|
+
@io.eof?
|
26
|
+
end
|
27
|
+
|
28
|
+
def decode_each(target, **opts)
|
29
|
+
if block_given?
|
30
|
+
yield decode(target, **opts) until eof?
|
31
|
+
else
|
32
|
+
Enumerator.new do |acc|
|
33
|
+
acc << decode(target, **opts) until eof?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def decode(_target, **)
|
39
|
+
raise 'Not implemented'
|
40
|
+
end
|
41
|
+
|
42
|
+
def close; end
|
20
43
|
end
|
21
44
|
|
22
|
-
|
23
|
-
|
45
|
+
class Encoder < Wrapper
|
46
|
+
def encode(_msg, **)
|
47
|
+
raise 'Not implemented'
|
48
|
+
end
|
49
|
+
|
50
|
+
def close
|
51
|
+
@io.flush if @io.respond_to?(:flush)
|
52
|
+
end
|
24
53
|
end
|
25
54
|
end
|
data/lib/feedx/format/json.rb
CHANGED
@@ -1,16 +1,20 @@
|
|
1
1
|
require 'json'
|
2
2
|
|
3
3
|
class Feedx::Format::JSON < Feedx::Format::Abstract
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
5
|
+
def decode(target, **)
|
6
|
+
line = @io.gets
|
7
|
+
return unless line
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
target = target.allocate if target.is_a?(Class)
|
10
|
+
target.from_json(line)
|
11
|
+
target
|
12
|
+
end
|
11
13
|
end
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
16
|
+
def encode(msg, **opts)
|
17
|
+
@io.write msg.to_json(**opts) << "\n"
|
18
|
+
end
|
15
19
|
end
|
16
20
|
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'parquet'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
class Feedx::Format::Parquet < Feedx::Format::Abstract
|
5
|
+
class Record < Arrow::Record
|
6
|
+
def each_pair
|
7
|
+
container.columns.each do |col|
|
8
|
+
yield col.name, col[index]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
14
|
+
def initialize(io, **)
|
15
|
+
super(io)
|
16
|
+
|
17
|
+
@table = read_table
|
18
|
+
@cursor = 0
|
19
|
+
end
|
20
|
+
|
21
|
+
def eof?
|
22
|
+
@cursor >= @table.n_rows
|
23
|
+
end
|
24
|
+
|
25
|
+
def decode(target, **)
|
26
|
+
return if eof?
|
27
|
+
|
28
|
+
rec = Record.new(@table, @cursor)
|
29
|
+
@cursor += 1
|
30
|
+
|
31
|
+
target = target.allocate if target.is_a?(Class)
|
32
|
+
target.from_parquet(rec)
|
33
|
+
target
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def read_table
|
39
|
+
tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
40
|
+
IO.copy_stream(@io, tmpname)
|
41
|
+
|
42
|
+
@table = Arrow::Table.load(tmpname, format: 'parquet')
|
43
|
+
ensure
|
44
|
+
unlink!(tmpname) if tmpname
|
45
|
+
end
|
46
|
+
|
47
|
+
def unlink!(tmpname)
|
48
|
+
File.unlink(tmpname)
|
49
|
+
rescue Errno::ENOENT
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
55
|
+
attr_reader :schema
|
56
|
+
|
57
|
+
def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
|
58
|
+
super(io)
|
59
|
+
|
60
|
+
@schema = schema
|
61
|
+
@batch_size = batch_size.to_i
|
62
|
+
@buffer_size = buffer_size.to_i
|
63
|
+
|
64
|
+
@tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
65
|
+
@output = Arrow::FileOutputStream.new(@tmpname, append: false)
|
66
|
+
@writer = Parquet::ArrowFileWriter.new(@schema, @output)
|
67
|
+
@batch = []
|
68
|
+
end
|
69
|
+
|
70
|
+
def encode(msg, **opts)
|
71
|
+
msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
|
72
|
+
|
73
|
+
res = @batch.push(msg)
|
74
|
+
flush_table if @batch.size >= @batch_size
|
75
|
+
res
|
76
|
+
end
|
77
|
+
|
78
|
+
def close
|
79
|
+
flush_table unless @batch.empty?
|
80
|
+
|
81
|
+
@writer.close
|
82
|
+
@output.close
|
83
|
+
IO.copy_stream(@tmpname, @io)
|
84
|
+
ensure
|
85
|
+
unlink!
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def flush_table
|
91
|
+
table = Arrow::RecordBatch.new(@schema, @batch).to_table
|
92
|
+
@writer.write_table table, @buffer_size
|
93
|
+
@batch.clear
|
94
|
+
end
|
95
|
+
|
96
|
+
def unlink!
|
97
|
+
File.unlink(@tmpname)
|
98
|
+
rescue Errno::ENOENT
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|