feedx 0.9.2 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.rubocop.yml +2 -0
  4. data/.travis.yml +13 -6
  5. data/Gemfile.lock +43 -20
  6. data/Makefile +5 -0
  7. data/consumer_test.go +5 -5
  8. data/feedx.gemspec +3 -2
  9. data/feedx_test.go +13 -13
  10. data/format.go +16 -16
  11. data/format_test.go +6 -7
  12. data/go.mod +5 -11
  13. data/go.sum +43 -26
  14. data/internal/testdata/testdata.pb.go +124 -0
  15. data/internal/testdata/testdata.proto +15 -0
  16. data/lib/feedx/cache/abstract.rb +2 -2
  17. data/lib/feedx/compression.rb +11 -4
  18. data/lib/feedx/compression/abstract.rb +2 -2
  19. data/lib/feedx/compression/gzip.rb +14 -2
  20. data/lib/feedx/compression/none.rb +4 -4
  21. data/lib/feedx/consumer.rb +12 -9
  22. data/lib/feedx/format.rb +18 -9
  23. data/lib/feedx/format/abstract.rb +42 -13
  24. data/lib/feedx/format/json.rb +12 -8
  25. data/lib/feedx/format/parquet.rb +102 -0
  26. data/lib/feedx/format/protobuf.rb +16 -8
  27. data/lib/feedx/producer.rb +12 -9
  28. data/lib/feedx/stream.rb +22 -25
  29. data/producer.go +1 -4
  30. data/producer_test.go +1 -2
  31. data/reader_test.go +7 -8
  32. data/spec/feedx/compression/gzip_spec.rb +4 -2
  33. data/spec/feedx/compression/none_spec.rb +2 -2
  34. data/spec/feedx/compression_spec.rb +9 -9
  35. data/spec/feedx/consumer_spec.rb +6 -3
  36. data/spec/feedx/format/abstract_spec.rb +11 -8
  37. data/spec/feedx/format/json_spec.rb +12 -11
  38. data/spec/feedx/format/parquet_spec.rb +30 -0
  39. data/spec/feedx/format/protobuf_spec.rb +12 -11
  40. data/spec/feedx/format_spec.rb +8 -8
  41. data/spec/feedx/producer_spec.rb +6 -0
  42. data/spec/feedx/stream_spec.rb +20 -1
  43. data/spec/spec_helper.rb +17 -1
  44. data/writer.go +19 -18
  45. data/writer_test.go +3 -5
  46. metadata +22 -3
@@ -0,0 +1,124 @@
1
+ // Code generated by protoc-gen-gogo. DO NOT EDIT.
2
+ // source: internal/testdata/testdata.proto
3
+
4
+ package testdata
5
+
6
+ import (
7
+ fmt "fmt"
8
+ proto "github.com/gogo/protobuf/proto"
9
+ math "math"
10
+ )
11
+
12
+ // Reference imports to suppress errors if they are not otherwise used.
13
+ var _ = proto.Marshal
14
+ var _ = fmt.Errorf
15
+ var _ = math.Inf
16
+
17
+ // This is a compile-time assertion to ensure that this generated file
18
+ // is compatible with the proto package it is being compiled against.
19
+ // A compilation error at this line likely means your copy of the
20
+ // proto package needs to be updated.
21
+ const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package
22
+
23
+ type MockEnum int32
24
+
25
+ const (
26
+ MockEnum_UNKNOWN MockEnum = 0
27
+ MockEnum_FIRST MockEnum = 3
28
+ )
29
+
30
+ var MockEnum_name = map[int32]string{
31
+ 0: "UNKNOWN",
32
+ 3: "FIRST",
33
+ }
34
+
35
+ var MockEnum_value = map[string]int32{
36
+ "UNKNOWN": 0,
37
+ "FIRST": 3,
38
+ }
39
+
40
+ func (x MockEnum) String() string {
41
+ return proto.EnumName(MockEnum_name, int32(x))
42
+ }
43
+
44
+ func (MockEnum) EnumDescriptor() ([]byte, []int) {
45
+ return fileDescriptor_076a9f61cb4a1904, []int{0}
46
+ }
47
+
48
+ type MockMessage struct {
49
+ Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
50
+ Enum MockEnum `protobuf:"varint,2,opt,name=enum,proto3,enum=feedx.internal.testdata.MockEnum" json:"enum,omitempty"`
51
+ Height uint32 `protobuf:"varint,3,opt,name=height,proto3" json:"height,omitempty"`
52
+ XXX_NoUnkeyedLiteral struct{} `json:"-"`
53
+ XXX_unrecognized []byte `json:"-"`
54
+ XXX_sizecache int32 `json:"-"`
55
+ }
56
+
57
+ func (m *MockMessage) Reset() { *m = MockMessage{} }
58
+ func (m *MockMessage) String() string { return proto.CompactTextString(m) }
59
+ func (*MockMessage) ProtoMessage() {}
60
+ func (*MockMessage) Descriptor() ([]byte, []int) {
61
+ return fileDescriptor_076a9f61cb4a1904, []int{0}
62
+ }
63
+ func (m *MockMessage) XXX_Unmarshal(b []byte) error {
64
+ return xxx_messageInfo_MockMessage.Unmarshal(m, b)
65
+ }
66
+ func (m *MockMessage) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
67
+ return xxx_messageInfo_MockMessage.Marshal(b, m, deterministic)
68
+ }
69
+ func (m *MockMessage) XXX_Merge(src proto.Message) {
70
+ xxx_messageInfo_MockMessage.Merge(m, src)
71
+ }
72
+ func (m *MockMessage) XXX_Size() int {
73
+ return xxx_messageInfo_MockMessage.Size(m)
74
+ }
75
+ func (m *MockMessage) XXX_DiscardUnknown() {
76
+ xxx_messageInfo_MockMessage.DiscardUnknown(m)
77
+ }
78
+
79
+ var xxx_messageInfo_MockMessage proto.InternalMessageInfo
80
+
81
+ func (m *MockMessage) GetName() string {
82
+ if m != nil {
83
+ return m.Name
84
+ }
85
+ return ""
86
+ }
87
+
88
+ func (m *MockMessage) GetEnum() MockEnum {
89
+ if m != nil {
90
+ return m.Enum
91
+ }
92
+ return MockEnum_UNKNOWN
93
+ }
94
+
95
+ func (m *MockMessage) GetHeight() uint32 {
96
+ if m != nil {
97
+ return m.Height
98
+ }
99
+ return 0
100
+ }
101
+
102
+ func init() {
103
+ proto.RegisterEnum("feedx.internal.testdata.MockEnum", MockEnum_name, MockEnum_value)
104
+ proto.RegisterType((*MockMessage)(nil), "feedx.internal.testdata.MockMessage")
105
+ }
106
+
107
+ func init() { proto.RegisterFile("internal/testdata/testdata.proto", fileDescriptor_076a9f61cb4a1904) }
108
+
109
+ var fileDescriptor_076a9f61cb4a1904 = []byte{
110
+ // 199 bytes of a gzipped FileDescriptorProto
111
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x52, 0xc8, 0xcc, 0x2b, 0x49,
112
+ 0x2d, 0xca, 0x4b, 0xcc, 0xd1, 0x2f, 0x49, 0x2d, 0x2e, 0x49, 0x49, 0x2c, 0x49, 0x84, 0x33, 0xf4,
113
+ 0x0a, 0x8a, 0xf2, 0x4b, 0xf2, 0x85, 0xc4, 0xd3, 0x52, 0x53, 0x53, 0x2a, 0xf4, 0x60, 0xea, 0xf4,
114
+ 0x60, 0xd2, 0x4a, 0x05, 0x5c, 0xdc, 0xbe, 0xf9, 0xc9, 0xd9, 0xbe, 0xa9, 0xc5, 0xc5, 0x89, 0xe9,
115
+ 0xa9, 0x42, 0x42, 0x5c, 0x2c, 0x79, 0x89, 0xb9, 0xa9, 0x12, 0x8c, 0x0a, 0x8c, 0x1a, 0x9c, 0x41,
116
+ 0x60, 0xb6, 0x90, 0x29, 0x17, 0x4b, 0x6a, 0x5e, 0x69, 0xae, 0x04, 0x93, 0x02, 0xa3, 0x06, 0x9f,
117
+ 0x91, 0xa2, 0x1e, 0x0e, 0xa3, 0xf4, 0x40, 0xe6, 0xb8, 0xe6, 0x95, 0xe6, 0x06, 0x81, 0x95, 0x0b,
118
+ 0x89, 0x71, 0xb1, 0x65, 0xa4, 0x66, 0xa6, 0x67, 0x94, 0x48, 0x30, 0x2b, 0x30, 0x6a, 0xf0, 0x06,
119
+ 0x41, 0x79, 0x5a, 0x4a, 0x5c, 0x1c, 0x30, 0x95, 0x42, 0xdc, 0x5c, 0xec, 0xa1, 0x7e, 0xde, 0x7e,
120
+ 0xfe, 0xe1, 0x7e, 0x02, 0x0c, 0x42, 0x9c, 0x5c, 0xac, 0x6e, 0x9e, 0x41, 0xc1, 0x21, 0x02, 0xcc,
121
+ 0x4e, 0x2a, 0x51, 0x4a, 0xe9, 0x99, 0x25, 0x19, 0xa5, 0x49, 0x7a, 0xc9, 0xf9, 0xb9, 0xfa, 0x60,
122
+ 0x0b, 0xf5, 0x31, 0xfc, 0x98, 0xc4, 0x06, 0xf6, 0x9b, 0x31, 0x20, 0x00, 0x00, 0xff, 0xff, 0x42,
123
+ 0xf6, 0x49, 0xb3, 0xff, 0x00, 0x00, 0x00,
124
+ }
@@ -0,0 +1,15 @@
1
+ syntax = "proto3";
2
+
3
+ package feedx.internal.testdata;
4
+ option go_package = "github.com/feedx/internal/testdata";
5
+
6
+ enum MockEnum {
7
+ UNKNOWN = 0;
8
+ FIRST = 3;
9
+ }
10
+
11
+ message MockMessage {
12
+ string name = 1;
13
+ MockEnum enum = 2;
14
+ uint32 height = 3;
15
+ }
@@ -5,12 +5,12 @@ class Feedx::Cache::Abstract
5
5
  end
6
6
 
7
7
  # Read reads a key.
8
- def read(_key, **_opts)
8
+ def read(_key, **)
9
9
  raise 'Not implemented'
10
10
  end
11
11
 
12
12
  # Write writes a key/value pair.
13
- def write(_key, _value, **_opts)
13
+ def write(_key, _value, **)
14
14
  raise 'Not implemented'
15
15
  end
16
16
 
@@ -5,12 +5,19 @@ module Feedx
5
5
  autoload :Gzip, 'feedx/compression/gzip'
6
6
 
7
7
  class << self
8
+ def validate!(kind)
9
+ raise ArgumentError, "#{kind} does not implement #reader(io, &block)" unless kind.respond_to?(:reader)
10
+ raise ArgumentError, "#{kind} does not implement #writer(io, &block)" unless kind.respond_to?(:writer)
11
+
12
+ kind
13
+ end
14
+
8
15
  def resolve(name)
9
16
  case name.to_s
10
17
  when 'gz', 'gzip'
11
- Gzip
18
+ Gzip.new
12
19
  when ''
13
- None
20
+ None.new
14
21
  else
15
22
  raise ArgumentError, "invalid compression #{name}"
16
23
  end
@@ -18,9 +25,9 @@ module Feedx
18
25
 
19
26
  def detect(path)
20
27
  if File.extname(path)[-1] == 'z'
21
- Gzip
28
+ Gzip.new
22
29
  else
23
- None
30
+ None.new
24
31
  end
25
32
  end
26
33
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::Abstract
2
- def self.reader(_io, &_block)
2
+ def reader(_io, **, &_block)
3
3
  raise 'Not implemented'
4
4
  end
5
5
 
6
- def self.writer(_io, &_block)
6
+ def writer(_io, **, &_block)
7
7
  raise 'Not implemented'
8
8
  end
9
9
  end
@@ -1,11 +1,23 @@
1
1
  require 'zlib'
2
2
 
3
3
  class Feedx::Compression::Gzip < Feedx::Compression::Abstract
4
- def self.reader(io, &block)
4
+ def reader(io, **, &block)
5
+ force_binmode(io)
5
6
  Zlib::GzipReader.wrap(io, &block)
6
7
  end
7
8
 
8
- def self.writer(io, &block)
9
+ def writer(io, **, &block)
10
+ force_binmode(io)
9
11
  Zlib::GzipWriter.wrap(io, &block)
10
12
  end
13
+
14
+ private
15
+
16
+ def force_binmode(io)
17
+ if io.respond_to?(:binmode)
18
+ io.binmode
19
+ elsif io.respond_to?(:set_encoding)
20
+ io.set_encoding(Encoding::BINARY)
21
+ end
22
+ end
11
23
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::None < Feedx::Compression::Abstract
2
- def self.reader(io, &block)
3
- block.call(io)
2
+ def reader(io, **)
3
+ yield(io)
4
4
  end
5
5
 
6
- def self.writer(io, &block)
7
- block.call(io)
6
+ def writer(io, **)
7
+ yield(io)
8
8
  end
9
9
  end
@@ -8,22 +8,25 @@ module Feedx
8
8
  include Enumerable
9
9
 
10
10
  # See constructor.
11
- def self.each(url, klass, opts = {}, &block)
12
- new(url, klass, opts).each(&block)
11
+ def self.each(url, klass, **opts, &block)
12
+ new(url, klass, **opts).each(&block)
13
13
  end
14
14
 
15
15
  # @param [String] url the destination URL.
16
16
  # @param [Class] klass the record class.
17
17
  # @param [Hash] opts options
18
18
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
19
- # @option opts [Hash] :format_options format decode options. Default: {}.
20
19
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
21
20
  # @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
22
- def initialize(url, klass, opts = {})
23
- @klass = klass
24
- @stream = Feedx::Stream.new(url, opts)
25
- @fmt_opts = opts[:format_options] || {}
26
- @cache = opts[:cache]
21
+ def initialize(url, klass, format_options: {}, cache: nil, **opts)
22
+ @klass = klass
23
+ @stream = Feedx::Stream.new(url, **opts)
24
+ @cache = cache
25
+ @opts = opts.merge(format_options)
26
+
27
+ return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
28
+
29
+ warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
27
30
  end
28
31
 
29
32
  # @return [Boolean] returns true if performed.
@@ -38,7 +41,7 @@ module Feedx
38
41
  end
39
42
 
40
43
  @stream.open do |fmt|
41
- fmt.decode_each(@klass, **@fmt_opts, &block)
44
+ fmt.decode_each(@klass, **@opts, &block)
42
45
  end
43
46
  @cache.write(remote_rev) if @cache && remote_rev
44
47
 
@@ -2,13 +2,19 @@ module Feedx
2
2
  module Format
3
3
  autoload :Abstract, 'feedx/format/abstract'
4
4
  autoload :JSON, 'feedx/format/json'
5
+ autoload :Parquet, 'feedx/format/parquet'
5
6
  autoload :Protobuf, 'feedx/format/protobuf'
6
7
 
7
8
  class << self
8
- def register(ext, kind)
9
- raise ArgumentError, "#{kind} is not a subclass of Feedx::Format::Abstract" unless kind.is_a?(Class) && kind < Abstract
9
+ def validate!(kind)
10
+ raise ArgumentError, "#{kind} does not implement #encoder(io, &block)" unless kind.respond_to?(:encoder)
11
+ raise ArgumentError, "#{kind} does not implement #decoder(io, &block)" unless kind.respond_to?(:decoder)
12
+
13
+ kind
14
+ end
10
15
 
11
- registry[ext.to_s] = kind
16
+ def register(ext, kind)
17
+ registry[ext.to_s] = validate!(kind)
12
18
  end
13
19
 
14
20
  def resolve(name)
@@ -33,6 +39,9 @@ module Feedx
33
39
  def registry
34
40
  @registry ||= {
35
41
  'json' => :JSON,
42
+ 'jsonl' => :JSON,
43
+ 'ndjson' => :JSON,
44
+ 'parquet' => :Parquet,
36
45
  'pb' => :Protobuf,
37
46
  'proto' => :Protobuf,
38
47
  'protobuf' => :Protobuf,
@@ -40,13 +49,13 @@ module Feedx
40
49
  end
41
50
 
42
51
  def _resolve(name)
43
- name = name.to_s
44
- klass = registry[name]
45
- if klass.is_a?(Symbol)
46
- klass = const_get(klass)
47
- registry[name.to_s] = klass
52
+ name = name.to_s
53
+ kind = registry[name]
54
+ if kind.is_a?(Symbol)
55
+ kind = const_get(kind).new
56
+ registry[name.to_s] = kind
48
57
  end
49
- klass
58
+ kind
50
59
  end
51
60
  end
52
61
  end
@@ -1,25 +1,54 @@
1
1
  class Feedx::Format::Abstract
2
- def initialize(io)
3
- @io = io
2
+ def decoder(io, **opts, &block)
3
+ self.class::Decoder.open(io, **opts, &block)
4
4
  end
5
5
 
6
- def eof?
7
- @io.eof?
6
+ def encoder(io, **opts, &block)
7
+ self.class::Encoder.open(io, **opts, &block)
8
8
  end
9
9
 
10
- def decode_each(klass, **opts)
11
- if block_given?
12
- yield decode(klass, **opts) until eof?
13
- else
14
- Enumerator.new {|y| y << decode(klass, **opts) until eof? }
10
+ class Wrapper
11
+ def self.open(io, **opts)
12
+ inst = new(io, **opts)
13
+ yield inst
14
+ ensure
15
+ inst&.close
16
+ end
17
+
18
+ def initialize(io, **)
19
+ @io = io
15
20
  end
16
21
  end
17
22
 
18
- def decode(_klass, **)
19
- raise 'Not implemented'
23
+ class Decoder < Wrapper
24
+ def eof?
25
+ @io.eof?
26
+ end
27
+
28
+ def decode_each(target, **opts)
29
+ if block_given?
30
+ yield decode(target, **opts) until eof?
31
+ else
32
+ Enumerator.new do |acc|
33
+ acc << decode(target, **opts) until eof?
34
+ end
35
+ end
36
+ end
37
+
38
+ def decode(_target, **)
39
+ raise 'Not implemented'
40
+ end
41
+
42
+ def close; end
20
43
  end
21
44
 
22
- def encode(_msg, **)
23
- raise 'Not implemented'
45
+ class Encoder < Wrapper
46
+ def encode(_msg, **)
47
+ raise 'Not implemented'
48
+ end
49
+
50
+ def close
51
+ @io.flush if @io.respond_to?(:flush)
52
+ end
24
53
  end
25
54
  end
@@ -1,16 +1,20 @@
1
1
  require 'json'
2
2
 
3
3
  class Feedx::Format::JSON < Feedx::Format::Abstract
4
- def decode(obj, **)
5
- line = @io.gets
6
- return unless line
4
+ class Decoder < Feedx::Format::Abstract::Decoder
5
+ def decode(target, **)
6
+ line = @io.gets
7
+ return unless line
7
8
 
8
- obj = obj.allocate if obj.is_a?(Class)
9
- obj.from_json(line)
10
- obj
9
+ target = target.allocate if target.is_a?(Class)
10
+ target.from_json(line)
11
+ target
12
+ end
11
13
  end
12
14
 
13
- def encode(msg, **opts)
14
- @io.write msg.to_json(**opts) << "\n"
15
+ class Encoder < Feedx::Format::Abstract::Encoder
16
+ def encode(msg, **opts)
17
+ @io.write msg.to_json(**opts) << "\n"
18
+ end
15
19
  end
16
20
  end
@@ -0,0 +1,102 @@
1
+ require 'parquet'
2
+ require 'tmpdir'
3
+
4
+ class Feedx::Format::Parquet < Feedx::Format::Abstract
5
+ class Record < Arrow::Record
6
+ def each_pair
7
+ container.columns.each do |col|
8
+ yield col.name, col[index]
9
+ end
10
+ end
11
+ end
12
+
13
+ class Decoder < Feedx::Format::Abstract::Decoder
14
+ def initialize(io, **)
15
+ super(io)
16
+
17
+ @table = read_table
18
+ @cursor = 0
19
+ end
20
+
21
+ def eof?
22
+ @cursor >= @table.n_rows
23
+ end
24
+
25
+ def decode(target, **)
26
+ return if eof?
27
+
28
+ rec = Record.new(@table, @cursor)
29
+ @cursor += 1
30
+
31
+ target = target.allocate if target.is_a?(Class)
32
+ target.from_parquet(rec)
33
+ target
34
+ end
35
+
36
+ private
37
+
38
+ def read_table
39
+ tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
40
+ IO.copy_stream(@io, tmpname)
41
+
42
+ @table = Arrow::Table.load(tmpname, format: 'parquet')
43
+ ensure
44
+ unlink!(tmpname) if tmpname
45
+ end
46
+
47
+ def unlink!(tmpname)
48
+ File.unlink(tmpname)
49
+ rescue Errno::ENOENT
50
+ nil
51
+ end
52
+ end
53
+
54
+ class Encoder < Feedx::Format::Abstract::Encoder
55
+ attr_reader :schema
56
+
57
+ def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
58
+ super(io)
59
+
60
+ @schema = schema
61
+ @batch_size = batch_size.to_i
62
+ @buffer_size = buffer_size.to_i
63
+
64
+ @tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
65
+ @output = Arrow::FileOutputStream.new(@tmpname, append: false)
66
+ @writer = Parquet::ArrowFileWriter.new(@schema, @output)
67
+ @batch = []
68
+ end
69
+
70
+ def encode(msg, **opts)
71
+ msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
72
+
73
+ res = @batch.push(msg)
74
+ flush_table if @batch.size >= @batch_size
75
+ res
76
+ end
77
+
78
+ def close
79
+ flush_table unless @batch.empty?
80
+
81
+ @writer.close
82
+ @output.close
83
+ IO.copy_stream(@tmpname, @io)
84
+ ensure
85
+ unlink!
86
+ end
87
+
88
+ private
89
+
90
+ def flush_table
91
+ table = Arrow::RecordBatch.new(@schema, @batch).to_table
92
+ @writer.write_table table, @buffer_size
93
+ @batch.clear
94
+ end
95
+
96
+ def unlink!
97
+ File.unlink(@tmpname)
98
+ rescue Errno::ENOENT
99
+ nil
100
+ end
101
+ end
102
+ end