feedx 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ // Code generated by protoc-gen-gogo. DO NOT EDIT.
2
+ // source: internal/testdata/testdata.proto
3
+
4
+ package testdata
5
+
6
+ import (
7
+ fmt "fmt"
8
+ proto "github.com/gogo/protobuf/proto"
9
+ math "math"
10
+ )
11
+
12
+ // Reference imports to suppress errors if they are not otherwise used.
13
+ var _ = proto.Marshal
14
+ var _ = fmt.Errorf
15
+ var _ = math.Inf
16
+
17
+ // This is a compile-time assertion to ensure that this generated file
18
+ // is compatible with the proto package it is being compiled against.
19
+ // A compilation error at this line likely means your copy of the
20
+ // proto package needs to be updated.
21
+ const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package
22
+
23
+ type MockEnum int32
24
+
25
+ const (
26
+ MockEnum_UNKNOWN MockEnum = 0
27
+ MockEnum_FIRST MockEnum = 3
28
+ )
29
+
30
+ var MockEnum_name = map[int32]string{
31
+ 0: "UNKNOWN",
32
+ 3: "FIRST",
33
+ }
34
+
35
+ var MockEnum_value = map[string]int32{
36
+ "UNKNOWN": 0,
37
+ "FIRST": 3,
38
+ }
39
+
40
+ func (x MockEnum) String() string {
41
+ return proto.EnumName(MockEnum_name, int32(x))
42
+ }
43
+
44
+ func (MockEnum) EnumDescriptor() ([]byte, []int) {
45
+ return fileDescriptor_076a9f61cb4a1904, []int{0}
46
+ }
47
+
48
+ type MockMessage struct {
49
+ Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
50
+ Enum MockEnum `protobuf:"varint,2,opt,name=enum,proto3,enum=feedx.internal.testdata.MockEnum" json:"enum,omitempty"`
51
+ Height uint32 `protobuf:"varint,3,opt,name=height,proto3" json:"height,omitempty"`
52
+ XXX_NoUnkeyedLiteral struct{} `json:"-"`
53
+ XXX_unrecognized []byte `json:"-"`
54
+ XXX_sizecache int32 `json:"-"`
55
+ }
56
+
57
+ func (m *MockMessage) Reset() { *m = MockMessage{} }
58
+ func (m *MockMessage) String() string { return proto.CompactTextString(m) }
59
+ func (*MockMessage) ProtoMessage() {}
60
+ func (*MockMessage) Descriptor() ([]byte, []int) {
61
+ return fileDescriptor_076a9f61cb4a1904, []int{0}
62
+ }
63
+ func (m *MockMessage) XXX_Unmarshal(b []byte) error {
64
+ return xxx_messageInfo_MockMessage.Unmarshal(m, b)
65
+ }
66
+ func (m *MockMessage) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
67
+ return xxx_messageInfo_MockMessage.Marshal(b, m, deterministic)
68
+ }
69
+ func (m *MockMessage) XXX_Merge(src proto.Message) {
70
+ xxx_messageInfo_MockMessage.Merge(m, src)
71
+ }
72
+ func (m *MockMessage) XXX_Size() int {
73
+ return xxx_messageInfo_MockMessage.Size(m)
74
+ }
75
+ func (m *MockMessage) XXX_DiscardUnknown() {
76
+ xxx_messageInfo_MockMessage.DiscardUnknown(m)
77
+ }
78
+
79
+ var xxx_messageInfo_MockMessage proto.InternalMessageInfo
80
+
81
+ func (m *MockMessage) GetName() string {
82
+ if m != nil {
83
+ return m.Name
84
+ }
85
+ return ""
86
+ }
87
+
88
+ func (m *MockMessage) GetEnum() MockEnum {
89
+ if m != nil {
90
+ return m.Enum
91
+ }
92
+ return MockEnum_UNKNOWN
93
+ }
94
+
95
+ func (m *MockMessage) GetHeight() uint32 {
96
+ if m != nil {
97
+ return m.Height
98
+ }
99
+ return 0
100
+ }
101
+
102
+ func init() {
103
+ proto.RegisterEnum("feedx.internal.testdata.MockEnum", MockEnum_name, MockEnum_value)
104
+ proto.RegisterType((*MockMessage)(nil), "feedx.internal.testdata.MockMessage")
105
+ }
106
+
107
+ func init() { proto.RegisterFile("internal/testdata/testdata.proto", fileDescriptor_076a9f61cb4a1904) }
108
+
109
+ var fileDescriptor_076a9f61cb4a1904 = []byte{
110
+ // 199 bytes of a gzipped FileDescriptorProto
111
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x52, 0xc8, 0xcc, 0x2b, 0x49,
112
+ 0x2d, 0xca, 0x4b, 0xcc, 0xd1, 0x2f, 0x49, 0x2d, 0x2e, 0x49, 0x49, 0x2c, 0x49, 0x84, 0x33, 0xf4,
113
+ 0x0a, 0x8a, 0xf2, 0x4b, 0xf2, 0x85, 0xc4, 0xd3, 0x52, 0x53, 0x53, 0x2a, 0xf4, 0x60, 0xea, 0xf4,
114
+ 0x60, 0xd2, 0x4a, 0x05, 0x5c, 0xdc, 0xbe, 0xf9, 0xc9, 0xd9, 0xbe, 0xa9, 0xc5, 0xc5, 0x89, 0xe9,
115
+ 0xa9, 0x42, 0x42, 0x5c, 0x2c, 0x79, 0x89, 0xb9, 0xa9, 0x12, 0x8c, 0x0a, 0x8c, 0x1a, 0x9c, 0x41,
116
+ 0x60, 0xb6, 0x90, 0x29, 0x17, 0x4b, 0x6a, 0x5e, 0x69, 0xae, 0x04, 0x93, 0x02, 0xa3, 0x06, 0x9f,
117
+ 0x91, 0xa2, 0x1e, 0x0e, 0xa3, 0xf4, 0x40, 0xe6, 0xb8, 0xe6, 0x95, 0xe6, 0x06, 0x81, 0x95, 0x0b,
118
+ 0x89, 0x71, 0xb1, 0x65, 0xa4, 0x66, 0xa6, 0x67, 0x94, 0x48, 0x30, 0x2b, 0x30, 0x6a, 0xf0, 0x06,
119
+ 0x41, 0x79, 0x5a, 0x4a, 0x5c, 0x1c, 0x30, 0x95, 0x42, 0xdc, 0x5c, 0xec, 0xa1, 0x7e, 0xde, 0x7e,
120
+ 0xfe, 0xe1, 0x7e, 0x02, 0x0c, 0x42, 0x9c, 0x5c, 0xac, 0x6e, 0x9e, 0x41, 0xc1, 0x21, 0x02, 0xcc,
121
+ 0x4e, 0x2a, 0x51, 0x4a, 0xe9, 0x99, 0x25, 0x19, 0xa5, 0x49, 0x7a, 0xc9, 0xf9, 0xb9, 0xfa, 0x60,
122
+ 0x0b, 0xf5, 0x31, 0xfc, 0x98, 0xc4, 0x06, 0xf6, 0x9b, 0x31, 0x20, 0x00, 0x00, 0xff, 0xff, 0x42,
123
+ 0xf6, 0x49, 0xb3, 0xff, 0x00, 0x00, 0x00,
124
+ }
@@ -0,0 +1,15 @@
1
+ syntax = "proto3";
2
+
3
+ package feedx.internal.testdata;
4
+ option go_package = "github.com/feedx/internal/testdata";
5
+
6
+ enum MockEnum {
7
+ UNKNOWN = 0;
8
+ FIRST = 3;
9
+ }
10
+
11
+ message MockMessage {
12
+ string name = 1;
13
+ MockEnum enum = 2;
14
+ uint32 height = 3;
15
+ }
@@ -5,12 +5,19 @@ module Feedx
5
5
  autoload :Gzip, 'feedx/compression/gzip'
6
6
 
7
7
  class << self
8
+ def validate!(kind)
9
+ raise ArgumentError, "#{kind} does not implement #reader(io, &block)" unless kind.respond_to?(:reader)
10
+ raise ArgumentError, "#{kind} does not implement #writer(io, &block)" unless kind.respond_to?(:writer)
11
+
12
+ kind
13
+ end
14
+
8
15
  def resolve(name)
9
16
  case name.to_s
10
17
  when 'gz', 'gzip'
11
- Gzip
18
+ Gzip.new
12
19
  when ''
13
- None
20
+ None.new
14
21
  else
15
22
  raise ArgumentError, "invalid compression #{name}"
16
23
  end
@@ -18,9 +25,9 @@ module Feedx
18
25
 
19
26
  def detect(path)
20
27
  if File.extname(path)[-1] == 'z'
21
- Gzip
28
+ Gzip.new
22
29
  else
23
- None
30
+ None.new
24
31
  end
25
32
  end
26
33
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::Abstract
2
- def self.reader(_io, &_block)
2
+ def reader(_io, &_block)
3
3
  raise 'Not implemented'
4
4
  end
5
5
 
6
- def self.writer(_io, &_block)
6
+ def writer(_io, &_block)
7
7
  raise 'Not implemented'
8
8
  end
9
9
  end
@@ -1,11 +1,23 @@
1
1
  require 'zlib'
2
2
 
3
3
  class Feedx::Compression::Gzip < Feedx::Compression::Abstract
4
- def self.reader(io, &block)
4
+ def reader(io, &block)
5
+ force_binmode(io)
5
6
  Zlib::GzipReader.wrap(io, &block)
6
7
  end
7
8
 
8
- def self.writer(io, &block)
9
+ def writer(io, &block)
10
+ force_binmode(io)
9
11
  Zlib::GzipWriter.wrap(io, &block)
10
12
  end
13
+
14
+ private
15
+
16
+ def force_binmode(io)
17
+ if io.respond_to?(:binmode)
18
+ io.binmode
19
+ elsif io.respond_to?(:set_encoding)
20
+ io.set_encoding(Encoding::BINARY)
21
+ end
22
+ end
11
23
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::None < Feedx::Compression::Abstract
2
- def self.reader(io, &block)
3
- block.call(io)
2
+ def reader(io)
3
+ yield(io)
4
4
  end
5
5
 
6
- def self.writer(io, &block)
7
- block.call(io)
6
+ def writer(io)
7
+ yield(io)
8
8
  end
9
9
  end
@@ -8,8 +8,8 @@ module Feedx
8
8
  include Enumerable
9
9
 
10
10
  # See constructor.
11
- def self.each(url, klass, opts={}, &block)
12
- new(url, klass, opts).each(&block)
11
+ def self.each(url, klass, **opts, &block)
12
+ new(url, klass, **opts).each(&block)
13
13
  end
14
14
 
15
15
  # @param [String] url the destination URL.
@@ -19,9 +19,9 @@ module Feedx
19
19
  # @option opts [Hash] :format_options format decode options. Default: {}.
20
20
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
21
21
  # @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
22
- def initialize(url, klass, opts={})
22
+ def initialize(url, klass, **opts)
23
23
  @klass = klass
24
- @stream = Feedx::Stream.new(url, opts)
24
+ @stream = Feedx::Stream.new(url, **opts)
25
25
  @fmt_opts = opts[:format_options] || {}
26
26
  @cache = opts[:cache]
27
27
  end
@@ -2,13 +2,19 @@ module Feedx
2
2
  module Format
3
3
  autoload :Abstract, 'feedx/format/abstract'
4
4
  autoload :JSON, 'feedx/format/json'
5
+ autoload :Parquet, 'feedx/format/parquet'
5
6
  autoload :Protobuf, 'feedx/format/protobuf'
6
7
 
7
8
  class << self
8
- def register(ext, kind)
9
- raise ArgumentError, "#{kind} is not a subclass of Feedx::Format::Abstract" unless kind.is_a?(Class) && kind < Abstract
9
+ def validate!(kind)
10
+ raise ArgumentError, "#{kind} does not implement #encoder(io, &block)" unless kind.respond_to?(:encoder)
11
+ raise ArgumentError, "#{kind} does not implement #decoder(io, &block)" unless kind.respond_to?(:decoder)
12
+
13
+ kind
14
+ end
10
15
 
11
- registry[ext.to_s] = kind
16
+ def register(ext, kind)
17
+ registry[ext.to_s] = validate!(kind)
12
18
  end
13
19
 
14
20
  def resolve(name)
@@ -40,13 +46,13 @@ module Feedx
40
46
  end
41
47
 
42
48
  def _resolve(name)
43
- name = name.to_s
44
- klass = registry[name]
45
- if klass.is_a?(Symbol)
46
- klass = const_get(klass)
47
- registry[name.to_s] = klass
49
+ name = name.to_s
50
+ kind = registry[name]
51
+ if kind.is_a?(Symbol)
52
+ kind = const_get(kind).new
53
+ registry[name.to_s] = kind
48
54
  end
49
- klass
55
+ kind
50
56
  end
51
57
  end
52
58
  end
@@ -1,25 +1,54 @@
1
1
  class Feedx::Format::Abstract
2
- def initialize(io)
3
- @io = io
2
+ def decoder(io, **opts, &block)
3
+ self.class::Decoder.open(io, **opts, &block)
4
4
  end
5
5
 
6
- def eof?
7
- @io.eof?
6
+ def encoder(io, **opts, &block)
7
+ self.class::Encoder.open(io, **opts, &block)
8
8
  end
9
9
 
10
- def decode_each(klass, **opts)
11
- if block_given?
12
- yield decode(klass, **opts) until eof?
13
- else
14
- Enumerator.new {|y| y << decode(klass, **opts) until eof? }
10
+ class Wrapper
11
+ def self.open(io, **opts)
12
+ inst = new(io, **opts)
13
+ yield inst
14
+ ensure
15
+ inst&.close
16
+ end
17
+
18
+ def initialize(io, **)
19
+ @io = io
15
20
  end
16
21
  end
17
22
 
18
- def decode(_klass, **)
19
- raise 'Not implemented'
23
+ class Decoder < Wrapper
24
+ def eof?
25
+ @io.eof?
26
+ end
27
+
28
+ def decode_each(target, **opts)
29
+ if block_given?
30
+ yield decode(target, **opts) until eof?
31
+ else
32
+ Enumerator.new do |acc|
33
+ acc << decode(target, **opts) until eof?
34
+ end
35
+ end
36
+ end
37
+
38
+ def decode(_target, **)
39
+ raise 'Not implemented'
40
+ end
41
+
42
+ def close; end
20
43
  end
21
44
 
22
- def encode(_msg, **)
23
- raise 'Not implemented'
45
+ class Encoder < Wrapper
46
+ def encode(_msg, **)
47
+ raise 'Not implemented'
48
+ end
49
+
50
+ def close
51
+ @io.flush if @io.respond_to?(:flush)
52
+ end
24
53
  end
25
54
  end
@@ -1,16 +1,20 @@
1
1
  require 'json'
2
2
 
3
3
  class Feedx::Format::JSON < Feedx::Format::Abstract
4
- def decode(obj, **)
5
- line = @io.gets
6
- return unless line
4
+ class Decoder < Feedx::Format::Abstract::Decoder
5
+ def decode(target, **)
6
+ line = @io.gets
7
+ return unless line
7
8
 
8
- obj = obj.allocate if obj.is_a?(Class)
9
- obj.from_json(line)
10
- obj
9
+ target = target.allocate if target.is_a?(Class)
10
+ target.from_json(line)
11
+ target
12
+ end
11
13
  end
12
14
 
13
- def encode(msg, **opts)
14
- @io.write msg.to_json(**opts) << "\n"
15
+ class Encoder < Feedx::Format::Abstract::Encoder
16
+ def encode(msg, **opts)
17
+ @io.write msg.to_json(**opts) << "\n"
18
+ end
15
19
  end
16
20
  end
@@ -0,0 +1,102 @@
1
+ require 'parquet'
2
+ require 'tmpdir'
3
+
4
+ class Feedx::Format::Parquet < Feedx::Format::Abstract
5
+ class Record < Arrow::Record
6
+ def each_pair
7
+ container.columns.each do |col|
8
+ yield col.name, col[index]
9
+ end
10
+ end
11
+ end
12
+
13
+ class Decoder < Feedx::Format::Abstract::Decoder
14
+ def initialize(io, **)
15
+ super(io)
16
+
17
+ @table = read_table
18
+ @cursor = 0
19
+ end
20
+
21
+ def eof?
22
+ @cursor >= @table.n_rows
23
+ end
24
+
25
+ def decode(target, **)
26
+ return if eof?
27
+
28
+ rec = Record.new(@table, @cursor)
29
+ @cursor += 1
30
+
31
+ target = target.allocate if target.is_a?(Class)
32
+ target.from_parquet(rec)
33
+ target
34
+ end
35
+
36
+ private
37
+
38
+ def read_table
39
+ tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
40
+ IO.copy_stream(@io, tmpname)
41
+
42
+ @table = Arrow::Table.load(tmpname, format: 'parquet')
43
+ ensure
44
+ unlink!(tmpname) if tmpname
45
+ end
46
+
47
+ def unlink!(tmpname)
48
+ File.unlink(tmpname)
49
+ rescue Errno::ENOENT
50
+ nil
51
+ end
52
+ end
53
+
54
+ class Encoder < Feedx::Format::Abstract::Encoder
55
+ attr_reader :schema
56
+
57
+ def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
58
+ super(io)
59
+
60
+ @schema = schema
61
+ @batch_size = batch_size.to_i
62
+ @buffer_size = buffer_size.to_i
63
+
64
+ @tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
65
+ @output = Arrow::FileOutputStream.new(@tmpname, append: false)
66
+ @writer = Parquet::ArrowFileWriter.new(@schema, @output)
67
+ @batch = []
68
+ end
69
+
70
+ def encode(msg, **opts)
71
+ msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
72
+
73
+ res = @batch.push(msg)
74
+ flush_table if @batch.size >= @batch_size
75
+ res
76
+ end
77
+
78
+ def close
79
+ flush_table unless @batch.empty?
80
+
81
+ @writer.close
82
+ @output.close
83
+ IO.copy_stream(@tmpname, @io)
84
+ ensure
85
+ unlink!
86
+ end
87
+
88
+ private
89
+
90
+ def flush_table
91
+ table = Arrow::RecordBatch.new(@schema, @batch).to_table
92
+ @writer.write_table table, @buffer_size
93
+ @batch.clear
94
+ end
95
+
96
+ def unlink!
97
+ File.unlink(@tmpname)
98
+ rescue Errno::ENOENT
99
+ nil
100
+ end
101
+ end
102
+ end