feedx 0.10.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +2 -0
  3. data/.travis.yml +13 -6
  4. data/Gemfile.lock +42 -19
  5. data/Makefile +5 -0
  6. data/consumer_test.go +5 -5
  7. data/feedx.gemspec +3 -2
  8. data/feedx_test.go +12 -9
  9. data/format.go +16 -16
  10. data/format_test.go +6 -7
  11. data/go.mod +5 -10
  12. data/go.sum +43 -24
  13. data/internal/testdata/testdata.pb.go +124 -0
  14. data/internal/testdata/testdata.proto +15 -0
  15. data/lib/feedx/cache/abstract.rb +2 -2
  16. data/lib/feedx/compression.rb +11 -4
  17. data/lib/feedx/compression/abstract.rb +2 -2
  18. data/lib/feedx/compression/gzip.rb +14 -16
  19. data/lib/feedx/compression/none.rb +4 -4
  20. data/lib/feedx/consumer.rb +17 -11
  21. data/lib/feedx/format.rb +18 -9
  22. data/lib/feedx/format/abstract.rb +42 -13
  23. data/lib/feedx/format/json.rb +12 -8
  24. data/lib/feedx/format/parquet.rb +102 -0
  25. data/lib/feedx/format/protobuf.rb +16 -8
  26. data/lib/feedx/producer.rb +20 -14
  27. data/lib/feedx/stream.rb +42 -25
  28. data/producer_test.go +1 -2
  29. data/reader_test.go +7 -8
  30. data/spec/feedx/compression/gzip_spec.rb +2 -2
  31. data/spec/feedx/compression/none_spec.rb +2 -2
  32. data/spec/feedx/compression_spec.rb +9 -9
  33. data/spec/feedx/consumer_spec.rb +6 -3
  34. data/spec/feedx/format/abstract_spec.rb +11 -8
  35. data/spec/feedx/format/json_spec.rb +12 -11
  36. data/spec/feedx/format/parquet_spec.rb +30 -0
  37. data/spec/feedx/format/protobuf_spec.rb +12 -11
  38. data/spec/feedx/format_spec.rb +8 -8
  39. data/spec/feedx/producer_spec.rb +6 -0
  40. data/spec/feedx/stream_spec.rb +28 -3
  41. data/spec/spec_helper.rb +17 -1
  42. data/writer_test.go +1 -1
  43. metadata +22 -3
@@ -0,0 +1,102 @@
1
+ require 'parquet'
2
+ require 'tmpdir'
3
+
4
+ class Feedx::Format::Parquet < Feedx::Format::Abstract
5
+ class Record < Arrow::Record
6
+ def each_pair
7
+ container.columns.each do |col|
8
+ yield col.name, col[index]
9
+ end
10
+ end
11
+ end
12
+
13
+ class Decoder < Feedx::Format::Abstract::Decoder
14
+ def initialize(io, **)
15
+ super(io)
16
+
17
+ @table = read_table
18
+ @cursor = 0
19
+ end
20
+
21
+ def eof?
22
+ @cursor >= @table.n_rows
23
+ end
24
+
25
+ def decode(target, **)
26
+ return if eof?
27
+
28
+ rec = Record.new(@table, @cursor)
29
+ @cursor += 1
30
+
31
+ target = target.allocate if target.is_a?(Class)
32
+ target.from_parquet(rec)
33
+ target
34
+ end
35
+
36
+ private
37
+
38
+ def read_table
39
+ tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
40
+ IO.copy_stream(@io, tmpname)
41
+
42
+ @table = Arrow::Table.load(tmpname, format: 'parquet')
43
+ ensure
44
+ unlink!(tmpname) if tmpname
45
+ end
46
+
47
+ def unlink!(tmpname)
48
+ File.unlink(tmpname)
49
+ rescue Errno::ENOENT
50
+ nil
51
+ end
52
+ end
53
+
54
+ class Encoder < Feedx::Format::Abstract::Encoder
55
+ attr_reader :schema
56
+
57
+ def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
58
+ super(io)
59
+
60
+ @schema = schema
61
+ @batch_size = batch_size.to_i
62
+ @buffer_size = buffer_size.to_i
63
+
64
+ @tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
65
+ @output = Arrow::FileOutputStream.new(@tmpname, append: false)
66
+ @writer = Parquet::ArrowFileWriter.new(@schema, @output)
67
+ @batch = []
68
+ end
69
+
70
+ def encode(msg, **opts)
71
+ msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
72
+
73
+ res = @batch.push(msg)
74
+ flush_table if @batch.size >= @batch_size
75
+ res
76
+ end
77
+
78
+ def close
79
+ flush_table unless @batch.empty?
80
+
81
+ @writer.close
82
+ @output.close
83
+ IO.copy_stream(@tmpname, @io)
84
+ ensure
85
+ unlink!
86
+ end
87
+
88
+ private
89
+
90
+ def flush_table
91
+ table = Arrow::RecordBatch.new(@schema, @batch).to_table
92
+ @writer.write_table table, @buffer_size
93
+ @batch.clear
94
+ end
95
+
96
+ def unlink!
97
+ File.unlink(@tmpname)
98
+ rescue Errno::ENOENT
99
+ nil
100
+ end
101
+ end
102
+ end
@@ -1,16 +1,24 @@
1
1
  require 'pbio'
2
2
 
3
3
  class Feedx::Format::Protobuf < Feedx::Format::Abstract
4
- def initialize(io)
5
- super PBIO::Delimited.new(io)
6
- end
4
+ class Decoder < Feedx::Format::Abstract::Decoder
5
+ def initialize(io, **opts)
6
+ super PBIO::Delimited.new(io), **opts
7
+ end
7
8
 
8
- def decode(klass, **)
9
- @io.read(klass)
9
+ def decode(target, **)
10
+ @io.read(target)
11
+ end
10
12
  end
11
13
 
12
- def encode(msg, **opts)
13
- msg = msg.to_pb(**opts) if msg.respond_to?(:to_pb)
14
- @io.write msg
14
+ class Encoder < Feedx::Format::Abstract::Encoder
15
+ def initialize(io, **opts)
16
+ super PBIO::Delimited.new(io), **opts
17
+ end
18
+
19
+ def encode(msg, **opts)
20
+ msg = msg.to_pb(**opts) if msg.respond_to?(:to_pb)
21
+ @io.write msg
22
+ end
15
23
  end
16
24
  end
@@ -6,46 +6,52 @@ module Feedx
6
6
  # Produces a relation as an encoded feed to a remote location.
7
7
  class Producer
8
8
  # See constructor.
9
- def self.perform(url, opts = {}, &block)
10
- new(url, opts, &block).perform
9
+ def self.perform(url, **opts, &block)
10
+ new(url, **opts, &block).perform
11
11
  end
12
12
 
13
13
  # @param [String] url the destination URL.
14
14
  # @param [Hash] opts options
15
15
  # @option opts [Enumerable,ActiveRecord::Relation] :enum relation or enumerator to stream.
16
16
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
17
- # @option opts [Hash] :format_options format encode options. Default: {}.
18
17
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
19
18
  # @option opts [Time,Proc] :last_modified the last modified time, used to determine if a push is necessary.
20
19
  # @yield A block factory to generate the relation or enumerator.
21
20
  # @yieldreturn [Enumerable,ActiveRecord::Relation] the relation or enumerator to stream.
22
- def initialize(url, opts = {}, &block)
23
- @enum = opts[:enum] || block
21
+ def initialize(url, last_modified: nil, format_options: {}, enum: nil, **opts, &block)
22
+ @enum = enum || block
24
23
  raise ArgumentError, "#{self.class.name}.new expects an :enum option or a block factory" unless @enum
25
24
 
26
- @stream = Feedx::Stream.new(url, opts)
27
- @last_mod = opts[:last_modified]
28
- @fmt_opts = opts[:format_options] || {}
25
+ @url = url
26
+ @opts = opts.merge(format_options)
27
+ @last_mod = last_modified
28
+
29
+ return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
30
+
31
+ warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
29
32
  end
30
33
 
31
34
  def perform
32
- enum = @enum.is_a?(Proc) ? @enum.call : @enum
33
- last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
35
+ stream = Feedx::Stream.new(@url, **@opts)
36
+ enum = @enum.is_a?(Proc) ? @enum.call : @enum
37
+ last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
34
38
  local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
35
39
 
36
40
  begin
37
- metadata = @stream.blob.info.metadata
41
+ metadata = stream.blob.info.metadata
38
42
  remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
39
43
  return -1 unless local_rev > remote_rev
40
44
  rescue BFS::FileNotFound
41
45
  nil
42
46
  end if local_rev.positive?
43
47
 
44
- @stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
48
+ stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
45
49
  iter = enum.respond_to?(:find_each) ? :find_each : :each
46
- enum.send(iter) {|rec| fmt.encode(rec, **@fmt_opts) }
50
+ enum.send(iter) {|rec| fmt.encode(rec, **@opts) }
47
51
  end
48
- @stream.blob.info.size
52
+ stream.blob.info.size
53
+ ensure
54
+ stream&.close
49
55
  end
50
56
  end
51
57
  end
@@ -6,25 +6,42 @@ module Feedx
6
6
  class Stream
7
7
  attr_reader :blob
8
8
 
9
+ # Behaves like new, but accepts an optional block.
10
+ # If a block is given, streams are automatically closed after the block is yielded.
11
+ def self.open(url, **opts)
12
+ stream = new(url, **opts)
13
+ return stream unless block_given?
14
+
15
+ begin
16
+ yield stream
17
+ ensure
18
+ stream.close
19
+ end
20
+ end
21
+
9
22
  # @param [String] url the blob URL.
10
23
  # @param [Hash] opts options
11
24
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
12
25
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
13
- def initialize(url, opts = {})
26
+ def initialize(url, format: nil, compress: nil, **opts)
14
27
  @blob = BFS::Blob.new(url)
15
- @format = detect_format(opts[:format])
16
- @compress = detect_compress(opts[:compress])
28
+ @format = detect_format(format)
29
+ @compress = detect_compress(compress)
30
+ @opts = opts
31
+
32
+ BFS.defer(self, :close)
17
33
  end
18
34
 
19
35
  # Opens the remote for reading.
20
36
  # @param [Hash] opts BFS::Blob#open options
21
37
  # @yield A block over a formatted stream.
22
38
  # @yieldparam [Feedx::Format::Abstract] formatted input stream.
23
- def open(opts = {})
24
- @blob.open(opts) do |io|
25
- @compress.reader(io) do |cio|
26
- fmt = @format.new(cio)
27
- yield fmt
39
+ def open(**opts)
40
+ @blob.open(**opts) do |io|
41
+ @compress.reader(io, **@opts) do |cio|
42
+ @format.decoder(cio, **@opts) do |fmt|
43
+ yield fmt
44
+ end
28
45
  end
29
46
  end
30
47
  end
@@ -33,28 +50,31 @@ module Feedx
33
50
  # @param [Hash] opts BFS::Blob#create options
34
51
  # @yield A block over a formatted stream.
35
52
  # @yieldparam [Feedx::Format::Abstract] formatted output stream.
36
- def create(opts = {})
37
- @blob.create(opts) do |io|
38
- @compress.writer(io) do |cio|
39
- fmt = @format.new(cio)
40
- yield fmt
53
+ def create(**opts)
54
+ @blob.create(**opts) do |io|
55
+ @compress.writer(io, **@opts) do |cio|
56
+ @format.encoder(cio, **@opts) do |fmt|
57
+ yield fmt
58
+ end
41
59
  end
42
60
  end
43
61
  end
44
62
 
63
+ # Closes the underlying connection.
64
+ def close
65
+ @blob.close
66
+ end
67
+
45
68
  private
46
69
 
47
70
  def detect_format(val)
48
71
  case val
49
72
  when nil
50
73
  Feedx::Format.detect(@blob.path)
51
- when Class
52
- parent = Feedx::Format::Abstract
53
- raise ArgumentError, "Class #{val} must extend #{parent}" unless val < parent
54
-
55
- val
56
- else
74
+ when String, Symbol
57
75
  Feedx::Format.resolve(val)
76
+ else
77
+ Feedx::Format.validate!(val)
58
78
  end
59
79
  end
60
80
 
@@ -62,13 +82,10 @@ module Feedx
62
82
  case val
63
83
  when nil
64
84
  Feedx::Compression.detect(@blob.path)
65
- when Class
66
- parent = Feedx::Compression::Abstract
67
- raise ArgumentError, "Class #{val} must extend #{parent}" unless val < parent
68
-
69
- val
70
- else
85
+ when String, Symbol
71
86
  Feedx::Compression.resolve(val)
87
+ else
88
+ Feedx::Compression.validate!(val)
72
89
  end
73
90
  end
74
91
  end
@@ -23,8 +23,7 @@ var _ = Describe("Producer", func() {
23
23
  atomic.AddUint32(&numRuns, 1)
24
24
 
25
25
  for i := 0; i < 10; i++ {
26
- fix := fixture
27
- if err := w.Encode(&fix); err != nil {
26
+ if err := w.Encode(seed()); err != nil {
28
27
  return err
29
28
  }
30
29
  }
@@ -5,10 +5,9 @@ import (
5
5
  "io"
6
6
  "io/ioutil"
7
7
 
8
- "github.com/bsm/feedx"
9
-
10
8
  "github.com/bsm/bfs"
11
- tbp "github.com/golang/protobuf/proto/proto3_proto"
9
+ "github.com/bsm/feedx"
10
+ "github.com/bsm/feedx/internal/testdata"
12
11
  . "github.com/onsi/ginkgo"
13
12
  . "github.com/onsi/gomega"
14
13
  )
@@ -34,23 +33,23 @@ var _ = Describe("Reader", func() {
34
33
  It("should read", func() {
35
34
  data, err := ioutil.ReadAll(subject)
36
35
  Expect(err).NotTo(HaveOccurred())
37
- Expect(len(data)).To(BeNumerically("~", 140, 20))
36
+ Expect(len(data)).To(BeNumerically("~", 110, 20))
38
37
  Expect(subject.NumRead()).To(Equal(0))
39
38
  })
40
39
 
41
40
  It("should decode", func() {
42
- var msgs []tbp.Message
41
+ var msgs []*testdata.MockMessage
43
42
  for {
44
- var msg tbp.Message
43
+ var msg testdata.MockMessage
45
44
  err := subject.Decode(&msg)
46
45
  if err == io.EOF {
47
46
  break
48
47
  }
49
48
  Expect(err).NotTo(HaveOccurred())
50
- msgs = append(msgs, msg)
49
+ msgs = append(msgs, &msg)
51
50
  }
52
51
 
53
- Expect(msgs).To(Equal([]tbp.Message{fixture, fixture, fixture}))
52
+ Expect(msgs).To(ConsistOf(seed(), seed(), seed()))
54
53
  Expect(subject.NumRead()).To(Equal(3))
55
54
  })
56
55
  })
@@ -3,13 +3,13 @@ require 'spec_helper'
3
3
  RSpec.describe Feedx::Compression::Gzip do
4
4
  it 'should wrap readers/writers' do
5
5
  wio = StringIO.new
6
- described_class.writer(wio) {|w| w.write 'xyz' * 1000 }
6
+ subject.writer(wio) {|w| w.write 'xyz' * 1000 }
7
7
  expect(wio.size).to be_within(20).of(40)
8
8
  expect(wio.string.encoding).to eq(Encoding::BINARY)
9
9
 
10
10
  data = ''
11
11
  StringIO.open(wio.string) do |rio|
12
- described_class.reader(rio) {|z| data = z.read }
12
+ subject.reader(rio) {|z| data = z.read }
13
13
  end
14
14
  expect(data.size).to eq(3000)
15
15
  expect(data.encoding).to eq(Encoding.default_external)
@@ -3,12 +3,12 @@ require 'spec_helper'
3
3
  RSpec.describe Feedx::Compression::None do
4
4
  it 'should wrap readers/writers' do
5
5
  wio = StringIO.new
6
- described_class.writer(wio) {|w| w.write 'xyz' * 1000 }
6
+ subject.writer(wio) {|w| w.write 'xyz' * 1000 }
7
7
  expect(wio.size).to eq(3000)
8
8
 
9
9
  data = ''
10
10
  StringIO.open(wio.string) do |rio|
11
- described_class.reader(rio) {|z| data = z.read }
11
+ subject.reader(rio) {|z| data = z.read }
12
12
  end
13
13
  expect(data.size).to eq(3000)
14
14
  end
@@ -2,18 +2,18 @@ require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Compression do
4
4
  it 'should resolve' do
5
- expect(described_class.resolve(:gzip)).to eq(described_class::Gzip)
6
- expect(described_class.resolve(:gz)).to eq(described_class::Gzip)
7
- expect(described_class.resolve(nil)).to eq(described_class::None)
5
+ expect(described_class.resolve(:gzip)).to be_instance_of(described_class::Gzip)
6
+ expect(described_class.resolve(:gz)).to be_instance_of(described_class::Gzip)
7
+ expect(described_class.resolve(nil)).to be_instance_of(described_class::None)
8
8
  expect { described_class.resolve(:txt) }.to raise_error(/invalid compression txt/)
9
9
  end
10
10
 
11
11
  it 'should detect' do
12
- expect(described_class.detect('path/to/file.jsonz')).to eq(described_class::Gzip)
13
- expect(described_class.detect('path/to/file.json.gz')).to eq(described_class::Gzip)
14
- expect(described_class.detect('path/to/file.json')).to eq(described_class::None)
15
- expect(described_class.detect('path/to/file.pbz')).to eq(described_class::Gzip)
16
- expect(described_class.detect('path/to/file.pb.gz')).to eq(described_class::Gzip)
17
- expect(described_class.detect('path/to/file.pb')).to eq(described_class::None)
12
+ expect(described_class.detect('path/to/file.jsonz')).to be_instance_of(described_class::Gzip)
13
+ expect(described_class.detect('path/to/file.json.gz')).to be_instance_of(described_class::Gzip)
14
+ expect(described_class.detect('path/to/file.json')).to be_instance_of(described_class::None)
15
+ expect(described_class.detect('path/to/file.pbz')).to be_instance_of(described_class::Gzip)
16
+ expect(described_class.detect('path/to/file.pb.gz')).to be_instance_of(described_class::Gzip)
17
+ expect(described_class.detect('path/to/file.pb')).to be_instance_of(described_class::None)
18
18
  end
19
19
  end
@@ -36,10 +36,13 @@ RSpec.describe Feedx::Consumer do
36
36
 
37
37
  private
38
38
 
39
- def mock_produce!(opts = {})
39
+ def mock_produce!(enum: mock_enum, **opts)
40
40
  url = 'mock:///dir/file.json'
41
- opts[:enum] ||= %w[x y z].map {|t| Feedx::TestCase::Model.new(t) } * 100
42
- Feedx::Producer.perform url, opts
41
+ Feedx::Producer.perform url, enum: enum, **opts
43
42
  url
44
43
  end
44
+
45
+ def mock_enum
46
+ %w[x y z].map {|t| Feedx::TestCase::Model.new(t) } * 100
47
+ end
45
48
  end
@@ -1,17 +1,20 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Format::Abstract do
4
- subject { Feedx::Format::JSON.new(wio) }
4
+ subject { Feedx::Format::JSON.new }
5
5
  let(:wio) { StringIO.new }
6
+ let(:rio) { StringIO.open(wio.string) }
6
7
 
7
8
  it 'should decode each' do
8
- subject.encode(Feedx::TestCase::Model.new('X'))
9
- subject.encode(Feedx::TestCase::Model.new('Y'))
10
- subject.encode(Feedx::TestCase::Message.new(title: 'Z'))
11
- StringIO.open(wio.string) do |rio|
12
- fmt = subject.class.new(rio)
13
- dec = fmt.decode_each(Feedx::TestCase::Model).to_a
14
- expect(dec.map(&:title)).to eq(%w[X Y Z])
9
+ subject.encoder wio do |enc|
10
+ enc.encode(Feedx::TestCase::Model.new('X'))
11
+ enc.encode(Feedx::TestCase::Model.new('Y'))
12
+ enc.encode(Feedx::TestCase::Message.new(title: 'Z'))
13
+ end
14
+
15
+ subject.decoder rio do |dec|
16
+ acc = dec.decode_each(Feedx::TestCase::Model).to_a
17
+ expect(acc.map(&:title)).to eq(%w[X Y Z])
15
18
  end
16
19
  end
17
20
  end