feedx 0.10.0 → 0.12.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.rubocop.yml +2 -0
  4. data/.travis.yml +13 -6
  5. data/Gemfile.lock +43 -20
  6. data/Makefile +5 -0
  7. data/consumer_test.go +5 -5
  8. data/feedx.gemspec +3 -2
  9. data/feedx_test.go +12 -9
  10. data/format.go +16 -16
  11. data/format_test.go +6 -7
  12. data/go.mod +5 -10
  13. data/go.sum +43 -24
  14. data/internal/testdata/testdata.pb.go +124 -0
  15. data/internal/testdata/testdata.proto +15 -0
  16. data/lib/feedx/cache/abstract.rb +2 -2
  17. data/lib/feedx/compression.rb +11 -4
  18. data/lib/feedx/compression/abstract.rb +2 -2
  19. data/lib/feedx/compression/gzip.rb +14 -2
  20. data/lib/feedx/compression/none.rb +4 -4
  21. data/lib/feedx/consumer.rb +17 -11
  22. data/lib/feedx/format.rb +18 -9
  23. data/lib/feedx/format/abstract.rb +42 -13
  24. data/lib/feedx/format/json.rb +12 -8
  25. data/lib/feedx/format/parquet.rb +102 -0
  26. data/lib/feedx/format/protobuf.rb +16 -8
  27. data/lib/feedx/producer.rb +20 -14
  28. data/lib/feedx/stream.rb +41 -25
  29. data/producer_test.go +1 -2
  30. data/reader_test.go +7 -8
  31. data/spec/feedx/compression/gzip_spec.rb +4 -2
  32. data/spec/feedx/compression/none_spec.rb +2 -2
  33. data/spec/feedx/compression_spec.rb +9 -9
  34. data/spec/feedx/consumer_spec.rb +6 -3
  35. data/spec/feedx/format/abstract_spec.rb +11 -8
  36. data/spec/feedx/format/json_spec.rb +12 -11
  37. data/spec/feedx/format/parquet_spec.rb +30 -0
  38. data/spec/feedx/format/protobuf_spec.rb +12 -11
  39. data/spec/feedx/format_spec.rb +8 -8
  40. data/spec/feedx/producer_spec.rb +6 -0
  41. data/spec/feedx/stream_spec.rb +26 -3
  42. data/spec/spec_helper.rb +17 -1
  43. data/writer_test.go +1 -1
  44. metadata +22 -3
@@ -0,0 +1,102 @@
1
+ require 'parquet'
2
+ require 'tmpdir'
3
+
4
+ class Feedx::Format::Parquet < Feedx::Format::Abstract
5
+ class Record < Arrow::Record
6
+ def each_pair
7
+ container.columns.each do |col|
8
+ yield col.name, col[index]
9
+ end
10
+ end
11
+ end
12
+
13
+ class Decoder < Feedx::Format::Abstract::Decoder
14
+ def initialize(io, **)
15
+ super(io)
16
+
17
+ @table = read_table
18
+ @cursor = 0
19
+ end
20
+
21
+ def eof?
22
+ @cursor >= @table.n_rows
23
+ end
24
+
25
+ def decode(target, **)
26
+ return if eof?
27
+
28
+ rec = Record.new(@table, @cursor)
29
+ @cursor += 1
30
+
31
+ target = target.allocate if target.is_a?(Class)
32
+ target.from_parquet(rec)
33
+ target
34
+ end
35
+
36
+ private
37
+
38
+ def read_table
39
+ tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
40
+ IO.copy_stream(@io, tmpname)
41
+
42
+ @table = Arrow::Table.load(tmpname, format: 'parquet')
43
+ ensure
44
+ unlink!(tmpname) if tmpname
45
+ end
46
+
47
+ def unlink!(tmpname)
48
+ File.unlink(tmpname)
49
+ rescue Errno::ENOENT
50
+ nil
51
+ end
52
+ end
53
+
54
+ class Encoder < Feedx::Format::Abstract::Encoder
55
+ attr_reader :schema
56
+
57
+ def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
58
+ super(io)
59
+
60
+ @schema = schema
61
+ @batch_size = batch_size.to_i
62
+ @buffer_size = buffer_size.to_i
63
+
64
+ @tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
65
+ @output = Arrow::FileOutputStream.new(@tmpname, append: false)
66
+ @writer = Parquet::ArrowFileWriter.new(@schema, @output)
67
+ @batch = []
68
+ end
69
+
70
+ def encode(msg, **opts)
71
+ msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
72
+
73
+ res = @batch.push(msg)
74
+ flush_table if @batch.size >= @batch_size
75
+ res
76
+ end
77
+
78
+ def close
79
+ flush_table unless @batch.empty?
80
+
81
+ @writer.close
82
+ @output.close
83
+ IO.copy_stream(@tmpname, @io)
84
+ ensure
85
+ unlink!
86
+ end
87
+
88
+ private
89
+
90
+ def flush_table
91
+ table = Arrow::RecordBatch.new(@schema, @batch).to_table
92
+ @writer.write_table table, @buffer_size
93
+ @batch.clear
94
+ end
95
+
96
+ def unlink!
97
+ File.unlink(@tmpname)
98
+ rescue Errno::ENOENT
99
+ nil
100
+ end
101
+ end
102
+ end
@@ -1,16 +1,24 @@
1
1
  require 'pbio'
2
2
 
3
3
  class Feedx::Format::Protobuf < Feedx::Format::Abstract
4
- def initialize(io)
5
- super PBIO::Delimited.new(io)
6
- end
4
+ class Decoder < Feedx::Format::Abstract::Decoder
5
+ def initialize(io, **opts)
6
+ super PBIO::Delimited.new(io), **opts
7
+ end
7
8
 
8
- def decode(klass, **)
9
- @io.read(klass)
9
+ def decode(target, **)
10
+ @io.read(target)
11
+ end
10
12
  end
11
13
 
12
- def encode(msg, **opts)
13
- msg = msg.to_pb(**opts) if msg.respond_to?(:to_pb)
14
- @io.write msg
14
+ class Encoder < Feedx::Format::Abstract::Encoder
15
+ def initialize(io, **opts)
16
+ super PBIO::Delimited.new(io), **opts
17
+ end
18
+
19
+ def encode(msg, **opts)
20
+ msg = msg.to_pb(**opts) if msg.respond_to?(:to_pb)
21
+ @io.write msg
22
+ end
15
23
  end
16
24
  end
@@ -6,46 +6,52 @@ module Feedx
6
6
  # Produces a relation as an encoded feed to a remote location.
7
7
  class Producer
8
8
  # See constructor.
9
- def self.perform(url, opts = {}, &block)
10
- new(url, opts, &block).perform
9
+ def self.perform(url, **opts, &block)
10
+ new(url, **opts, &block).perform
11
11
  end
12
12
 
13
13
  # @param [String] url the destination URL.
14
14
  # @param [Hash] opts options
15
15
  # @option opts [Enumerable,ActiveRecord::Relation] :enum relation or enumerator to stream.
16
16
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
17
- # @option opts [Hash] :format_options format encode options. Default: {}.
18
17
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
19
18
  # @option opts [Time,Proc] :last_modified the last modified time, used to determine if a push is necessary.
20
19
  # @yield A block factory to generate the relation or enumerator.
21
20
  # @yieldreturn [Enumerable,ActiveRecord::Relation] the relation or enumerator to stream.
22
- def initialize(url, opts = {}, &block)
23
- @enum = opts[:enum] || block
21
+ def initialize(url, last_modified: nil, format_options: {}, enum: nil, **opts, &block)
22
+ @enum = enum || block
24
23
  raise ArgumentError, "#{self.class.name}.new expects an :enum option or a block factory" unless @enum
25
24
 
26
- @stream = Feedx::Stream.new(url, opts)
27
- @last_mod = opts[:last_modified]
28
- @fmt_opts = opts[:format_options] || {}
25
+ @url = url
26
+ @opts = opts.merge(format_options)
27
+ @last_mod = last_modified
28
+
29
+ return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
30
+
31
+ warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
29
32
  end
30
33
 
31
34
  def perform
32
- enum = @enum.is_a?(Proc) ? @enum.call : @enum
33
- last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
35
+ stream = Feedx::Stream.new(@url, **@opts)
36
+ enum = @enum.is_a?(Proc) ? @enum.call : @enum
37
+ last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
34
38
  local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
35
39
 
36
40
  begin
37
- metadata = @stream.blob.info.metadata
41
+ metadata = stream.blob.info.metadata
38
42
  remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
39
43
  return -1 unless local_rev > remote_rev
40
44
  rescue BFS::FileNotFound
41
45
  nil
42
46
  end if local_rev.positive?
43
47
 
44
- @stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
48
+ stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
45
49
  iter = enum.respond_to?(:find_each) ? :find_each : :each
46
- enum.send(iter) {|rec| fmt.encode(rec, **@fmt_opts) }
50
+ enum.send(iter) {|rec| fmt.encode(rec, **@opts) }
47
51
  end
48
- @stream.blob.info.size
52
+ stream.blob.info.size
53
+ ensure
54
+ stream&.close
49
55
  end
50
56
  end
51
57
  end
@@ -6,25 +6,41 @@ module Feedx
6
6
  class Stream
7
7
  attr_reader :blob
8
8
 
9
+ # Behaves like new, but accepts an optional block.
10
+ # If a block is given, streams are automatically closed after the block is yielded.
11
+ def self.open(url, **opts)
12
+ stream = new(url, **opts)
13
+ begin
14
+ yield stream
15
+ ensure
16
+ stream.close
17
+ end if block_given?
18
+ stream
19
+ end
20
+
9
21
  # @param [String] url the blob URL.
10
22
  # @param [Hash] opts options
11
23
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
12
24
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
13
- def initialize(url, opts = {})
25
+ def initialize(url, format: nil, compress: nil, **opts)
14
26
  @blob = BFS::Blob.new(url)
15
- @format = detect_format(opts[:format])
16
- @compress = detect_compress(opts[:compress])
27
+ @format = detect_format(format)
28
+ @compress = detect_compress(compress)
29
+ @opts = opts
30
+
31
+ BFS.defer(self, :close)
17
32
  end
18
33
 
19
34
  # Opens the remote for reading.
20
35
  # @param [Hash] opts BFS::Blob#open options
21
36
  # @yield A block over a formatted stream.
22
37
  # @yieldparam [Feedx::Format::Abstract] formatted input stream.
23
- def open(opts = {})
24
- @blob.open(opts) do |io|
25
- @compress.reader(io) do |cio|
26
- fmt = @format.new(cio)
27
- yield fmt
38
+ def open(**opts)
39
+ @blob.open(**opts) do |io|
40
+ @compress.reader(io, **@opts) do |cio|
41
+ @format.decoder(cio, **@opts) do |fmt|
42
+ yield fmt
43
+ end
28
44
  end
29
45
  end
30
46
  end
@@ -33,28 +49,31 @@ module Feedx
33
49
  # @param [Hash] opts BFS::Blob#create options
34
50
  # @yield A block over a formatted stream.
35
51
  # @yieldparam [Feedx::Format::Abstract] formatted output stream.
36
- def create(opts = {})
37
- @blob.create(opts) do |io|
38
- @compress.writer(io) do |cio|
39
- fmt = @format.new(cio)
40
- yield fmt
52
+ def create(**opts)
53
+ @blob.create(**opts) do |io|
54
+ @compress.writer(io, **@opts) do |cio|
55
+ @format.encoder(cio, **@opts) do |fmt|
56
+ yield fmt
57
+ end
41
58
  end
42
59
  end
43
60
  end
44
61
 
62
+ # Closes the underlying connection.
63
+ def close
64
+ @blob.close
65
+ end
66
+
45
67
  private
46
68
 
47
69
  def detect_format(val)
48
70
  case val
49
71
  when nil
50
72
  Feedx::Format.detect(@blob.path)
51
- when Class
52
- parent = Feedx::Format::Abstract
53
- raise ArgumentError, "Class #{val} must extend #{parent}" unless val < parent
54
-
55
- val
56
- else
73
+ when String, Symbol
57
74
  Feedx::Format.resolve(val)
75
+ else
76
+ Feedx::Format.validate!(val)
58
77
  end
59
78
  end
60
79
 
@@ -62,13 +81,10 @@ module Feedx
62
81
  case val
63
82
  when nil
64
83
  Feedx::Compression.detect(@blob.path)
65
- when Class
66
- parent = Feedx::Compression::Abstract
67
- raise ArgumentError, "Class #{val} must extend #{parent}" unless val < parent
68
-
69
- val
70
- else
84
+ when String, Symbol
71
85
  Feedx::Compression.resolve(val)
86
+ else
87
+ Feedx::Compression.validate!(val)
72
88
  end
73
89
  end
74
90
  end
@@ -23,8 +23,7 @@ var _ = Describe("Producer", func() {
23
23
  atomic.AddUint32(&numRuns, 1)
24
24
 
25
25
  for i := 0; i < 10; i++ {
26
- fix := fixture
27
- if err := w.Encode(&fix); err != nil {
26
+ if err := w.Encode(seed()); err != nil {
28
27
  return err
29
28
  }
30
29
  }
@@ -5,10 +5,9 @@ import (
5
5
  "io"
6
6
  "io/ioutil"
7
7
 
8
- "github.com/bsm/feedx"
9
-
10
8
  "github.com/bsm/bfs"
11
- tbp "github.com/golang/protobuf/proto/proto3_proto"
9
+ "github.com/bsm/feedx"
10
+ "github.com/bsm/feedx/internal/testdata"
12
11
  . "github.com/onsi/ginkgo"
13
12
  . "github.com/onsi/gomega"
14
13
  )
@@ -34,23 +33,23 @@ var _ = Describe("Reader", func() {
34
33
  It("should read", func() {
35
34
  data, err := ioutil.ReadAll(subject)
36
35
  Expect(err).NotTo(HaveOccurred())
37
- Expect(len(data)).To(BeNumerically("~", 140, 20))
36
+ Expect(len(data)).To(BeNumerically("~", 110, 20))
38
37
  Expect(subject.NumRead()).To(Equal(0))
39
38
  })
40
39
 
41
40
  It("should decode", func() {
42
- var msgs []tbp.Message
41
+ var msgs []*testdata.MockMessage
43
42
  for {
44
- var msg tbp.Message
43
+ var msg testdata.MockMessage
45
44
  err := subject.Decode(&msg)
46
45
  if err == io.EOF {
47
46
  break
48
47
  }
49
48
  Expect(err).NotTo(HaveOccurred())
50
- msgs = append(msgs, msg)
49
+ msgs = append(msgs, &msg)
51
50
  }
52
51
 
53
- Expect(msgs).To(Equal([]tbp.Message{fixture, fixture, fixture}))
52
+ Expect(msgs).To(ConsistOf(seed(), seed(), seed()))
54
53
  Expect(subject.NumRead()).To(Equal(3))
55
54
  })
56
55
  })
@@ -3,13 +3,15 @@ require 'spec_helper'
3
3
  RSpec.describe Feedx::Compression::Gzip do
4
4
  it 'should wrap readers/writers' do
5
5
  wio = StringIO.new
6
- described_class.writer(wio) {|w| w.write 'xyz' * 1000 }
6
+ subject.writer(wio) {|w| w.write 'xyz' * 1000 }
7
7
  expect(wio.size).to be_within(20).of(40)
8
+ expect(wio.string.encoding).to eq(Encoding::BINARY)
8
9
 
9
10
  data = ''
10
11
  StringIO.open(wio.string) do |rio|
11
- described_class.reader(rio) {|z| data = z.read }
12
+ subject.reader(rio) {|z| data = z.read }
12
13
  end
13
14
  expect(data.size).to eq(3000)
15
+ expect(data.encoding).to eq(Encoding.default_external)
14
16
  end
15
17
  end
@@ -3,12 +3,12 @@ require 'spec_helper'
3
3
  RSpec.describe Feedx::Compression::None do
4
4
  it 'should wrap readers/writers' do
5
5
  wio = StringIO.new
6
- described_class.writer(wio) {|w| w.write 'xyz' * 1000 }
6
+ subject.writer(wio) {|w| w.write 'xyz' * 1000 }
7
7
  expect(wio.size).to eq(3000)
8
8
 
9
9
  data = ''
10
10
  StringIO.open(wio.string) do |rio|
11
- described_class.reader(rio) {|z| data = z.read }
11
+ subject.reader(rio) {|z| data = z.read }
12
12
  end
13
13
  expect(data.size).to eq(3000)
14
14
  end
@@ -2,18 +2,18 @@ require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Compression do
4
4
  it 'should resolve' do
5
- expect(described_class.resolve(:gzip)).to eq(described_class::Gzip)
6
- expect(described_class.resolve(:gz)).to eq(described_class::Gzip)
7
- expect(described_class.resolve(nil)).to eq(described_class::None)
5
+ expect(described_class.resolve(:gzip)).to be_instance_of(described_class::Gzip)
6
+ expect(described_class.resolve(:gz)).to be_instance_of(described_class::Gzip)
7
+ expect(described_class.resolve(nil)).to be_instance_of(described_class::None)
8
8
  expect { described_class.resolve(:txt) }.to raise_error(/invalid compression txt/)
9
9
  end
10
10
 
11
11
  it 'should detect' do
12
- expect(described_class.detect('path/to/file.jsonz')).to eq(described_class::Gzip)
13
- expect(described_class.detect('path/to/file.json.gz')).to eq(described_class::Gzip)
14
- expect(described_class.detect('path/to/file.json')).to eq(described_class::None)
15
- expect(described_class.detect('path/to/file.pbz')).to eq(described_class::Gzip)
16
- expect(described_class.detect('path/to/file.pb.gz')).to eq(described_class::Gzip)
17
- expect(described_class.detect('path/to/file.pb')).to eq(described_class::None)
12
+ expect(described_class.detect('path/to/file.jsonz')).to be_instance_of(described_class::Gzip)
13
+ expect(described_class.detect('path/to/file.json.gz')).to be_instance_of(described_class::Gzip)
14
+ expect(described_class.detect('path/to/file.json')).to be_instance_of(described_class::None)
15
+ expect(described_class.detect('path/to/file.pbz')).to be_instance_of(described_class::Gzip)
16
+ expect(described_class.detect('path/to/file.pb.gz')).to be_instance_of(described_class::Gzip)
17
+ expect(described_class.detect('path/to/file.pb')).to be_instance_of(described_class::None)
18
18
  end
19
19
  end
@@ -36,10 +36,13 @@ RSpec.describe Feedx::Consumer do
36
36
 
37
37
  private
38
38
 
39
- def mock_produce!(opts = {})
39
+ def mock_produce!(enum: mock_enum, **opts)
40
40
  url = 'mock:///dir/file.json'
41
- opts[:enum] ||= %w[x y z].map {|t| Feedx::TestCase::Model.new(t) } * 100
42
- Feedx::Producer.perform url, opts
41
+ Feedx::Producer.perform url, enum: enum, **opts
43
42
  url
44
43
  end
44
+
45
+ def mock_enum
46
+ %w[x y z].map {|t| Feedx::TestCase::Model.new(t) } * 100
47
+ end
45
48
  end
@@ -1,17 +1,20 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Format::Abstract do
4
- subject { Feedx::Format::JSON.new(wio) }
4
+ subject { Feedx::Format::JSON.new }
5
5
  let(:wio) { StringIO.new }
6
+ let(:rio) { StringIO.open(wio.string) }
6
7
 
7
8
  it 'should decode each' do
8
- subject.encode(Feedx::TestCase::Model.new('X'))
9
- subject.encode(Feedx::TestCase::Model.new('Y'))
10
- subject.encode(Feedx::TestCase::Message.new(title: 'Z'))
11
- StringIO.open(wio.string) do |rio|
12
- fmt = subject.class.new(rio)
13
- dec = fmt.decode_each(Feedx::TestCase::Model).to_a
14
- expect(dec.map(&:title)).to eq(%w[X Y Z])
9
+ subject.encoder wio do |enc|
10
+ enc.encode(Feedx::TestCase::Model.new('X'))
11
+ enc.encode(Feedx::TestCase::Model.new('Y'))
12
+ enc.encode(Feedx::TestCase::Message.new(title: 'Z'))
13
+ end
14
+
15
+ subject.decoder rio do |dec|
16
+ acc = dec.decode_each(Feedx::TestCase::Model).to_a
17
+ expect(acc.map(&:title)).to eq(%w[X Y Z])
15
18
  end
16
19
  end
17
20
  end