feedx 0.11.0 → 0.12.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +3 -0
  3. data/.github/workflows/test.yml +60 -0
  4. data/.gitignore +1 -0
  5. data/.rubocop.yml +15 -4
  6. data/Gemfile +0 -2
  7. data/Gemfile.lock +80 -50
  8. data/Makefile +6 -6
  9. data/README.md +1 -1
  10. data/compression.go +18 -0
  11. data/compression_test.go +14 -2
  12. data/consumer_test.go +2 -2
  13. data/ext/parquet/decoder.go +170 -0
  14. data/ext/parquet/decoder_test.go +88 -0
  15. data/ext/parquet/go.mod +10 -0
  16. data/ext/parquet/go.sum +152 -0
  17. data/ext/parquet/parquet.go +78 -0
  18. data/ext/parquet/parquet_test.go +28 -0
  19. data/ext/parquet/reader.go +89 -0
  20. data/ext/parquet/testdata/alltypes_plain.parquet +0 -0
  21. data/ext/parquet/types.go +51 -0
  22. data/feedx.gemspec +5 -6
  23. data/feedx_test.go +2 -2
  24. data/format.go +45 -15
  25. data/format_test.go +4 -2
  26. data/go.mod +10 -5
  27. data/go.sum +90 -25
  28. data/internal/testdata/testdata.pb.go +176 -77
  29. data/lib/feedx/cache/abstract.rb +2 -2
  30. data/lib/feedx/cache/memory.rb +1 -0
  31. data/lib/feedx/compression/abstract.rb +2 -2
  32. data/lib/feedx/compression/gzip.rb +2 -2
  33. data/lib/feedx/compression/none.rb +2 -2
  34. data/lib/feedx/consumer.rb +15 -9
  35. data/lib/feedx/format.rb +4 -1
  36. data/lib/feedx/producer.rb +27 -22
  37. data/lib/feedx/stream.rb +30 -13
  38. data/producer_test.go +2 -2
  39. data/reader_test.go +2 -2
  40. data/spec/feedx/cache/memory_spec.rb +2 -2
  41. data/spec/feedx/cache/value_spec.rb +1 -1
  42. data/spec/feedx/compression/gzip_spec.rb +1 -1
  43. data/spec/feedx/compression/none_spec.rb +1 -1
  44. data/spec/feedx/compression_spec.rb +2 -2
  45. data/spec/feedx/consumer_spec.rb +5 -4
  46. data/spec/feedx/format/abstract_spec.rb +2 -1
  47. data/spec/feedx/format/json_spec.rb +6 -6
  48. data/spec/feedx/format/parquet_spec.rb +1 -1
  49. data/spec/feedx/format/protobuf_spec.rb +1 -1
  50. data/spec/feedx/format_spec.rb +2 -2
  51. data/spec/feedx/producer_spec.rb +15 -8
  52. data/spec/feedx/stream_spec.rb +36 -18
  53. data/writer_test.go +2 -2
  54. metadata +24 -23
  55. data/.travis.yml +0 -24
@@ -5,12 +5,12 @@ class Feedx::Cache::Abstract
5
5
  end
6
6
 
7
7
  # Read reads a key.
8
- def read(_key, **_opts)
8
+ def read(_key, **)
9
9
  raise 'Not implemented'
10
10
  end
11
11
 
12
12
  # Write writes a key/value pair.
13
- def write(_key, _value, **_opts)
13
+ def write(_key, _value, **)
14
14
  raise 'Not implemented'
15
15
  end
16
16
 
@@ -3,6 +3,7 @@ require 'monitor'
3
3
  # Thread-safe in-memory cache. Use for testing only.
4
4
  class Feedx::Cache::Memory < Feedx::Cache::Abstract
5
5
  def initialize
6
+ super
6
7
  @monitor = Monitor.new
7
8
  @entries = {}
8
9
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::Abstract
2
- def reader(_io, &_block)
2
+ def reader(_io, **, &_block)
3
3
  raise 'Not implemented'
4
4
  end
5
5
 
6
- def writer(_io, &_block)
6
+ def writer(_io, **, &_block)
7
7
  raise 'Not implemented'
8
8
  end
9
9
  end
@@ -1,12 +1,12 @@
1
1
  require 'zlib'
2
2
 
3
3
  class Feedx::Compression::Gzip < Feedx::Compression::Abstract
4
- def reader(io, &block)
4
+ def reader(io, **, &block)
5
5
  force_binmode(io)
6
6
  Zlib::GzipReader.wrap(io, &block)
7
7
  end
8
8
 
9
- def writer(io, &block)
9
+ def writer(io, **, &block)
10
10
  force_binmode(io)
11
11
  Zlib::GzipWriter.wrap(io, &block)
12
12
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::None < Feedx::Compression::Abstract
2
- def reader(io)
2
+ def reader(io, **)
3
3
  yield(io)
4
4
  end
5
5
 
6
- def writer(io)
6
+ def writer(io, **)
7
7
  yield(io)
8
8
  end
9
9
  end
@@ -16,33 +16,39 @@ module Feedx
16
16
  # @param [Class] klass the record class.
17
17
  # @param [Hash] opts options
18
18
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
19
- # @option opts [Hash] :format_options format decode options. Default: {}.
20
19
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
21
20
  # @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
22
- def initialize(url, klass, **opts)
23
- @klass = klass
24
- @stream = Feedx::Stream.new(url, **opts)
25
- @fmt_opts = opts[:format_options] || {}
26
- @cache = opts[:cache]
21
+ def initialize(url, klass, format_options: {}, cache: nil, **opts)
22
+ @klass = klass
23
+ @url = url
24
+ @opts = opts.merge(format_options)
25
+ @cache = cache
26
+
27
+ return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
28
+
29
+ warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
27
30
  end
28
31
 
29
32
  # @return [Boolean] returns true if performed.
30
33
  def each(&block)
34
+ stream = Feedx::Stream.new(@url, **@opts)
31
35
  remote_rev = nil
32
36
 
33
37
  if @cache
34
- metadata = @stream.blob.info.metadata
38
+ metadata = stream.blob.info.metadata
35
39
  local_rev = @cache.read.to_i
36
40
  remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
37
41
  return false if remote_rev.positive? && remote_rev <= local_rev
38
42
  end
39
43
 
40
- @stream.open do |fmt|
41
- fmt.decode_each(@klass, **@fmt_opts, &block)
44
+ stream.open do |fmt|
45
+ fmt.decode_each(@klass, **@opts, &block)
42
46
  end
43
47
  @cache.write(remote_rev) if @cache && remote_rev
44
48
 
45
49
  true
50
+ ensure
51
+ stream&.close
46
52
  end
47
53
  end
48
54
  end
data/lib/feedx/format.rb CHANGED
@@ -27,7 +27,7 @@ module Feedx
27
27
  ext = File.extname(base)
28
28
  raise ArgumentError, 'unable to detect format' if ext.empty?
29
29
 
30
- kind = _resolve(ext[1..-1]) || _resolve(ext[1..-2])
30
+ kind = _resolve(ext[1..]) || _resolve(ext[1..-2])
31
31
  return kind if kind
32
32
 
33
33
  base = base[0..-ext.size - 1]
@@ -39,6 +39,9 @@ module Feedx
39
39
  def registry
40
40
  @registry ||= {
41
41
  'json' => :JSON,
42
+ 'jsonl' => :JSON,
43
+ 'ndjson' => :JSON,
44
+ 'parquet' => :Parquet,
42
45
  'pb' => :Protobuf,
43
46
  'proto' => :Protobuf,
44
47
  'protobuf' => :Protobuf,
@@ -14,38 +14,43 @@ module Feedx
14
14
  # @param [Hash] opts options
15
15
  # @option opts [Enumerable,ActiveRecord::Relation] :enum relation or enumerator to stream.
16
16
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
17
- # @option opts [Hash] :format_options format encode options. Default: {}.
18
17
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
19
18
  # @option opts [Time,Proc] :last_modified the last modified time, used to determine if a push is necessary.
20
19
  # @yield A block factory to generate the relation or enumerator.
21
20
  # @yieldreturn [Enumerable,ActiveRecord::Relation] the relation or enumerator to stream.
22
- def initialize(url, **opts, &block)
23
- @enum = opts[:enum] || block
21
+ def initialize(url, last_modified: nil, format_options: {}, enum: nil, **opts, &block)
22
+ @enum = enum || block
24
23
  raise ArgumentError, "#{self.class.name}.new expects an :enum option or a block factory" unless @enum
25
24
 
26
- @stream = Feedx::Stream.new(url, **opts)
27
- @last_mod = opts[:last_modified]
28
- @fmt_opts = opts[:format_options] || {}
25
+ @url = url
26
+ @opts = opts.merge(format_options)
27
+ @last_mod = last_modified
28
+
29
+ return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
30
+
31
+ warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
29
32
  end
30
33
 
31
34
  def perform
32
- enum = @enum.is_a?(Proc) ? @enum.call : @enum
33
- last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
34
- local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
35
-
36
- begin
37
- metadata = @stream.blob.info.metadata
38
- remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
39
- return -1 unless local_rev > remote_rev
40
- rescue BFS::FileNotFound
41
- nil
42
- end if local_rev.positive?
43
-
44
- @stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
45
- iter = enum.respond_to?(:find_each) ? :find_each : :each
46
- enum.send(iter) {|rec| fmt.encode(rec, **@fmt_opts) }
35
+ Feedx::Stream.open(@url, **@opts) do |stream|
36
+ enum = @enum.is_a?(Proc) ? @enum.call : @enum
37
+ last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
38
+ local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
39
+
40
+ begin
41
+ metadata = stream.blob.info.metadata
42
+ remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
43
+ return -1 unless local_rev > remote_rev
44
+ rescue BFS::FileNotFound
45
+ nil
46
+ end if local_rev.positive?
47
+
48
+ stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
49
+ iter = enum.respond_to?(:find_each) ? :find_each : :each
50
+ enum.send(iter) {|rec| fmt.encode(rec, **@opts) }
51
+ end
52
+ stream.blob.info.size
47
53
  end
48
- @stream.blob.info.size
49
54
  end
50
55
  end
51
56
  end
data/lib/feedx/stream.rb CHANGED
@@ -6,26 +6,40 @@ module Feedx
6
6
  class Stream
7
7
  attr_reader :blob
8
8
 
9
+ # Behaves like new, but accepts an optional block.
10
+ # If a block is given, streams are automatically closed after the block is yielded.
11
+ def self.open(url, **opts)
12
+ stream = new(url, **opts)
13
+ return stream unless block_given?
14
+
15
+ begin
16
+ yield stream
17
+ ensure
18
+ stream.close
19
+ end
20
+ end
21
+
9
22
  # @param [String] url the blob URL.
10
23
  # @param [Hash] opts options
11
24
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
12
25
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
13
- def initialize(url, **opts)
26
+ def initialize(url, format: nil, compress: nil, **opts)
14
27
  @blob = BFS::Blob.new(url)
15
- @format = detect_format(opts[:format])
16
- @compress = detect_compress(opts[:compress])
28
+ @format = detect_format(format)
29
+ @compress = detect_compress(compress)
30
+ @opts = opts
31
+
32
+ BFS.defer(self, :close)
17
33
  end
18
34
 
19
35
  # Opens the remote for reading.
20
36
  # @param [Hash] opts BFS::Blob#open options
21
37
  # @yield A block over a formatted stream.
22
38
  # @yieldparam [Feedx::Format::Abstract] formatted input stream.
23
- def open(**opts)
39
+ def open(**opts, &block)
24
40
  @blob.open(**opts) do |io|
25
- @compress.reader(io) do |cio|
26
- @format.decoder(cio) do |fmt|
27
- yield fmt
28
- end
41
+ @compress.reader(io, **@opts) do |cio|
42
+ @format.decoder(cio, **@opts, &block)
29
43
  end
30
44
  end
31
45
  end
@@ -34,16 +48,19 @@ module Feedx
34
48
  # @param [Hash] opts BFS::Blob#create options
35
49
  # @yield A block over a formatted stream.
36
50
  # @yieldparam [Feedx::Format::Abstract] formatted output stream.
37
- def create(**opts)
51
+ def create(**opts, &block)
38
52
  @blob.create(**opts) do |io|
39
- @compress.writer(io) do |cio|
40
- @format.encoder(cio) do |fmt|
41
- yield fmt
42
- end
53
+ @compress.writer(io, **@opts) do |cio|
54
+ @format.encoder(cio, **@opts, &block)
43
55
  end
44
56
  end
45
57
  end
46
58
 
59
+ # Closes the underlying connection.
60
+ def close
61
+ @blob.close
62
+ end
63
+
47
64
  private
48
65
 
49
66
  def detect_format(val)
data/producer_test.go CHANGED
@@ -7,8 +7,8 @@ import (
7
7
 
8
8
  "github.com/bsm/bfs"
9
9
  "github.com/bsm/feedx"
10
- . "github.com/onsi/ginkgo"
11
- . "github.com/onsi/gomega"
10
+ . "github.com/bsm/ginkgo"
11
+ . "github.com/bsm/gomega"
12
12
  )
13
13
 
14
14
  var _ = Describe("Producer", func() {
data/reader_test.go CHANGED
@@ -8,8 +8,8 @@ import (
8
8
  "github.com/bsm/bfs"
9
9
  "github.com/bsm/feedx"
10
10
  "github.com/bsm/feedx/internal/testdata"
11
- . "github.com/onsi/ginkgo"
12
- . "github.com/onsi/gomega"
11
+ . "github.com/bsm/ginkgo"
12
+ . "github.com/bsm/gomega"
13
13
  )
14
14
 
15
15
  var _ = Describe("Reader", func() {
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Cache::Memory do
4
- it 'should read/write' do
4
+ it 'read/writes' do
5
5
  expect(subject.fetch('key')).to be_nil
6
6
  expect(subject.fetch('key') { 'value' }).to eq('value')
7
7
  expect(subject.fetch('key')).to eq('value')
@@ -16,7 +16,7 @@ RSpec.describe Feedx::Cache::Memory do
16
16
  expect(subject.fetch('key')).to be_nil
17
17
  end
18
18
 
19
- it 'should write strings' do
19
+ it 'writes strings' do
20
20
  subject.write('key', 5)
21
21
  expect(subject.read('key')).to eq('5')
22
22
  end
@@ -5,7 +5,7 @@ RSpec.describe Feedx::Cache::Value do
5
5
  described_class.new(Feedx::Cache::Memory.new, 'key')
6
6
  end
7
7
 
8
- it 'should read/write' do
8
+ it 'read/writes' do
9
9
  expect(subject.fetch).to be_nil
10
10
  expect(subject.fetch { 'value' }).to eq('value')
11
11
  expect(subject.fetch).to eq('value')
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Compression::Gzip do
4
- it 'should wrap readers/writers' do
4
+ it 'wraps readers/writers' do
5
5
  wio = StringIO.new
6
6
  subject.writer(wio) {|w| w.write 'xyz' * 1000 }
7
7
  expect(wio.size).to be_within(20).of(40)
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Compression::None do
4
- it 'should wrap readers/writers' do
4
+ it 'wraps readers/writers' do
5
5
  wio = StringIO.new
6
6
  subject.writer(wio) {|w| w.write 'xyz' * 1000 }
7
7
  expect(wio.size).to eq(3000)
@@ -1,14 +1,14 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Compression do
4
- it 'should resolve' do
4
+ it 'resolves' do
5
5
  expect(described_class.resolve(:gzip)).to be_instance_of(described_class::Gzip)
6
6
  expect(described_class.resolve(:gz)).to be_instance_of(described_class::Gzip)
7
7
  expect(described_class.resolve(nil)).to be_instance_of(described_class::None)
8
8
  expect { described_class.resolve(:txt) }.to raise_error(/invalid compression txt/)
9
9
  end
10
10
 
11
- it 'should detect' do
11
+ it 'detects' do
12
12
  expect(described_class.detect('path/to/file.jsonz')).to be_instance_of(described_class::Gzip)
13
13
  expect(described_class.detect('path/to/file.json.gz')).to be_instance_of(described_class::Gzip)
14
14
  expect(described_class.detect('path/to/file.json')).to be_instance_of(described_class::None)
@@ -4,15 +4,16 @@ RSpec.describe Feedx::Consumer do
4
4
  let(:bucket) { BFS::Bucket::InMem.new }
5
5
  let(:klass) { Feedx::TestCase::Model }
6
6
  let(:cache) { Feedx::Cache::Memory.new.value('my-consumer') }
7
+
7
8
  before { allow(BFS).to receive(:resolve).and_return(bucket) }
8
9
 
9
- it 'should reject invalid inputs' do
10
+ it 'rejects invalid inputs' do
10
11
  expect do
11
- described_class.each('mock:///dir/file.txt', klass) {}
12
+ described_class.each('mock:///dir/file.txt', klass)
12
13
  end.to raise_error(/unable to detect format/)
13
14
  end
14
15
 
15
- it 'should consume feeds' do
16
+ it 'consumes feeds' do
16
17
  url = mock_produce!
17
18
  csm = described_class.new(url, klass)
18
19
  expect(csm).to be_a(Enumerable)
@@ -24,7 +25,7 @@ RSpec.describe Feedx::Consumer do
24
25
  expect(cnt).to eq(300)
25
26
  end
26
27
 
27
- it 'should perform conditionally' do
28
+ it 'performs conditionally' do
28
29
  url = mock_produce! last_modified: Time.at(1515151515)
29
30
  expect(described_class.new(url, klass, cache: cache).count).to eq(300)
30
31
  expect(described_class.new(url, klass, cache: cache).count).to eq(0)
@@ -2,10 +2,11 @@ require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Format::Abstract do
4
4
  subject { Feedx::Format::JSON.new }
5
+
5
6
  let(:wio) { StringIO.new }
6
7
  let(:rio) { StringIO.open(wio.string) }
7
8
 
8
- it 'should decode each' do
9
+ it 'decodes each' do
9
10
  subject.encoder wio do |enc|
10
11
  enc.encode(Feedx::TestCase::Model.new('X'))
11
12
  enc.encode(Feedx::TestCase::Model.new('Y'))
@@ -4,17 +4,17 @@ RSpec.describe Feedx::Format::JSON do
4
4
  let(:wio) { StringIO.new }
5
5
  let(:rio) { StringIO.open(wio.string) }
6
6
 
7
- it 'should encode/decode' do
7
+ it 'encode/decodes' do
8
8
  subject.encoder wio do |enc|
9
9
  enc.encode(Feedx::TestCase::Model.new('X'))
10
10
  enc.encode(Feedx::TestCase::Model.new('Y'))
11
11
  enc.encode(Feedx::TestCase::Message.new(title: 'Z'))
12
12
  end
13
- expect(wio.string.lines).to eq [
14
- %({"title":"X","updated_at":"2018-01-05 11:25:15 UTC"}\n),
15
- %({"title":"Y","updated_at":"2018-01-05 11:25:15 UTC"}\n),
16
- %({"title":"Z"}\n),
17
- ]
13
+ expect(wio.string).to eq(<<~JSON)
14
+ {"title":"X","updated_at":"2018-01-05 11:25:15 UTC"}
15
+ {"title":"Y","updated_at":"2018-01-05 11:25:15 UTC"}
16
+ {"title":"Z"}
17
+ JSON
18
18
 
19
19
  subject.decoder rio do |dec|
20
20
  expect(dec.decode(Feedx::TestCase::Model)).to eq(Feedx::TestCase::Model.new('X'))
@@ -11,7 +11,7 @@ RSpec.describe Feedx::Format::Parquet do
11
11
  ])
12
12
  end
13
13
 
14
- it 'should encode/decode' do
14
+ it 'encode/decodes' do
15
15
  subject.encoder wio, schema: schema, batch_size: 2 do |enc|
16
16
  enc.encode(Feedx::TestCase::Model.new('X'))
17
17
  enc.encode(Feedx::TestCase::Model.new('Y'))