feedx 0.11.0 → 0.12.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +3 -0
  3. data/.github/workflows/test.yml +60 -0
  4. data/.gitignore +1 -0
  5. data/.rubocop.yml +15 -4
  6. data/Gemfile +0 -2
  7. data/Gemfile.lock +80 -50
  8. data/Makefile +6 -6
  9. data/README.md +1 -1
  10. data/compression.go +18 -0
  11. data/compression_test.go +14 -2
  12. data/consumer_test.go +2 -2
  13. data/ext/parquet/decoder.go +170 -0
  14. data/ext/parquet/decoder_test.go +88 -0
  15. data/ext/parquet/go.mod +10 -0
  16. data/ext/parquet/go.sum +152 -0
  17. data/ext/parquet/parquet.go +78 -0
  18. data/ext/parquet/parquet_test.go +28 -0
  19. data/ext/parquet/reader.go +89 -0
  20. data/ext/parquet/testdata/alltypes_plain.parquet +0 -0
  21. data/ext/parquet/types.go +51 -0
  22. data/feedx.gemspec +5 -6
  23. data/feedx_test.go +2 -2
  24. data/format.go +45 -15
  25. data/format_test.go +4 -2
  26. data/go.mod +10 -5
  27. data/go.sum +90 -25
  28. data/internal/testdata/testdata.pb.go +176 -77
  29. data/lib/feedx/cache/abstract.rb +2 -2
  30. data/lib/feedx/cache/memory.rb +1 -0
  31. data/lib/feedx/compression/abstract.rb +2 -2
  32. data/lib/feedx/compression/gzip.rb +2 -2
  33. data/lib/feedx/compression/none.rb +2 -2
  34. data/lib/feedx/consumer.rb +15 -9
  35. data/lib/feedx/format.rb +4 -1
  36. data/lib/feedx/producer.rb +27 -22
  37. data/lib/feedx/stream.rb +30 -13
  38. data/producer_test.go +2 -2
  39. data/reader_test.go +2 -2
  40. data/spec/feedx/cache/memory_spec.rb +2 -2
  41. data/spec/feedx/cache/value_spec.rb +1 -1
  42. data/spec/feedx/compression/gzip_spec.rb +1 -1
  43. data/spec/feedx/compression/none_spec.rb +1 -1
  44. data/spec/feedx/compression_spec.rb +2 -2
  45. data/spec/feedx/consumer_spec.rb +5 -4
  46. data/spec/feedx/format/abstract_spec.rb +2 -1
  47. data/spec/feedx/format/json_spec.rb +6 -6
  48. data/spec/feedx/format/parquet_spec.rb +1 -1
  49. data/spec/feedx/format/protobuf_spec.rb +1 -1
  50. data/spec/feedx/format_spec.rb +2 -2
  51. data/spec/feedx/producer_spec.rb +15 -8
  52. data/spec/feedx/stream_spec.rb +36 -18
  53. data/writer_test.go +2 -2
  54. metadata +24 -23
  55. data/.travis.yml +0 -24
@@ -5,12 +5,12 @@ class Feedx::Cache::Abstract
5
5
  end
6
6
 
7
7
  # Read reads a key.
8
- def read(_key, **_opts)
8
+ def read(_key, **)
9
9
  raise 'Not implemented'
10
10
  end
11
11
 
12
12
  # Write writes a key/value pair.
13
- def write(_key, _value, **_opts)
13
+ def write(_key, _value, **)
14
14
  raise 'Not implemented'
15
15
  end
16
16
 
@@ -3,6 +3,7 @@ require 'monitor'
3
3
  # Thread-safe in-memory cache. Use for testing only.
4
4
  class Feedx::Cache::Memory < Feedx::Cache::Abstract
5
5
  def initialize
6
+ super
6
7
  @monitor = Monitor.new
7
8
  @entries = {}
8
9
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::Abstract
2
- def reader(_io, &_block)
2
+ def reader(_io, **, &_block)
3
3
  raise 'Not implemented'
4
4
  end
5
5
 
6
- def writer(_io, &_block)
6
+ def writer(_io, **, &_block)
7
7
  raise 'Not implemented'
8
8
  end
9
9
  end
@@ -1,12 +1,12 @@
1
1
  require 'zlib'
2
2
 
3
3
  class Feedx::Compression::Gzip < Feedx::Compression::Abstract
4
- def reader(io, &block)
4
+ def reader(io, **, &block)
5
5
  force_binmode(io)
6
6
  Zlib::GzipReader.wrap(io, &block)
7
7
  end
8
8
 
9
- def writer(io, &block)
9
+ def writer(io, **, &block)
10
10
  force_binmode(io)
11
11
  Zlib::GzipWriter.wrap(io, &block)
12
12
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::None < Feedx::Compression::Abstract
2
- def reader(io)
2
+ def reader(io, **)
3
3
  yield(io)
4
4
  end
5
5
 
6
- def writer(io)
6
+ def writer(io, **)
7
7
  yield(io)
8
8
  end
9
9
  end
@@ -16,33 +16,39 @@ module Feedx
16
16
  # @param [Class] klass the record class.
17
17
  # @param [Hash] opts options
18
18
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
19
- # @option opts [Hash] :format_options format decode options. Default: {}.
20
19
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
21
20
  # @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
22
- def initialize(url, klass, **opts)
23
- @klass = klass
24
- @stream = Feedx::Stream.new(url, **opts)
25
- @fmt_opts = opts[:format_options] || {}
26
- @cache = opts[:cache]
21
+ def initialize(url, klass, format_options: {}, cache: nil, **opts)
22
+ @klass = klass
23
+ @url = url
24
+ @opts = opts.merge(format_options)
25
+ @cache = cache
26
+
27
+ return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
28
+
29
+ warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
27
30
  end
28
31
 
29
32
  # @return [Boolean] returns true if performed.
30
33
  def each(&block)
34
+ stream = Feedx::Stream.new(@url, **@opts)
31
35
  remote_rev = nil
32
36
 
33
37
  if @cache
34
- metadata = @stream.blob.info.metadata
38
+ metadata = stream.blob.info.metadata
35
39
  local_rev = @cache.read.to_i
36
40
  remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
37
41
  return false if remote_rev.positive? && remote_rev <= local_rev
38
42
  end
39
43
 
40
- @stream.open do |fmt|
41
- fmt.decode_each(@klass, **@fmt_opts, &block)
44
+ stream.open do |fmt|
45
+ fmt.decode_each(@klass, **@opts, &block)
42
46
  end
43
47
  @cache.write(remote_rev) if @cache && remote_rev
44
48
 
45
49
  true
50
+ ensure
51
+ stream&.close
46
52
  end
47
53
  end
48
54
  end
data/lib/feedx/format.rb CHANGED
@@ -27,7 +27,7 @@ module Feedx
27
27
  ext = File.extname(base)
28
28
  raise ArgumentError, 'unable to detect format' if ext.empty?
29
29
 
30
- kind = _resolve(ext[1..-1]) || _resolve(ext[1..-2])
30
+ kind = _resolve(ext[1..]) || _resolve(ext[1..-2])
31
31
  return kind if kind
32
32
 
33
33
  base = base[0..-ext.size - 1]
@@ -39,6 +39,9 @@ module Feedx
39
39
  def registry
40
40
  @registry ||= {
41
41
  'json' => :JSON,
42
+ 'jsonl' => :JSON,
43
+ 'ndjson' => :JSON,
44
+ 'parquet' => :Parquet,
42
45
  'pb' => :Protobuf,
43
46
  'proto' => :Protobuf,
44
47
  'protobuf' => :Protobuf,
@@ -14,38 +14,43 @@ module Feedx
14
14
  # @param [Hash] opts options
15
15
  # @option opts [Enumerable,ActiveRecord::Relation] :enum relation or enumerator to stream.
16
16
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
17
- # @option opts [Hash] :format_options format encode options. Default: {}.
18
17
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
19
18
  # @option opts [Time,Proc] :last_modified the last modified time, used to determine if a push is necessary.
20
19
  # @yield A block factory to generate the relation or enumerator.
21
20
  # @yieldreturn [Enumerable,ActiveRecord::Relation] the relation or enumerator to stream.
22
- def initialize(url, **opts, &block)
23
- @enum = opts[:enum] || block
21
+ def initialize(url, last_modified: nil, format_options: {}, enum: nil, **opts, &block)
22
+ @enum = enum || block
24
23
  raise ArgumentError, "#{self.class.name}.new expects an :enum option or a block factory" unless @enum
25
24
 
26
- @stream = Feedx::Stream.new(url, **opts)
27
- @last_mod = opts[:last_modified]
28
- @fmt_opts = opts[:format_options] || {}
25
+ @url = url
26
+ @opts = opts.merge(format_options)
27
+ @last_mod = last_modified
28
+
29
+ return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
30
+
31
+ warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
29
32
  end
30
33
 
31
34
  def perform
32
- enum = @enum.is_a?(Proc) ? @enum.call : @enum
33
- last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
34
- local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
35
-
36
- begin
37
- metadata = @stream.blob.info.metadata
38
- remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
39
- return -1 unless local_rev > remote_rev
40
- rescue BFS::FileNotFound
41
- nil
42
- end if local_rev.positive?
43
-
44
- @stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
45
- iter = enum.respond_to?(:find_each) ? :find_each : :each
46
- enum.send(iter) {|rec| fmt.encode(rec, **@fmt_opts) }
35
+ Feedx::Stream.open(@url, **@opts) do |stream|
36
+ enum = @enum.is_a?(Proc) ? @enum.call : @enum
37
+ last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
38
+ local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
39
+
40
+ begin
41
+ metadata = stream.blob.info.metadata
42
+ remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
43
+ return -1 unless local_rev > remote_rev
44
+ rescue BFS::FileNotFound
45
+ nil
46
+ end if local_rev.positive?
47
+
48
+ stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
49
+ iter = enum.respond_to?(:find_each) ? :find_each : :each
50
+ enum.send(iter) {|rec| fmt.encode(rec, **@opts) }
51
+ end
52
+ stream.blob.info.size
47
53
  end
48
- @stream.blob.info.size
49
54
  end
50
55
  end
51
56
  end
data/lib/feedx/stream.rb CHANGED
@@ -6,26 +6,40 @@ module Feedx
6
6
  class Stream
7
7
  attr_reader :blob
8
8
 
9
+ # Behaves like new, but accepts an optional block.
10
+ # If a block is given, streams are automatically closed after the block is yielded.
11
+ def self.open(url, **opts)
12
+ stream = new(url, **opts)
13
+ return stream unless block_given?
14
+
15
+ begin
16
+ yield stream
17
+ ensure
18
+ stream.close
19
+ end
20
+ end
21
+
9
22
  # @param [String] url the blob URL.
10
23
  # @param [Hash] opts options
11
24
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
12
25
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
13
- def initialize(url, **opts)
26
+ def initialize(url, format: nil, compress: nil, **opts)
14
27
  @blob = BFS::Blob.new(url)
15
- @format = detect_format(opts[:format])
16
- @compress = detect_compress(opts[:compress])
28
+ @format = detect_format(format)
29
+ @compress = detect_compress(compress)
30
+ @opts = opts
31
+
32
+ BFS.defer(self, :close)
17
33
  end
18
34
 
19
35
  # Opens the remote for reading.
20
36
  # @param [Hash] opts BFS::Blob#open options
21
37
  # @yield A block over a formatted stream.
22
38
  # @yieldparam [Feedx::Format::Abstract] formatted input stream.
23
- def open(**opts)
39
+ def open(**opts, &block)
24
40
  @blob.open(**opts) do |io|
25
- @compress.reader(io) do |cio|
26
- @format.decoder(cio) do |fmt|
27
- yield fmt
28
- end
41
+ @compress.reader(io, **@opts) do |cio|
42
+ @format.decoder(cio, **@opts, &block)
29
43
  end
30
44
  end
31
45
  end
@@ -34,16 +48,19 @@ module Feedx
34
48
  # @param [Hash] opts BFS::Blob#create options
35
49
  # @yield A block over a formatted stream.
36
50
  # @yieldparam [Feedx::Format::Abstract] formatted output stream.
37
- def create(**opts)
51
+ def create(**opts, &block)
38
52
  @blob.create(**opts) do |io|
39
- @compress.writer(io) do |cio|
40
- @format.encoder(cio) do |fmt|
41
- yield fmt
42
- end
53
+ @compress.writer(io, **@opts) do |cio|
54
+ @format.encoder(cio, **@opts, &block)
43
55
  end
44
56
  end
45
57
  end
46
58
 
59
+ # Closes the underlying connection.
60
+ def close
61
+ @blob.close
62
+ end
63
+
47
64
  private
48
65
 
49
66
  def detect_format(val)
data/producer_test.go CHANGED
@@ -7,8 +7,8 @@ import (
7
7
 
8
8
  "github.com/bsm/bfs"
9
9
  "github.com/bsm/feedx"
10
- . "github.com/onsi/ginkgo"
11
- . "github.com/onsi/gomega"
10
+ . "github.com/bsm/ginkgo"
11
+ . "github.com/bsm/gomega"
12
12
  )
13
13
 
14
14
  var _ = Describe("Producer", func() {
data/reader_test.go CHANGED
@@ -8,8 +8,8 @@ import (
8
8
  "github.com/bsm/bfs"
9
9
  "github.com/bsm/feedx"
10
10
  "github.com/bsm/feedx/internal/testdata"
11
- . "github.com/onsi/ginkgo"
12
- . "github.com/onsi/gomega"
11
+ . "github.com/bsm/ginkgo"
12
+ . "github.com/bsm/gomega"
13
13
  )
14
14
 
15
15
  var _ = Describe("Reader", func() {
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Cache::Memory do
4
- it 'should read/write' do
4
+ it 'read/writes' do
5
5
  expect(subject.fetch('key')).to be_nil
6
6
  expect(subject.fetch('key') { 'value' }).to eq('value')
7
7
  expect(subject.fetch('key')).to eq('value')
@@ -16,7 +16,7 @@ RSpec.describe Feedx::Cache::Memory do
16
16
  expect(subject.fetch('key')).to be_nil
17
17
  end
18
18
 
19
- it 'should write strings' do
19
+ it 'writes strings' do
20
20
  subject.write('key', 5)
21
21
  expect(subject.read('key')).to eq('5')
22
22
  end
@@ -5,7 +5,7 @@ RSpec.describe Feedx::Cache::Value do
5
5
  described_class.new(Feedx::Cache::Memory.new, 'key')
6
6
  end
7
7
 
8
- it 'should read/write' do
8
+ it 'read/writes' do
9
9
  expect(subject.fetch).to be_nil
10
10
  expect(subject.fetch { 'value' }).to eq('value')
11
11
  expect(subject.fetch).to eq('value')
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Compression::Gzip do
4
- it 'should wrap readers/writers' do
4
+ it 'wraps readers/writers' do
5
5
  wio = StringIO.new
6
6
  subject.writer(wio) {|w| w.write 'xyz' * 1000 }
7
7
  expect(wio.size).to be_within(20).of(40)
@@ -1,7 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Compression::None do
4
- it 'should wrap readers/writers' do
4
+ it 'wraps readers/writers' do
5
5
  wio = StringIO.new
6
6
  subject.writer(wio) {|w| w.write 'xyz' * 1000 }
7
7
  expect(wio.size).to eq(3000)
@@ -1,14 +1,14 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Compression do
4
- it 'should resolve' do
4
+ it 'resolves' do
5
5
  expect(described_class.resolve(:gzip)).to be_instance_of(described_class::Gzip)
6
6
  expect(described_class.resolve(:gz)).to be_instance_of(described_class::Gzip)
7
7
  expect(described_class.resolve(nil)).to be_instance_of(described_class::None)
8
8
  expect { described_class.resolve(:txt) }.to raise_error(/invalid compression txt/)
9
9
  end
10
10
 
11
- it 'should detect' do
11
+ it 'detects' do
12
12
  expect(described_class.detect('path/to/file.jsonz')).to be_instance_of(described_class::Gzip)
13
13
  expect(described_class.detect('path/to/file.json.gz')).to be_instance_of(described_class::Gzip)
14
14
  expect(described_class.detect('path/to/file.json')).to be_instance_of(described_class::None)
@@ -4,15 +4,16 @@ RSpec.describe Feedx::Consumer do
4
4
  let(:bucket) { BFS::Bucket::InMem.new }
5
5
  let(:klass) { Feedx::TestCase::Model }
6
6
  let(:cache) { Feedx::Cache::Memory.new.value('my-consumer') }
7
+
7
8
  before { allow(BFS).to receive(:resolve).and_return(bucket) }
8
9
 
9
- it 'should reject invalid inputs' do
10
+ it 'rejects invalid inputs' do
10
11
  expect do
11
- described_class.each('mock:///dir/file.txt', klass) {}
12
+ described_class.each('mock:///dir/file.txt', klass)
12
13
  end.to raise_error(/unable to detect format/)
13
14
  end
14
15
 
15
- it 'should consume feeds' do
16
+ it 'consumes feeds' do
16
17
  url = mock_produce!
17
18
  csm = described_class.new(url, klass)
18
19
  expect(csm).to be_a(Enumerable)
@@ -24,7 +25,7 @@ RSpec.describe Feedx::Consumer do
24
25
  expect(cnt).to eq(300)
25
26
  end
26
27
 
27
- it 'should perform conditionally' do
28
+ it 'performs conditionally' do
28
29
  url = mock_produce! last_modified: Time.at(1515151515)
29
30
  expect(described_class.new(url, klass, cache: cache).count).to eq(300)
30
31
  expect(described_class.new(url, klass, cache: cache).count).to eq(0)
@@ -2,10 +2,11 @@ require 'spec_helper'
2
2
 
3
3
  RSpec.describe Feedx::Format::Abstract do
4
4
  subject { Feedx::Format::JSON.new }
5
+
5
6
  let(:wio) { StringIO.new }
6
7
  let(:rio) { StringIO.open(wio.string) }
7
8
 
8
- it 'should decode each' do
9
+ it 'decodes each' do
9
10
  subject.encoder wio do |enc|
10
11
  enc.encode(Feedx::TestCase::Model.new('X'))
11
12
  enc.encode(Feedx::TestCase::Model.new('Y'))
@@ -4,17 +4,17 @@ RSpec.describe Feedx::Format::JSON do
4
4
  let(:wio) { StringIO.new }
5
5
  let(:rio) { StringIO.open(wio.string) }
6
6
 
7
- it 'should encode/decode' do
7
+ it 'encode/decodes' do
8
8
  subject.encoder wio do |enc|
9
9
  enc.encode(Feedx::TestCase::Model.new('X'))
10
10
  enc.encode(Feedx::TestCase::Model.new('Y'))
11
11
  enc.encode(Feedx::TestCase::Message.new(title: 'Z'))
12
12
  end
13
- expect(wio.string.lines).to eq [
14
- %({"title":"X","updated_at":"2018-01-05 11:25:15 UTC"}\n),
15
- %({"title":"Y","updated_at":"2018-01-05 11:25:15 UTC"}\n),
16
- %({"title":"Z"}\n),
17
- ]
13
+ expect(wio.string).to eq(<<~JSON)
14
+ {"title":"X","updated_at":"2018-01-05 11:25:15 UTC"}
15
+ {"title":"Y","updated_at":"2018-01-05 11:25:15 UTC"}
16
+ {"title":"Z"}
17
+ JSON
18
18
 
19
19
  subject.decoder rio do |dec|
20
20
  expect(dec.decode(Feedx::TestCase::Model)).to eq(Feedx::TestCase::Model.new('X'))
@@ -11,7 +11,7 @@ RSpec.describe Feedx::Format::Parquet do
11
11
  ])
12
12
  end
13
13
 
14
- it 'should encode/decode' do
14
+ it 'encode/decodes' do
15
15
  subject.encoder wio, schema: schema, batch_size: 2 do |enc|
16
16
  enc.encode(Feedx::TestCase::Model.new('X'))
17
17
  enc.encode(Feedx::TestCase::Model.new('Y'))