feedx 0.11.0 → 0.12.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.editorconfig +3 -0
- data/.github/workflows/test.yml +60 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +15 -4
- data/Gemfile +0 -2
- data/Gemfile.lock +80 -50
- data/Makefile +6 -6
- data/README.md +1 -1
- data/compression.go +18 -0
- data/compression_test.go +14 -2
- data/consumer_test.go +2 -2
- data/ext/parquet/decoder.go +170 -0
- data/ext/parquet/decoder_test.go +88 -0
- data/ext/parquet/go.mod +10 -0
- data/ext/parquet/go.sum +152 -0
- data/ext/parquet/parquet.go +78 -0
- data/ext/parquet/parquet_test.go +28 -0
- data/ext/parquet/reader.go +89 -0
- data/ext/parquet/testdata/alltypes_plain.parquet +0 -0
- data/ext/parquet/types.go +51 -0
- data/feedx.gemspec +5 -6
- data/feedx_test.go +2 -2
- data/format.go +45 -15
- data/format_test.go +4 -2
- data/go.mod +10 -5
- data/go.sum +90 -25
- data/internal/testdata/testdata.pb.go +176 -77
- data/lib/feedx/cache/abstract.rb +2 -2
- data/lib/feedx/cache/memory.rb +1 -0
- data/lib/feedx/compression/abstract.rb +2 -2
- data/lib/feedx/compression/gzip.rb +2 -2
- data/lib/feedx/compression/none.rb +2 -2
- data/lib/feedx/consumer.rb +15 -9
- data/lib/feedx/format.rb +4 -1
- data/lib/feedx/producer.rb +27 -22
- data/lib/feedx/stream.rb +30 -13
- data/producer_test.go +2 -2
- data/reader_test.go +2 -2
- data/spec/feedx/cache/memory_spec.rb +2 -2
- data/spec/feedx/cache/value_spec.rb +1 -1
- data/spec/feedx/compression/gzip_spec.rb +1 -1
- data/spec/feedx/compression/none_spec.rb +1 -1
- data/spec/feedx/compression_spec.rb +2 -2
- data/spec/feedx/consumer_spec.rb +5 -4
- data/spec/feedx/format/abstract_spec.rb +2 -1
- data/spec/feedx/format/json_spec.rb +6 -6
- data/spec/feedx/format/parquet_spec.rb +1 -1
- data/spec/feedx/format/protobuf_spec.rb +1 -1
- data/spec/feedx/format_spec.rb +2 -2
- data/spec/feedx/producer_spec.rb +15 -8
- data/spec/feedx/stream_spec.rb +36 -18
- data/writer_test.go +2 -2
- metadata +24 -23
- data/.travis.yml +0 -24
data/lib/feedx/cache/abstract.rb
CHANGED
@@ -5,12 +5,12 @@ class Feedx::Cache::Abstract
|
|
5
5
|
end
|
6
6
|
|
7
7
|
# Read reads a key.
|
8
|
-
def read(_key, **
|
8
|
+
def read(_key, **)
|
9
9
|
raise 'Not implemented'
|
10
10
|
end
|
11
11
|
|
12
12
|
# Write writes a key/value pair.
|
13
|
-
def write(_key, _value, **
|
13
|
+
def write(_key, _value, **)
|
14
14
|
raise 'Not implemented'
|
15
15
|
end
|
16
16
|
|
data/lib/feedx/cache/memory.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'zlib'
|
2
2
|
|
3
3
|
class Feedx::Compression::Gzip < Feedx::Compression::Abstract
|
4
|
-
def reader(io, &block)
|
4
|
+
def reader(io, **, &block)
|
5
5
|
force_binmode(io)
|
6
6
|
Zlib::GzipReader.wrap(io, &block)
|
7
7
|
end
|
8
8
|
|
9
|
-
def writer(io, &block)
|
9
|
+
def writer(io, **, &block)
|
10
10
|
force_binmode(io)
|
11
11
|
Zlib::GzipWriter.wrap(io, &block)
|
12
12
|
end
|
data/lib/feedx/consumer.rb
CHANGED
@@ -16,33 +16,39 @@ module Feedx
|
|
16
16
|
# @param [Class] klass the record class.
|
17
17
|
# @param [Hash] opts options
|
18
18
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
19
|
-
# @option opts [Hash] :format_options format decode options. Default: {}.
|
20
19
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
21
20
|
# @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
|
22
|
-
def initialize(url, klass, **opts)
|
23
|
-
@klass
|
24
|
-
@
|
25
|
-
@
|
26
|
-
@cache
|
21
|
+
def initialize(url, klass, format_options: {}, cache: nil, **opts)
|
22
|
+
@klass = klass
|
23
|
+
@url = url
|
24
|
+
@opts = opts.merge(format_options)
|
25
|
+
@cache = cache
|
26
|
+
|
27
|
+
return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
|
28
|
+
|
29
|
+
warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
|
27
30
|
end
|
28
31
|
|
29
32
|
# @return [Boolean] returns true if performed.
|
30
33
|
def each(&block)
|
34
|
+
stream = Feedx::Stream.new(@url, **@opts)
|
31
35
|
remote_rev = nil
|
32
36
|
|
33
37
|
if @cache
|
34
|
-
metadata =
|
38
|
+
metadata = stream.blob.info.metadata
|
35
39
|
local_rev = @cache.read.to_i
|
36
40
|
remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
|
37
41
|
return false if remote_rev.positive? && remote_rev <= local_rev
|
38
42
|
end
|
39
43
|
|
40
|
-
|
41
|
-
fmt.decode_each(@klass, **@
|
44
|
+
stream.open do |fmt|
|
45
|
+
fmt.decode_each(@klass, **@opts, &block)
|
42
46
|
end
|
43
47
|
@cache.write(remote_rev) if @cache && remote_rev
|
44
48
|
|
45
49
|
true
|
50
|
+
ensure
|
51
|
+
stream&.close
|
46
52
|
end
|
47
53
|
end
|
48
54
|
end
|
data/lib/feedx/format.rb
CHANGED
@@ -27,7 +27,7 @@ module Feedx
|
|
27
27
|
ext = File.extname(base)
|
28
28
|
raise ArgumentError, 'unable to detect format' if ext.empty?
|
29
29
|
|
30
|
-
kind = _resolve(ext[1
|
30
|
+
kind = _resolve(ext[1..]) || _resolve(ext[1..-2])
|
31
31
|
return kind if kind
|
32
32
|
|
33
33
|
base = base[0..-ext.size - 1]
|
@@ -39,6 +39,9 @@ module Feedx
|
|
39
39
|
def registry
|
40
40
|
@registry ||= {
|
41
41
|
'json' => :JSON,
|
42
|
+
'jsonl' => :JSON,
|
43
|
+
'ndjson' => :JSON,
|
44
|
+
'parquet' => :Parquet,
|
42
45
|
'pb' => :Protobuf,
|
43
46
|
'proto' => :Protobuf,
|
44
47
|
'protobuf' => :Protobuf,
|
data/lib/feedx/producer.rb
CHANGED
@@ -14,38 +14,43 @@ module Feedx
|
|
14
14
|
# @param [Hash] opts options
|
15
15
|
# @option opts [Enumerable,ActiveRecord::Relation] :enum relation or enumerator to stream.
|
16
16
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
17
|
-
# @option opts [Hash] :format_options format encode options. Default: {}.
|
18
17
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
19
18
|
# @option opts [Time,Proc] :last_modified the last modified time, used to determine if a push is necessary.
|
20
19
|
# @yield A block factory to generate the relation or enumerator.
|
21
20
|
# @yieldreturn [Enumerable,ActiveRecord::Relation] the relation or enumerator to stream.
|
22
|
-
def initialize(url, **opts, &block)
|
23
|
-
@enum =
|
21
|
+
def initialize(url, last_modified: nil, format_options: {}, enum: nil, **opts, &block)
|
22
|
+
@enum = enum || block
|
24
23
|
raise ArgumentError, "#{self.class.name}.new expects an :enum option or a block factory" unless @enum
|
25
24
|
|
26
|
-
@
|
27
|
-
@
|
28
|
-
@
|
25
|
+
@url = url
|
26
|
+
@opts = opts.merge(format_options)
|
27
|
+
@last_mod = last_modified
|
28
|
+
|
29
|
+
return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
|
30
|
+
|
31
|
+
warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
|
29
32
|
end
|
30
33
|
|
31
34
|
def perform
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
35
|
+
Feedx::Stream.open(@url, **@opts) do |stream|
|
36
|
+
enum = @enum.is_a?(Proc) ? @enum.call : @enum
|
37
|
+
last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
|
38
|
+
local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
|
39
|
+
|
40
|
+
begin
|
41
|
+
metadata = stream.blob.info.metadata
|
42
|
+
remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
|
43
|
+
return -1 unless local_rev > remote_rev
|
44
|
+
rescue BFS::FileNotFound
|
45
|
+
nil
|
46
|
+
end if local_rev.positive?
|
47
|
+
|
48
|
+
stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
|
49
|
+
iter = enum.respond_to?(:find_each) ? :find_each : :each
|
50
|
+
enum.send(iter) {|rec| fmt.encode(rec, **@opts) }
|
51
|
+
end
|
52
|
+
stream.blob.info.size
|
47
53
|
end
|
48
|
-
@stream.blob.info.size
|
49
54
|
end
|
50
55
|
end
|
51
56
|
end
|
data/lib/feedx/stream.rb
CHANGED
@@ -6,26 +6,40 @@ module Feedx
|
|
6
6
|
class Stream
|
7
7
|
attr_reader :blob
|
8
8
|
|
9
|
+
# Behaves like new, but accepts an optional block.
|
10
|
+
# If a block is given, streams are automatically closed after the block is yielded.
|
11
|
+
def self.open(url, **opts)
|
12
|
+
stream = new(url, **opts)
|
13
|
+
return stream unless block_given?
|
14
|
+
|
15
|
+
begin
|
16
|
+
yield stream
|
17
|
+
ensure
|
18
|
+
stream.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
9
22
|
# @param [String] url the blob URL.
|
10
23
|
# @param [Hash] opts options
|
11
24
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
12
25
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
13
|
-
def initialize(url, **opts)
|
26
|
+
def initialize(url, format: nil, compress: nil, **opts)
|
14
27
|
@blob = BFS::Blob.new(url)
|
15
|
-
@format = detect_format(
|
16
|
-
@compress = detect_compress(
|
28
|
+
@format = detect_format(format)
|
29
|
+
@compress = detect_compress(compress)
|
30
|
+
@opts = opts
|
31
|
+
|
32
|
+
BFS.defer(self, :close)
|
17
33
|
end
|
18
34
|
|
19
35
|
# Opens the remote for reading.
|
20
36
|
# @param [Hash] opts BFS::Blob#open options
|
21
37
|
# @yield A block over a formatted stream.
|
22
38
|
# @yieldparam [Feedx::Format::Abstract] formatted input stream.
|
23
|
-
def open(**opts)
|
39
|
+
def open(**opts, &block)
|
24
40
|
@blob.open(**opts) do |io|
|
25
|
-
@compress.reader(io) do |cio|
|
26
|
-
@format.decoder(cio
|
27
|
-
yield fmt
|
28
|
-
end
|
41
|
+
@compress.reader(io, **@opts) do |cio|
|
42
|
+
@format.decoder(cio, **@opts, &block)
|
29
43
|
end
|
30
44
|
end
|
31
45
|
end
|
@@ -34,16 +48,19 @@ module Feedx
|
|
34
48
|
# @param [Hash] opts BFS::Blob#create options
|
35
49
|
# @yield A block over a formatted stream.
|
36
50
|
# @yieldparam [Feedx::Format::Abstract] formatted output stream.
|
37
|
-
def create(**opts)
|
51
|
+
def create(**opts, &block)
|
38
52
|
@blob.create(**opts) do |io|
|
39
|
-
@compress.writer(io) do |cio|
|
40
|
-
@format.encoder(cio
|
41
|
-
yield fmt
|
42
|
-
end
|
53
|
+
@compress.writer(io, **@opts) do |cio|
|
54
|
+
@format.encoder(cio, **@opts, &block)
|
43
55
|
end
|
44
56
|
end
|
45
57
|
end
|
46
58
|
|
59
|
+
# Closes the underlying connection.
|
60
|
+
def close
|
61
|
+
@blob.close
|
62
|
+
end
|
63
|
+
|
47
64
|
private
|
48
65
|
|
49
66
|
def detect_format(val)
|
data/producer_test.go
CHANGED
data/reader_test.go
CHANGED
@@ -8,8 +8,8 @@ import (
|
|
8
8
|
"github.com/bsm/bfs"
|
9
9
|
"github.com/bsm/feedx"
|
10
10
|
"github.com/bsm/feedx/internal/testdata"
|
11
|
-
. "github.com/
|
12
|
-
. "github.com/
|
11
|
+
. "github.com/bsm/ginkgo"
|
12
|
+
. "github.com/bsm/gomega"
|
13
13
|
)
|
14
14
|
|
15
15
|
var _ = Describe("Reader", func() {
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Cache::Memory do
|
4
|
-
it '
|
4
|
+
it 'read/writes' do
|
5
5
|
expect(subject.fetch('key')).to be_nil
|
6
6
|
expect(subject.fetch('key') { 'value' }).to eq('value')
|
7
7
|
expect(subject.fetch('key')).to eq('value')
|
@@ -16,7 +16,7 @@ RSpec.describe Feedx::Cache::Memory do
|
|
16
16
|
expect(subject.fetch('key')).to be_nil
|
17
17
|
end
|
18
18
|
|
19
|
-
it '
|
19
|
+
it 'writes strings' do
|
20
20
|
subject.write('key', 5)
|
21
21
|
expect(subject.read('key')).to eq('5')
|
22
22
|
end
|
@@ -5,7 +5,7 @@ RSpec.describe Feedx::Cache::Value do
|
|
5
5
|
described_class.new(Feedx::Cache::Memory.new, 'key')
|
6
6
|
end
|
7
7
|
|
8
|
-
it '
|
8
|
+
it 'read/writes' do
|
9
9
|
expect(subject.fetch).to be_nil
|
10
10
|
expect(subject.fetch { 'value' }).to eq('value')
|
11
11
|
expect(subject.fetch).to eq('value')
|
@@ -1,14 +1,14 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Compression do
|
4
|
-
it '
|
4
|
+
it 'resolves' do
|
5
5
|
expect(described_class.resolve(:gzip)).to be_instance_of(described_class::Gzip)
|
6
6
|
expect(described_class.resolve(:gz)).to be_instance_of(described_class::Gzip)
|
7
7
|
expect(described_class.resolve(nil)).to be_instance_of(described_class::None)
|
8
8
|
expect { described_class.resolve(:txt) }.to raise_error(/invalid compression txt/)
|
9
9
|
end
|
10
10
|
|
11
|
-
it '
|
11
|
+
it 'detects' do
|
12
12
|
expect(described_class.detect('path/to/file.jsonz')).to be_instance_of(described_class::Gzip)
|
13
13
|
expect(described_class.detect('path/to/file.json.gz')).to be_instance_of(described_class::Gzip)
|
14
14
|
expect(described_class.detect('path/to/file.json')).to be_instance_of(described_class::None)
|
data/spec/feedx/consumer_spec.rb
CHANGED
@@ -4,15 +4,16 @@ RSpec.describe Feedx::Consumer do
|
|
4
4
|
let(:bucket) { BFS::Bucket::InMem.new }
|
5
5
|
let(:klass) { Feedx::TestCase::Model }
|
6
6
|
let(:cache) { Feedx::Cache::Memory.new.value('my-consumer') }
|
7
|
+
|
7
8
|
before { allow(BFS).to receive(:resolve).and_return(bucket) }
|
8
9
|
|
9
|
-
it '
|
10
|
+
it 'rejects invalid inputs' do
|
10
11
|
expect do
|
11
|
-
described_class.each('mock:///dir/file.txt', klass)
|
12
|
+
described_class.each('mock:///dir/file.txt', klass)
|
12
13
|
end.to raise_error(/unable to detect format/)
|
13
14
|
end
|
14
15
|
|
15
|
-
it '
|
16
|
+
it 'consumes feeds' do
|
16
17
|
url = mock_produce!
|
17
18
|
csm = described_class.new(url, klass)
|
18
19
|
expect(csm).to be_a(Enumerable)
|
@@ -24,7 +25,7 @@ RSpec.describe Feedx::Consumer do
|
|
24
25
|
expect(cnt).to eq(300)
|
25
26
|
end
|
26
27
|
|
27
|
-
it '
|
28
|
+
it 'performs conditionally' do
|
28
29
|
url = mock_produce! last_modified: Time.at(1515151515)
|
29
30
|
expect(described_class.new(url, klass, cache: cache).count).to eq(300)
|
30
31
|
expect(described_class.new(url, klass, cache: cache).count).to eq(0)
|
@@ -2,10 +2,11 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Format::Abstract do
|
4
4
|
subject { Feedx::Format::JSON.new }
|
5
|
+
|
5
6
|
let(:wio) { StringIO.new }
|
6
7
|
let(:rio) { StringIO.open(wio.string) }
|
7
8
|
|
8
|
-
it '
|
9
|
+
it 'decodes each' do
|
9
10
|
subject.encoder wio do |enc|
|
10
11
|
enc.encode(Feedx::TestCase::Model.new('X'))
|
11
12
|
enc.encode(Feedx::TestCase::Model.new('Y'))
|
@@ -4,17 +4,17 @@ RSpec.describe Feedx::Format::JSON do
|
|
4
4
|
let(:wio) { StringIO.new }
|
5
5
|
let(:rio) { StringIO.open(wio.string) }
|
6
6
|
|
7
|
-
it '
|
7
|
+
it 'encode/decodes' do
|
8
8
|
subject.encoder wio do |enc|
|
9
9
|
enc.encode(Feedx::TestCase::Model.new('X'))
|
10
10
|
enc.encode(Feedx::TestCase::Model.new('Y'))
|
11
11
|
enc.encode(Feedx::TestCase::Message.new(title: 'Z'))
|
12
12
|
end
|
13
|
-
expect(wio.string
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
expect(wio.string).to eq(<<~JSON)
|
14
|
+
{"title":"X","updated_at":"2018-01-05 11:25:15 UTC"}
|
15
|
+
{"title":"Y","updated_at":"2018-01-05 11:25:15 UTC"}
|
16
|
+
{"title":"Z"}
|
17
|
+
JSON
|
18
18
|
|
19
19
|
subject.decoder rio do |dec|
|
20
20
|
expect(dec.decode(Feedx::TestCase::Model)).to eq(Feedx::TestCase::Model.new('X'))
|
@@ -11,7 +11,7 @@ RSpec.describe Feedx::Format::Parquet do
|
|
11
11
|
])
|
12
12
|
end
|
13
13
|
|
14
|
-
it '
|
14
|
+
it 'encode/decodes' do
|
15
15
|
subject.encoder wio, schema: schema, batch_size: 2 do |enc|
|
16
16
|
enc.encode(Feedx::TestCase::Model.new('X'))
|
17
17
|
enc.encode(Feedx::TestCase::Model.new('Y'))
|