feedx 0.11.0 → 0.12.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +3 -0
- data/.github/workflows/test.yml +60 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +15 -4
- data/Gemfile +0 -2
- data/Gemfile.lock +80 -50
- data/Makefile +6 -6
- data/README.md +1 -1
- data/compression.go +18 -0
- data/compression_test.go +14 -2
- data/consumer_test.go +2 -2
- data/ext/parquet/decoder.go +170 -0
- data/ext/parquet/decoder_test.go +88 -0
- data/ext/parquet/go.mod +10 -0
- data/ext/parquet/go.sum +152 -0
- data/ext/parquet/parquet.go +78 -0
- data/ext/parquet/parquet_test.go +28 -0
- data/ext/parquet/reader.go +89 -0
- data/ext/parquet/testdata/alltypes_plain.parquet +0 -0
- data/ext/parquet/types.go +51 -0
- data/feedx.gemspec +5 -6
- data/feedx_test.go +2 -2
- data/format.go +45 -15
- data/format_test.go +4 -2
- data/go.mod +10 -5
- data/go.sum +90 -25
- data/internal/testdata/testdata.pb.go +176 -77
- data/lib/feedx/cache/abstract.rb +2 -2
- data/lib/feedx/cache/memory.rb +1 -0
- data/lib/feedx/compression/abstract.rb +2 -2
- data/lib/feedx/compression/gzip.rb +2 -2
- data/lib/feedx/compression/none.rb +2 -2
- data/lib/feedx/consumer.rb +15 -9
- data/lib/feedx/format.rb +4 -1
- data/lib/feedx/producer.rb +27 -22
- data/lib/feedx/stream.rb +30 -13
- data/producer_test.go +2 -2
- data/reader_test.go +2 -2
- data/spec/feedx/cache/memory_spec.rb +2 -2
- data/spec/feedx/cache/value_spec.rb +1 -1
- data/spec/feedx/compression/gzip_spec.rb +1 -1
- data/spec/feedx/compression/none_spec.rb +1 -1
- data/spec/feedx/compression_spec.rb +2 -2
- data/spec/feedx/consumer_spec.rb +5 -4
- data/spec/feedx/format/abstract_spec.rb +2 -1
- data/spec/feedx/format/json_spec.rb +6 -6
- data/spec/feedx/format/parquet_spec.rb +1 -1
- data/spec/feedx/format/protobuf_spec.rb +1 -1
- data/spec/feedx/format_spec.rb +2 -2
- data/spec/feedx/producer_spec.rb +15 -8
- data/spec/feedx/stream_spec.rb +36 -18
- data/writer_test.go +2 -2
- metadata +24 -23
- data/.travis.yml +0 -24
data/lib/feedx/cache/abstract.rb
CHANGED
@@ -5,12 +5,12 @@ class Feedx::Cache::Abstract
|
|
5
5
|
end
|
6
6
|
|
7
7
|
# Read reads a key.
|
8
|
-
def read(_key, **
|
8
|
+
def read(_key, **)
|
9
9
|
raise 'Not implemented'
|
10
10
|
end
|
11
11
|
|
12
12
|
# Write writes a key/value pair.
|
13
|
-
def write(_key, _value, **
|
13
|
+
def write(_key, _value, **)
|
14
14
|
raise 'Not implemented'
|
15
15
|
end
|
16
16
|
|
data/lib/feedx/cache/memory.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'zlib'
|
2
2
|
|
3
3
|
class Feedx::Compression::Gzip < Feedx::Compression::Abstract
|
4
|
-
def reader(io, &block)
|
4
|
+
def reader(io, **, &block)
|
5
5
|
force_binmode(io)
|
6
6
|
Zlib::GzipReader.wrap(io, &block)
|
7
7
|
end
|
8
8
|
|
9
|
-
def writer(io, &block)
|
9
|
+
def writer(io, **, &block)
|
10
10
|
force_binmode(io)
|
11
11
|
Zlib::GzipWriter.wrap(io, &block)
|
12
12
|
end
|
data/lib/feedx/consumer.rb
CHANGED
@@ -16,33 +16,39 @@ module Feedx
|
|
16
16
|
# @param [Class] klass the record class.
|
17
17
|
# @param [Hash] opts options
|
18
18
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
19
|
-
# @option opts [Hash] :format_options format decode options. Default: {}.
|
20
19
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
21
20
|
# @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
|
22
|
-
def initialize(url, klass, **opts)
|
23
|
-
@klass
|
24
|
-
@
|
25
|
-
@
|
26
|
-
@cache
|
21
|
+
def initialize(url, klass, format_options: {}, cache: nil, **opts)
|
22
|
+
@klass = klass
|
23
|
+
@url = url
|
24
|
+
@opts = opts.merge(format_options)
|
25
|
+
@cache = cache
|
26
|
+
|
27
|
+
return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
|
28
|
+
|
29
|
+
warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
|
27
30
|
end
|
28
31
|
|
29
32
|
# @return [Boolean] returns true if performed.
|
30
33
|
def each(&block)
|
34
|
+
stream = Feedx::Stream.new(@url, **@opts)
|
31
35
|
remote_rev = nil
|
32
36
|
|
33
37
|
if @cache
|
34
|
-
metadata =
|
38
|
+
metadata = stream.blob.info.metadata
|
35
39
|
local_rev = @cache.read.to_i
|
36
40
|
remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
|
37
41
|
return false if remote_rev.positive? && remote_rev <= local_rev
|
38
42
|
end
|
39
43
|
|
40
|
-
|
41
|
-
fmt.decode_each(@klass, **@
|
44
|
+
stream.open do |fmt|
|
45
|
+
fmt.decode_each(@klass, **@opts, &block)
|
42
46
|
end
|
43
47
|
@cache.write(remote_rev) if @cache && remote_rev
|
44
48
|
|
45
49
|
true
|
50
|
+
ensure
|
51
|
+
stream&.close
|
46
52
|
end
|
47
53
|
end
|
48
54
|
end
|
data/lib/feedx/format.rb
CHANGED
@@ -27,7 +27,7 @@ module Feedx
|
|
27
27
|
ext = File.extname(base)
|
28
28
|
raise ArgumentError, 'unable to detect format' if ext.empty?
|
29
29
|
|
30
|
-
kind = _resolve(ext[1
|
30
|
+
kind = _resolve(ext[1..]) || _resolve(ext[1..-2])
|
31
31
|
return kind if kind
|
32
32
|
|
33
33
|
base = base[0..-ext.size - 1]
|
@@ -39,6 +39,9 @@ module Feedx
|
|
39
39
|
def registry
|
40
40
|
@registry ||= {
|
41
41
|
'json' => :JSON,
|
42
|
+
'jsonl' => :JSON,
|
43
|
+
'ndjson' => :JSON,
|
44
|
+
'parquet' => :Parquet,
|
42
45
|
'pb' => :Protobuf,
|
43
46
|
'proto' => :Protobuf,
|
44
47
|
'protobuf' => :Protobuf,
|
data/lib/feedx/producer.rb
CHANGED
@@ -14,38 +14,43 @@ module Feedx
|
|
14
14
|
# @param [Hash] opts options
|
15
15
|
# @option opts [Enumerable,ActiveRecord::Relation] :enum relation or enumerator to stream.
|
16
16
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
17
|
-
# @option opts [Hash] :format_options format encode options. Default: {}.
|
18
17
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
19
18
|
# @option opts [Time,Proc] :last_modified the last modified time, used to determine if a push is necessary.
|
20
19
|
# @yield A block factory to generate the relation or enumerator.
|
21
20
|
# @yieldreturn [Enumerable,ActiveRecord::Relation] the relation or enumerator to stream.
|
22
|
-
def initialize(url, **opts, &block)
|
23
|
-
@enum =
|
21
|
+
def initialize(url, last_modified: nil, format_options: {}, enum: nil, **opts, &block)
|
22
|
+
@enum = enum || block
|
24
23
|
raise ArgumentError, "#{self.class.name}.new expects an :enum option or a block factory" unless @enum
|
25
24
|
|
26
|
-
@
|
27
|
-
@
|
28
|
-
@
|
25
|
+
@url = url
|
26
|
+
@opts = opts.merge(format_options)
|
27
|
+
@last_mod = last_modified
|
28
|
+
|
29
|
+
return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
|
30
|
+
|
31
|
+
warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
|
29
32
|
end
|
30
33
|
|
31
34
|
def perform
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
35
|
+
Feedx::Stream.open(@url, **@opts) do |stream|
|
36
|
+
enum = @enum.is_a?(Proc) ? @enum.call : @enum
|
37
|
+
last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
|
38
|
+
local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
|
39
|
+
|
40
|
+
begin
|
41
|
+
metadata = stream.blob.info.metadata
|
42
|
+
remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
|
43
|
+
return -1 unless local_rev > remote_rev
|
44
|
+
rescue BFS::FileNotFound
|
45
|
+
nil
|
46
|
+
end if local_rev.positive?
|
47
|
+
|
48
|
+
stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
|
49
|
+
iter = enum.respond_to?(:find_each) ? :find_each : :each
|
50
|
+
enum.send(iter) {|rec| fmt.encode(rec, **@opts) }
|
51
|
+
end
|
52
|
+
stream.blob.info.size
|
47
53
|
end
|
48
|
-
@stream.blob.info.size
|
49
54
|
end
|
50
55
|
end
|
51
56
|
end
|
data/lib/feedx/stream.rb
CHANGED
@@ -6,26 +6,40 @@ module Feedx
|
|
6
6
|
class Stream
|
7
7
|
attr_reader :blob
|
8
8
|
|
9
|
+
# Behaves like new, but accepts an optional block.
|
10
|
+
# If a block is given, streams are automatically closed after the block is yielded.
|
11
|
+
def self.open(url, **opts)
|
12
|
+
stream = new(url, **opts)
|
13
|
+
return stream unless block_given?
|
14
|
+
|
15
|
+
begin
|
16
|
+
yield stream
|
17
|
+
ensure
|
18
|
+
stream.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
9
22
|
# @param [String] url the blob URL.
|
10
23
|
# @param [Hash] opts options
|
11
24
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
12
25
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
13
|
-
def initialize(url, **opts)
|
26
|
+
def initialize(url, format: nil, compress: nil, **opts)
|
14
27
|
@blob = BFS::Blob.new(url)
|
15
|
-
@format = detect_format(
|
16
|
-
@compress = detect_compress(
|
28
|
+
@format = detect_format(format)
|
29
|
+
@compress = detect_compress(compress)
|
30
|
+
@opts = opts
|
31
|
+
|
32
|
+
BFS.defer(self, :close)
|
17
33
|
end
|
18
34
|
|
19
35
|
# Opens the remote for reading.
|
20
36
|
# @param [Hash] opts BFS::Blob#open options
|
21
37
|
# @yield A block over a formatted stream.
|
22
38
|
# @yieldparam [Feedx::Format::Abstract] formatted input stream.
|
23
|
-
def open(**opts)
|
39
|
+
def open(**opts, &block)
|
24
40
|
@blob.open(**opts) do |io|
|
25
|
-
@compress.reader(io) do |cio|
|
26
|
-
@format.decoder(cio
|
27
|
-
yield fmt
|
28
|
-
end
|
41
|
+
@compress.reader(io, **@opts) do |cio|
|
42
|
+
@format.decoder(cio, **@opts, &block)
|
29
43
|
end
|
30
44
|
end
|
31
45
|
end
|
@@ -34,16 +48,19 @@ module Feedx
|
|
34
48
|
# @param [Hash] opts BFS::Blob#create options
|
35
49
|
# @yield A block over a formatted stream.
|
36
50
|
# @yieldparam [Feedx::Format::Abstract] formatted output stream.
|
37
|
-
def create(**opts)
|
51
|
+
def create(**opts, &block)
|
38
52
|
@blob.create(**opts) do |io|
|
39
|
-
@compress.writer(io) do |cio|
|
40
|
-
@format.encoder(cio
|
41
|
-
yield fmt
|
42
|
-
end
|
53
|
+
@compress.writer(io, **@opts) do |cio|
|
54
|
+
@format.encoder(cio, **@opts, &block)
|
43
55
|
end
|
44
56
|
end
|
45
57
|
end
|
46
58
|
|
59
|
+
# Closes the underlying connection.
|
60
|
+
def close
|
61
|
+
@blob.close
|
62
|
+
end
|
63
|
+
|
47
64
|
private
|
48
65
|
|
49
66
|
def detect_format(val)
|
data/producer_test.go
CHANGED
data/reader_test.go
CHANGED
@@ -8,8 +8,8 @@ import (
|
|
8
8
|
"github.com/bsm/bfs"
|
9
9
|
"github.com/bsm/feedx"
|
10
10
|
"github.com/bsm/feedx/internal/testdata"
|
11
|
-
. "github.com/
|
12
|
-
. "github.com/
|
11
|
+
. "github.com/bsm/ginkgo"
|
12
|
+
. "github.com/bsm/gomega"
|
13
13
|
)
|
14
14
|
|
15
15
|
var _ = Describe("Reader", func() {
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Cache::Memory do
|
4
|
-
it '
|
4
|
+
it 'read/writes' do
|
5
5
|
expect(subject.fetch('key')).to be_nil
|
6
6
|
expect(subject.fetch('key') { 'value' }).to eq('value')
|
7
7
|
expect(subject.fetch('key')).to eq('value')
|
@@ -16,7 +16,7 @@ RSpec.describe Feedx::Cache::Memory do
|
|
16
16
|
expect(subject.fetch('key')).to be_nil
|
17
17
|
end
|
18
18
|
|
19
|
-
it '
|
19
|
+
it 'writes strings' do
|
20
20
|
subject.write('key', 5)
|
21
21
|
expect(subject.read('key')).to eq('5')
|
22
22
|
end
|
@@ -5,7 +5,7 @@ RSpec.describe Feedx::Cache::Value do
|
|
5
5
|
described_class.new(Feedx::Cache::Memory.new, 'key')
|
6
6
|
end
|
7
7
|
|
8
|
-
it '
|
8
|
+
it 'read/writes' do
|
9
9
|
expect(subject.fetch).to be_nil
|
10
10
|
expect(subject.fetch { 'value' }).to eq('value')
|
11
11
|
expect(subject.fetch).to eq('value')
|
@@ -1,14 +1,14 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Compression do
|
4
|
-
it '
|
4
|
+
it 'resolves' do
|
5
5
|
expect(described_class.resolve(:gzip)).to be_instance_of(described_class::Gzip)
|
6
6
|
expect(described_class.resolve(:gz)).to be_instance_of(described_class::Gzip)
|
7
7
|
expect(described_class.resolve(nil)).to be_instance_of(described_class::None)
|
8
8
|
expect { described_class.resolve(:txt) }.to raise_error(/invalid compression txt/)
|
9
9
|
end
|
10
10
|
|
11
|
-
it '
|
11
|
+
it 'detects' do
|
12
12
|
expect(described_class.detect('path/to/file.jsonz')).to be_instance_of(described_class::Gzip)
|
13
13
|
expect(described_class.detect('path/to/file.json.gz')).to be_instance_of(described_class::Gzip)
|
14
14
|
expect(described_class.detect('path/to/file.json')).to be_instance_of(described_class::None)
|
data/spec/feedx/consumer_spec.rb
CHANGED
@@ -4,15 +4,16 @@ RSpec.describe Feedx::Consumer do
|
|
4
4
|
let(:bucket) { BFS::Bucket::InMem.new }
|
5
5
|
let(:klass) { Feedx::TestCase::Model }
|
6
6
|
let(:cache) { Feedx::Cache::Memory.new.value('my-consumer') }
|
7
|
+
|
7
8
|
before { allow(BFS).to receive(:resolve).and_return(bucket) }
|
8
9
|
|
9
|
-
it '
|
10
|
+
it 'rejects invalid inputs' do
|
10
11
|
expect do
|
11
|
-
described_class.each('mock:///dir/file.txt', klass)
|
12
|
+
described_class.each('mock:///dir/file.txt', klass)
|
12
13
|
end.to raise_error(/unable to detect format/)
|
13
14
|
end
|
14
15
|
|
15
|
-
it '
|
16
|
+
it 'consumes feeds' do
|
16
17
|
url = mock_produce!
|
17
18
|
csm = described_class.new(url, klass)
|
18
19
|
expect(csm).to be_a(Enumerable)
|
@@ -24,7 +25,7 @@ RSpec.describe Feedx::Consumer do
|
|
24
25
|
expect(cnt).to eq(300)
|
25
26
|
end
|
26
27
|
|
27
|
-
it '
|
28
|
+
it 'performs conditionally' do
|
28
29
|
url = mock_produce! last_modified: Time.at(1515151515)
|
29
30
|
expect(described_class.new(url, klass, cache: cache).count).to eq(300)
|
30
31
|
expect(described_class.new(url, klass, cache: cache).count).to eq(0)
|
@@ -2,10 +2,11 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Format::Abstract do
|
4
4
|
subject { Feedx::Format::JSON.new }
|
5
|
+
|
5
6
|
let(:wio) { StringIO.new }
|
6
7
|
let(:rio) { StringIO.open(wio.string) }
|
7
8
|
|
8
|
-
it '
|
9
|
+
it 'decodes each' do
|
9
10
|
subject.encoder wio do |enc|
|
10
11
|
enc.encode(Feedx::TestCase::Model.new('X'))
|
11
12
|
enc.encode(Feedx::TestCase::Model.new('Y'))
|
@@ -4,17 +4,17 @@ RSpec.describe Feedx::Format::JSON do
|
|
4
4
|
let(:wio) { StringIO.new }
|
5
5
|
let(:rio) { StringIO.open(wio.string) }
|
6
6
|
|
7
|
-
it '
|
7
|
+
it 'encode/decodes' do
|
8
8
|
subject.encoder wio do |enc|
|
9
9
|
enc.encode(Feedx::TestCase::Model.new('X'))
|
10
10
|
enc.encode(Feedx::TestCase::Model.new('Y'))
|
11
11
|
enc.encode(Feedx::TestCase::Message.new(title: 'Z'))
|
12
12
|
end
|
13
|
-
expect(wio.string
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
expect(wio.string).to eq(<<~JSON)
|
14
|
+
{"title":"X","updated_at":"2018-01-05 11:25:15 UTC"}
|
15
|
+
{"title":"Y","updated_at":"2018-01-05 11:25:15 UTC"}
|
16
|
+
{"title":"Z"}
|
17
|
+
JSON
|
18
18
|
|
19
19
|
subject.decoder rio do |dec|
|
20
20
|
expect(dec.decode(Feedx::TestCase::Model)).to eq(Feedx::TestCase::Model.new('X'))
|
@@ -11,7 +11,7 @@ RSpec.describe Feedx::Format::Parquet do
|
|
11
11
|
])
|
12
12
|
end
|
13
13
|
|
14
|
-
it '
|
14
|
+
it 'encode/decodes' do
|
15
15
|
subject.encoder wio, schema: schema, batch_size: 2 do |enc|
|
16
16
|
enc.encode(Feedx::TestCase::Model.new('X'))
|
17
17
|
enc.encode(Feedx::TestCase::Model.new('Y'))
|