feedx 0.10.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -0
- data/.travis.yml +13 -6
- data/Gemfile.lock +42 -19
- data/Makefile +5 -0
- data/consumer_test.go +5 -5
- data/feedx.gemspec +3 -2
- data/feedx_test.go +12 -9
- data/format.go +16 -16
- data/format_test.go +6 -7
- data/go.mod +5 -10
- data/go.sum +43 -24
- data/internal/testdata/testdata.pb.go +124 -0
- data/internal/testdata/testdata.proto +15 -0
- data/lib/feedx/cache/abstract.rb +2 -2
- data/lib/feedx/compression.rb +11 -4
- data/lib/feedx/compression/abstract.rb +2 -2
- data/lib/feedx/compression/gzip.rb +14 -16
- data/lib/feedx/compression/none.rb +4 -4
- data/lib/feedx/consumer.rb +17 -11
- data/lib/feedx/format.rb +18 -9
- data/lib/feedx/format/abstract.rb +42 -13
- data/lib/feedx/format/json.rb +12 -8
- data/lib/feedx/format/parquet.rb +102 -0
- data/lib/feedx/format/protobuf.rb +16 -8
- data/lib/feedx/producer.rb +20 -14
- data/lib/feedx/stream.rb +42 -25
- data/producer_test.go +1 -2
- data/reader_test.go +7 -8
- data/spec/feedx/compression/gzip_spec.rb +2 -2
- data/spec/feedx/compression/none_spec.rb +2 -2
- data/spec/feedx/compression_spec.rb +9 -9
- data/spec/feedx/consumer_spec.rb +6 -3
- data/spec/feedx/format/abstract_spec.rb +11 -8
- data/spec/feedx/format/json_spec.rb +12 -11
- data/spec/feedx/format/parquet_spec.rb +30 -0
- data/spec/feedx/format/protobuf_spec.rb +12 -11
- data/spec/feedx/format_spec.rb +8 -8
- data/spec/feedx/producer_spec.rb +6 -0
- data/spec/feedx/stream_spec.rb +28 -3
- data/spec/spec_helper.rb +17 -1
- data/writer_test.go +1 -1
- metadata +22 -3
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'parquet'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
class Feedx::Format::Parquet < Feedx::Format::Abstract
|
5
|
+
class Record < Arrow::Record
|
6
|
+
def each_pair
|
7
|
+
container.columns.each do |col|
|
8
|
+
yield col.name, col[index]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
14
|
+
def initialize(io, **)
|
15
|
+
super(io)
|
16
|
+
|
17
|
+
@table = read_table
|
18
|
+
@cursor = 0
|
19
|
+
end
|
20
|
+
|
21
|
+
def eof?
|
22
|
+
@cursor >= @table.n_rows
|
23
|
+
end
|
24
|
+
|
25
|
+
def decode(target, **)
|
26
|
+
return if eof?
|
27
|
+
|
28
|
+
rec = Record.new(@table, @cursor)
|
29
|
+
@cursor += 1
|
30
|
+
|
31
|
+
target = target.allocate if target.is_a?(Class)
|
32
|
+
target.from_parquet(rec)
|
33
|
+
target
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def read_table
|
39
|
+
tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
40
|
+
IO.copy_stream(@io, tmpname)
|
41
|
+
|
42
|
+
@table = Arrow::Table.load(tmpname, format: 'parquet')
|
43
|
+
ensure
|
44
|
+
unlink!(tmpname) if tmpname
|
45
|
+
end
|
46
|
+
|
47
|
+
def unlink!(tmpname)
|
48
|
+
File.unlink(tmpname)
|
49
|
+
rescue Errno::ENOENT
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
55
|
+
attr_reader :schema
|
56
|
+
|
57
|
+
def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
|
58
|
+
super(io)
|
59
|
+
|
60
|
+
@schema = schema
|
61
|
+
@batch_size = batch_size.to_i
|
62
|
+
@buffer_size = buffer_size.to_i
|
63
|
+
|
64
|
+
@tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
65
|
+
@output = Arrow::FileOutputStream.new(@tmpname, append: false)
|
66
|
+
@writer = Parquet::ArrowFileWriter.new(@schema, @output)
|
67
|
+
@batch = []
|
68
|
+
end
|
69
|
+
|
70
|
+
def encode(msg, **opts)
|
71
|
+
msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
|
72
|
+
|
73
|
+
res = @batch.push(msg)
|
74
|
+
flush_table if @batch.size >= @batch_size
|
75
|
+
res
|
76
|
+
end
|
77
|
+
|
78
|
+
def close
|
79
|
+
flush_table unless @batch.empty?
|
80
|
+
|
81
|
+
@writer.close
|
82
|
+
@output.close
|
83
|
+
IO.copy_stream(@tmpname, @io)
|
84
|
+
ensure
|
85
|
+
unlink!
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def flush_table
|
91
|
+
table = Arrow::RecordBatch.new(@schema, @batch).to_table
|
92
|
+
@writer.write_table table, @buffer_size
|
93
|
+
@batch.clear
|
94
|
+
end
|
95
|
+
|
96
|
+
def unlink!
|
97
|
+
File.unlink(@tmpname)
|
98
|
+
rescue Errno::ENOENT
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -1,16 +1,24 @@
|
|
1
1
|
require 'pbio'
|
2
2
|
|
3
3
|
class Feedx::Format::Protobuf < Feedx::Format::Abstract
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
5
|
+
def initialize(io, **opts)
|
6
|
+
super PBIO::Delimited.new(io), **opts
|
7
|
+
end
|
7
8
|
|
8
|
-
|
9
|
-
|
9
|
+
def decode(target, **)
|
10
|
+
@io.read(target)
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
|
-
|
13
|
-
|
14
|
-
|
14
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
15
|
+
def initialize(io, **opts)
|
16
|
+
super PBIO::Delimited.new(io), **opts
|
17
|
+
end
|
18
|
+
|
19
|
+
def encode(msg, **opts)
|
20
|
+
msg = msg.to_pb(**opts) if msg.respond_to?(:to_pb)
|
21
|
+
@io.write msg
|
22
|
+
end
|
15
23
|
end
|
16
24
|
end
|
data/lib/feedx/producer.rb
CHANGED
@@ -6,46 +6,52 @@ module Feedx
|
|
6
6
|
# Produces a relation as an encoded feed to a remote location.
|
7
7
|
class Producer
|
8
8
|
# See constructor.
|
9
|
-
def self.perform(url, opts
|
10
|
-
new(url, opts, &block).perform
|
9
|
+
def self.perform(url, **opts, &block)
|
10
|
+
new(url, **opts, &block).perform
|
11
11
|
end
|
12
12
|
|
13
13
|
# @param [String] url the destination URL.
|
14
14
|
# @param [Hash] opts options
|
15
15
|
# @option opts [Enumerable,ActiveRecord::Relation] :enum relation or enumerator to stream.
|
16
16
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
17
|
-
# @option opts [Hash] :format_options format encode options. Default: {}.
|
18
17
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
19
18
|
# @option opts [Time,Proc] :last_modified the last modified time, used to determine if a push is necessary.
|
20
19
|
# @yield A block factory to generate the relation or enumerator.
|
21
20
|
# @yieldreturn [Enumerable,ActiveRecord::Relation] the relation or enumerator to stream.
|
22
|
-
def initialize(url,
|
23
|
-
@enum =
|
21
|
+
def initialize(url, last_modified: nil, format_options: {}, enum: nil, **opts, &block)
|
22
|
+
@enum = enum || block
|
24
23
|
raise ArgumentError, "#{self.class.name}.new expects an :enum option or a block factory" unless @enum
|
25
24
|
|
26
|
-
@
|
27
|
-
@
|
28
|
-
@
|
25
|
+
@url = url
|
26
|
+
@opts = opts.merge(format_options)
|
27
|
+
@last_mod = last_modified
|
28
|
+
|
29
|
+
return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
|
30
|
+
|
31
|
+
warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
|
29
32
|
end
|
30
33
|
|
31
34
|
def perform
|
32
|
-
|
33
|
-
|
35
|
+
stream = Feedx::Stream.new(@url, **@opts)
|
36
|
+
enum = @enum.is_a?(Proc) ? @enum.call : @enum
|
37
|
+
last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
|
34
38
|
local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
|
35
39
|
|
36
40
|
begin
|
37
|
-
metadata =
|
41
|
+
metadata = stream.blob.info.metadata
|
38
42
|
remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
|
39
43
|
return -1 unless local_rev > remote_rev
|
40
44
|
rescue BFS::FileNotFound
|
41
45
|
nil
|
42
46
|
end if local_rev.positive?
|
43
47
|
|
44
|
-
|
48
|
+
stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
|
45
49
|
iter = enum.respond_to?(:find_each) ? :find_each : :each
|
46
|
-
enum.send(iter) {|rec| fmt.encode(rec, **@
|
50
|
+
enum.send(iter) {|rec| fmt.encode(rec, **@opts) }
|
47
51
|
end
|
48
|
-
|
52
|
+
stream.blob.info.size
|
53
|
+
ensure
|
54
|
+
stream&.close
|
49
55
|
end
|
50
56
|
end
|
51
57
|
end
|
data/lib/feedx/stream.rb
CHANGED
@@ -6,25 +6,42 @@ module Feedx
|
|
6
6
|
class Stream
|
7
7
|
attr_reader :blob
|
8
8
|
|
9
|
+
# Behaves like new, but accepts an optional block.
|
10
|
+
# If a block is given, streams are automatically closed after the block is yielded.
|
11
|
+
def self.open(url, **opts)
|
12
|
+
stream = new(url, **opts)
|
13
|
+
return stream unless block_given?
|
14
|
+
|
15
|
+
begin
|
16
|
+
yield stream
|
17
|
+
ensure
|
18
|
+
stream.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
9
22
|
# @param [String] url the blob URL.
|
10
23
|
# @param [Hash] opts options
|
11
24
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
12
25
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
13
|
-
def initialize(url,
|
26
|
+
def initialize(url, format: nil, compress: nil, **opts)
|
14
27
|
@blob = BFS::Blob.new(url)
|
15
|
-
@format = detect_format(
|
16
|
-
@compress = detect_compress(
|
28
|
+
@format = detect_format(format)
|
29
|
+
@compress = detect_compress(compress)
|
30
|
+
@opts = opts
|
31
|
+
|
32
|
+
BFS.defer(self, :close)
|
17
33
|
end
|
18
34
|
|
19
35
|
# Opens the remote for reading.
|
20
36
|
# @param [Hash] opts BFS::Blob#open options
|
21
37
|
# @yield A block over a formatted stream.
|
22
38
|
# @yieldparam [Feedx::Format::Abstract] formatted input stream.
|
23
|
-
def open(opts
|
24
|
-
@blob.open(opts) do |io|
|
25
|
-
@compress.reader(io) do |cio|
|
26
|
-
|
27
|
-
|
39
|
+
def open(**opts)
|
40
|
+
@blob.open(**opts) do |io|
|
41
|
+
@compress.reader(io, **@opts) do |cio|
|
42
|
+
@format.decoder(cio, **@opts) do |fmt|
|
43
|
+
yield fmt
|
44
|
+
end
|
28
45
|
end
|
29
46
|
end
|
30
47
|
end
|
@@ -33,28 +50,31 @@ module Feedx
|
|
33
50
|
# @param [Hash] opts BFS::Blob#create options
|
34
51
|
# @yield A block over a formatted stream.
|
35
52
|
# @yieldparam [Feedx::Format::Abstract] formatted output stream.
|
36
|
-
def create(opts
|
37
|
-
@blob.create(opts) do |io|
|
38
|
-
@compress.writer(io) do |cio|
|
39
|
-
|
40
|
-
|
53
|
+
def create(**opts)
|
54
|
+
@blob.create(**opts) do |io|
|
55
|
+
@compress.writer(io, **@opts) do |cio|
|
56
|
+
@format.encoder(cio, **@opts) do |fmt|
|
57
|
+
yield fmt
|
58
|
+
end
|
41
59
|
end
|
42
60
|
end
|
43
61
|
end
|
44
62
|
|
63
|
+
# Closes the underlying connection.
|
64
|
+
def close
|
65
|
+
@blob.close
|
66
|
+
end
|
67
|
+
|
45
68
|
private
|
46
69
|
|
47
70
|
def detect_format(val)
|
48
71
|
case val
|
49
72
|
when nil
|
50
73
|
Feedx::Format.detect(@blob.path)
|
51
|
-
when
|
52
|
-
parent = Feedx::Format::Abstract
|
53
|
-
raise ArgumentError, "Class #{val} must extend #{parent}" unless val < parent
|
54
|
-
|
55
|
-
val
|
56
|
-
else
|
74
|
+
when String, Symbol
|
57
75
|
Feedx::Format.resolve(val)
|
76
|
+
else
|
77
|
+
Feedx::Format.validate!(val)
|
58
78
|
end
|
59
79
|
end
|
60
80
|
|
@@ -62,13 +82,10 @@ module Feedx
|
|
62
82
|
case val
|
63
83
|
when nil
|
64
84
|
Feedx::Compression.detect(@blob.path)
|
65
|
-
when
|
66
|
-
parent = Feedx::Compression::Abstract
|
67
|
-
raise ArgumentError, "Class #{val} must extend #{parent}" unless val < parent
|
68
|
-
|
69
|
-
val
|
70
|
-
else
|
85
|
+
when String, Symbol
|
71
86
|
Feedx::Compression.resolve(val)
|
87
|
+
else
|
88
|
+
Feedx::Compression.validate!(val)
|
72
89
|
end
|
73
90
|
end
|
74
91
|
end
|
data/producer_test.go
CHANGED
data/reader_test.go
CHANGED
@@ -5,10 +5,9 @@ import (
|
|
5
5
|
"io"
|
6
6
|
"io/ioutil"
|
7
7
|
|
8
|
-
"github.com/bsm/feedx"
|
9
|
-
|
10
8
|
"github.com/bsm/bfs"
|
11
|
-
|
9
|
+
"github.com/bsm/feedx"
|
10
|
+
"github.com/bsm/feedx/internal/testdata"
|
12
11
|
. "github.com/onsi/ginkgo"
|
13
12
|
. "github.com/onsi/gomega"
|
14
13
|
)
|
@@ -34,23 +33,23 @@ var _ = Describe("Reader", func() {
|
|
34
33
|
It("should read", func() {
|
35
34
|
data, err := ioutil.ReadAll(subject)
|
36
35
|
Expect(err).NotTo(HaveOccurred())
|
37
|
-
Expect(len(data)).To(BeNumerically("~",
|
36
|
+
Expect(len(data)).To(BeNumerically("~", 110, 20))
|
38
37
|
Expect(subject.NumRead()).To(Equal(0))
|
39
38
|
})
|
40
39
|
|
41
40
|
It("should decode", func() {
|
42
|
-
var msgs []
|
41
|
+
var msgs []*testdata.MockMessage
|
43
42
|
for {
|
44
|
-
var msg
|
43
|
+
var msg testdata.MockMessage
|
45
44
|
err := subject.Decode(&msg)
|
46
45
|
if err == io.EOF {
|
47
46
|
break
|
48
47
|
}
|
49
48
|
Expect(err).NotTo(HaveOccurred())
|
50
|
-
msgs = append(msgs, msg)
|
49
|
+
msgs = append(msgs, &msg)
|
51
50
|
}
|
52
51
|
|
53
|
-
Expect(msgs).To(
|
52
|
+
Expect(msgs).To(ConsistOf(seed(), seed(), seed()))
|
54
53
|
Expect(subject.NumRead()).To(Equal(3))
|
55
54
|
})
|
56
55
|
})
|
@@ -3,13 +3,13 @@ require 'spec_helper'
|
|
3
3
|
RSpec.describe Feedx::Compression::Gzip do
|
4
4
|
it 'should wrap readers/writers' do
|
5
5
|
wio = StringIO.new
|
6
|
-
|
6
|
+
subject.writer(wio) {|w| w.write 'xyz' * 1000 }
|
7
7
|
expect(wio.size).to be_within(20).of(40)
|
8
8
|
expect(wio.string.encoding).to eq(Encoding::BINARY)
|
9
9
|
|
10
10
|
data = ''
|
11
11
|
StringIO.open(wio.string) do |rio|
|
12
|
-
|
12
|
+
subject.reader(rio) {|z| data = z.read }
|
13
13
|
end
|
14
14
|
expect(data.size).to eq(3000)
|
15
15
|
expect(data.encoding).to eq(Encoding.default_external)
|
@@ -3,12 +3,12 @@ require 'spec_helper'
|
|
3
3
|
RSpec.describe Feedx::Compression::None do
|
4
4
|
it 'should wrap readers/writers' do
|
5
5
|
wio = StringIO.new
|
6
|
-
|
6
|
+
subject.writer(wio) {|w| w.write 'xyz' * 1000 }
|
7
7
|
expect(wio.size).to eq(3000)
|
8
8
|
|
9
9
|
data = ''
|
10
10
|
StringIO.open(wio.string) do |rio|
|
11
|
-
|
11
|
+
subject.reader(rio) {|z| data = z.read }
|
12
12
|
end
|
13
13
|
expect(data.size).to eq(3000)
|
14
14
|
end
|
@@ -2,18 +2,18 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Compression do
|
4
4
|
it 'should resolve' do
|
5
|
-
expect(described_class.resolve(:gzip)).to
|
6
|
-
expect(described_class.resolve(:gz)).to
|
7
|
-
expect(described_class.resolve(nil)).to
|
5
|
+
expect(described_class.resolve(:gzip)).to be_instance_of(described_class::Gzip)
|
6
|
+
expect(described_class.resolve(:gz)).to be_instance_of(described_class::Gzip)
|
7
|
+
expect(described_class.resolve(nil)).to be_instance_of(described_class::None)
|
8
8
|
expect { described_class.resolve(:txt) }.to raise_error(/invalid compression txt/)
|
9
9
|
end
|
10
10
|
|
11
11
|
it 'should detect' do
|
12
|
-
expect(described_class.detect('path/to/file.jsonz')).to
|
13
|
-
expect(described_class.detect('path/to/file.json.gz')).to
|
14
|
-
expect(described_class.detect('path/to/file.json')).to
|
15
|
-
expect(described_class.detect('path/to/file.pbz')).to
|
16
|
-
expect(described_class.detect('path/to/file.pb.gz')).to
|
17
|
-
expect(described_class.detect('path/to/file.pb')).to
|
12
|
+
expect(described_class.detect('path/to/file.jsonz')).to be_instance_of(described_class::Gzip)
|
13
|
+
expect(described_class.detect('path/to/file.json.gz')).to be_instance_of(described_class::Gzip)
|
14
|
+
expect(described_class.detect('path/to/file.json')).to be_instance_of(described_class::None)
|
15
|
+
expect(described_class.detect('path/to/file.pbz')).to be_instance_of(described_class::Gzip)
|
16
|
+
expect(described_class.detect('path/to/file.pb.gz')).to be_instance_of(described_class::Gzip)
|
17
|
+
expect(described_class.detect('path/to/file.pb')).to be_instance_of(described_class::None)
|
18
18
|
end
|
19
19
|
end
|
data/spec/feedx/consumer_spec.rb
CHANGED
@@ -36,10 +36,13 @@ RSpec.describe Feedx::Consumer do
|
|
36
36
|
|
37
37
|
private
|
38
38
|
|
39
|
-
def mock_produce!(
|
39
|
+
def mock_produce!(enum: mock_enum, **opts)
|
40
40
|
url = 'mock:///dir/file.json'
|
41
|
-
|
42
|
-
Feedx::Producer.perform url, opts
|
41
|
+
Feedx::Producer.perform url, enum: enum, **opts
|
43
42
|
url
|
44
43
|
end
|
44
|
+
|
45
|
+
def mock_enum
|
46
|
+
%w[x y z].map {|t| Feedx::TestCase::Model.new(t) } * 100
|
47
|
+
end
|
45
48
|
end
|
@@ -1,17 +1,20 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Format::Abstract do
|
4
|
-
subject { Feedx::Format::JSON.new
|
4
|
+
subject { Feedx::Format::JSON.new }
|
5
5
|
let(:wio) { StringIO.new }
|
6
|
+
let(:rio) { StringIO.open(wio.string) }
|
6
7
|
|
7
8
|
it 'should decode each' do
|
8
|
-
subject.
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
9
|
+
subject.encoder wio do |enc|
|
10
|
+
enc.encode(Feedx::TestCase::Model.new('X'))
|
11
|
+
enc.encode(Feedx::TestCase::Model.new('Y'))
|
12
|
+
enc.encode(Feedx::TestCase::Message.new(title: 'Z'))
|
13
|
+
end
|
14
|
+
|
15
|
+
subject.decoder rio do |dec|
|
16
|
+
acc = dec.decode_each(Feedx::TestCase::Model).to_a
|
17
|
+
expect(acc.map(&:title)).to eq(%w[X Y Z])
|
15
18
|
end
|
16
19
|
end
|
17
20
|
end
|