feedx 0.10.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -0
- data/.travis.yml +13 -6
- data/Gemfile.lock +42 -19
- data/Makefile +5 -0
- data/consumer_test.go +5 -5
- data/feedx.gemspec +3 -2
- data/feedx_test.go +12 -9
- data/format.go +16 -16
- data/format_test.go +6 -7
- data/go.mod +5 -10
- data/go.sum +43 -24
- data/internal/testdata/testdata.pb.go +124 -0
- data/internal/testdata/testdata.proto +15 -0
- data/lib/feedx/cache/abstract.rb +2 -2
- data/lib/feedx/compression.rb +11 -4
- data/lib/feedx/compression/abstract.rb +2 -2
- data/lib/feedx/compression/gzip.rb +14 -16
- data/lib/feedx/compression/none.rb +4 -4
- data/lib/feedx/consumer.rb +17 -11
- data/lib/feedx/format.rb +18 -9
- data/lib/feedx/format/abstract.rb +42 -13
- data/lib/feedx/format/json.rb +12 -8
- data/lib/feedx/format/parquet.rb +102 -0
- data/lib/feedx/format/protobuf.rb +16 -8
- data/lib/feedx/producer.rb +20 -14
- data/lib/feedx/stream.rb +42 -25
- data/producer_test.go +1 -2
- data/reader_test.go +7 -8
- data/spec/feedx/compression/gzip_spec.rb +2 -2
- data/spec/feedx/compression/none_spec.rb +2 -2
- data/spec/feedx/compression_spec.rb +9 -9
- data/spec/feedx/consumer_spec.rb +6 -3
- data/spec/feedx/format/abstract_spec.rb +11 -8
- data/spec/feedx/format/json_spec.rb +12 -11
- data/spec/feedx/format/parquet_spec.rb +30 -0
- data/spec/feedx/format/protobuf_spec.rb +12 -11
- data/spec/feedx/format_spec.rb +8 -8
- data/spec/feedx/producer_spec.rb +6 -0
- data/spec/feedx/stream_spec.rb +28 -3
- data/spec/spec_helper.rb +17 -1
- data/writer_test.go +1 -1
- metadata +22 -3
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'parquet'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
class Feedx::Format::Parquet < Feedx::Format::Abstract
|
5
|
+
class Record < Arrow::Record
|
6
|
+
def each_pair
|
7
|
+
container.columns.each do |col|
|
8
|
+
yield col.name, col[index]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
14
|
+
def initialize(io, **)
|
15
|
+
super(io)
|
16
|
+
|
17
|
+
@table = read_table
|
18
|
+
@cursor = 0
|
19
|
+
end
|
20
|
+
|
21
|
+
def eof?
|
22
|
+
@cursor >= @table.n_rows
|
23
|
+
end
|
24
|
+
|
25
|
+
def decode(target, **)
|
26
|
+
return if eof?
|
27
|
+
|
28
|
+
rec = Record.new(@table, @cursor)
|
29
|
+
@cursor += 1
|
30
|
+
|
31
|
+
target = target.allocate if target.is_a?(Class)
|
32
|
+
target.from_parquet(rec)
|
33
|
+
target
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def read_table
|
39
|
+
tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
40
|
+
IO.copy_stream(@io, tmpname)
|
41
|
+
|
42
|
+
@table = Arrow::Table.load(tmpname, format: 'parquet')
|
43
|
+
ensure
|
44
|
+
unlink!(tmpname) if tmpname
|
45
|
+
end
|
46
|
+
|
47
|
+
def unlink!(tmpname)
|
48
|
+
File.unlink(tmpname)
|
49
|
+
rescue Errno::ENOENT
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
55
|
+
attr_reader :schema
|
56
|
+
|
57
|
+
def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
|
58
|
+
super(io)
|
59
|
+
|
60
|
+
@schema = schema
|
61
|
+
@batch_size = batch_size.to_i
|
62
|
+
@buffer_size = buffer_size.to_i
|
63
|
+
|
64
|
+
@tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
65
|
+
@output = Arrow::FileOutputStream.new(@tmpname, append: false)
|
66
|
+
@writer = Parquet::ArrowFileWriter.new(@schema, @output)
|
67
|
+
@batch = []
|
68
|
+
end
|
69
|
+
|
70
|
+
def encode(msg, **opts)
|
71
|
+
msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
|
72
|
+
|
73
|
+
res = @batch.push(msg)
|
74
|
+
flush_table if @batch.size >= @batch_size
|
75
|
+
res
|
76
|
+
end
|
77
|
+
|
78
|
+
def close
|
79
|
+
flush_table unless @batch.empty?
|
80
|
+
|
81
|
+
@writer.close
|
82
|
+
@output.close
|
83
|
+
IO.copy_stream(@tmpname, @io)
|
84
|
+
ensure
|
85
|
+
unlink!
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def flush_table
|
91
|
+
table = Arrow::RecordBatch.new(@schema, @batch).to_table
|
92
|
+
@writer.write_table table, @buffer_size
|
93
|
+
@batch.clear
|
94
|
+
end
|
95
|
+
|
96
|
+
def unlink!
|
97
|
+
File.unlink(@tmpname)
|
98
|
+
rescue Errno::ENOENT
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -1,16 +1,24 @@
|
|
1
1
|
require 'pbio'
|
2
2
|
|
3
3
|
class Feedx::Format::Protobuf < Feedx::Format::Abstract
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
5
|
+
def initialize(io, **opts)
|
6
|
+
super PBIO::Delimited.new(io), **opts
|
7
|
+
end
|
7
8
|
|
8
|
-
|
9
|
-
|
9
|
+
def decode(target, **)
|
10
|
+
@io.read(target)
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
|
-
|
13
|
-
|
14
|
-
|
14
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
15
|
+
def initialize(io, **opts)
|
16
|
+
super PBIO::Delimited.new(io), **opts
|
17
|
+
end
|
18
|
+
|
19
|
+
def encode(msg, **opts)
|
20
|
+
msg = msg.to_pb(**opts) if msg.respond_to?(:to_pb)
|
21
|
+
@io.write msg
|
22
|
+
end
|
15
23
|
end
|
16
24
|
end
|
data/lib/feedx/producer.rb
CHANGED
@@ -6,46 +6,52 @@ module Feedx
|
|
6
6
|
# Produces a relation as an encoded feed to a remote location.
|
7
7
|
class Producer
|
8
8
|
# See constructor.
|
9
|
-
def self.perform(url, opts
|
10
|
-
new(url, opts, &block).perform
|
9
|
+
def self.perform(url, **opts, &block)
|
10
|
+
new(url, **opts, &block).perform
|
11
11
|
end
|
12
12
|
|
13
13
|
# @param [String] url the destination URL.
|
14
14
|
# @param [Hash] opts options
|
15
15
|
# @option opts [Enumerable,ActiveRecord::Relation] :enum relation or enumerator to stream.
|
16
16
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
17
|
-
# @option opts [Hash] :format_options format encode options. Default: {}.
|
18
17
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
19
18
|
# @option opts [Time,Proc] :last_modified the last modified time, used to determine if a push is necessary.
|
20
19
|
# @yield A block factory to generate the relation or enumerator.
|
21
20
|
# @yieldreturn [Enumerable,ActiveRecord::Relation] the relation or enumerator to stream.
|
22
|
-
def initialize(url,
|
23
|
-
@enum =
|
21
|
+
def initialize(url, last_modified: nil, format_options: {}, enum: nil, **opts, &block)
|
22
|
+
@enum = enum || block
|
24
23
|
raise ArgumentError, "#{self.class.name}.new expects an :enum option or a block factory" unless @enum
|
25
24
|
|
26
|
-
@
|
27
|
-
@
|
28
|
-
@
|
25
|
+
@url = url
|
26
|
+
@opts = opts.merge(format_options)
|
27
|
+
@last_mod = last_modified
|
28
|
+
|
29
|
+
return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
|
30
|
+
|
31
|
+
warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
|
29
32
|
end
|
30
33
|
|
31
34
|
def perform
|
32
|
-
|
33
|
-
|
35
|
+
stream = Feedx::Stream.new(@url, **@opts)
|
36
|
+
enum = @enum.is_a?(Proc) ? @enum.call : @enum
|
37
|
+
last_mod = @last_mod.is_a?(Proc) ? @last_mod.call(enum) : @last_mod
|
34
38
|
local_rev = last_mod.is_a?(Integer) ? last_mod : (last_mod.to_f * 1000).floor
|
35
39
|
|
36
40
|
begin
|
37
|
-
metadata =
|
41
|
+
metadata = stream.blob.info.metadata
|
38
42
|
remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
|
39
43
|
return -1 unless local_rev > remote_rev
|
40
44
|
rescue BFS::FileNotFound
|
41
45
|
nil
|
42
46
|
end if local_rev.positive?
|
43
47
|
|
44
|
-
|
48
|
+
stream.create metadata: { META_LAST_MODIFIED => local_rev.to_s } do |fmt|
|
45
49
|
iter = enum.respond_to?(:find_each) ? :find_each : :each
|
46
|
-
enum.send(iter) {|rec| fmt.encode(rec, **@
|
50
|
+
enum.send(iter) {|rec| fmt.encode(rec, **@opts) }
|
47
51
|
end
|
48
|
-
|
52
|
+
stream.blob.info.size
|
53
|
+
ensure
|
54
|
+
stream&.close
|
49
55
|
end
|
50
56
|
end
|
51
57
|
end
|
data/lib/feedx/stream.rb
CHANGED
@@ -6,25 +6,42 @@ module Feedx
|
|
6
6
|
class Stream
|
7
7
|
attr_reader :blob
|
8
8
|
|
9
|
+
# Behaves like new, but accepts an optional block.
|
10
|
+
# If a block is given, streams are automatically closed after the block is yielded.
|
11
|
+
def self.open(url, **opts)
|
12
|
+
stream = new(url, **opts)
|
13
|
+
return stream unless block_given?
|
14
|
+
|
15
|
+
begin
|
16
|
+
yield stream
|
17
|
+
ensure
|
18
|
+
stream.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
9
22
|
# @param [String] url the blob URL.
|
10
23
|
# @param [Hash] opts options
|
11
24
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
12
25
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
13
|
-
def initialize(url,
|
26
|
+
def initialize(url, format: nil, compress: nil, **opts)
|
14
27
|
@blob = BFS::Blob.new(url)
|
15
|
-
@format = detect_format(
|
16
|
-
@compress = detect_compress(
|
28
|
+
@format = detect_format(format)
|
29
|
+
@compress = detect_compress(compress)
|
30
|
+
@opts = opts
|
31
|
+
|
32
|
+
BFS.defer(self, :close)
|
17
33
|
end
|
18
34
|
|
19
35
|
# Opens the remote for reading.
|
20
36
|
# @param [Hash] opts BFS::Blob#open options
|
21
37
|
# @yield A block over a formatted stream.
|
22
38
|
# @yieldparam [Feedx::Format::Abstract] formatted input stream.
|
23
|
-
def open(opts
|
24
|
-
@blob.open(opts) do |io|
|
25
|
-
@compress.reader(io) do |cio|
|
26
|
-
|
27
|
-
|
39
|
+
def open(**opts)
|
40
|
+
@blob.open(**opts) do |io|
|
41
|
+
@compress.reader(io, **@opts) do |cio|
|
42
|
+
@format.decoder(cio, **@opts) do |fmt|
|
43
|
+
yield fmt
|
44
|
+
end
|
28
45
|
end
|
29
46
|
end
|
30
47
|
end
|
@@ -33,28 +50,31 @@ module Feedx
|
|
33
50
|
# @param [Hash] opts BFS::Blob#create options
|
34
51
|
# @yield A block over a formatted stream.
|
35
52
|
# @yieldparam [Feedx::Format::Abstract] formatted output stream.
|
36
|
-
def create(opts
|
37
|
-
@blob.create(opts) do |io|
|
38
|
-
@compress.writer(io) do |cio|
|
39
|
-
|
40
|
-
|
53
|
+
def create(**opts)
|
54
|
+
@blob.create(**opts) do |io|
|
55
|
+
@compress.writer(io, **@opts) do |cio|
|
56
|
+
@format.encoder(cio, **@opts) do |fmt|
|
57
|
+
yield fmt
|
58
|
+
end
|
41
59
|
end
|
42
60
|
end
|
43
61
|
end
|
44
62
|
|
63
|
+
# Closes the underlying connection.
|
64
|
+
def close
|
65
|
+
@blob.close
|
66
|
+
end
|
67
|
+
|
45
68
|
private
|
46
69
|
|
47
70
|
def detect_format(val)
|
48
71
|
case val
|
49
72
|
when nil
|
50
73
|
Feedx::Format.detect(@blob.path)
|
51
|
-
when
|
52
|
-
parent = Feedx::Format::Abstract
|
53
|
-
raise ArgumentError, "Class #{val} must extend #{parent}" unless val < parent
|
54
|
-
|
55
|
-
val
|
56
|
-
else
|
74
|
+
when String, Symbol
|
57
75
|
Feedx::Format.resolve(val)
|
76
|
+
else
|
77
|
+
Feedx::Format.validate!(val)
|
58
78
|
end
|
59
79
|
end
|
60
80
|
|
@@ -62,13 +82,10 @@ module Feedx
|
|
62
82
|
case val
|
63
83
|
when nil
|
64
84
|
Feedx::Compression.detect(@blob.path)
|
65
|
-
when
|
66
|
-
parent = Feedx::Compression::Abstract
|
67
|
-
raise ArgumentError, "Class #{val} must extend #{parent}" unless val < parent
|
68
|
-
|
69
|
-
val
|
70
|
-
else
|
85
|
+
when String, Symbol
|
71
86
|
Feedx::Compression.resolve(val)
|
87
|
+
else
|
88
|
+
Feedx::Compression.validate!(val)
|
72
89
|
end
|
73
90
|
end
|
74
91
|
end
|
data/producer_test.go
CHANGED
data/reader_test.go
CHANGED
@@ -5,10 +5,9 @@ import (
|
|
5
5
|
"io"
|
6
6
|
"io/ioutil"
|
7
7
|
|
8
|
-
"github.com/bsm/feedx"
|
9
|
-
|
10
8
|
"github.com/bsm/bfs"
|
11
|
-
|
9
|
+
"github.com/bsm/feedx"
|
10
|
+
"github.com/bsm/feedx/internal/testdata"
|
12
11
|
. "github.com/onsi/ginkgo"
|
13
12
|
. "github.com/onsi/gomega"
|
14
13
|
)
|
@@ -34,23 +33,23 @@ var _ = Describe("Reader", func() {
|
|
34
33
|
It("should read", func() {
|
35
34
|
data, err := ioutil.ReadAll(subject)
|
36
35
|
Expect(err).NotTo(HaveOccurred())
|
37
|
-
Expect(len(data)).To(BeNumerically("~",
|
36
|
+
Expect(len(data)).To(BeNumerically("~", 110, 20))
|
38
37
|
Expect(subject.NumRead()).To(Equal(0))
|
39
38
|
})
|
40
39
|
|
41
40
|
It("should decode", func() {
|
42
|
-
var msgs []
|
41
|
+
var msgs []*testdata.MockMessage
|
43
42
|
for {
|
44
|
-
var msg
|
43
|
+
var msg testdata.MockMessage
|
45
44
|
err := subject.Decode(&msg)
|
46
45
|
if err == io.EOF {
|
47
46
|
break
|
48
47
|
}
|
49
48
|
Expect(err).NotTo(HaveOccurred())
|
50
|
-
msgs = append(msgs, msg)
|
49
|
+
msgs = append(msgs, &msg)
|
51
50
|
}
|
52
51
|
|
53
|
-
Expect(msgs).To(
|
52
|
+
Expect(msgs).To(ConsistOf(seed(), seed(), seed()))
|
54
53
|
Expect(subject.NumRead()).To(Equal(3))
|
55
54
|
})
|
56
55
|
})
|
@@ -3,13 +3,13 @@ require 'spec_helper'
|
|
3
3
|
RSpec.describe Feedx::Compression::Gzip do
|
4
4
|
it 'should wrap readers/writers' do
|
5
5
|
wio = StringIO.new
|
6
|
-
|
6
|
+
subject.writer(wio) {|w| w.write 'xyz' * 1000 }
|
7
7
|
expect(wio.size).to be_within(20).of(40)
|
8
8
|
expect(wio.string.encoding).to eq(Encoding::BINARY)
|
9
9
|
|
10
10
|
data = ''
|
11
11
|
StringIO.open(wio.string) do |rio|
|
12
|
-
|
12
|
+
subject.reader(rio) {|z| data = z.read }
|
13
13
|
end
|
14
14
|
expect(data.size).to eq(3000)
|
15
15
|
expect(data.encoding).to eq(Encoding.default_external)
|
@@ -3,12 +3,12 @@ require 'spec_helper'
|
|
3
3
|
RSpec.describe Feedx::Compression::None do
|
4
4
|
it 'should wrap readers/writers' do
|
5
5
|
wio = StringIO.new
|
6
|
-
|
6
|
+
subject.writer(wio) {|w| w.write 'xyz' * 1000 }
|
7
7
|
expect(wio.size).to eq(3000)
|
8
8
|
|
9
9
|
data = ''
|
10
10
|
StringIO.open(wio.string) do |rio|
|
11
|
-
|
11
|
+
subject.reader(rio) {|z| data = z.read }
|
12
12
|
end
|
13
13
|
expect(data.size).to eq(3000)
|
14
14
|
end
|
@@ -2,18 +2,18 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Compression do
|
4
4
|
it 'should resolve' do
|
5
|
-
expect(described_class.resolve(:gzip)).to
|
6
|
-
expect(described_class.resolve(:gz)).to
|
7
|
-
expect(described_class.resolve(nil)).to
|
5
|
+
expect(described_class.resolve(:gzip)).to be_instance_of(described_class::Gzip)
|
6
|
+
expect(described_class.resolve(:gz)).to be_instance_of(described_class::Gzip)
|
7
|
+
expect(described_class.resolve(nil)).to be_instance_of(described_class::None)
|
8
8
|
expect { described_class.resolve(:txt) }.to raise_error(/invalid compression txt/)
|
9
9
|
end
|
10
10
|
|
11
11
|
it 'should detect' do
|
12
|
-
expect(described_class.detect('path/to/file.jsonz')).to
|
13
|
-
expect(described_class.detect('path/to/file.json.gz')).to
|
14
|
-
expect(described_class.detect('path/to/file.json')).to
|
15
|
-
expect(described_class.detect('path/to/file.pbz')).to
|
16
|
-
expect(described_class.detect('path/to/file.pb.gz')).to
|
17
|
-
expect(described_class.detect('path/to/file.pb')).to
|
12
|
+
expect(described_class.detect('path/to/file.jsonz')).to be_instance_of(described_class::Gzip)
|
13
|
+
expect(described_class.detect('path/to/file.json.gz')).to be_instance_of(described_class::Gzip)
|
14
|
+
expect(described_class.detect('path/to/file.json')).to be_instance_of(described_class::None)
|
15
|
+
expect(described_class.detect('path/to/file.pbz')).to be_instance_of(described_class::Gzip)
|
16
|
+
expect(described_class.detect('path/to/file.pb.gz')).to be_instance_of(described_class::Gzip)
|
17
|
+
expect(described_class.detect('path/to/file.pb')).to be_instance_of(described_class::None)
|
18
18
|
end
|
19
19
|
end
|
data/spec/feedx/consumer_spec.rb
CHANGED
@@ -36,10 +36,13 @@ RSpec.describe Feedx::Consumer do
|
|
36
36
|
|
37
37
|
private
|
38
38
|
|
39
|
-
def mock_produce!(
|
39
|
+
def mock_produce!(enum: mock_enum, **opts)
|
40
40
|
url = 'mock:///dir/file.json'
|
41
|
-
|
42
|
-
Feedx::Producer.perform url, opts
|
41
|
+
Feedx::Producer.perform url, enum: enum, **opts
|
43
42
|
url
|
44
43
|
end
|
44
|
+
|
45
|
+
def mock_enum
|
46
|
+
%w[x y z].map {|t| Feedx::TestCase::Model.new(t) } * 100
|
47
|
+
end
|
45
48
|
end
|
@@ -1,17 +1,20 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
RSpec.describe Feedx::Format::Abstract do
|
4
|
-
subject { Feedx::Format::JSON.new
|
4
|
+
subject { Feedx::Format::JSON.new }
|
5
5
|
let(:wio) { StringIO.new }
|
6
|
+
let(:rio) { StringIO.open(wio.string) }
|
6
7
|
|
7
8
|
it 'should decode each' do
|
8
|
-
subject.
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
9
|
+
subject.encoder wio do |enc|
|
10
|
+
enc.encode(Feedx::TestCase::Model.new('X'))
|
11
|
+
enc.encode(Feedx::TestCase::Model.new('Y'))
|
12
|
+
enc.encode(Feedx::TestCase::Message.new(title: 'Z'))
|
13
|
+
end
|
14
|
+
|
15
|
+
subject.decoder rio do |dec|
|
16
|
+
acc = dec.decode_each(Feedx::TestCase::Model).to_a
|
17
|
+
expect(acc.map(&:title)).to eq(%w[X Y Z])
|
15
18
|
end
|
16
19
|
end
|
17
20
|
end
|