feedx 0.10.2 → 0.12.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +3 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +2 -0
- data/.travis.yml +12 -2
- data/Gemfile +0 -2
- data/Gemfile.lock +50 -30
- data/Makefile +10 -5
- data/compression.go +18 -0
- data/compression_test.go +12 -0
- data/consumer_test.go +5 -4
- data/ext/parquet/decoder.go +170 -0
- data/ext/parquet/decoder_test.go +88 -0
- data/ext/parquet/go.mod +12 -0
- data/ext/parquet/go.sum +134 -0
- data/ext/parquet/parquet.go +78 -0
- data/ext/parquet/parquet_test.go +28 -0
- data/ext/parquet/reader.go +89 -0
- data/ext/parquet/testdata/alltypes_plain.parquet +0 -0
- data/ext/parquet/types.go +51 -0
- data/feedx.gemspec +3 -2
- data/feedx_test.go +8 -24
- data/format.go +50 -20
- data/format_test.go +8 -6
- data/go.mod +9 -11
- data/go.sum +76 -28
- data/internal/testdata/testdata.pb.go +223 -0
- data/internal/testdata/testdata.proto +15 -0
- data/lib/feedx/cache/abstract.rb +2 -2
- data/lib/feedx/cache/memory.rb +1 -0
- data/lib/feedx/compression.rb +11 -4
- data/lib/feedx/compression/abstract.rb +2 -2
- data/lib/feedx/compression/gzip.rb +14 -16
- data/lib/feedx/compression/none.rb +4 -4
- data/lib/feedx/consumer.rb +15 -9
- data/lib/feedx/format.rb +18 -9
- data/lib/feedx/format/abstract.rb +42 -13
- data/lib/feedx/format/json.rb +12 -8
- data/lib/feedx/format/parquet.rb +102 -0
- data/lib/feedx/format/protobuf.rb +16 -8
- data/lib/feedx/producer.rb +27 -22
- data/lib/feedx/stream.rb +36 -23
- data/producer_test.go +1 -2
- data/reader_test.go +6 -6
- data/spec/feedx/compression/gzip_spec.rb +2 -2
- data/spec/feedx/compression/none_spec.rb +2 -2
- data/spec/feedx/compression_spec.rb +9 -9
- data/spec/feedx/consumer_spec.rb +1 -1
- data/spec/feedx/format/abstract_spec.rb +11 -8
- data/spec/feedx/format/json_spec.rb +17 -16
- data/spec/feedx/format/parquet_spec.rb +30 -0
- data/spec/feedx/format/protobuf_spec.rb +12 -11
- data/spec/feedx/format_spec.rb +8 -8
- data/spec/feedx/producer_spec.rb +6 -0
- data/spec/feedx/stream_spec.rb +43 -6
- data/spec/spec_helper.rb +17 -1
- metadata +33 -5
@@ -0,0 +1,15 @@
|
|
1
|
+
syntax = "proto3";
|
2
|
+
|
3
|
+
package feedx.internal.testdata;
|
4
|
+
option go_package = "github.com/feedx/internal/testdata";
|
5
|
+
|
6
|
+
enum MockEnum {
|
7
|
+
UNKNOWN = 0;
|
8
|
+
FIRST = 3;
|
9
|
+
}
|
10
|
+
|
11
|
+
message MockMessage {
|
12
|
+
string name = 1;
|
13
|
+
MockEnum enum = 2;
|
14
|
+
uint32 height = 3;
|
15
|
+
}
|
data/lib/feedx/cache/abstract.rb
CHANGED
@@ -5,12 +5,12 @@ class Feedx::Cache::Abstract
|
|
5
5
|
end
|
6
6
|
|
7
7
|
# Read reads a key.
|
8
|
-
def read(_key, **
|
8
|
+
def read(_key, **)
|
9
9
|
raise 'Not implemented'
|
10
10
|
end
|
11
11
|
|
12
12
|
# Write writes a key/value pair.
|
13
|
-
def write(_key, _value, **
|
13
|
+
def write(_key, _value, **)
|
14
14
|
raise 'Not implemented'
|
15
15
|
end
|
16
16
|
|
data/lib/feedx/cache/memory.rb
CHANGED
data/lib/feedx/compression.rb
CHANGED
@@ -5,12 +5,19 @@ module Feedx
|
|
5
5
|
autoload :Gzip, 'feedx/compression/gzip'
|
6
6
|
|
7
7
|
class << self
|
8
|
+
def validate!(kind)
|
9
|
+
raise ArgumentError, "#{kind} does not implement #reader(io, &block)" unless kind.respond_to?(:reader)
|
10
|
+
raise ArgumentError, "#{kind} does not implement #writer(io, &block)" unless kind.respond_to?(:writer)
|
11
|
+
|
12
|
+
kind
|
13
|
+
end
|
14
|
+
|
8
15
|
def resolve(name)
|
9
16
|
case name.to_s
|
10
17
|
when 'gz', 'gzip'
|
11
|
-
Gzip
|
18
|
+
Gzip.new
|
12
19
|
when ''
|
13
|
-
None
|
20
|
+
None.new
|
14
21
|
else
|
15
22
|
raise ArgumentError, "invalid compression #{name}"
|
16
23
|
end
|
@@ -18,9 +25,9 @@ module Feedx
|
|
18
25
|
|
19
26
|
def detect(path)
|
20
27
|
if File.extname(path)[-1] == 'z'
|
21
|
-
Gzip
|
28
|
+
Gzip.new
|
22
29
|
else
|
23
|
-
None
|
30
|
+
None.new
|
24
31
|
end
|
25
32
|
end
|
26
33
|
end
|
@@ -1,25 +1,23 @@
|
|
1
1
|
require 'zlib'
|
2
2
|
|
3
3
|
class Feedx::Compression::Gzip < Feedx::Compression::Abstract
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
end
|
4
|
+
def reader(io, **, &block)
|
5
|
+
force_binmode(io)
|
6
|
+
Zlib::GzipReader.wrap(io, &block)
|
7
|
+
end
|
9
8
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
def writer(io, **, &block)
|
10
|
+
force_binmode(io)
|
11
|
+
Zlib::GzipWriter.wrap(io, &block)
|
12
|
+
end
|
14
13
|
|
15
|
-
|
14
|
+
private
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
16
|
+
def force_binmode(io)
|
17
|
+
if io.respond_to?(:binmode)
|
18
|
+
io.binmode
|
19
|
+
elsif io.respond_to?(:set_encoding)
|
20
|
+
io.set_encoding(Encoding::BINARY)
|
23
21
|
end
|
24
22
|
end
|
25
23
|
end
|
data/lib/feedx/consumer.rb
CHANGED
@@ -16,33 +16,39 @@ module Feedx
|
|
16
16
|
# @param [Class] klass the record class.
|
17
17
|
# @param [Hash] opts options
|
18
18
|
# @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
|
19
|
-
# @option opts [Hash] :format_options format decode options. Default: {}.
|
20
19
|
# @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
|
21
20
|
# @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
|
22
|
-
def initialize(url, klass, **opts)
|
23
|
-
@klass
|
24
|
-
@
|
25
|
-
@
|
26
|
-
@cache
|
21
|
+
def initialize(url, klass, format_options: {}, cache: nil, **opts)
|
22
|
+
@klass = klass
|
23
|
+
@url = url
|
24
|
+
@opts = opts.merge(format_options)
|
25
|
+
@cache = cache
|
26
|
+
|
27
|
+
return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
|
28
|
+
|
29
|
+
warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
|
27
30
|
end
|
28
31
|
|
29
32
|
# @return [Boolean] returns true if performed.
|
30
33
|
def each(&block)
|
34
|
+
stream = Feedx::Stream.new(@url, **@opts)
|
31
35
|
remote_rev = nil
|
32
36
|
|
33
37
|
if @cache
|
34
|
-
metadata =
|
38
|
+
metadata = stream.blob.info.metadata
|
35
39
|
local_rev = @cache.read.to_i
|
36
40
|
remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
|
37
41
|
return false if remote_rev.positive? && remote_rev <= local_rev
|
38
42
|
end
|
39
43
|
|
40
|
-
|
41
|
-
fmt.decode_each(@klass, **@
|
44
|
+
stream.open do |fmt|
|
45
|
+
fmt.decode_each(@klass, **@opts, &block)
|
42
46
|
end
|
43
47
|
@cache.write(remote_rev) if @cache && remote_rev
|
44
48
|
|
45
49
|
true
|
50
|
+
ensure
|
51
|
+
stream&.close
|
46
52
|
end
|
47
53
|
end
|
48
54
|
end
|
data/lib/feedx/format.rb
CHANGED
@@ -2,13 +2,19 @@ module Feedx
|
|
2
2
|
module Format
|
3
3
|
autoload :Abstract, 'feedx/format/abstract'
|
4
4
|
autoload :JSON, 'feedx/format/json'
|
5
|
+
autoload :Parquet, 'feedx/format/parquet'
|
5
6
|
autoload :Protobuf, 'feedx/format/protobuf'
|
6
7
|
|
7
8
|
class << self
|
8
|
-
def
|
9
|
-
raise ArgumentError, "#{kind}
|
9
|
+
def validate!(kind)
|
10
|
+
raise ArgumentError, "#{kind} does not implement #encoder(io, &block)" unless kind.respond_to?(:encoder)
|
11
|
+
raise ArgumentError, "#{kind} does not implement #decoder(io, &block)" unless kind.respond_to?(:decoder)
|
12
|
+
|
13
|
+
kind
|
14
|
+
end
|
10
15
|
|
11
|
-
|
16
|
+
def register(ext, kind)
|
17
|
+
registry[ext.to_s] = validate!(kind)
|
12
18
|
end
|
13
19
|
|
14
20
|
def resolve(name)
|
@@ -33,6 +39,9 @@ module Feedx
|
|
33
39
|
def registry
|
34
40
|
@registry ||= {
|
35
41
|
'json' => :JSON,
|
42
|
+
'jsonl' => :JSON,
|
43
|
+
'ndjson' => :JSON,
|
44
|
+
'parquet' => :Parquet,
|
36
45
|
'pb' => :Protobuf,
|
37
46
|
'proto' => :Protobuf,
|
38
47
|
'protobuf' => :Protobuf,
|
@@ -40,13 +49,13 @@ module Feedx
|
|
40
49
|
end
|
41
50
|
|
42
51
|
def _resolve(name)
|
43
|
-
name
|
44
|
-
|
45
|
-
if
|
46
|
-
|
47
|
-
registry[name.to_s] =
|
52
|
+
name = name.to_s
|
53
|
+
kind = registry[name]
|
54
|
+
if kind.is_a?(Symbol)
|
55
|
+
kind = const_get(kind).new
|
56
|
+
registry[name.to_s] = kind
|
48
57
|
end
|
49
|
-
|
58
|
+
kind
|
50
59
|
end
|
51
60
|
end
|
52
61
|
end
|
@@ -1,25 +1,54 @@
|
|
1
1
|
class Feedx::Format::Abstract
|
2
|
-
def
|
3
|
-
|
2
|
+
def decoder(io, **opts, &block)
|
3
|
+
self.class::Decoder.open(io, **opts, &block)
|
4
4
|
end
|
5
5
|
|
6
|
-
def
|
7
|
-
|
6
|
+
def encoder(io, **opts, &block)
|
7
|
+
self.class::Encoder.open(io, **opts, &block)
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
class Wrapper
|
11
|
+
def self.open(io, **opts)
|
12
|
+
inst = new(io, **opts)
|
13
|
+
yield inst
|
14
|
+
ensure
|
15
|
+
inst&.close
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(io, **)
|
19
|
+
@io = io
|
15
20
|
end
|
16
21
|
end
|
17
22
|
|
18
|
-
|
19
|
-
|
23
|
+
class Decoder < Wrapper
|
24
|
+
def eof?
|
25
|
+
@io.eof?
|
26
|
+
end
|
27
|
+
|
28
|
+
def decode_each(target, **opts)
|
29
|
+
if block_given?
|
30
|
+
yield decode(target, **opts) until eof?
|
31
|
+
else
|
32
|
+
Enumerator.new do |acc|
|
33
|
+
acc << decode(target, **opts) until eof?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def decode(_target, **)
|
39
|
+
raise 'Not implemented'
|
40
|
+
end
|
41
|
+
|
42
|
+
def close; end
|
20
43
|
end
|
21
44
|
|
22
|
-
|
23
|
-
|
45
|
+
class Encoder < Wrapper
|
46
|
+
def encode(_msg, **)
|
47
|
+
raise 'Not implemented'
|
48
|
+
end
|
49
|
+
|
50
|
+
def close
|
51
|
+
@io.flush if @io.respond_to?(:flush)
|
52
|
+
end
|
24
53
|
end
|
25
54
|
end
|
data/lib/feedx/format/json.rb
CHANGED
@@ -1,16 +1,20 @@
|
|
1
1
|
require 'json'
|
2
2
|
|
3
3
|
class Feedx::Format::JSON < Feedx::Format::Abstract
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
5
|
+
def decode(target, **)
|
6
|
+
line = @io.gets
|
7
|
+
return unless line
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
target = target.allocate if target.is_a?(Class)
|
10
|
+
target.from_json(line)
|
11
|
+
target
|
12
|
+
end
|
11
13
|
end
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
16
|
+
def encode(msg, **opts)
|
17
|
+
@io.write msg.to_json(**opts) << "\n"
|
18
|
+
end
|
15
19
|
end
|
16
20
|
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'parquet'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
class Feedx::Format::Parquet < Feedx::Format::Abstract
|
5
|
+
class Record < Arrow::Record
|
6
|
+
def each_pair
|
7
|
+
container.columns.each do |col|
|
8
|
+
yield col.name, col[index]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
14
|
+
def initialize(io, **)
|
15
|
+
super(io)
|
16
|
+
|
17
|
+
@table = read_table
|
18
|
+
@cursor = 0
|
19
|
+
end
|
20
|
+
|
21
|
+
def eof?
|
22
|
+
@cursor >= @table.n_rows
|
23
|
+
end
|
24
|
+
|
25
|
+
def decode(target, **)
|
26
|
+
return if eof?
|
27
|
+
|
28
|
+
rec = Record.new(@table, @cursor)
|
29
|
+
@cursor += 1
|
30
|
+
|
31
|
+
target = target.allocate if target.is_a?(Class)
|
32
|
+
target.from_parquet(rec)
|
33
|
+
target
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def read_table
|
39
|
+
tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
40
|
+
IO.copy_stream(@io, tmpname)
|
41
|
+
|
42
|
+
@table = Arrow::Table.load(tmpname, format: 'parquet')
|
43
|
+
ensure
|
44
|
+
unlink!(tmpname) if tmpname
|
45
|
+
end
|
46
|
+
|
47
|
+
def unlink!(tmpname)
|
48
|
+
File.unlink(tmpname)
|
49
|
+
rescue Errno::ENOENT
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
55
|
+
attr_reader :schema
|
56
|
+
|
57
|
+
def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
|
58
|
+
super(io)
|
59
|
+
|
60
|
+
@schema = schema
|
61
|
+
@batch_size = batch_size.to_i
|
62
|
+
@buffer_size = buffer_size.to_i
|
63
|
+
|
64
|
+
@tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
|
65
|
+
@output = Arrow::FileOutputStream.new(@tmpname, append: false)
|
66
|
+
@writer = Parquet::ArrowFileWriter.new(@schema, @output)
|
67
|
+
@batch = []
|
68
|
+
end
|
69
|
+
|
70
|
+
def encode(msg, **opts)
|
71
|
+
msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
|
72
|
+
|
73
|
+
res = @batch.push(msg)
|
74
|
+
flush_table if @batch.size >= @batch_size
|
75
|
+
res
|
76
|
+
end
|
77
|
+
|
78
|
+
def close
|
79
|
+
flush_table unless @batch.empty?
|
80
|
+
|
81
|
+
@writer.close
|
82
|
+
@output.close
|
83
|
+
IO.copy_stream(@tmpname, @io)
|
84
|
+
ensure
|
85
|
+
unlink!
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def flush_table
|
91
|
+
table = Arrow::RecordBatch.new(@schema, @batch).to_table
|
92
|
+
@writer.write_table table, @buffer_size
|
93
|
+
@batch.clear
|
94
|
+
end
|
95
|
+
|
96
|
+
def unlink!
|
97
|
+
File.unlink(@tmpname)
|
98
|
+
rescue Errno::ENOENT
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -1,16 +1,24 @@
|
|
1
1
|
require 'pbio'
|
2
2
|
|
3
3
|
class Feedx::Format::Protobuf < Feedx::Format::Abstract
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
class Decoder < Feedx::Format::Abstract::Decoder
|
5
|
+
def initialize(io, **opts)
|
6
|
+
super PBIO::Delimited.new(io), **opts
|
7
|
+
end
|
7
8
|
|
8
|
-
|
9
|
-
|
9
|
+
def decode(target, **)
|
10
|
+
@io.read(target)
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
|
-
|
13
|
-
|
14
|
-
|
14
|
+
class Encoder < Feedx::Format::Abstract::Encoder
|
15
|
+
def initialize(io, **opts)
|
16
|
+
super PBIO::Delimited.new(io), **opts
|
17
|
+
end
|
18
|
+
|
19
|
+
def encode(msg, **opts)
|
20
|
+
msg = msg.to_pb(**opts) if msg.respond_to?(:to_pb)
|
21
|
+
@io.write msg
|
22
|
+
end
|
15
23
|
end
|
16
24
|
end
|