feedx 0.10.2 → 0.12.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +3 -0
  3. data/.gitignore +1 -0
  4. data/.rubocop.yml +2 -0
  5. data/.travis.yml +12 -2
  6. data/Gemfile +0 -2
  7. data/Gemfile.lock +50 -30
  8. data/Makefile +10 -5
  9. data/compression.go +18 -0
  10. data/compression_test.go +12 -0
  11. data/consumer_test.go +5 -4
  12. data/ext/parquet/decoder.go +170 -0
  13. data/ext/parquet/decoder_test.go +88 -0
  14. data/ext/parquet/go.mod +12 -0
  15. data/ext/parquet/go.sum +134 -0
  16. data/ext/parquet/parquet.go +78 -0
  17. data/ext/parquet/parquet_test.go +28 -0
  18. data/ext/parquet/reader.go +89 -0
  19. data/ext/parquet/testdata/alltypes_plain.parquet +0 -0
  20. data/ext/parquet/types.go +51 -0
  21. data/feedx.gemspec +3 -2
  22. data/feedx_test.go +8 -24
  23. data/format.go +50 -20
  24. data/format_test.go +8 -6
  25. data/go.mod +9 -11
  26. data/go.sum +76 -28
  27. data/internal/testdata/testdata.pb.go +223 -0
  28. data/internal/testdata/testdata.proto +15 -0
  29. data/lib/feedx/cache/abstract.rb +2 -2
  30. data/lib/feedx/cache/memory.rb +1 -0
  31. data/lib/feedx/compression.rb +11 -4
  32. data/lib/feedx/compression/abstract.rb +2 -2
  33. data/lib/feedx/compression/gzip.rb +14 -16
  34. data/lib/feedx/compression/none.rb +4 -4
  35. data/lib/feedx/consumer.rb +15 -9
  36. data/lib/feedx/format.rb +18 -9
  37. data/lib/feedx/format/abstract.rb +42 -13
  38. data/lib/feedx/format/json.rb +12 -8
  39. data/lib/feedx/format/parquet.rb +102 -0
  40. data/lib/feedx/format/protobuf.rb +16 -8
  41. data/lib/feedx/producer.rb +27 -22
  42. data/lib/feedx/stream.rb +36 -23
  43. data/producer_test.go +1 -2
  44. data/reader_test.go +6 -6
  45. data/spec/feedx/compression/gzip_spec.rb +2 -2
  46. data/spec/feedx/compression/none_spec.rb +2 -2
  47. data/spec/feedx/compression_spec.rb +9 -9
  48. data/spec/feedx/consumer_spec.rb +1 -1
  49. data/spec/feedx/format/abstract_spec.rb +11 -8
  50. data/spec/feedx/format/json_spec.rb +17 -16
  51. data/spec/feedx/format/parquet_spec.rb +30 -0
  52. data/spec/feedx/format/protobuf_spec.rb +12 -11
  53. data/spec/feedx/format_spec.rb +8 -8
  54. data/spec/feedx/producer_spec.rb +6 -0
  55. data/spec/feedx/stream_spec.rb +43 -6
  56. data/spec/spec_helper.rb +17 -1
  57. metadata +33 -5
@@ -0,0 +1,15 @@
1
+ syntax = "proto3";
2
+
3
+ package feedx.internal.testdata;
4
+ option go_package = "github.com/feedx/internal/testdata";
5
+
6
+ enum MockEnum {
7
+ UNKNOWN = 0;
8
+ FIRST = 3;
9
+ }
10
+
11
+ message MockMessage {
12
+ string name = 1;
13
+ MockEnum enum = 2;
14
+ uint32 height = 3;
15
+ }
@@ -5,12 +5,12 @@ class Feedx::Cache::Abstract
5
5
  end
6
6
 
7
7
  # Read reads a key.
8
- def read(_key, **_opts)
8
+ def read(_key, **)
9
9
  raise 'Not implemented'
10
10
  end
11
11
 
12
12
  # Write writes a key/value pair.
13
- def write(_key, _value, **_opts)
13
+ def write(_key, _value, **)
14
14
  raise 'Not implemented'
15
15
  end
16
16
 
@@ -3,6 +3,7 @@ require 'monitor'
3
3
  # Thread-safe in-memory cache. Use for testing only.
4
4
  class Feedx::Cache::Memory < Feedx::Cache::Abstract
5
5
  def initialize
6
+ super
6
7
  @monitor = Monitor.new
7
8
  @entries = {}
8
9
  end
@@ -5,12 +5,19 @@ module Feedx
5
5
  autoload :Gzip, 'feedx/compression/gzip'
6
6
 
7
7
  class << self
8
+ def validate!(kind)
9
+ raise ArgumentError, "#{kind} does not implement #reader(io, &block)" unless kind.respond_to?(:reader)
10
+ raise ArgumentError, "#{kind} does not implement #writer(io, &block)" unless kind.respond_to?(:writer)
11
+
12
+ kind
13
+ end
14
+
8
15
  def resolve(name)
9
16
  case name.to_s
10
17
  when 'gz', 'gzip'
11
- Gzip
18
+ Gzip.new
12
19
  when ''
13
- None
20
+ None.new
14
21
  else
15
22
  raise ArgumentError, "invalid compression #{name}"
16
23
  end
@@ -18,9 +25,9 @@ module Feedx
18
25
 
19
26
  def detect(path)
20
27
  if File.extname(path)[-1] == 'z'
21
- Gzip
28
+ Gzip.new
22
29
  else
23
- None
30
+ None.new
24
31
  end
25
32
  end
26
33
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::Abstract
2
- def self.reader(_io, &_block)
2
+ def reader(_io, **, &_block)
3
3
  raise 'Not implemented'
4
4
  end
5
5
 
6
- def self.writer(_io, &_block)
6
+ def writer(_io, **, &_block)
7
7
  raise 'Not implemented'
8
8
  end
9
9
  end
@@ -1,25 +1,23 @@
1
1
  require 'zlib'
2
2
 
3
3
  class Feedx::Compression::Gzip < Feedx::Compression::Abstract
4
- class << self
5
- def reader(io, &block)
6
- force_binmode(io)
7
- Zlib::GzipReader.wrap(io, &block)
8
- end
4
+ def reader(io, **, &block)
5
+ force_binmode(io)
6
+ Zlib::GzipReader.wrap(io, &block)
7
+ end
9
8
 
10
- def writer(io, &block)
11
- force_binmode(io)
12
- Zlib::GzipWriter.wrap(io, &block)
13
- end
9
+ def writer(io, **, &block)
10
+ force_binmode(io)
11
+ Zlib::GzipWriter.wrap(io, &block)
12
+ end
14
13
 
15
- private
14
+ private
16
15
 
17
- def force_binmode(io)
18
- if io.respond_to?(:binmode)
19
- io.binmode
20
- elsif io.respond_to?(:set_encoding)
21
- io.set_encoding(Encoding::BINARY)
22
- end
16
+ def force_binmode(io)
17
+ if io.respond_to?(:binmode)
18
+ io.binmode
19
+ elsif io.respond_to?(:set_encoding)
20
+ io.set_encoding(Encoding::BINARY)
23
21
  end
24
22
  end
25
23
  end
@@ -1,9 +1,9 @@
1
1
  class Feedx::Compression::None < Feedx::Compression::Abstract
2
- def self.reader(io, &block)
3
- block.call(io)
2
+ def reader(io, **)
3
+ yield(io)
4
4
  end
5
5
 
6
- def self.writer(io, &block)
7
- block.call(io)
6
+ def writer(io, **)
7
+ yield(io)
8
8
  end
9
9
  end
@@ -16,33 +16,39 @@ module Feedx
16
16
  # @param [Class] klass the record class.
17
17
  # @param [Hash] opts options
18
18
  # @option opts [Symbol,Class<Feedx::Format::Abstract>] :format custom formatter. Default: from file extension.
19
- # @option opts [Hash] :format_options format decode options. Default: {}.
20
19
  # @option opts [Symbol,Class<Feedx::Compression::Abstract>] :compress enable compression. Default: from file extension.
21
20
  # @option opts [Feedx::Cache::Value] :cache cache value to store remote last modified time and consume conditionally.
22
- def initialize(url, klass, **opts)
23
- @klass = klass
24
- @stream = Feedx::Stream.new(url, **opts)
25
- @fmt_opts = opts[:format_options] || {}
26
- @cache = opts[:cache]
21
+ def initialize(url, klass, format_options: {}, cache: nil, **opts)
22
+ @klass = klass
23
+ @url = url
24
+ @opts = opts.merge(format_options)
25
+ @cache = cache
26
+
27
+ return if format_options.empty? || (defined?(Gem::Deprecate) && Gem::Deprecate.skip)
28
+
29
+ warn "WARNING: passing format_options is deprecated; pass the options inline instead (called from #{caller(2..2).first})."
27
30
  end
28
31
 
29
32
  # @return [Boolean] returns true if performed.
30
33
  def each(&block)
34
+ stream = Feedx::Stream.new(@url, **@opts)
31
35
  remote_rev = nil
32
36
 
33
37
  if @cache
34
- metadata = @stream.blob.info.metadata
38
+ metadata = stream.blob.info.metadata
35
39
  local_rev = @cache.read.to_i
36
40
  remote_rev = (metadata[META_LAST_MODIFIED] || metadata[META_LAST_MODIFIED_DC]).to_i
37
41
  return false if remote_rev.positive? && remote_rev <= local_rev
38
42
  end
39
43
 
40
- @stream.open do |fmt|
41
- fmt.decode_each(@klass, **@fmt_opts, &block)
44
+ stream.open do |fmt|
45
+ fmt.decode_each(@klass, **@opts, &block)
42
46
  end
43
47
  @cache.write(remote_rev) if @cache && remote_rev
44
48
 
45
49
  true
50
+ ensure
51
+ stream&.close
46
52
  end
47
53
  end
48
54
  end
@@ -2,13 +2,19 @@ module Feedx
2
2
  module Format
3
3
  autoload :Abstract, 'feedx/format/abstract'
4
4
  autoload :JSON, 'feedx/format/json'
5
+ autoload :Parquet, 'feedx/format/parquet'
5
6
  autoload :Protobuf, 'feedx/format/protobuf'
6
7
 
7
8
  class << self
8
- def register(ext, kind)
9
- raise ArgumentError, "#{kind} is not a subclass of Feedx::Format::Abstract" unless kind.is_a?(Class) && kind < Abstract
9
+ def validate!(kind)
10
+ raise ArgumentError, "#{kind} does not implement #encoder(io, &block)" unless kind.respond_to?(:encoder)
11
+ raise ArgumentError, "#{kind} does not implement #decoder(io, &block)" unless kind.respond_to?(:decoder)
12
+
13
+ kind
14
+ end
10
15
 
11
- registry[ext.to_s] = kind
16
+ def register(ext, kind)
17
+ registry[ext.to_s] = validate!(kind)
12
18
  end
13
19
 
14
20
  def resolve(name)
@@ -33,6 +39,9 @@ module Feedx
33
39
  def registry
34
40
  @registry ||= {
35
41
  'json' => :JSON,
42
+ 'jsonl' => :JSON,
43
+ 'ndjson' => :JSON,
44
+ 'parquet' => :Parquet,
36
45
  'pb' => :Protobuf,
37
46
  'proto' => :Protobuf,
38
47
  'protobuf' => :Protobuf,
@@ -40,13 +49,13 @@ module Feedx
40
49
  end
41
50
 
42
51
  def _resolve(name)
43
- name = name.to_s
44
- klass = registry[name]
45
- if klass.is_a?(Symbol)
46
- klass = const_get(klass)
47
- registry[name.to_s] = klass
52
+ name = name.to_s
53
+ kind = registry[name]
54
+ if kind.is_a?(Symbol)
55
+ kind = const_get(kind).new
56
+ registry[name.to_s] = kind
48
57
  end
49
- klass
58
+ kind
50
59
  end
51
60
  end
52
61
  end
@@ -1,25 +1,54 @@
1
1
  class Feedx::Format::Abstract
2
- def initialize(io)
3
- @io = io
2
+ def decoder(io, **opts, &block)
3
+ self.class::Decoder.open(io, **opts, &block)
4
4
  end
5
5
 
6
- def eof?
7
- @io.eof?
6
+ def encoder(io, **opts, &block)
7
+ self.class::Encoder.open(io, **opts, &block)
8
8
  end
9
9
 
10
- def decode_each(klass, **opts)
11
- if block_given?
12
- yield decode(klass, **opts) until eof?
13
- else
14
- Enumerator.new {|y| y << decode(klass, **opts) until eof? }
10
+ class Wrapper
11
+ def self.open(io, **opts)
12
+ inst = new(io, **opts)
13
+ yield inst
14
+ ensure
15
+ inst&.close
16
+ end
17
+
18
+ def initialize(io, **)
19
+ @io = io
15
20
  end
16
21
  end
17
22
 
18
- def decode(_klass, **)
19
- raise 'Not implemented'
23
+ class Decoder < Wrapper
24
+ def eof?
25
+ @io.eof?
26
+ end
27
+
28
+ def decode_each(target, **opts)
29
+ if block_given?
30
+ yield decode(target, **opts) until eof?
31
+ else
32
+ Enumerator.new do |acc|
33
+ acc << decode(target, **opts) until eof?
34
+ end
35
+ end
36
+ end
37
+
38
+ def decode(_target, **)
39
+ raise 'Not implemented'
40
+ end
41
+
42
+ def close; end
20
43
  end
21
44
 
22
- def encode(_msg, **)
23
- raise 'Not implemented'
45
+ class Encoder < Wrapper
46
+ def encode(_msg, **)
47
+ raise 'Not implemented'
48
+ end
49
+
50
+ def close
51
+ @io.flush if @io.respond_to?(:flush)
52
+ end
24
53
  end
25
54
  end
@@ -1,16 +1,20 @@
1
1
  require 'json'
2
2
 
3
3
  class Feedx::Format::JSON < Feedx::Format::Abstract
4
- def decode(obj, **)
5
- line = @io.gets
6
- return unless line
4
+ class Decoder < Feedx::Format::Abstract::Decoder
5
+ def decode(target, **)
6
+ line = @io.gets
7
+ return unless line
7
8
 
8
- obj = obj.allocate if obj.is_a?(Class)
9
- obj.from_json(line)
10
- obj
9
+ target = target.allocate if target.is_a?(Class)
10
+ target.from_json(line)
11
+ target
12
+ end
11
13
  end
12
14
 
13
- def encode(msg, **opts)
14
- @io.write msg.to_json(**opts) << "\n"
15
+ class Encoder < Feedx::Format::Abstract::Encoder
16
+ def encode(msg, **opts)
17
+ @io.write msg.to_json(**opts) << "\n"
18
+ end
15
19
  end
16
20
  end
@@ -0,0 +1,102 @@
1
+ require 'parquet'
2
+ require 'tmpdir'
3
+
4
+ class Feedx::Format::Parquet < Feedx::Format::Abstract
5
+ class Record < Arrow::Record
6
+ def each_pair
7
+ container.columns.each do |col|
8
+ yield col.name, col[index]
9
+ end
10
+ end
11
+ end
12
+
13
+ class Decoder < Feedx::Format::Abstract::Decoder
14
+ def initialize(io, **)
15
+ super(io)
16
+
17
+ @table = read_table
18
+ @cursor = 0
19
+ end
20
+
21
+ def eof?
22
+ @cursor >= @table.n_rows
23
+ end
24
+
25
+ def decode(target, **)
26
+ return if eof?
27
+
28
+ rec = Record.new(@table, @cursor)
29
+ @cursor += 1
30
+
31
+ target = target.allocate if target.is_a?(Class)
32
+ target.from_parquet(rec)
33
+ target
34
+ end
35
+
36
+ private
37
+
38
+ def read_table
39
+ tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
40
+ IO.copy_stream(@io, tmpname)
41
+
42
+ @table = Arrow::Table.load(tmpname, format: 'parquet')
43
+ ensure
44
+ unlink!(tmpname) if tmpname
45
+ end
46
+
47
+ def unlink!(tmpname)
48
+ File.unlink(tmpname)
49
+ rescue Errno::ENOENT
50
+ nil
51
+ end
52
+ end
53
+
54
+ class Encoder < Feedx::Format::Abstract::Encoder
55
+ attr_reader :schema
56
+
57
+ def initialize(io, schema:, buffer_size: 1 << 20, batch_size: 10_000)
58
+ super(io)
59
+
60
+ @schema = schema
61
+ @batch_size = batch_size.to_i
62
+ @buffer_size = buffer_size.to_i
63
+
64
+ @tmpname = ::Dir::Tmpname.create('feedx-parquet') {|path, *| path }
65
+ @output = Arrow::FileOutputStream.new(@tmpname, append: false)
66
+ @writer = Parquet::ArrowFileWriter.new(@schema, @output)
67
+ @batch = []
68
+ end
69
+
70
+ def encode(msg, **opts)
71
+ msg = msg.to_parquet(@schema, **opts) if msg.respond_to?(:to_parquet)
72
+
73
+ res = @batch.push(msg)
74
+ flush_table if @batch.size >= @batch_size
75
+ res
76
+ end
77
+
78
+ def close
79
+ flush_table unless @batch.empty?
80
+
81
+ @writer.close
82
+ @output.close
83
+ IO.copy_stream(@tmpname, @io)
84
+ ensure
85
+ unlink!
86
+ end
87
+
88
+ private
89
+
90
+ def flush_table
91
+ table = Arrow::RecordBatch.new(@schema, @batch).to_table
92
+ @writer.write_table table, @buffer_size
93
+ @batch.clear
94
+ end
95
+
96
+ def unlink!
97
+ File.unlink(@tmpname)
98
+ rescue Errno::ENOENT
99
+ nil
100
+ end
101
+ end
102
+ end
@@ -1,16 +1,24 @@
1
1
  require 'pbio'
2
2
 
3
3
  class Feedx::Format::Protobuf < Feedx::Format::Abstract
4
- def initialize(io)
5
- super PBIO::Delimited.new(io)
6
- end
4
+ class Decoder < Feedx::Format::Abstract::Decoder
5
+ def initialize(io, **opts)
6
+ super PBIO::Delimited.new(io), **opts
7
+ end
7
8
 
8
- def decode(klass, **)
9
- @io.read(klass)
9
+ def decode(target, **)
10
+ @io.read(target)
11
+ end
10
12
  end
11
13
 
12
- def encode(msg, **opts)
13
- msg = msg.to_pb(**opts) if msg.respond_to?(:to_pb)
14
- @io.write msg
14
+ class Encoder < Feedx::Format::Abstract::Encoder
15
+ def initialize(io, **opts)
16
+ super PBIO::Delimited.new(io), **opts
17
+ end
18
+
19
+ def encode(msg, **opts)
20
+ msg = msg.to_pb(**opts) if msg.respond_to?(:to_pb)
21
+ @io.write msg
22
+ end
15
23
  end
16
24
  end