feedx 0.12.7 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/test.yml +2 -37
  3. data/.golangci.yml +13 -4
  4. data/.rubocop.yml +8 -14
  5. data/.tool-versions +1 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +54 -68
  8. data/Makefile +3 -3
  9. data/README.md +3 -1
  10. data/compression.go +29 -0
  11. data/compression_test.go +73 -61
  12. data/consumer.go +96 -152
  13. data/consumer_test.go +124 -59
  14. data/example_test.go +140 -0
  15. data/feedx.gemspec +2 -10
  16. data/feedx.go +16 -31
  17. data/feedx_ext_test.go +13 -3
  18. data/feedx_test.go +24 -26
  19. data/format.go +29 -19
  20. data/format_test.go +84 -56
  21. data/go.mod +11 -7
  22. data/go.sum +16 -138
  23. data/incremental.go +122 -0
  24. data/incremental_test.go +62 -0
  25. data/lib/feedx/cache/abstract.rb +3 -3
  26. data/lib/feedx/cache/value.rb +6 -6
  27. data/lib/feedx/compression/abstract.rb +2 -2
  28. data/lib/feedx/compression/gzip.rb +4 -4
  29. data/lib/feedx/consumer.rb +8 -8
  30. data/lib/feedx/format/abstract.rb +6 -6
  31. data/lib/feedx/format/json.rb +2 -2
  32. data/lib/feedx/format/protobuf.rb +6 -6
  33. data/lib/feedx/format.rb +1 -3
  34. data/lib/feedx/producer.rb +11 -11
  35. data/lib/feedx/stream.rb +2 -2
  36. data/lib/feedx.rb +2 -3
  37. data/manifest.go +65 -0
  38. data/producer.go +34 -137
  39. data/producer_test.go +46 -60
  40. data/reader.go +142 -41
  41. data/reader_test.go +86 -35
  42. data/scheduler.go +176 -0
  43. data/scheduler_test.go +128 -0
  44. data/writer.go +13 -13
  45. data/writer_test.go +61 -44
  46. metadata +12 -137
  47. data/.github/workflows/lint.yml +0 -18
  48. data/ext/parquet/decoder.go +0 -59
  49. data/ext/parquet/decoder_test.go +0 -88
  50. data/ext/parquet/encoder.go +0 -27
  51. data/ext/parquet/encoder_test.go +0 -70
  52. data/ext/parquet/go.mod +0 -12
  53. data/ext/parquet/go.sum +0 -193
  54. data/ext/parquet/parquet.go +0 -78
  55. data/ext/parquet/parquet_test.go +0 -28
  56. data/ext/parquet/testdata/alltypes_plain.parquet +0 -0
  57. data/lib/feedx/format/parquet.rb +0 -102
  58. data/spec/feedx/cache/memory_spec.rb +0 -23
  59. data/spec/feedx/cache/value_spec.rb +0 -19
  60. data/spec/feedx/compression/gzip_spec.rb +0 -17
  61. data/spec/feedx/compression/none_spec.rb +0 -15
  62. data/spec/feedx/compression_spec.rb +0 -19
  63. data/spec/feedx/consumer_spec.rb +0 -49
  64. data/spec/feedx/format/abstract_spec.rb +0 -21
  65. data/spec/feedx/format/json_spec.rb +0 -27
  66. data/spec/feedx/format/parquet_spec.rb +0 -30
  67. data/spec/feedx/format/protobuf_spec.rb +0 -23
  68. data/spec/feedx/format_spec.rb +0 -21
  69. data/spec/feedx/producer_spec.rb +0 -74
  70. data/spec/feedx/stream_spec.rb +0 -109
  71. data/spec/spec_helper.rb +0 -57
data/producer.go CHANGED
@@ -2,8 +2,6 @@ package feedx
2
2
 
3
3
  import (
4
4
  "context"
5
- "sync/atomic"
6
- "time"
7
5
 
8
6
  "github.com/bsm/bfs"
9
7
  )
@@ -11,146 +9,66 @@ import (
11
9
  // ProduceFunc is a callback which is run by the producer on every iteration.
12
10
  type ProduceFunc func(*Writer) error
13
11
 
14
- // ProducerOptions configure the producer instance.
15
- type ProducerOptions struct {
16
- WriterOptions
17
-
18
- // The interval used by producer to initiate a cycle.
19
- // Default: 1m
20
- Interval time.Duration
21
-
22
- // LastModCheck this function will be called before each push attempt
23
- // to dynamically determine the last modified time.
24
- LastModCheck func(context.Context) (time.Time, error)
25
-
26
- // AfterPush callbacks are triggered after each push cycle, receiving
27
- // the push state and error (if occurred).
28
- AfterPush func(*ProducerPush, error)
29
- }
30
-
31
- func (o *ProducerOptions) norm(name string) {
32
- o.WriterOptions.norm(name)
33
- if o.Interval <= 0 {
34
- o.Interval = time.Minute
35
- }
36
- }
37
-
38
- // ProducerPush contains the state of the last push.
39
- type ProducerPush struct {
40
- // Producer exposes the current producer state.
41
- *Producer
42
- // Updated indicates is the push resulted in an update.
43
- Updated bool
44
- }
45
-
46
- // Producer (continously) produces a feed.
12
+ // Producer instances push data feeds to remote locations.
47
13
  type Producer struct {
48
14
  remote *bfs.Object
49
15
  ownRemote bool
50
-
51
- opt ProducerOptions
52
- ctx context.Context
53
- stop context.CancelFunc
54
- pfn ProduceFunc
55
-
56
- numWritten, lastPush, lastMod int64
57
16
  }
58
17
 
59
18
  // NewProducer inits a new feed producer.
60
- func NewProducer(ctx context.Context, remoteURL string, opt *ProducerOptions, pfn ProduceFunc) (*Producer, error) {
19
+ func NewProducer(ctx context.Context, remoteURL string) (*Producer, error) {
61
20
  remote, err := bfs.NewObject(ctx, remoteURL)
62
21
  if err != nil {
63
22
  return nil, err
64
23
  }
65
24
 
66
- p, err := NewProducerForRemote(ctx, remote, opt, pfn)
67
- if err != nil {
68
- _ = remote.Close()
69
- return nil, err
70
- }
71
- p.ownRemote = true
72
- return p, nil
25
+ pcr := NewProducerForRemote(remote)
26
+ pcr.ownRemote = true
27
+ return pcr, nil
73
28
  }
74
29
 
75
30
  // NewProducerForRemote starts a new feed producer with a remote.
76
- func NewProducerForRemote(ctx context.Context, remote *bfs.Object, opt *ProducerOptions, pfn ProduceFunc) (*Producer, error) {
77
- var o ProducerOptions
78
- if opt != nil {
79
- o = *opt
80
- }
81
- o.norm(remote.Name())
82
-
83
- ctx, stop := context.WithCancel(ctx)
84
- p := &Producer{
85
- remote: remote,
86
- opt: o,
87
- pfn: pfn,
88
- ctx: ctx,
89
- stop: stop,
90
- }
91
-
92
- // run initial push
93
- if _, err := p.push(); err != nil {
94
- _ = p.Close()
95
- return nil, err
96
- }
97
-
98
- // start continuous loop
99
- go p.loop()
100
-
101
- return p, nil
102
- }
103
-
104
- // LastPush returns time of last push attempt.
105
- func (p *Producer) LastPush() time.Time {
106
- return timestamp(atomic.LoadInt64(&p.lastPush)).Time()
107
- }
108
-
109
- // LastModified returns time at which the remote feed was last modified.
110
- func (p *Producer) LastModified() time.Time {
111
- return timestamp(atomic.LoadInt64(&p.lastMod)).Time()
112
- }
113
-
114
- // NumWritten returns the number of values produced during the last push.
115
- func (p *Producer) NumWritten() int {
116
- return int(atomic.LoadInt64(&p.numWritten))
31
+ func NewProducerForRemote(remote *bfs.Object) *Producer {
32
+ return &Producer{remote: remote}
117
33
  }
118
34
 
119
35
  // Close stops the producer.
120
36
  func (p *Producer) Close() error {
121
- p.stop()
122
- if p.ownRemote {
123
- return p.remote.Close()
37
+ if p.ownRemote && p.remote != nil {
38
+ err := p.remote.Close()
39
+ p.remote = nil
40
+ return err
124
41
  }
125
42
  return nil
126
43
  }
127
44
 
128
- func (p *Producer) push() (*ProducerPush, error) {
129
- start := time.Now()
130
- atomic.StoreInt64(&p.lastPush, timestampFromTime(start).Millis())
131
-
132
- // setup write options
133
- wopt := p.opt.WriterOptions
134
- wopt.LastMod = start
135
- if p.opt.LastModCheck != nil {
136
- modTime, err := p.opt.LastModCheck(p.ctx)
137
- if err != nil {
138
- return nil, err
139
- }
140
- wopt.LastMod = modTime
141
- }
45
+ func (p *Producer) Produce(ctx context.Context, version int64, opt *WriterOptions, pfn ProduceFunc) (*Status, error) {
46
+ status := Status{LocalVersion: version}
142
47
 
143
- // retrieve original last modified time, skip if not modified
144
- if rts, err := remoteLastModified(p.ctx, p.remote); err != nil {
48
+ // retrieve previous remote version
49
+ remoteVersion, err := fetchRemoteVersion(ctx, p.remote)
50
+ if err != nil {
145
51
  return nil, err
146
- } else if rts == timestampFromTime(wopt.LastMod) {
147
- return &ProducerPush{Producer: p}, nil
148
52
  }
53
+ status.RemoteVersion = remoteVersion
149
54
 
150
- writer := NewWriter(p.ctx, p.remote, &wopt)
55
+ // skip if not modified
56
+ if skipSync(version, remoteVersion) {
57
+ status.Skipped = true
58
+ return &status, nil
59
+ }
60
+
61
+ // set version for writer
62
+ if opt == nil {
63
+ opt = new(WriterOptions)
64
+ }
65
+ opt.Version = version
66
+
67
+ // init writer and perform
68
+ writer := NewWriter(ctx, p.remote, opt)
151
69
  defer writer.Discard()
152
70
 
153
- if err := p.pfn(writer); err != nil {
71
+ if err := pfn(writer); err != nil {
154
72
  return nil, err
155
73
  }
156
74
 
@@ -158,27 +76,6 @@ func (p *Producer) push() (*ProducerPush, error) {
158
76
  return nil, err
159
77
  }
160
78
 
161
- atomic.StoreInt64(&p.numWritten, int64(writer.NumWritten()))
162
- atomic.StoreInt64(&p.lastMod, timestampFromTime(wopt.LastMod).Millis())
163
- return &ProducerPush{
164
- Producer: p,
165
- Updated: true,
166
- }, nil
167
- }
168
-
169
- func (p *Producer) loop() {
170
- ticker := time.NewTicker(p.opt.Interval)
171
- defer ticker.Stop()
172
-
173
- for {
174
- select {
175
- case <-p.ctx.Done():
176
- return
177
- case <-ticker.C:
178
- state, err := p.push()
179
- if p.opt.AfterPush != nil {
180
- p.opt.AfterPush(state, err)
181
- }
182
- }
183
- }
79
+ status.NumItems = writer.NumWritten()
80
+ return &status, nil
184
81
  }
data/producer_test.go CHANGED
@@ -1,79 +1,65 @@
1
1
  package feedx_test
2
2
 
3
3
  import (
4
- "context"
5
- "sync/atomic"
6
- "time"
4
+ "reflect"
5
+ "testing"
7
6
 
8
7
  "github.com/bsm/bfs"
9
8
  "github.com/bsm/feedx"
10
- . "github.com/bsm/ginkgo"
11
- . "github.com/bsm/gomega"
12
9
  )
13
10
 
14
- var _ = Describe("Producer", func() {
15
- var subject *feedx.Producer
16
- var obj *bfs.Object
17
- var numRuns uint32
18
- var ctx = context.Background()
11
+ func TestProducer(t *testing.T) {
12
+ obj := bfs.NewInMemObject("path/to/file.json")
13
+ defer obj.Close()
19
14
 
20
- setup := func(o *feedx.ProducerOptions) {
21
- var err error
22
- subject, err = feedx.NewProducerForRemote(ctx, obj, o, func(w *feedx.Writer) error {
23
- atomic.AddUint32(&numRuns, 1)
15
+ pcr := feedx.NewProducerForRemote(obj)
16
+ defer pcr.Close()
24
17
 
25
- for i := 0; i < 10; i++ {
26
- if err := w.Encode(seed()); err != nil {
27
- return err
28
- }
29
- }
30
- return nil
31
- })
32
- Expect(err).NotTo(HaveOccurred())
33
- }
34
-
35
- BeforeEach(func() {
36
- atomic.StoreUint32(&numRuns, 0)
37
- obj = bfs.NewInMemObject("path/to/file.jsonz")
18
+ // first attempt
19
+ testProduce(t, pcr, 101, &feedx.Status{
20
+ LocalVersion: 101,
21
+ NumItems: 10,
38
22
  })
39
23
 
40
- AfterEach(func() {
41
- if subject != nil {
42
- Expect(subject.Close()).To(Succeed())
43
- }
24
+ // second attempt
25
+ testProduce(t, pcr, 101, &feedx.Status{
26
+ LocalVersion: 101,
27
+ RemoteVersion: 101,
28
+ Skipped: true,
44
29
  })
45
30
 
46
- It("produces", func() {
47
- setup(nil)
48
-
49
- Expect(subject.LastPush()).To(BeTemporally("~", time.Now(), time.Second))
50
- Expect(subject.LastModified()).To(BeTemporally("~", time.Now(), time.Second))
51
- Expect(subject.NumWritten()).To(Equal(10))
52
- Expect(subject.Close()).To(Succeed())
53
-
54
- info, err := obj.Head(ctx)
55
- Expect(err).NotTo(HaveOccurred())
56
- Expect(info.Size).To(BeNumerically("~", 75, 10))
31
+ // updated version
32
+ testProduce(t, pcr, 134, &feedx.Status{
33
+ LocalVersion: 134,
34
+ RemoteVersion: 101,
35
+ NumItems: 13,
57
36
  })
58
37
 
59
- It("produces with custom last-mod check", func() {
60
- setup(&feedx.ProducerOptions{
61
- Interval: 50 * time.Millisecond,
62
- LastModCheck: func(_ context.Context) (time.Time, error) { return time.Unix(1515151515, 987654321), nil },
63
- })
64
-
65
- firstPush := subject.LastPush()
66
- Expect(firstPush).To(BeTemporally("~", time.Now(), time.Second))
67
- Expect(subject.LastModified()).To(Equal(time.Unix(1515151515, 987000000)))
68
- Expect(subject.NumWritten()).To(Equal(10))
69
- Expect(atomic.LoadUint32(&numRuns)).To(Equal(uint32(1)))
38
+ meta, err := obj.Head(t.Context())
39
+ if err != nil {
40
+ t.Fatal("unexpected error", err)
41
+ }
42
+ if exp := (bfs.Metadata{"X-Feedx-Version": "134"}); !reflect.DeepEqual(exp, meta.Metadata) {
43
+ t.Errorf("expected %#v, got %#v", exp, meta)
44
+ }
45
+ }
70
46
 
71
- info, err := obj.Head(ctx)
72
- Expect(err).NotTo(HaveOccurred())
73
- Expect(info.Size).To(BeNumerically("~", 75, 10))
74
- Expect(info.Metadata).To(HaveKeyWithValue("X-Feedx-Last-Modified", "1515151515987"))
47
+ func testProduce(t *testing.T, pcr *feedx.Producer, version int64, exp *feedx.Status) {
48
+ t.Helper()
75
49
 
76
- Eventually(func() bool { return subject.LastPush().After(firstPush) }).Should(BeTrue())
77
- Expect(atomic.LoadUint32(&numRuns)).To(Equal(uint32(1)))
50
+ status, err := pcr.Produce(t.Context(), version, nil, func(w *feedx.Writer) error {
51
+ for i := int64(0); i < version/10; i++ {
52
+ if err := w.Encode(seed()); err != nil {
53
+ return err
54
+ }
55
+ }
56
+ return nil
78
57
  })
79
- })
58
+ if err != nil {
59
+ t.Fatal("unexpected error", err)
60
+ }
61
+
62
+ if !reflect.DeepEqual(exp, status) {
63
+ t.Errorf("expected %#v, got %#v", exp, status)
64
+ }
65
+ }
data/reader.go CHANGED
@@ -2,8 +2,8 @@ package feedx
2
2
 
3
3
  import (
4
4
  "context"
5
+ "errors"
5
6
  "io"
6
- "time"
7
7
 
8
8
  "github.com/bsm/bfs"
9
9
  )
@@ -30,42 +30,159 @@ func (o *ReaderOptions) norm(name string) {
30
30
 
31
31
  // Reader reads data from a remote feed.
32
32
  type Reader struct {
33
- remote *bfs.Object
34
- opt ReaderOptions
35
- ctx context.Context
36
- num int
33
+ ctx context.Context
34
+ opt *ReaderOptions
37
35
 
38
- br io.ReadCloser // bfs reader
39
- cr io.ReadCloser // compression reader
40
- fd FormatDecoder
36
+ remotes []*bfs.Object
37
+ ownRemotes bool
38
+
39
+ cur *streamReader
40
+ pos int
41
+
42
+ num int64
41
43
  }
42
44
 
43
45
  // NewReader inits a new reader.
44
46
  func NewReader(ctx context.Context, remote *bfs.Object, opt *ReaderOptions) (*Reader, error) {
45
- var o ReaderOptions
46
- if opt != nil {
47
- o = *opt
48
- }
49
- o.norm(remote.Name())
47
+ return MultiReader(ctx, []*bfs.Object{remote}, opt), nil
48
+ }
50
49
 
50
+ // MultiReader inits a new reader for multiple remotes. Remotes are read sequentially as if concatenated.
51
+ // Once all remotes are fully read, Read will return EOF.
52
+ func MultiReader(ctx context.Context, remotes []*bfs.Object, opt *ReaderOptions) *Reader {
51
53
  return &Reader{
52
- remote: remote,
53
- opt: o,
54
- ctx: ctx,
55
- }, nil
54
+ remotes: remotes,
55
+ opt: opt,
56
+ ctx: ctx,
57
+ }
56
58
  }
57
59
 
58
60
  // Read reads raw bytes from the feed.
61
+ // At end of feed, Read returns 0, io.EOF.
59
62
  func (r *Reader) Read(p []byte) (int, error) {
63
+ if !r.ensureCurrent() {
64
+ return 0, io.EOF
65
+ }
66
+
67
+ n, err := r.cur.Read(p)
68
+ if errors.Is(err, io.EOF) {
69
+ if more, err := r.nextRemote(); err != nil {
70
+ return n, err
71
+ } else if more {
72
+ return n, nil // dont return EOF until all remotes read
73
+ }
74
+ }
75
+ return n, err
76
+ }
77
+
78
+ // Decode decodes the next formatted value from the feed.
79
+ // At end of feed, Read returns io.EOF.
80
+ func (r *Reader) Decode(v interface{}) error {
81
+ if !r.ensureCurrent() {
82
+ return io.EOF
83
+ }
84
+
85
+ err := r.cur.Decode(v)
86
+ if errors.Is(err, io.EOF) {
87
+ if more, err := r.nextRemote(); err != nil {
88
+ return err
89
+ } else if more {
90
+ return r.Decode(v) // start decoding from next remote
91
+ }
92
+ } else if err == nil {
93
+ r.num++
94
+ }
95
+ return err
96
+ }
97
+
98
+ // NumRead returns the number of read values.
99
+ func (r *Reader) NumRead() int64 {
100
+ return r.num
101
+ }
102
+
103
+ // Version returns the version of the remote feed.
104
+ func (r *Reader) Version() (int64, error) {
105
+ var max int64
106
+ for _, remote := range r.remotes {
107
+ v, err := fetchRemoteVersion(r.ctx, remote)
108
+ if err != nil {
109
+ return 0, err
110
+ } else if v > max {
111
+ max = v
112
+ }
113
+ }
114
+
115
+ return max, nil
116
+ }
117
+
118
+ // Close closes the reader.
119
+ func (r *Reader) Close() (err error) {
120
+ if r.cur != nil {
121
+ err = r.cur.Close()
122
+ }
123
+ if r.ownRemotes {
124
+ for _, remote := range r.remotes {
125
+ if e := remote.Close(); e != nil {
126
+ err = errors.Join(err, e)
127
+ }
128
+ }
129
+ }
130
+ return
131
+ }
132
+
133
+ func (r *Reader) ensureCurrent() bool {
134
+ if r.pos >= len(r.remotes) {
135
+ return false
136
+ }
137
+
138
+ if r.cur == nil {
139
+ remote := r.remotes[r.pos]
140
+
141
+ var o ReaderOptions
142
+ if r.opt != nil {
143
+ o = *r.opt
144
+ }
145
+ o.norm(remote.Name())
146
+
147
+ r.cur = &streamReader{
148
+ remote: remote,
149
+ opt: o,
150
+ ctx: r.ctx,
151
+ }
152
+ }
153
+ return true
154
+ }
155
+
156
+ func (r *Reader) nextRemote() (bool, error) {
157
+ if err := r.cur.Close(); err != nil {
158
+ return false, err
159
+ }
160
+ // unset current, increment cursor
161
+ r.cur = nil
162
+ r.pos++
163
+ return r.pos < len(r.remotes), nil
164
+ }
165
+
166
+ type streamReader struct {
167
+ remote *bfs.Object
168
+ opt ReaderOptions
169
+ ctx context.Context
170
+
171
+ br io.ReadCloser // bfs reader
172
+ cr io.ReadCloser // compression reader
173
+ fd FormatDecoder
174
+ }
175
+
176
+ // Read reads raw bytes from the feed.
177
+ func (r *streamReader) Read(p []byte) (int, error) {
60
178
  if err := r.ensureOpen(); err != nil {
61
179
  return 0, err
62
180
  }
63
-
64
181
  return r.cr.Read(p)
65
182
  }
66
183
 
67
184
  // Decode decodes the next formatted value from the feed.
68
- func (r *Reader) Decode(v interface{}) error {
185
+ func (r *streamReader) Decode(v interface{}) error {
69
186
  if err := r.ensureOpen(); err != nil {
70
187
  return err
71
188
  }
@@ -78,47 +195,31 @@ func (r *Reader) Decode(v interface{}) error {
78
195
  r.fd = fd
79
196
  }
80
197
 
81
- if err := r.fd.Decode(v); err != nil {
82
- return err
83
- }
84
-
85
- r.num++
86
- return nil
87
- }
88
-
89
- // NumRead returns the number of read values.
90
- func (r *Reader) NumRead() int {
91
- return r.num
92
- }
93
-
94
- // LastModified returns the last modified time of the remote feed.
95
- func (r *Reader) LastModified() (time.Time, error) {
96
- lastMod, err := remoteLastModified(r.ctx, r.remote)
97
- return lastMod.Time(), err
198
+ return r.fd.Decode(v)
98
199
  }
99
200
 
100
201
  // Close closes the reader.
101
- func (r *Reader) Close() error {
202
+ func (r *streamReader) Close() error {
102
203
  var err error
103
204
  if r.fd != nil {
104
205
  if e := r.fd.Close(); e != nil {
105
- err = e
206
+ err = errors.Join(err, e)
106
207
  }
107
208
  }
108
209
  if r.cr != nil {
109
210
  if e := r.cr.Close(); e != nil {
110
- err = e
211
+ err = errors.Join(err, e)
111
212
  }
112
213
  }
113
214
  if r.br != nil {
114
215
  if e := r.br.Close(); e != nil {
115
- err = e
216
+ err = errors.Join(err, e)
116
217
  }
117
218
  }
118
219
  return err
119
220
  }
120
221
 
121
- func (r *Reader) ensureOpen() error {
222
+ func (r *streamReader) ensureOpen() error {
122
223
  if r.br == nil {
123
224
  br, err := r.remote.Open(r.ctx)
124
225
  if err != nil {