bucket_cake 5.2.1 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c70c6ebd3e74463c225cf381f435bd9ba555eb2a
4
- data.tar.gz: 40d780925bac1c782eea33810d0c31662f0fee40
3
+ metadata.gz: 2883992442100adbcd532c41921f5054242b8c6c
4
+ data.tar.gz: 5e4d397a504c4a976b0eb5bbbbb27d8c7c83ed99
5
5
  SHA512:
6
- metadata.gz: 1682f264a01266b295152ea1a66cf3f76266a5ae6edcf823ed08c4d0038fe7ac6300a33bb7a1f8419c80fe0ed42ea3945a9cee096ce37ca029c1c653d315efe9
7
- data.tar.gz: cd5a860472517fdc2f0d2b80ce8fc665c2505b2a36615d9efbceb215d969c504c53a35a609f8bfd5ec2b7df300486b990dd961e4036fd885478f6cbb9144b9ad
6
+ metadata.gz: 7868a377f26766dd2416aab145894eb963f588ea92bafa3aaf5364110449af913af0017f7ebfca1b635f5bc0293b5f2c19e7a0096fff4fa4dc983896034d793e
7
+ data.tar.gz: f09f55f39ee24be7ac16e0ca99ba90a3242790fee052c03b0961c6a25b4c91862f06d68d5c87c652df840e21a7f4d4bc26285607c6295e38ac3efe6704b76792
@@ -1,46 +1,91 @@
1
1
  # frozen_string_literal: true
2
2
  module BucketCake
3
3
  class Base
4
- class Range < self
5
- private
4
+ def items
5
+ klass = self.class::PROTOCLASS.call
6
+ Decoder.new(files, klass).items.lazy
7
+ end
6
8
 
7
- def source
8
- Source::Range.new(self.class::FOLDER, cursor)
9
+ private
10
+
11
+ def files
12
+ Enumerator.new do |y|
13
+ keys.each do |key|
14
+ y << bucket.object(key).get.body
15
+ end
9
16
  end
10
17
  end
11
18
 
12
- class Latest < self
13
- def initialize(cursor = nil)
14
- super(cursor)
15
- end
19
+ def folder
20
+ self.class::FOLDER
21
+ end
16
22
 
17
- def has_new_data?
18
- cursor.nil? || cursor != source.latest_cursor
23
+ def bucket
24
+ @bucket ||= Aws::S3::Bucket.new(ENV.fetch('CAKE_DATA_BUCKET'))
25
+ end
26
+
27
+ class Range < self
28
+ include TimeHelper
29
+
30
+ attr_reader :start_time, :end_time
31
+
32
+ def initialize(start_time, end_time)
33
+ assert_time(start_time)
34
+ assert_time(end_time)
35
+ raise 'Invalid time: end must be after start' unless end_time > start_time
36
+
37
+ @start_time = start_time
38
+ @end_time = end_time
19
39
  end
20
40
 
21
41
  private
22
42
 
23
- def source
24
- Source::Latest.new(self.class::FOLDER, cursor)
43
+ def keys
44
+ hour_keys(folder, start_time, end_time)
25
45
  end
26
46
  end
27
47
 
28
- attr_reader :cursor
48
+ class Latest < self
49
+ LATEST_FILE = 'latest.gz'
29
50
 
30
- def initialize(cursor)
31
- raise 'BucketCake: cursor has invalid format!' unless cursor.nil? || cursor =~ CURSOR_REGEXP
32
- @cursor = cursor
33
- end
51
+ private
34
52
 
35
- def items
36
- get source, self.class::PROTOCLASS.call
53
+ def keys
54
+ ["#{folder}/#{LATEST_FILE}"]
55
+ end
37
56
  end
38
57
 
39
- private
58
+ class Realtime < self
59
+ include TimeHelper
60
+
61
+ DEFAULT_LOOKBACK = 24.hours
62
+ CURSOR_REGEX = %r{\A\w+/\d{4}/\d{2}/\d{2}/\d{4}\.gz\z}
63
+
64
+ attr_reader :cursor
65
+
66
+ def initialize(cursor)
67
+ raise 'Invalid cursor format!' unless cursor.nil? || cursor =~ CURSOR_REGEX
68
+ @cursor = cursor || default_cursor
69
+ end
40
70
 
41
- def get(source, klass)
42
- @cursor = source.latest_cursor
43
- Decoder.new(source.zip_files, klass).items.lazy
71
+ def files
72
+ @keys = load_keys
73
+ @cursor = keys.last unless keys.empty?
74
+ super
75
+ end
76
+
77
+ private
78
+
79
+ attr_reader :keys
80
+
81
+ def default_cursor
82
+ minute_cursor(folder, Time.now - DEFAULT_LOOKBACK)
83
+ end
84
+
85
+ def load_keys
86
+ # return all objects after the cursor timestamp
87
+ bucket.objects(marker: cursor, prefix: "#{folder}/").map(&:key)
88
+ end
44
89
  end
45
90
  end
46
91
  end
@@ -1,15 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
  module BucketCake
3
3
  class Decoder
4
- def initialize(zip_files, klass)
5
- @zip_files = zip_files
4
+ def initialize(files, klass)
5
+ @files = files
6
6
  @klass = klass
7
7
  end
8
8
 
9
9
  def items
10
10
  Enumerator.new do |y|
11
- zip_files.each do |zip_data|
12
- unzip_elements(zip_data) do |element|
11
+ files.each do |gzdata|
12
+ unpack(gzdata) do |element|
13
13
  y << klass.decode(element)
14
14
  end
15
15
  end
@@ -18,15 +18,14 @@ module BucketCake
18
18
 
19
19
  private
20
20
 
21
- attr_reader :zip_files, :klass
21
+ attr_reader :files, :klass
22
22
 
23
- def unzip_elements(zip_data)
24
- zip_data_io = StringIO.new(zip_data)
25
- files = Zip::File.new(zip_data_io, true, true)
26
- files.read_from_stream(zip_data_io)
23
+ def unpack(gzdata)
24
+ Zlib::GzipReader.new(gzdata).each_line.each do |line|
25
+ line.strip!
26
+ next if line.empty?
27
27
 
28
- files.each do |entry|
29
- yield entry.get_input_stream.read
28
+ yield Base64.decode64(line)
30
29
  end
31
30
  end
32
31
  end
@@ -11,14 +11,9 @@ module BucketCake
11
11
  PROTOCLASS = -> { Cakeproto::Conversion }
12
12
  end
13
13
 
14
- class CapStates < Base::Range
14
+ class CapStatesLatest < Base::Latest
15
15
  FOLDER = 'cap_states'
16
16
  PROTOCLASS = -> { Cakeproto::CapState }
17
-
18
- class Latest < Base::Latest
19
- FOLDER = 'cap_states'
20
- PROTOCLASS = -> { Cakeproto::CapState }
21
- end
22
17
  end
23
18
  end
24
19
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+ module BucketCake
3
+ module Realtime
4
+ class Clicks < Base::Realtime
5
+ FOLDER = 'clicks_rt'
6
+ PROTOCLASS = -> { Cakeproto::Click }
7
+ end
8
+
9
+ class Conversions < Base::Realtime
10
+ FOLDER = 'conversions_rt'
11
+ PROTOCLASS = -> { Cakeproto::Conversion }
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+ module BucketCake
3
+ module TimeHelper
4
+ def hour_cursor(prefix, time)
5
+ "#{prefix}/#{time.utc.strftime('%Y/%m/%d/%H')}.gz"
6
+ end
7
+
8
+ def minute_cursor(prefix, time)
9
+ "#{prefix}/#{time.utc.strftime('%Y/%m/%d/%H%M')}.gz"
10
+ end
11
+
12
+ def hour_keys(prefix, start_time, end_time)
13
+ Enumerator.new do |y|
14
+ while start_time < end_time
15
+ y << hour_cursor(prefix, start_time)
16
+ start_time += 1.hour
17
+ end
18
+ end
19
+ end
20
+
21
+ def assert_time(time)
22
+ raise 'Invalid time: must be a Time object' unless time.is_a?(Time)
23
+ raise 'Invalid time: must be at start of hour' if time.beginning_of_hour != time
24
+ end
25
+ end
26
+ end
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module BucketCake
3
- VERSION = '5.2.1'
3
+ VERSION = '6.0.0'
4
4
  end
data/lib/bucket_cake.rb CHANGED
@@ -1,11 +1,12 @@
1
1
  # frozen_string_literal: true
2
+ require 'base64'
3
+ require 'zlib'
2
4
  require 'aws-sdk'
3
- require 'zip'
4
5
  require 'google/protobuf'
5
6
  require 'active_support/time'
6
7
 
7
8
  require 'bucket_cake/version'
8
- require 'bucket_cake/source'
9
+ require 'bucket_cake/time_helper'
9
10
  require 'bucket_cake/decoder'
10
11
  require 'bucket_cake/base'
11
12
 
@@ -25,7 +26,4 @@ require 'bucket_cake/proto_ext/maybe_int'
25
26
  require 'bucket_cake/entities'
26
27
  require 'bucket_cake/values'
27
28
  require 'bucket_cake/facts'
28
-
29
- module BucketCake
30
- CURSOR_REGEXP = %r{\A\d{4}/\d{2}/\d{2}/\d{6}\z}
31
- end
29
+ require 'bucket_cake/realtime'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bucket_cake
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.2.1
4
+ version: 6.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ad2games developers
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-09 00:00:00.000000000 Z
11
+ date: 2016-12-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: aws-sdk
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: rubyzip
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: google-protobuf
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -187,7 +173,8 @@ files:
187
173
  - lib/bucket_cake/proto/offers_pb.rb
188
174
  - lib/bucket_cake/proto_ext/maybe_int.rb
189
175
  - lib/bucket_cake/proto_ext/time_with_zone.rb
190
- - lib/bucket_cake/source.rb
176
+ - lib/bucket_cake/realtime.rb
177
+ - lib/bucket_cake/time_helper.rb
191
178
  - lib/bucket_cake/values.rb
192
179
  - lib/bucket_cake/version.rb
193
180
  homepage: https://github.com/ad2games/bucket_cake
@@ -1,68 +0,0 @@
1
- # frozen_string_literal: true
2
- module BucketCake
3
- class Source
4
- def initialize(folder, cursor)
5
- @folder = folder
6
- @cursor = cursor
7
- end
8
-
9
- def zip_files
10
- Enumerator.new do |y|
11
- objects.each do |object|
12
- next unless check_success(object.key)
13
- next unless object.key =~ %r{/part\d{3}\.zip\z}
14
-
15
- io = object.get.body
16
-
17
- # we need to force binary encoding, otherwise binary zip data will be parsed as utf-8
18
- io.set_encoding('BINARY')
19
-
20
- y << io.read
21
- end
22
- end
23
- end
24
-
25
- def latest_cursor
26
- @latest_cursor ||= bucket.object("#{folder}/latest").get.body.read
27
- end
28
-
29
- private
30
-
31
- attr_reader :folder, :cursor
32
-
33
- def check_success(key)
34
- @success_files ||= Set.new
35
- @success_files << key if key.end_with?('/SUCCESS')
36
-
37
- # This relies on the fact that SUCCESS files always come before partNNN.zip files.
38
- # But that is fine because S3 guarantees alphabetical order.
39
- @success_files.include?(File.dirname(key) + '/SUCCESS')
40
- end
41
-
42
- def bucket
43
- @bucket ||= Aws::S3::Bucket.new(ENV.fetch('CAKE_DATA_BUCKET'))
44
- end
45
- end
46
-
47
- class Source::Latest < Source
48
- private
49
-
50
- def objects
51
- bucket.objects(prefix: "#{folder}/#{latest_cursor}/")
52
- end
53
- end
54
-
55
- class Source::Range < Source
56
- private
57
-
58
- def objects
59
- # If we don't have a timestamp in cursor, return everything
60
- return bucket.objects(prefix: "#{folder}/") if cursor.nil?
61
-
62
- # This returns all objects after the current timestamp.
63
- # By using the 'xxx' suffix, none of the objects of the given timestamp are returned.
64
- # S3 ensures alphabetical order of the results, so this works fine for date ranges.
65
- bucket.objects(marker: "#{folder}/#{cursor}/xxx", prefix: "#{folder}/")
66
- end
67
- end
68
- end