outatime 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9413cb4ec7dfd1657acd3b8f031fc03b74c9ca9c
4
- data.tar.gz: d880faa841f6e506d602c22cbb5ad76027f2f1ff
3
+ metadata.gz: ed16328dea0d2e300283d1982a891c17a23341bb
4
+ data.tar.gz: b2bd09379a2fe1ef252ab34894f88fbcb33d468a
5
5
  SHA512:
6
- metadata.gz: d258fd21d762e66e887507f5630af69e86cd1532633b64b115e1c6d78767e94db640c5bbf13923ad8f924ba3afd4a5032d26ad637bf8eec09b01961f02420326
7
- data.tar.gz: f8044e58e7f1ca9a4acec8551a663b48612f72837bed2d3d112557bd7cc765758673a171b329ed73d229f0988114c89943195cf014ae60a5c002ec2536be134e
6
+ metadata.gz: 30b90838d71c9d13f5dde1b1e1b512c324b7c5e9b349b6af99593d8e943834a84ebc1d13bb570dd549f21e0698bf8e9209d00cc12860c49d3a05fb1228e5955c
7
+ data.tar.gz: bb2152657dd2c26d0501b9814b97e5242f433aae73ea955a96e43218f0ecd1b6063b4deb1e3a3c2695ccf134fe1daf4ed34b11e47eafc53651c68c17d04ad2c1
data/lib/outatime/cli.rb CHANGED
@@ -22,7 +22,7 @@ module Outatime
22
22
  def run
23
23
  fetcher = Outatime::Fetcher.new(options)
24
24
 
25
- pb = ProgressBar.create(total: fetcher.total_size,
25
+ pb = ProgressBar.create(total: nil,
26
26
  format: "%t: |%B| %f %c/%C %R MB/sec",
27
27
  rate_scale: lambda { |rate| rate / 1024 / 1024 },
28
28
  throttle_rate: 0.5)
@@ -1,3 +1,45 @@
1
+ require 'thread/pool'
2
+
3
+ # Outatime::Fetcher is responsible for finding the exact revision for each file
4
+ # for a given time.
5
+ #
6
+ # AWS S3 API lists all file revisions in a very particular order and this
7
+ # class takes advantage of that to quickly parse and find the revision.
8
+ #
9
+ # The 'GET Bucket Object Versions'
10
+ # (http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGETVersion.html)
11
+ # returns all file revisions ordered by key (path + filename) and last
12
+ # modified time.
13
+ #
14
+ # For example:
15
+ #
16
+ # id: 12456, key: "src/file1", last_modified: 7 Feb 11:38
17
+ # id: 12357, key: "src/file1", last_modified: 7 Feb 11:37
18
+ # id: 12357, key: "src/file1", last_modified: 7 Feb 11:00
19
+ # id: 22222, key: "src/file2", last_modified: 7 Feb 11:39
20
+ # ...
21
+ #
22
+ # Keep in mind that this response is paginated, where the max amount of
23
+ # revisions per page is 1000.
24
+ #
25
+ # When a versioned bucket contains a huge amount of files and revisions,
26
+ # fetching all data may take a long time. So Outatime::Fetcher starts
27
+ # downloading the correct revision even when not all file revisions are
28
+ # already fetched, i.e. when it is still downloading the revisions and their
29
+ # data. That accelerates the process.
30
+ #
31
+ # So how do we know when we have all information we need before start downloading
32
+ # a specific file revision? As the response is ordered by key (filename) and
33
+ # timestamps, fetcher can start downloading if the actual response page
34
+ # contains all possible revisions for a file, i.e. its revisions aren't
35
+ # paginated.
36
+ #
37
+ # When the response for a given key (filename) is paginated, the file is not
38
+ # downloaded until all revisions are fetched in the next response page.
39
+ #
40
+ # This algorithm removes the need to fetch all file revisions (which may take
41
+ # several requests) for a versioned bucket before starting to download its files,
42
+ # but acts on each individual file instead.
1
43
  module Outatime
2
44
  class Fetcher
3
45
  attr_accessor :options
@@ -15,10 +57,10 @@ module Outatime
15
57
  #
16
58
  def initialize(options = {})
17
59
  @options = options
18
- @files_mutex = Mutex.new
19
60
  @fetch_block_mutex = Mutex.new
20
61
  @s3_client = options[:s3_client] if options[:s3_client]
21
- @from = ::Chronic.parse(@options[:from]) if @options[:from]
62
+ @from = ::Chronic.parse(@options[:from]) if @options[:from]
63
+ @pool = Thread.pool(@options.fetch(:threads, 20))
22
64
 
23
65
  # raise if the date/time was not parsed
24
66
  raise ArgumentError, "The from time was not parseable." if @from.nil?
@@ -31,7 +73,11 @@ module Outatime
31
73
  #
32
74
  # Returns nothing.
33
75
  def fetch!(&block)
34
- fetch_objects(object_versions, &block)
76
+ object_versions do |object_version|
77
+ fetch_object(object_version, &block)
78
+ end
79
+
80
+ @pool.wait(:done)
35
81
  end
36
82
 
37
83
  # Public: Returns the objects total size.
@@ -45,34 +91,29 @@ module Outatime
45
91
  #
46
92
  # Returns an Array of Aws::S3::Types::ObjectVersion.
47
93
  def object_versions
48
- puts "fetching object versions from #{@from}" if verbose?
49
- @files ||= begin
50
- versions = []
51
- delete_markers = []
52
-
53
- s3_client.list_object_versions(bucket: @options[:bucket],
54
- prefix: @options[:prefix]).each do |response|
55
-
56
- versions.concat(filter_future_items(response.versions))
57
- delete_markers.concat(filter_future_items(response.delete_markers))
58
- end
59
-
60
- # keep only the latest versions
61
- # AWS lists the latest versions first, so it should be OK to use uniq here.
62
- versions.uniq! { |obj| obj.key }
63
- delete_markers.uniq! { |obj| obj.key }
64
-
65
- delete_marker_keys = delete_markers.map { |dm| dm.key }
66
-
67
- # check versions to see if we have newer delete_markers
68
- # if so, delete those versions
69
- versions.delete_if do |version|
70
- if dm_index = delete_marker_keys.index(version.key)
71
- if version.last_modified <= delete_markers[dm_index].last_modified
72
- true
94
+ remaining_versions = []
95
+ remaining_delete_markers = []
96
+
97
+ s3_client.list_object_versions(bucket: @options[:bucket],
98
+ prefix: @options[:prefix]).each do |response|
99
+
100
+ versions = remaining_versions.concat(response.versions)
101
+ versions_by_key = versions.group_by {|v| v.key }
102
+ delete_markers = remaining_delete_markers.concat(response.delete_markers)
103
+ delete_markers_by_key = delete_markers.group_by {|v| v.key }
104
+
105
+ versions_by_key.each do |key, versions|
106
+ next if response.next_key_marker == key
107
+ filter_items(versions).each do |version|
108
+ dl_marker = filter_items(Array(delete_markers_by_key[version.key])).first
109
+ if dl_marker.nil? || (version.last_modified > dl_marker.last_modified)
110
+ yield version
73
111
  end
74
112
  end
75
113
  end
114
+
115
+ remaining_versions = Array(versions_by_key[response.next_key_marker])
116
+ remaining_delete_markers = Array(delete_markers_by_key[response.next_key_marker])
76
117
  end
77
118
  end
78
119
 
@@ -90,33 +131,24 @@ module Outatime
90
131
  # files - an Array of Aws::S3::Types::ObjectVersion.
91
132
  #
92
133
  # Returns nothing.
93
- def fetch_objects(files)
94
- threads = []
95
-
96
- @options[:threads].times do
97
- threads << Thread.new do
98
- while !(file = @files_mutex.synchronize { files.pop }).nil? do
99
- dest = Pathname.new("#{@options[:destination]}/#{file.key}")
100
-
101
- if file.key.end_with?("/")
102
- puts "Creating s3 subdirectory #{file.key} - #{Time.now}" if verbose?
103
- dest.mkpath
104
- else
105
- dest.dirname.mkpath
106
-
107
- puts "Copying from s3 #{file.key} - #{Time.now}" if verbose?
108
- s3_client.get_object(response_target: "#{dest}",
109
- bucket: @options[:bucket],
110
- key: file.key,
111
- version_id: file.version_id)
112
- end
113
-
114
- @fetch_block_mutex.synchronize { yield file } if block_given?
115
- end
134
+ def fetch_object(file)
135
+ @pool.process do
136
+ dest = Pathname.new("#{@options[:destination]}/#{file.key}")
137
+ if file.key.end_with?("/")
138
+ puts "Creating s3 subdirectory #{file.key} - #{Time.now}" if verbose?
139
+ dest.mkpath
140
+ else
141
+ dest.dirname.mkpath
142
+
143
+ puts "Copying from s3 #{file.key} - #{Time.now}" if verbose?
144
+ s3_client.get_object(response_target: "#{dest}",
145
+ bucket: @options[:bucket],
146
+ key: file.key,
147
+ version_id: file.version_id)
116
148
  end
117
- end
118
149
 
119
- threads.map(&:join)
150
+ @fetch_block_mutex.synchronize { yield file } if block_given?
151
+ end
120
152
  end
121
153
 
122
154
  # Private: Creates the S3 client instance.
@@ -132,8 +164,8 @@ module Outatime
132
164
  # items - An Array of objects. Object must respond to #last_modified.
133
165
  #
134
166
  # Returns Array.
135
- def filter_future_items(items)
136
- items.keep_if { |obj| obj.last_modified <= @from }
167
+ def filter_items(items)
168
+ items.keep_if { |obj| obj.last_modified <= @from }.uniq {|obj| obj.key }
137
169
  end
138
170
  end
139
171
  end
@@ -1,3 +1,3 @@
1
1
  module Outatime
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: outatime
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Justin Mazzi
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2017-02-01 00:00:00.000000000 Z
13
+ date: 2017-02-07 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: aws-sdk