outatime 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9413cb4ec7dfd1657acd3b8f031fc03b74c9ca9c
4
- data.tar.gz: d880faa841f6e506d602c22cbb5ad76027f2f1ff
3
+ metadata.gz: ed16328dea0d2e300283d1982a891c17a23341bb
4
+ data.tar.gz: b2bd09379a2fe1ef252ab34894f88fbcb33d468a
5
5
  SHA512:
6
- metadata.gz: d258fd21d762e66e887507f5630af69e86cd1532633b64b115e1c6d78767e94db640c5bbf13923ad8f924ba3afd4a5032d26ad637bf8eec09b01961f02420326
7
- data.tar.gz: f8044e58e7f1ca9a4acec8551a663b48612f72837bed2d3d112557bd7cc765758673a171b329ed73d229f0988114c89943195cf014ae60a5c002ec2536be134e
6
+ metadata.gz: 30b90838d71c9d13f5dde1b1e1b512c324b7c5e9b349b6af99593d8e943834a84ebc1d13bb570dd549f21e0698bf8e9209d00cc12860c49d3a05fb1228e5955c
7
+ data.tar.gz: bb2152657dd2c26d0501b9814b97e5242f433aae73ea955a96e43218f0ecd1b6063b4deb1e3a3c2695ccf134fe1daf4ed34b11e47eafc53651c68c17d04ad2c1
data/lib/outatime/cli.rb CHANGED
@@ -22,7 +22,7 @@ module Outatime
22
22
  def run
23
23
  fetcher = Outatime::Fetcher.new(options)
24
24
 
25
- pb = ProgressBar.create(total: fetcher.total_size,
25
+ pb = ProgressBar.create(total: nil,
26
26
  format: "%t: |%B| %f %c/%C %R MB/sec",
27
27
  rate_scale: lambda { |rate| rate / 1024 / 1024 },
28
28
  throttle_rate: 0.5)
@@ -1,3 +1,45 @@
1
+ require 'thread/pool'
2
+
3
+ # Outatime::Fetcher is responsible for finding the exact revision for each file
4
+ # for a given time.
5
+ #
6
+ # AWS S3 API lists all file revisions in a very particular order and this
7
+ # class takes advantage of that to quickly parse and find the revision.
8
+ #
9
+ # The 'GET Bucket Object Versions'
10
+ # (http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGETVersion.html)
11
+ # returns all file revisions ordered by key (path + filename) and last
12
+ # modified time.
13
+ #
14
+ # For example:
15
+ #
16
+ # id: 12456, key: "src/file1", last_modified: 7 Feb 11:38
17
+ # id: 12357, key: "src/file1", last_modified: 7 Feb 11:37
18
+ # id: 12357, key: "src/file1", last_modified: 7 Feb 11:00
19
+ # id: 22222, key: "src/file2", last_modified: 7 Feb 11:39
20
+ # ...
21
+ #
22
+ # Keep in mind that this response is paginated, where the max amount of
23
+ # revisions per page is 1000.
24
+ #
25
+ # When a versioned bucket contains a huge amount of files and revisions,
26
+ # fetching all data may take a long time. So Outatime::Fetcher starts
27
+ # downloading the correct revision even when not all file revisions are
28
+ # already fetched, i.e. when it is still downloading the revisions and their
29
+ # data. That accelerates the process.
30
+ #
31
+ # So how do we know when we have all information we need before start downloading
32
+ # a specific file revision? As the response is ordered by key (filename) and
33
+ # timestamps, fetcher can start downloading if the actual response page
34
+ # contains all possible revisions for a file, i.e. its revisions aren't
35
+ # paginated.
36
+ #
37
+ # When the response for a given key (filename) is paginated, the file is not
38
+ # downloaded until all revisions are fetched in the next response page.
39
+ #
40
+ # This algorithm removes the need to fetch all file revisions (which may take
41
+ # several requests) for a versioned bucket before starting to download its files,
42
+ # but acts on each individual file instead.
1
43
  module Outatime
2
44
  class Fetcher
3
45
  attr_accessor :options
@@ -15,10 +57,10 @@ module Outatime
15
57
  #
16
58
  def initialize(options = {})
17
59
  @options = options
18
- @files_mutex = Mutex.new
19
60
  @fetch_block_mutex = Mutex.new
20
61
  @s3_client = options[:s3_client] if options[:s3_client]
21
- @from = ::Chronic.parse(@options[:from]) if @options[:from]
62
+ @from = ::Chronic.parse(@options[:from]) if @options[:from]
63
+ @pool = Thread.pool(@options.fetch(:threads, 20))
22
64
 
23
65
  # raise if the date/time was not parsed
24
66
  raise ArgumentError, "The from time was not parseable." if @from.nil?
@@ -31,7 +73,11 @@ module Outatime
31
73
  #
32
74
  # Returns nothing.
33
75
  def fetch!(&block)
34
- fetch_objects(object_versions, &block)
76
+ object_versions do |object_version|
77
+ fetch_object(object_version, &block)
78
+ end
79
+
80
+ @pool.wait(:done)
35
81
  end
36
82
 
37
83
  # Public: Returns the objects total size.
@@ -45,34 +91,29 @@ module Outatime
45
91
  #
46
92
  # Returns an Array of Aws::S3::Types::ObjectVersion.
47
93
  def object_versions
48
- puts "fetching object versions from #{@from}" if verbose?
49
- @files ||= begin
50
- versions = []
51
- delete_markers = []
52
-
53
- s3_client.list_object_versions(bucket: @options[:bucket],
54
- prefix: @options[:prefix]).each do |response|
55
-
56
- versions.concat(filter_future_items(response.versions))
57
- delete_markers.concat(filter_future_items(response.delete_markers))
58
- end
59
-
60
- # keep only the latest versions
61
- # AWS lists the latest versions first, so it should be OK to use uniq here.
62
- versions.uniq! { |obj| obj.key }
63
- delete_markers.uniq! { |obj| obj.key }
64
-
65
- delete_marker_keys = delete_markers.map { |dm| dm.key }
66
-
67
- # check versions to see if we have newer delete_markers
68
- # if so, delete those versions
69
- versions.delete_if do |version|
70
- if dm_index = delete_marker_keys.index(version.key)
71
- if version.last_modified <= delete_markers[dm_index].last_modified
72
- true
94
+ remaining_versions = []
95
+ remaining_delete_markers = []
96
+
97
+ s3_client.list_object_versions(bucket: @options[:bucket],
98
+ prefix: @options[:prefix]).each do |response|
99
+
100
+ versions = remaining_versions.concat(response.versions)
101
+ versions_by_key = versions.group_by {|v| v.key }
102
+ delete_markers = remaining_delete_markers.concat(response.delete_markers)
103
+ delete_markers_by_key = delete_markers.group_by {|v| v.key }
104
+
105
+ versions_by_key.each do |key, versions|
106
+ next if response.next_key_marker == key
107
+ filter_items(versions).each do |version|
108
+ dl_marker = filter_items(Array(delete_markers_by_key[version.key])).first
109
+ if dl_marker.nil? || (version.last_modified > dl_marker.last_modified)
110
+ yield version
73
111
  end
74
112
  end
75
113
  end
114
+
115
+ remaining_versions = Array(versions_by_key[response.next_key_marker])
116
+ remaining_delete_markers = Array(delete_markers_by_key[response.next_key_marker])
76
117
  end
77
118
  end
78
119
 
@@ -90,33 +131,24 @@ module Outatime
90
131
  # files - an Array of Aws::S3::Types::ObjectVersion.
91
132
  #
92
133
  # Returns nothing.
93
- def fetch_objects(files)
94
- threads = []
95
-
96
- @options[:threads].times do
97
- threads << Thread.new do
98
- while !(file = @files_mutex.synchronize { files.pop }).nil? do
99
- dest = Pathname.new("#{@options[:destination]}/#{file.key}")
100
-
101
- if file.key.end_with?("/")
102
- puts "Creating s3 subdirectory #{file.key} - #{Time.now}" if verbose?
103
- dest.mkpath
104
- else
105
- dest.dirname.mkpath
106
-
107
- puts "Copying from s3 #{file.key} - #{Time.now}" if verbose?
108
- s3_client.get_object(response_target: "#{dest}",
109
- bucket: @options[:bucket],
110
- key: file.key,
111
- version_id: file.version_id)
112
- end
113
-
114
- @fetch_block_mutex.synchronize { yield file } if block_given?
115
- end
134
+ def fetch_object(file)
135
+ @pool.process do
136
+ dest = Pathname.new("#{@options[:destination]}/#{file.key}")
137
+ if file.key.end_with?("/")
138
+ puts "Creating s3 subdirectory #{file.key} - #{Time.now}" if verbose?
139
+ dest.mkpath
140
+ else
141
+ dest.dirname.mkpath
142
+
143
+ puts "Copying from s3 #{file.key} - #{Time.now}" if verbose?
144
+ s3_client.get_object(response_target: "#{dest}",
145
+ bucket: @options[:bucket],
146
+ key: file.key,
147
+ version_id: file.version_id)
116
148
  end
117
- end
118
149
 
119
- threads.map(&:join)
150
+ @fetch_block_mutex.synchronize { yield file } if block_given?
151
+ end
120
152
  end
121
153
 
122
154
  # Private: Creates the S3 client instance.
@@ -132,8 +164,8 @@ module Outatime
132
164
  # items - An Array of objects. Object must respond to #last_modified.
133
165
  #
134
166
  # Returns Array.
135
- def filter_future_items(items)
136
- items.keep_if { |obj| obj.last_modified <= @from }
167
+ def filter_items(items)
168
+ items.keep_if { |obj| obj.last_modified <= @from }.uniq {|obj| obj.key }
137
169
  end
138
170
  end
139
171
  end
@@ -1,3 +1,3 @@
1
1
  module Outatime
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: outatime
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Justin Mazzi
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2017-02-01 00:00:00.000000000 Z
13
+ date: 2017-02-07 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: aws-sdk