gh-archive 0.8 → 0.12

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +132 -7
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a4a9b89fb02620499e8b51f5c5e4ba34d0f3bc8b8f3ae4e2e69cba1e027bdb49
4
- data.tar.gz: 6b707eb1bcb37b8a9b03ce36a2d4304e6760def0cc139ffad748a1596046f82f
3
+ metadata.gz: 9e36482fd8eeb76b12db28c1e68ad836d77a5985af6aa0e570a025bcb664b1a0
4
+ data.tar.gz: a91e869a8e3f614e8280f03749bcd31ce0de3c7c4e9dc9dd3d777c1a3f1e0d3f
5
5
  SHA512:
6
- metadata.gz: '0975e354e028e768fb5bc4c17c19cddbc44b706394f60d8e22a64537a34342965bb98fa00365c329c30d189729dadba271ec6b101ff86593affef7e1d34848b3'
7
- data.tar.gz: 95935fb3c27841760a68696c832e02108940f9eef535f5ed89afea9613a7988c622f2649f3feb77b22706250a30cfba30b5f36f2ada617f532e86a0426e052ce
6
+ metadata.gz: 4423afb5e0538be2abbe4ac21aeba3919081f0c61943ad9801add90b85b6f2e9df907836c13e00c6b300b28766c65c5cb2e78bc040393ebd38c9618092f2957d
7
+ data.tar.gz: c0caf8f4e47419744f7694748ed608c2e7da8fc0094477ce96bad211cf0b398ef5d43550fdcfd25ba1bc820cafd470ebc4f7526c39c171ef991bfc5fd2399882
data/lib/gh-archive.rb CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
9
9
 
10
10
  require_relative File.expand_path('../gh-archive/events', __FILE__)
11
11
 
12
+ module GHArchive
13
+ class ThreadPool
14
+ def initialize(size)
15
+ @size = size
16
+ @threads = []
17
+ @queue = []
18
+ @mutex = Mutex.new
19
+
20
+ @consumer_thread = Thread.start do
21
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
22
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
+ @threads.delete_if { |t| !t.alive? }
24
+
25
+ if @threads.size < @size && @queue.size > 0
26
+ @mutex.synchronize do
27
+ args, job = @queue.shift
28
+ @threads << Thread.start(*args, &job)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def process(*args, &block)
36
+ raise "Block expected" unless block_given?
37
+ raise "Can not add jobs while shutting down" if @shutdown
38
+
39
+ @mutex.synchronize do
40
+ @queue << [args, block]
41
+ end
42
+
43
+ return self.enqueued
44
+ end
45
+
46
+ def shutdown
47
+ @shutdown = true
48
+ end
49
+
50
+ def shutdown!
51
+ self.shutdown
52
+ @mutex.synchronize do
53
+ @queue.clear
54
+ end
55
+ end
56
+
57
+ def enqueued
58
+ return @queue.size
59
+ end
60
+
61
+ def shutdown?
62
+ @shutdown
63
+ end
64
+
65
+ def alive?
66
+ @consumer_thread.alive?
67
+ end
68
+
69
+ def wait
70
+ while alive?
71
+ sleep 0.1
72
+ end
73
+ end
74
+ end
75
+ end
76
+
12
77
  module GHAUtils
13
78
  def get_gha_filename(date)
14
79
  return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
@@ -22,9 +87,10 @@ module GHAUtils
22
87
  end
23
88
 
24
89
  def read_gha_file(file)
25
- if file.path.end_with?(".json")
90
+
91
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
26
92
  content = file.read
27
- elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
93
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
28
94
  content = read_gha_file_content(file)
29
95
  else
30
96
  raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
@@ -56,9 +122,16 @@ class GHAProvider
56
122
  @includes = {}
57
123
  @excludes = {}
58
124
 
125
+ @checkpoint_name = nil
59
126
  @use_json = true
60
127
  end
61
128
 
129
+ def use_checkpoint(filename)
130
+ @checkpoint_name = filename
131
+
132
+ return self
133
+ end
134
+
62
135
  def parse_events
63
136
  @use_json = false
64
137
 
@@ -67,7 +140,10 @@ class GHAProvider
67
140
 
68
141
  def logger=(logger)
69
142
  @logger = logger
143
+
144
+ return self
70
145
  end
146
+ alias :use_logger :logger=
71
147
 
72
148
  def get(date)
73
149
  raise "Not implemented"
@@ -91,11 +167,45 @@ class GHAProvider
91
167
  return self
92
168
  end
93
169
 
170
+ def restore_checkpoint(from)
171
+ if @checkpoint_name && FileTest.exist?(@checkpoint_name)
172
+ # Note that this throws an exception if the file is not readable. This is the intended behavior.
173
+ # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
174
+ loaded_from = Marshal.load(File.read(@checkpoint_name))
175
+ raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
176
+
177
+ @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
178
+
179
+ return loaded_from
180
+ else
181
+ return from
182
+ end
183
+ end
184
+
185
+ def update_checkpoint(current_time)
186
+ if @checkpoint_name
187
+ begin
188
+ File.open(@checkpoint_name, "wb") do |f|
189
+ f.write(Marshal.dump(current_time))
190
+ end
191
+ rescue
192
+ @logger.warn(
193
+ "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
194
+ )
195
+ end
196
+ end
197
+ end
198
+
94
199
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
95
200
  exceptions = []
96
201
 
202
+ from = restore_checkpoint(from)
203
+
97
204
  self.each_time(from, to) do |current_time|
98
205
  events = []
206
+
207
+ update_checkpoint(current_time)
208
+
99
209
  begin
100
210
  events = self.get(current_time)
101
211
  rescue GHAException => e
@@ -131,6 +241,8 @@ class GHAProvider
131
241
  GC.start
132
242
  end
133
243
 
244
+ update_checkpoint(to)
245
+
134
246
  return exceptions
135
247
  end
136
248
 
@@ -142,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
142
254
  def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
143
255
  super()
144
256
 
145
- @max_retries = max_retries
146
- @proactive = proactive
147
- @proactive_pool_size = proactive_pool_size
148
- @pool = Thread.pool(proactive_pool_size)
257
+ self.max_retries(max_retries)
258
+ self.proactive(proactive_pool_size) if proactive
259
+
149
260
  @cache = Cache.new
150
261
  end
151
262
 
263
+ def max_retries(n)
264
+ @max_retries = n
265
+
266
+ return self
267
+ end
268
+
269
+ def proactive(pool_size = 10)
270
+ @proactive = true
271
+ @pool = GHArchive::ThreadPool.new(pool_size)
272
+
273
+ return self
274
+ end
275
+
152
276
  def get(current_time)
153
277
  @max_retries.times do
154
278
  begin
@@ -214,10 +338,11 @@ class OnlineGHAProvider < GHAProvider
214
338
 
215
339
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
216
340
  if @proactive
341
+ real_from = restore_checkpoint(from)
217
342
  any_ready = Thread.promise
218
343
 
219
344
  @logger.info("Proactively scheduling download tasks...")
220
- self.each_time(from, to) do |current_time|
345
+ self.each_time(real_from, to) do |current_time|
221
346
  @pool.process(current_time) do |current_time|
222
347
  cache(current_time)
223
348
  any_ready << true
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.8'
4
+ version: '0.12'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-14 00:00:00.000000000 Z
11
+ date: 2021-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions