gh-archive 0.8 → 0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +132 -7
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a4a9b89fb02620499e8b51f5c5e4ba34d0f3bc8b8f3ae4e2e69cba1e027bdb49
4
- data.tar.gz: 6b707eb1bcb37b8a9b03ce36a2d4304e6760def0cc139ffad748a1596046f82f
3
+ metadata.gz: 9e36482fd8eeb76b12db28c1e68ad836d77a5985af6aa0e570a025bcb664b1a0
4
+ data.tar.gz: a91e869a8e3f614e8280f03749bcd31ce0de3c7c4e9dc9dd3d777c1a3f1e0d3f
5
5
  SHA512:
6
- metadata.gz: '0975e354e028e768fb5bc4c17c19cddbc44b706394f60d8e22a64537a34342965bb98fa00365c329c30d189729dadba271ec6b101ff86593affef7e1d34848b3'
7
- data.tar.gz: 95935fb3c27841760a68696c832e02108940f9eef535f5ed89afea9613a7988c622f2649f3feb77b22706250a30cfba30b5f36f2ada617f532e86a0426e052ce
6
+ metadata.gz: 4423afb5e0538be2abbe4ac21aeba3919081f0c61943ad9801add90b85b6f2e9df907836c13e00c6b300b28766c65c5cb2e78bc040393ebd38c9618092f2957d
7
+ data.tar.gz: c0caf8f4e47419744f7694748ed608c2e7da8fc0094477ce96bad211cf0b398ef5d43550fdcfd25ba1bc820cafd470ebc4f7526c39c171ef991bfc5fd2399882
data/lib/gh-archive.rb CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
9
9
 
10
10
  require_relative File.expand_path('../gh-archive/events', __FILE__)
11
11
 
12
+ module GHArchive
13
+ class ThreadPool
14
+ def initialize(size)
15
+ @size = size
16
+ @threads = []
17
+ @queue = []
18
+ @mutex = Mutex.new
19
+
20
+ @consumer_thread = Thread.start do
21
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
22
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
+ @threads.delete_if { |t| !t.alive? }
24
+
25
+ if @threads.size < @size && @queue.size > 0
26
+ @mutex.synchronize do
27
+ args, job = @queue.shift
28
+ @threads << Thread.start(*args, &job)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def process(*args, &block)
36
+ raise "Block expected" unless block_given?
37
+ raise "Can not add jobs while shutting down" if @shutdown
38
+
39
+ @mutex.synchronize do
40
+ @queue << [args, block]
41
+ end
42
+
43
+ return self.enqueued
44
+ end
45
+
46
+ def shutdown
47
+ @shutdown = true
48
+ end
49
+
50
+ def shutdown!
51
+ self.shutdown
52
+ @mutex.synchronize do
53
+ @queue.clear
54
+ end
55
+ end
56
+
57
+ def enqueued
58
+ return @queue.size
59
+ end
60
+
61
+ def shutdown?
62
+ @shutdown
63
+ end
64
+
65
+ def alive?
66
+ @consumer_thread.alive?
67
+ end
68
+
69
+ def wait
70
+ while alive?
71
+ sleep 0.1
72
+ end
73
+ end
74
+ end
75
+ end
76
+
12
77
  module GHAUtils
13
78
  def get_gha_filename(date)
14
79
  return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
@@ -22,9 +87,10 @@ module GHAUtils
22
87
  end
23
88
 
24
89
  def read_gha_file(file)
25
- if file.path.end_with?(".json")
90
+
91
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
26
92
  content = file.read
27
- elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
93
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
28
94
  content = read_gha_file_content(file)
29
95
  else
30
96
  raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
@@ -56,9 +122,16 @@ class GHAProvider
56
122
  @includes = {}
57
123
  @excludes = {}
58
124
 
125
+ @checkpoint_name = nil
59
126
  @use_json = true
60
127
  end
61
128
 
129
+ def use_checkpoint(filename)
130
+ @checkpoint_name = filename
131
+
132
+ return self
133
+ end
134
+
62
135
  def parse_events
63
136
  @use_json = false
64
137
 
@@ -67,7 +140,10 @@ class GHAProvider
67
140
 
68
141
  def logger=(logger)
69
142
  @logger = logger
143
+
144
+ return self
70
145
  end
146
+ alias :use_logger :logger=
71
147
 
72
148
  def get(date)
73
149
  raise "Not implemented"
@@ -91,11 +167,45 @@ class GHAProvider
91
167
  return self
92
168
  end
93
169
 
170
+ def restore_checkpoint(from)
171
+ if @checkpoint_name && FileTest.exist?(@checkpoint_name)
172
+ # Note that this throws an exception if the file is not readable. This is the intended behavior.
173
+ # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
174
+ loaded_from = Marshal.load(File.read(@checkpoint_name))
175
+ raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
176
+
177
+ @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
178
+
179
+ return loaded_from
180
+ else
181
+ return from
182
+ end
183
+ end
184
+
185
+ def update_checkpoint(current_time)
186
+ if @checkpoint_name
187
+ begin
188
+ File.open(@checkpoint_name, "wb") do |f|
189
+ f.write(Marshal.dump(current_time))
190
+ end
191
+ rescue
192
+ @logger.warn(
193
+ "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
194
+ )
195
+ end
196
+ end
197
+ end
198
+
94
199
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
95
200
  exceptions = []
96
201
 
202
+ from = restore_checkpoint(from)
203
+
97
204
  self.each_time(from, to) do |current_time|
98
205
  events = []
206
+
207
+ update_checkpoint(current_time)
208
+
99
209
  begin
100
210
  events = self.get(current_time)
101
211
  rescue GHAException => e
@@ -131,6 +241,8 @@ class GHAProvider
131
241
  GC.start
132
242
  end
133
243
 
244
+ update_checkpoint(to)
245
+
134
246
  return exceptions
135
247
  end
136
248
 
@@ -142,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
142
254
  def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
143
255
  super()
144
256
 
145
- @max_retries = max_retries
146
- @proactive = proactive
147
- @proactive_pool_size = proactive_pool_size
148
- @pool = Thread.pool(proactive_pool_size)
257
+ self.max_retries(max_retries)
258
+ self.proactive(proactive_pool_size) if proactive
259
+
149
260
  @cache = Cache.new
150
261
  end
151
262
 
263
+ def max_retries(n)
264
+ @max_retries = n
265
+
266
+ return self
267
+ end
268
+
269
+ def proactive(pool_size = 10)
270
+ @proactive = true
271
+ @pool = GHArchive::ThreadPool.new(pool_size)
272
+
273
+ return self
274
+ end
275
+
152
276
  def get(current_time)
153
277
  @max_retries.times do
154
278
  begin
@@ -214,10 +338,11 @@ class OnlineGHAProvider < GHAProvider
214
338
 
215
339
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
216
340
  if @proactive
341
+ real_from = restore_checkpoint(from)
217
342
  any_ready = Thread.promise
218
343
 
219
344
  @logger.info("Proactively scheduling download tasks...")
220
- self.each_time(from, to) do |current_time|
345
+ self.each_time(real_from, to) do |current_time|
221
346
  @pool.process(current_time) do |current_time|
222
347
  cache(current_time)
223
348
  any_ready << true
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.8'
4
+ version: '0.12'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-14 00:00:00.000000000 Z
11
+ date: 2021-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions