gh-archive 0.8 → 0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +132 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e36482fd8eeb76b12db28c1e68ad836d77a5985af6aa0e570a025bcb664b1a0
|
4
|
+
data.tar.gz: a91e869a8e3f614e8280f03749bcd31ce0de3c7c4e9dc9dd3d777c1a3f1e0d3f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4423afb5e0538be2abbe4ac21aeba3919081f0c61943ad9801add90b85b6f2e9df907836c13e00c6b300b28766c65c5cb2e78bc040393ebd38c9618092f2957d
|
7
|
+
data.tar.gz: c0caf8f4e47419744f7694748ed608c2e7da8fc0094477ce96bad211cf0b398ef5d43550fdcfd25ba1bc820cafd470ebc4f7526c39c171ef991bfc5fd2399882
|
data/lib/gh-archive.rb
CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
|
|
9
9
|
|
10
10
|
require_relative File.expand_path('../gh-archive/events', __FILE__)
|
11
11
|
|
12
|
+
module GHArchive
|
13
|
+
class ThreadPool
|
14
|
+
def initialize(size)
|
15
|
+
@size = size
|
16
|
+
@threads = []
|
17
|
+
@queue = []
|
18
|
+
@mutex = Mutex.new
|
19
|
+
|
20
|
+
@consumer_thread = Thread.start do
|
21
|
+
while !@shutdown || @threads.size > 0 || @queue.size > 0
|
22
|
+
sleep 0.1 if @queue.size == 0 || @threads.size == @size
|
23
|
+
@threads.delete_if { |t| !t.alive? }
|
24
|
+
|
25
|
+
if @threads.size < @size && @queue.size > 0
|
26
|
+
@mutex.synchronize do
|
27
|
+
args, job = @queue.shift
|
28
|
+
@threads << Thread.start(*args, &job)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def process(*args, &block)
|
36
|
+
raise "Block expected" unless block_given?
|
37
|
+
raise "Can not add jobs while shutting down" if @shutdown
|
38
|
+
|
39
|
+
@mutex.synchronize do
|
40
|
+
@queue << [args, block]
|
41
|
+
end
|
42
|
+
|
43
|
+
return self.enqueued
|
44
|
+
end
|
45
|
+
|
46
|
+
def shutdown
|
47
|
+
@shutdown = true
|
48
|
+
end
|
49
|
+
|
50
|
+
def shutdown!
|
51
|
+
self.shutdown
|
52
|
+
@mutex.synchronize do
|
53
|
+
@queue.clear
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def enqueued
|
58
|
+
return @queue.size
|
59
|
+
end
|
60
|
+
|
61
|
+
def shutdown?
|
62
|
+
@shutdown
|
63
|
+
end
|
64
|
+
|
65
|
+
def alive?
|
66
|
+
@consumer_thread.alive?
|
67
|
+
end
|
68
|
+
|
69
|
+
def wait
|
70
|
+
while alive?
|
71
|
+
sleep 0.1
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
12
77
|
module GHAUtils
|
13
78
|
def get_gha_filename(date)
|
14
79
|
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
@@ -22,9 +87,10 @@ module GHAUtils
|
|
22
87
|
end
|
23
88
|
|
24
89
|
def read_gha_file(file)
|
25
|
-
|
90
|
+
|
91
|
+
if !file.is_a?(StringIO) && file.path.end_with?(".json")
|
26
92
|
content = file.read
|
27
|
-
elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
93
|
+
elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
28
94
|
content = read_gha_file_content(file)
|
29
95
|
else
|
30
96
|
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
@@ -56,9 +122,16 @@ class GHAProvider
|
|
56
122
|
@includes = {}
|
57
123
|
@excludes = {}
|
58
124
|
|
125
|
+
@checkpoint_name = nil
|
59
126
|
@use_json = true
|
60
127
|
end
|
61
128
|
|
129
|
+
def use_checkpoint(filename)
|
130
|
+
@checkpoint_name = filename
|
131
|
+
|
132
|
+
return self
|
133
|
+
end
|
134
|
+
|
62
135
|
def parse_events
|
63
136
|
@use_json = false
|
64
137
|
|
@@ -67,7 +140,10 @@ class GHAProvider
|
|
67
140
|
|
68
141
|
def logger=(logger)
|
69
142
|
@logger = logger
|
143
|
+
|
144
|
+
return self
|
70
145
|
end
|
146
|
+
alias :use_logger :logger=
|
71
147
|
|
72
148
|
def get(date)
|
73
149
|
raise "Not implemented"
|
@@ -91,11 +167,45 @@ class GHAProvider
|
|
91
167
|
return self
|
92
168
|
end
|
93
169
|
|
170
|
+
def restore_checkpoint(from)
|
171
|
+
if @checkpoint_name && FileTest.exist?(@checkpoint_name)
|
172
|
+
# Note that this throws an exception if the file is not readable. This is the intended behavior.
|
173
|
+
# As opposed to that, failing to save the checkpoint information just results in a warning on the log.
|
174
|
+
loaded_from = Marshal.load(File.read(@checkpoint_name))
|
175
|
+
raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
|
176
|
+
|
177
|
+
@logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
|
178
|
+
|
179
|
+
return loaded_from
|
180
|
+
else
|
181
|
+
return from
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def update_checkpoint(current_time)
|
186
|
+
if @checkpoint_name
|
187
|
+
begin
|
188
|
+
File.open(@checkpoint_name, "wb") do |f|
|
189
|
+
f.write(Marshal.dump(current_time))
|
190
|
+
end
|
191
|
+
rescue
|
192
|
+
@logger.warn(
|
193
|
+
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
194
|
+
)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
94
199
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
95
200
|
exceptions = []
|
96
201
|
|
202
|
+
from = restore_checkpoint(from)
|
203
|
+
|
97
204
|
self.each_time(from, to) do |current_time|
|
98
205
|
events = []
|
206
|
+
|
207
|
+
update_checkpoint(current_time)
|
208
|
+
|
99
209
|
begin
|
100
210
|
events = self.get(current_time)
|
101
211
|
rescue GHAException => e
|
@@ -131,6 +241,8 @@ class GHAProvider
|
|
131
241
|
GC.start
|
132
242
|
end
|
133
243
|
|
244
|
+
update_checkpoint(to)
|
245
|
+
|
134
246
|
return exceptions
|
135
247
|
end
|
136
248
|
|
@@ -142,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
|
|
142
254
|
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
143
255
|
super()
|
144
256
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
@pool = Thread.pool(proactive_pool_size)
|
257
|
+
self.max_retries(max_retries)
|
258
|
+
self.proactive(proactive_pool_size) if proactive
|
259
|
+
|
149
260
|
@cache = Cache.new
|
150
261
|
end
|
151
262
|
|
263
|
+
def max_retries(n)
|
264
|
+
@max_retries = n
|
265
|
+
|
266
|
+
return self
|
267
|
+
end
|
268
|
+
|
269
|
+
def proactive(pool_size = 10)
|
270
|
+
@proactive = true
|
271
|
+
@pool = GHArchive::ThreadPool.new(pool_size)
|
272
|
+
|
273
|
+
return self
|
274
|
+
end
|
275
|
+
|
152
276
|
def get(current_time)
|
153
277
|
@max_retries.times do
|
154
278
|
begin
|
@@ -214,10 +338,11 @@ class OnlineGHAProvider < GHAProvider
|
|
214
338
|
|
215
339
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
216
340
|
if @proactive
|
341
|
+
real_from = restore_checkpoint(from)
|
217
342
|
any_ready = Thread.promise
|
218
343
|
|
219
344
|
@logger.info("Proactively scheduling download tasks...")
|
220
|
-
self.each_time(
|
345
|
+
self.each_time(real_from, to) do |current_time|
|
221
346
|
@pool.process(current_time) do |current_time|
|
222
347
|
cache(current_time)
|
223
348
|
any_ready << true
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.12'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-08-
|
11
|
+
date: 2021-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|