gh-archive 0.8 → 0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +132 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e36482fd8eeb76b12db28c1e68ad836d77a5985af6aa0e570a025bcb664b1a0
|
4
|
+
data.tar.gz: a91e869a8e3f614e8280f03749bcd31ce0de3c7c4e9dc9dd3d777c1a3f1e0d3f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4423afb5e0538be2abbe4ac21aeba3919081f0c61943ad9801add90b85b6f2e9df907836c13e00c6b300b28766c65c5cb2e78bc040393ebd38c9618092f2957d
|
7
|
+
data.tar.gz: c0caf8f4e47419744f7694748ed608c2e7da8fc0094477ce96bad211cf0b398ef5d43550fdcfd25ba1bc820cafd470ebc4f7526c39c171ef991bfc5fd2399882
|
data/lib/gh-archive.rb
CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
|
|
9
9
|
|
10
10
|
require_relative File.expand_path('../gh-archive/events', __FILE__)
|
11
11
|
|
12
|
+
module GHArchive
|
13
|
+
class ThreadPool
|
14
|
+
def initialize(size)
|
15
|
+
@size = size
|
16
|
+
@threads = []
|
17
|
+
@queue = []
|
18
|
+
@mutex = Mutex.new
|
19
|
+
|
20
|
+
@consumer_thread = Thread.start do
|
21
|
+
while !@shutdown || @threads.size > 0 || @queue.size > 0
|
22
|
+
sleep 0.1 if @queue.size == 0 || @threads.size == @size
|
23
|
+
@threads.delete_if { |t| !t.alive? }
|
24
|
+
|
25
|
+
if @threads.size < @size && @queue.size > 0
|
26
|
+
@mutex.synchronize do
|
27
|
+
args, job = @queue.shift
|
28
|
+
@threads << Thread.start(*args, &job)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def process(*args, &block)
|
36
|
+
raise "Block expected" unless block_given?
|
37
|
+
raise "Can not add jobs while shutting down" if @shutdown
|
38
|
+
|
39
|
+
@mutex.synchronize do
|
40
|
+
@queue << [args, block]
|
41
|
+
end
|
42
|
+
|
43
|
+
return self.enqueued
|
44
|
+
end
|
45
|
+
|
46
|
+
def shutdown
|
47
|
+
@shutdown = true
|
48
|
+
end
|
49
|
+
|
50
|
+
def shutdown!
|
51
|
+
self.shutdown
|
52
|
+
@mutex.synchronize do
|
53
|
+
@queue.clear
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def enqueued
|
58
|
+
return @queue.size
|
59
|
+
end
|
60
|
+
|
61
|
+
def shutdown?
|
62
|
+
@shutdown
|
63
|
+
end
|
64
|
+
|
65
|
+
def alive?
|
66
|
+
@consumer_thread.alive?
|
67
|
+
end
|
68
|
+
|
69
|
+
def wait
|
70
|
+
while alive?
|
71
|
+
sleep 0.1
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
12
77
|
module GHAUtils
|
13
78
|
def get_gha_filename(date)
|
14
79
|
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
@@ -22,9 +87,10 @@ module GHAUtils
|
|
22
87
|
end
|
23
88
|
|
24
89
|
def read_gha_file(file)
|
25
|
-
|
90
|
+
|
91
|
+
if !file.is_a?(StringIO) && file.path.end_with?(".json")
|
26
92
|
content = file.read
|
27
|
-
elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
93
|
+
elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
28
94
|
content = read_gha_file_content(file)
|
29
95
|
else
|
30
96
|
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
@@ -56,9 +122,16 @@ class GHAProvider
|
|
56
122
|
@includes = {}
|
57
123
|
@excludes = {}
|
58
124
|
|
125
|
+
@checkpoint_name = nil
|
59
126
|
@use_json = true
|
60
127
|
end
|
61
128
|
|
129
|
+
def use_checkpoint(filename)
|
130
|
+
@checkpoint_name = filename
|
131
|
+
|
132
|
+
return self
|
133
|
+
end
|
134
|
+
|
62
135
|
def parse_events
|
63
136
|
@use_json = false
|
64
137
|
|
@@ -67,7 +140,10 @@ class GHAProvider
|
|
67
140
|
|
68
141
|
def logger=(logger)
|
69
142
|
@logger = logger
|
143
|
+
|
144
|
+
return self
|
70
145
|
end
|
146
|
+
alias :use_logger :logger=
|
71
147
|
|
72
148
|
def get(date)
|
73
149
|
raise "Not implemented"
|
@@ -91,11 +167,45 @@ class GHAProvider
|
|
91
167
|
return self
|
92
168
|
end
|
93
169
|
|
170
|
+
def restore_checkpoint(from)
|
171
|
+
if @checkpoint_name && FileTest.exist?(@checkpoint_name)
|
172
|
+
# Note that this throws an exception if the file is not readable. This is the intended behavior.
|
173
|
+
# As opposed to that, failing to save the checkpoint information just results in a warning on the log.
|
174
|
+
loaded_from = Marshal.load(File.read(@checkpoint_name))
|
175
|
+
raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
|
176
|
+
|
177
|
+
@logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
|
178
|
+
|
179
|
+
return loaded_from
|
180
|
+
else
|
181
|
+
return from
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def update_checkpoint(current_time)
|
186
|
+
if @checkpoint_name
|
187
|
+
begin
|
188
|
+
File.open(@checkpoint_name, "wb") do |f|
|
189
|
+
f.write(Marshal.dump(current_time))
|
190
|
+
end
|
191
|
+
rescue
|
192
|
+
@logger.warn(
|
193
|
+
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
194
|
+
)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
94
199
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
95
200
|
exceptions = []
|
96
201
|
|
202
|
+
from = restore_checkpoint(from)
|
203
|
+
|
97
204
|
self.each_time(from, to) do |current_time|
|
98
205
|
events = []
|
206
|
+
|
207
|
+
update_checkpoint(current_time)
|
208
|
+
|
99
209
|
begin
|
100
210
|
events = self.get(current_time)
|
101
211
|
rescue GHAException => e
|
@@ -131,6 +241,8 @@ class GHAProvider
|
|
131
241
|
GC.start
|
132
242
|
end
|
133
243
|
|
244
|
+
update_checkpoint(to)
|
245
|
+
|
134
246
|
return exceptions
|
135
247
|
end
|
136
248
|
|
@@ -142,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
|
|
142
254
|
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
143
255
|
super()
|
144
256
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
@pool = Thread.pool(proactive_pool_size)
|
257
|
+
self.max_retries(max_retries)
|
258
|
+
self.proactive(proactive_pool_size) if proactive
|
259
|
+
|
149
260
|
@cache = Cache.new
|
150
261
|
end
|
151
262
|
|
263
|
+
def max_retries(n)
|
264
|
+
@max_retries = n
|
265
|
+
|
266
|
+
return self
|
267
|
+
end
|
268
|
+
|
269
|
+
def proactive(pool_size = 10)
|
270
|
+
@proactive = true
|
271
|
+
@pool = GHArchive::ThreadPool.new(pool_size)
|
272
|
+
|
273
|
+
return self
|
274
|
+
end
|
275
|
+
|
152
276
|
def get(current_time)
|
153
277
|
@max_retries.times do
|
154
278
|
begin
|
@@ -214,10 +338,11 @@ class OnlineGHAProvider < GHAProvider
|
|
214
338
|
|
215
339
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
216
340
|
if @proactive
|
341
|
+
real_from = restore_checkpoint(from)
|
217
342
|
any_ready = Thread.promise
|
218
343
|
|
219
344
|
@logger.info("Proactively scheduling download tasks...")
|
220
|
-
self.each_time(
|
345
|
+
self.each_time(real_from, to) do |current_time|
|
221
346
|
@pool.process(current_time) do |current_time|
|
222
347
|
cache(current_time)
|
223
348
|
any_ready << true
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.12'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-08-
|
11
|
+
date: 2021-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|