gh-archive 0.9 → 0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +117 -34
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74e3037ee1115173176aa974f453f49b7649743f41f83118e9ee180bd620c095
|
4
|
+
data.tar.gz: c4c1ca30210ba39204b28b4b3854e2e93c69003ef1ead0f85506f2fe213f0ee9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 289b568dce07aa1f0182c75d26f7ab286a2b48dbf31c9ee63c6a1ef77bf5a59823b17136984380d6cca123c0be70e6cca3cc3dba216a935354d9312eb93aa2fa
|
7
|
+
data.tar.gz: 5d9ef4ec34a106e3fb2db37ab173c0b0876637941263f356f2e43440c506ad0471906cd4fa0fbd12bf98c84e44ee2bae0408a96ccdef0a5946c13487e5441204
|
data/lib/gh-archive.rb
CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
|
|
9
9
|
|
10
10
|
require_relative File.expand_path('../gh-archive/events', __FILE__)
|
11
11
|
|
12
|
+
module GHArchive
|
13
|
+
class ThreadPool
|
14
|
+
def initialize(size)
|
15
|
+
@size = size
|
16
|
+
@threads = []
|
17
|
+
@queue = []
|
18
|
+
@mutex = Mutex.new
|
19
|
+
|
20
|
+
@consumer_thread = Thread.start do
|
21
|
+
while !@shutdown || @threads.size > 0 || @queue.size > 0
|
22
|
+
sleep 0.1 if @queue.size == 0 || @threads.size == @size
|
23
|
+
@threads.delete_if { |t| !t.alive? }
|
24
|
+
|
25
|
+
if @threads.size < @size && @queue.size > 0
|
26
|
+
@mutex.synchronize do
|
27
|
+
args, job = @queue.shift
|
28
|
+
@threads << Thread.start(*args, &job)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def process(*args, &block)
|
36
|
+
raise "Block expected" unless block_given?
|
37
|
+
raise "Can not add jobs while shutting down" if @shutdown
|
38
|
+
|
39
|
+
@mutex.synchronize do
|
40
|
+
@queue << [args, block]
|
41
|
+
end
|
42
|
+
|
43
|
+
return self.enqueued
|
44
|
+
end
|
45
|
+
|
46
|
+
def shutdown
|
47
|
+
@shutdown = true
|
48
|
+
end
|
49
|
+
|
50
|
+
def shutdown!
|
51
|
+
self.shutdown
|
52
|
+
@mutex.synchronize do
|
53
|
+
@queue.clear
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def enqueued
|
58
|
+
return @queue.size
|
59
|
+
end
|
60
|
+
|
61
|
+
def shutdown?
|
62
|
+
@shutdown
|
63
|
+
end
|
64
|
+
|
65
|
+
def alive?
|
66
|
+
@consumer_thread.alive?
|
67
|
+
end
|
68
|
+
|
69
|
+
def wait
|
70
|
+
while alive?
|
71
|
+
sleep 0.1
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
12
77
|
module GHAUtils
|
13
78
|
def get_gha_filename(date)
|
14
79
|
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
@@ -22,9 +87,10 @@ module GHAUtils
|
|
22
87
|
end
|
23
88
|
|
24
89
|
def read_gha_file(file)
|
25
|
-
|
90
|
+
|
91
|
+
if !file.is_a?(StringIO) && file.path.end_with?(".json")
|
26
92
|
content = file.read
|
27
|
-
elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
93
|
+
elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
28
94
|
content = read_gha_file_content(file)
|
29
95
|
else
|
30
96
|
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
@@ -101,9 +167,7 @@ class GHAProvider
|
|
101
167
|
return self
|
102
168
|
end
|
103
169
|
|
104
|
-
def
|
105
|
-
exceptions = []
|
106
|
-
|
170
|
+
def restore_checkpoint(from)
|
107
171
|
if @checkpoint_name && FileTest.exist?(@checkpoint_name)
|
108
172
|
# Note that this throws an exception if the file is not readable. This is the intended behavior.
|
109
173
|
# As opposed to that, failing to save the checkpoint information just results in a warning on the log.
|
@@ -111,11 +175,37 @@ class GHAProvider
|
|
111
175
|
raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
|
112
176
|
|
113
177
|
@logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
|
114
|
-
|
178
|
+
|
179
|
+
return loaded_from
|
180
|
+
else
|
181
|
+
return from
|
115
182
|
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def update_checkpoint(current_time)
|
186
|
+
if @checkpoint_name
|
187
|
+
begin
|
188
|
+
File.open(@checkpoint_name, "wb") do |f|
|
189
|
+
f.write(Marshal.dump(current_time))
|
190
|
+
end
|
191
|
+
rescue
|
192
|
+
@logger.warn(
|
193
|
+
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
194
|
+
)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
200
|
+
exceptions = []
|
201
|
+
|
202
|
+
from = restore_checkpoint(from)
|
116
203
|
|
117
204
|
self.each_time(from, to) do |current_time|
|
118
205
|
events = []
|
206
|
+
|
207
|
+
update_checkpoint(current_time)
|
208
|
+
|
119
209
|
begin
|
120
210
|
events = self.get(current_time)
|
121
211
|
rescue GHAException => e
|
@@ -127,18 +217,6 @@ class GHAProvider
|
|
127
217
|
next
|
128
218
|
end
|
129
219
|
|
130
|
-
if @checkpoint_name
|
131
|
-
begin
|
132
|
-
File.open(@checkpoint_name, "wb") do |f|
|
133
|
-
f.write(Marshal.dump(current_time))
|
134
|
-
end
|
135
|
-
rescue
|
136
|
-
@logger.warn(
|
137
|
-
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
138
|
-
)
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
220
|
events.each do |event|
|
143
221
|
skip = false
|
144
222
|
@includes.each do |key, value|
|
@@ -163,17 +241,7 @@ class GHAProvider
|
|
163
241
|
GC.start
|
164
242
|
end
|
165
243
|
|
166
|
-
|
167
|
-
begin
|
168
|
-
File.open(@checkpoint_name, "wb") do |f|
|
169
|
-
f.write(Marshal.dump(to))
|
170
|
-
end
|
171
|
-
rescue
|
172
|
-
@logger.warn(
|
173
|
-
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
174
|
-
)
|
175
|
-
end
|
176
|
-
end
|
244
|
+
update_checkpoint(to)
|
177
245
|
|
178
246
|
return exceptions
|
179
247
|
end
|
@@ -186,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
|
|
186
254
|
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
187
255
|
super()
|
188
256
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
@pool = Thread.pool(proactive_pool_size)
|
257
|
+
self.max_retries(max_retries)
|
258
|
+
self.proactive(proactive_pool_size) if proactive
|
259
|
+
|
193
260
|
@cache = Cache.new
|
194
261
|
end
|
195
262
|
|
263
|
+
def max_retries(n)
|
264
|
+
@max_retries = n
|
265
|
+
|
266
|
+
return self
|
267
|
+
end
|
268
|
+
|
269
|
+
def proactive(pool_size = 10)
|
270
|
+
@proactive = true
|
271
|
+
@pool = GHArchive::ThreadPool.new(pool_size)
|
272
|
+
|
273
|
+
return self
|
274
|
+
end
|
275
|
+
|
196
276
|
def get(current_time)
|
197
277
|
@max_retries.times do
|
198
278
|
begin
|
@@ -252,16 +332,19 @@ class OnlineGHAProvider < GHAProvider
|
|
252
332
|
else
|
253
333
|
raise e
|
254
334
|
end
|
335
|
+
rescue Zlib::GzipFile::Error => e
|
336
|
+
@logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
|
255
337
|
end
|
256
338
|
end
|
257
339
|
end
|
258
340
|
|
259
341
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
260
342
|
if @proactive
|
343
|
+
real_from = restore_checkpoint(from)
|
261
344
|
any_ready = Thread.promise
|
262
345
|
|
263
346
|
@logger.info("Proactively scheduling download tasks...")
|
264
|
-
self.each_time(
|
347
|
+
self.each_time(real_from, to) do |current_time|
|
265
348
|
@pool.process(current_time) do |current_time|
|
266
349
|
cache(current_time)
|
267
350
|
any_ready << true
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.13'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-08-
|
11
|
+
date: 2021-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
@@ -63,7 +63,7 @@ homepage: https://github.com/intersimone999/gh-archive
|
|
63
63
|
licenses:
|
64
64
|
- GPL-3.0-only
|
65
65
|
metadata: {}
|
66
|
-
post_install_message:
|
66
|
+
post_install_message:
|
67
67
|
rdoc_options: []
|
68
68
|
require_paths:
|
69
69
|
- lib
|
@@ -78,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '0'
|
80
80
|
requirements: []
|
81
|
-
rubygems_version: 3.2.
|
82
|
-
signing_key:
|
81
|
+
rubygems_version: 3.2.21
|
82
|
+
signing_key:
|
83
83
|
specification_version: 4
|
84
84
|
summary: GitHub Archive mining utility
|
85
85
|
test_files: []
|