gh-archive 0.9 → 0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +117 -34
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74e3037ee1115173176aa974f453f49b7649743f41f83118e9ee180bd620c095
|
4
|
+
data.tar.gz: c4c1ca30210ba39204b28b4b3854e2e93c69003ef1ead0f85506f2fe213f0ee9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 289b568dce07aa1f0182c75d26f7ab286a2b48dbf31c9ee63c6a1ef77bf5a59823b17136984380d6cca123c0be70e6cca3cc3dba216a935354d9312eb93aa2fa
|
7
|
+
data.tar.gz: 5d9ef4ec34a106e3fb2db37ab173c0b0876637941263f356f2e43440c506ad0471906cd4fa0fbd12bf98c84e44ee2bae0408a96ccdef0a5946c13487e5441204
|
data/lib/gh-archive.rb
CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
|
|
9
9
|
|
10
10
|
require_relative File.expand_path('../gh-archive/events', __FILE__)
|
11
11
|
|
12
|
+
module GHArchive
|
13
|
+
class ThreadPool
|
14
|
+
def initialize(size)
|
15
|
+
@size = size
|
16
|
+
@threads = []
|
17
|
+
@queue = []
|
18
|
+
@mutex = Mutex.new
|
19
|
+
|
20
|
+
@consumer_thread = Thread.start do
|
21
|
+
while !@shutdown || @threads.size > 0 || @queue.size > 0
|
22
|
+
sleep 0.1 if @queue.size == 0 || @threads.size == @size
|
23
|
+
@threads.delete_if { |t| !t.alive? }
|
24
|
+
|
25
|
+
if @threads.size < @size && @queue.size > 0
|
26
|
+
@mutex.synchronize do
|
27
|
+
args, job = @queue.shift
|
28
|
+
@threads << Thread.start(*args, &job)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def process(*args, &block)
|
36
|
+
raise "Block expected" unless block_given?
|
37
|
+
raise "Can not add jobs while shutting down" if @shutdown
|
38
|
+
|
39
|
+
@mutex.synchronize do
|
40
|
+
@queue << [args, block]
|
41
|
+
end
|
42
|
+
|
43
|
+
return self.enqueued
|
44
|
+
end
|
45
|
+
|
46
|
+
def shutdown
|
47
|
+
@shutdown = true
|
48
|
+
end
|
49
|
+
|
50
|
+
def shutdown!
|
51
|
+
self.shutdown
|
52
|
+
@mutex.synchronize do
|
53
|
+
@queue.clear
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def enqueued
|
58
|
+
return @queue.size
|
59
|
+
end
|
60
|
+
|
61
|
+
def shutdown?
|
62
|
+
@shutdown
|
63
|
+
end
|
64
|
+
|
65
|
+
def alive?
|
66
|
+
@consumer_thread.alive?
|
67
|
+
end
|
68
|
+
|
69
|
+
def wait
|
70
|
+
while alive?
|
71
|
+
sleep 0.1
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
12
77
|
module GHAUtils
|
13
78
|
def get_gha_filename(date)
|
14
79
|
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
@@ -22,9 +87,10 @@ module GHAUtils
|
|
22
87
|
end
|
23
88
|
|
24
89
|
def read_gha_file(file)
|
25
|
-
|
90
|
+
|
91
|
+
if !file.is_a?(StringIO) && file.path.end_with?(".json")
|
26
92
|
content = file.read
|
27
|
-
elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
93
|
+
elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
28
94
|
content = read_gha_file_content(file)
|
29
95
|
else
|
30
96
|
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
@@ -101,9 +167,7 @@ class GHAProvider
|
|
101
167
|
return self
|
102
168
|
end
|
103
169
|
|
104
|
-
def
|
105
|
-
exceptions = []
|
106
|
-
|
170
|
+
def restore_checkpoint(from)
|
107
171
|
if @checkpoint_name && FileTest.exist?(@checkpoint_name)
|
108
172
|
# Note that this throws an exception if the file is not readable. This is the intended behavior.
|
109
173
|
# As opposed to that, failing to save the checkpoint information just results in a warning on the log.
|
@@ -111,11 +175,37 @@ class GHAProvider
|
|
111
175
|
raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
|
112
176
|
|
113
177
|
@logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
|
114
|
-
|
178
|
+
|
179
|
+
return loaded_from
|
180
|
+
else
|
181
|
+
return from
|
115
182
|
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def update_checkpoint(current_time)
|
186
|
+
if @checkpoint_name
|
187
|
+
begin
|
188
|
+
File.open(@checkpoint_name, "wb") do |f|
|
189
|
+
f.write(Marshal.dump(current_time))
|
190
|
+
end
|
191
|
+
rescue
|
192
|
+
@logger.warn(
|
193
|
+
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
194
|
+
)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
200
|
+
exceptions = []
|
201
|
+
|
202
|
+
from = restore_checkpoint(from)
|
116
203
|
|
117
204
|
self.each_time(from, to) do |current_time|
|
118
205
|
events = []
|
206
|
+
|
207
|
+
update_checkpoint(current_time)
|
208
|
+
|
119
209
|
begin
|
120
210
|
events = self.get(current_time)
|
121
211
|
rescue GHAException => e
|
@@ -127,18 +217,6 @@ class GHAProvider
|
|
127
217
|
next
|
128
218
|
end
|
129
219
|
|
130
|
-
if @checkpoint_name
|
131
|
-
begin
|
132
|
-
File.open(@checkpoint_name, "wb") do |f|
|
133
|
-
f.write(Marshal.dump(current_time))
|
134
|
-
end
|
135
|
-
rescue
|
136
|
-
@logger.warn(
|
137
|
-
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
138
|
-
)
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
220
|
events.each do |event|
|
143
221
|
skip = false
|
144
222
|
@includes.each do |key, value|
|
@@ -163,17 +241,7 @@ class GHAProvider
|
|
163
241
|
GC.start
|
164
242
|
end
|
165
243
|
|
166
|
-
|
167
|
-
begin
|
168
|
-
File.open(@checkpoint_name, "wb") do |f|
|
169
|
-
f.write(Marshal.dump(to))
|
170
|
-
end
|
171
|
-
rescue
|
172
|
-
@logger.warn(
|
173
|
-
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
174
|
-
)
|
175
|
-
end
|
176
|
-
end
|
244
|
+
update_checkpoint(to)
|
177
245
|
|
178
246
|
return exceptions
|
179
247
|
end
|
@@ -186,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
|
|
186
254
|
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
187
255
|
super()
|
188
256
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
@pool = Thread.pool(proactive_pool_size)
|
257
|
+
self.max_retries(max_retries)
|
258
|
+
self.proactive(proactive_pool_size) if proactive
|
259
|
+
|
193
260
|
@cache = Cache.new
|
194
261
|
end
|
195
262
|
|
263
|
+
def max_retries(n)
|
264
|
+
@max_retries = n
|
265
|
+
|
266
|
+
return self
|
267
|
+
end
|
268
|
+
|
269
|
+
def proactive(pool_size = 10)
|
270
|
+
@proactive = true
|
271
|
+
@pool = GHArchive::ThreadPool.new(pool_size)
|
272
|
+
|
273
|
+
return self
|
274
|
+
end
|
275
|
+
|
196
276
|
def get(current_time)
|
197
277
|
@max_retries.times do
|
198
278
|
begin
|
@@ -252,16 +332,19 @@ class OnlineGHAProvider < GHAProvider
|
|
252
332
|
else
|
253
333
|
raise e
|
254
334
|
end
|
335
|
+
rescue Zlib::GzipFile::Error => e
|
336
|
+
@logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
|
255
337
|
end
|
256
338
|
end
|
257
339
|
end
|
258
340
|
|
259
341
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
260
342
|
if @proactive
|
343
|
+
real_from = restore_checkpoint(from)
|
261
344
|
any_ready = Thread.promise
|
262
345
|
|
263
346
|
@logger.info("Proactively scheduling download tasks...")
|
264
|
-
self.each_time(
|
347
|
+
self.each_time(real_from, to) do |current_time|
|
265
348
|
@pool.process(current_time) do |current_time|
|
266
349
|
cache(current_time)
|
267
350
|
any_ready << true
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.13'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-08-
|
11
|
+
date: 2021-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
@@ -63,7 +63,7 @@ homepage: https://github.com/intersimone999/gh-archive
|
|
63
63
|
licenses:
|
64
64
|
- GPL-3.0-only
|
65
65
|
metadata: {}
|
66
|
-
post_install_message:
|
66
|
+
post_install_message:
|
67
67
|
rdoc_options: []
|
68
68
|
require_paths:
|
69
69
|
- lib
|
@@ -78,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '0'
|
80
80
|
requirements: []
|
81
|
-
rubygems_version: 3.2.
|
82
|
-
signing_key:
|
81
|
+
rubygems_version: 3.2.21
|
82
|
+
signing_key:
|
83
83
|
specification_version: 4
|
84
84
|
summary: GitHub Archive mining utility
|
85
85
|
test_files: []
|