gh-archive 0.16 → 0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/gh-archive/core.rb +112 -0
- data/lib/gh-archive/downloader.rb +62 -0
- data/lib/gh-archive/entities.rb +1 -0
- data/lib/gh-archive/events.rb +1 -1
- data/lib/gh-archive/job.rb +22 -0
- data/lib/gh-archive/legacy.rb +31 -0
- data/lib/gh-archive/providers.rb +326 -0
- data/lib/gh-archive.rb +6 -496
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c17a920e176289a00fb21fff7b712dc634c4491241ebe11a3b06f0ddd112706d
|
4
|
+
data.tar.gz: 58e7e4fdc6442d4a0955bc70d0fb10eec05b437350c255e34731021b2d714deb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9066428d42acd77376fe72082adc62517c21bf8154b60b0006d7f0ab61fd0679af5775444c180d5cc8e842c99b0c207266e25ea28d4f0cdef33e49259339bb3
|
7
|
+
data.tar.gz: 036eaa0ead55db627ee8bb4f4a7421a525ab0ae73041045ef6d5b13307b7ed54d36e70532658ec3853cac2f21a7e5d2e80313c2fb688723ea87cd689ea469c9c
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'code-assertions'
|
2
|
+
require 'json'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'zlib'
|
5
|
+
require 'logger'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'thread/pool'
|
8
|
+
require 'thread/promise'
|
9
|
+
|
10
|
+
module GHArchive
|
11
|
+
class ThreadPool
|
12
|
+
def initialize(size)
|
13
|
+
@size = size
|
14
|
+
@threads = []
|
15
|
+
@queue = []
|
16
|
+
@mutex = Mutex.new
|
17
|
+
|
18
|
+
@consumer_thread = Thread.start do
|
19
|
+
while !@shutdown || @threads.size > 0 || @queue.size > 0
|
20
|
+
sleep 0.1 if @queue.size == 0 || @threads.size == @size
|
21
|
+
@threads.delete_if { |t| !t.alive? }
|
22
|
+
|
23
|
+
if @threads.size < @size && @queue.size > 0
|
24
|
+
@mutex.synchronize do
|
25
|
+
args, job = @queue.shift
|
26
|
+
@threads << Thread.start(*args, &job)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def process(*args, &block)
|
34
|
+
raise "Block expected" unless block_given?
|
35
|
+
raise "Can not add jobs while shutting down" if @shutdown
|
36
|
+
|
37
|
+
@mutex.synchronize do
|
38
|
+
@queue << [args, block]
|
39
|
+
end
|
40
|
+
|
41
|
+
return self.enqueued
|
42
|
+
end
|
43
|
+
|
44
|
+
def shutdown
|
45
|
+
@shutdown = true
|
46
|
+
end
|
47
|
+
|
48
|
+
def shutdown!
|
49
|
+
self.shutdown
|
50
|
+
@mutex.synchronize do
|
51
|
+
@queue.clear
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def enqueued
|
56
|
+
return @queue.size
|
57
|
+
end
|
58
|
+
|
59
|
+
def shutdown?
|
60
|
+
@shutdown
|
61
|
+
end
|
62
|
+
|
63
|
+
def alive?
|
64
|
+
@consumer_thread.alive?
|
65
|
+
end
|
66
|
+
|
67
|
+
def wait
|
68
|
+
while alive?
|
69
|
+
sleep 0.1
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
module Utils
|
75
|
+
def get_gha_filename(date)
|
76
|
+
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
77
|
+
end
|
78
|
+
|
79
|
+
def read_gha_file_content(gz)
|
80
|
+
gzip = Zlib::GzipReader.new(gz)
|
81
|
+
return gzip.read
|
82
|
+
ensure
|
83
|
+
gzip.close if gzip
|
84
|
+
end
|
85
|
+
|
86
|
+
def read_gha_file(file)
|
87
|
+
|
88
|
+
if !file.is_a?(StringIO) && file.path.end_with?(".json")
|
89
|
+
content = file.read
|
90
|
+
elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
91
|
+
content = read_gha_file_content(file)
|
92
|
+
else
|
93
|
+
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
94
|
+
end
|
95
|
+
|
96
|
+
result = []
|
97
|
+
content.lines.each do |line|
|
98
|
+
result << JSON.parse(line)
|
99
|
+
end
|
100
|
+
|
101
|
+
return result
|
102
|
+
end
|
103
|
+
|
104
|
+
def each_time(from, to)
|
105
|
+
current_time = from
|
106
|
+
while current_time < to
|
107
|
+
yield current_time
|
108
|
+
current_time += 3600
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require_relative 'core'
|
2
|
+
|
3
|
+
module GHArchive
|
4
|
+
class Downloader
|
5
|
+
include Utils
|
6
|
+
|
7
|
+
def initialize(folder, decompress = false)
|
8
|
+
@logger = Logger.new(STDERR)
|
9
|
+
@decompress = decompress
|
10
|
+
@folder = folder
|
11
|
+
@max = nil
|
12
|
+
|
13
|
+
Dir.mkdir(@folder) unless FileTest.exist?(@folder)
|
14
|
+
raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
|
15
|
+
end
|
16
|
+
|
17
|
+
def max(max)
|
18
|
+
@max = max
|
19
|
+
return self
|
20
|
+
end
|
21
|
+
|
22
|
+
def logger=(logger)
|
23
|
+
@logger = logger
|
24
|
+
end
|
25
|
+
|
26
|
+
def download(from = Time.gm(2015, 1, 1), to = Time.now)
|
27
|
+
archive = []
|
28
|
+
self.each_time(from, to) do |current_time|
|
29
|
+
filename = self.get_gha_filename(current_time)
|
30
|
+
out_filename = filename.clone
|
31
|
+
out_filename.gsub!(".json.gz", ".json") if @decompress
|
32
|
+
|
33
|
+
target_file = File.join(@folder, out_filename)
|
34
|
+
if FileTest.exist?(target_file)
|
35
|
+
@logger.info("Skipping existing file for #{current_time}")
|
36
|
+
next
|
37
|
+
else
|
38
|
+
@logger.info("Downloading file for #{current_time}")
|
39
|
+
end
|
40
|
+
|
41
|
+
File.open(target_file, 'w') do |f|
|
42
|
+
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
43
|
+
if @decompress
|
44
|
+
f << self.read_gha_file_content(gz)
|
45
|
+
else
|
46
|
+
f << gz.read
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
archive << target_file
|
51
|
+
|
52
|
+
if @max && archive.size > @max
|
53
|
+
last = archive.shift
|
54
|
+
@logger.info("Removing local file #{last}")
|
55
|
+
File.unlink(last)
|
56
|
+
end
|
57
|
+
|
58
|
+
yield filename if block_given?
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
data/lib/gh-archive/entities.rb
CHANGED
data/lib/gh-archive/events.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'core'
|
2
|
+
require 'providers'
|
3
|
+
|
4
|
+
module GHArchive
|
5
|
+
class Job
|
6
|
+
def initialize(provider, from, to)
|
7
|
+
@provider = provider
|
8
|
+
@from = from
|
9
|
+
@to = to
|
10
|
+
end
|
11
|
+
|
12
|
+
def start
|
13
|
+
@provider.each(@from, @to) do |event, time|
|
14
|
+
run(event, time)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def run(event, time)
|
19
|
+
raise GHAException, "This is an abstract job, it should be implemented before running"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative 'core'
|
2
|
+
|
3
|
+
GHAUtils = GHArchive::Utils
|
4
|
+
|
5
|
+
class GHAProvider < GHArchive::Provider
|
6
|
+
def initialize(*args)
|
7
|
+
warn "GHAProvider is deprecated. Please use GHArchive::Provider instead."
|
8
|
+
super
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
class OnlineGHAProvider < GHArchive::OnlineProvider
|
13
|
+
def initialize(*args)
|
14
|
+
warn "OnlineGHAProvider is deprecated. Please use GHArchive::OnlineProvider instead."
|
15
|
+
super
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class FolderGHAProvider < GHArchive::FolderProvider
|
20
|
+
def initialize(*args)
|
21
|
+
warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
|
22
|
+
super
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class GHADownloader < GHArchive::Downloader
|
27
|
+
def initialize(*args)
|
28
|
+
warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
|
29
|
+
super
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,326 @@
|
|
1
|
+
require_relative 'core'
|
2
|
+
|
3
|
+
module GHArchive
|
4
|
+
class Provider
|
5
|
+
include Utils
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@logger = Logger.new(STDOUT)
|
9
|
+
|
10
|
+
@includes = {}
|
11
|
+
@excludes = {}
|
12
|
+
|
13
|
+
@checkpoint_name = nil
|
14
|
+
@use_json = true
|
15
|
+
end
|
16
|
+
|
17
|
+
def use_checkpoint(filename)
|
18
|
+
@checkpoint_name = filename
|
19
|
+
|
20
|
+
return self
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse_events
|
24
|
+
@use_json = false
|
25
|
+
|
26
|
+
return self
|
27
|
+
end
|
28
|
+
|
29
|
+
def logger=(logger)
|
30
|
+
@logger = logger
|
31
|
+
|
32
|
+
return self
|
33
|
+
end
|
34
|
+
alias :use_logger :logger=
|
35
|
+
|
36
|
+
def get(date)
|
37
|
+
raise "Not implemented"
|
38
|
+
end
|
39
|
+
|
40
|
+
def include(**args)
|
41
|
+
args.each do |key, value|
|
42
|
+
@includes[key.to_s] = [] unless @includes[key.to_s]
|
43
|
+
@includes[key.to_s] << value
|
44
|
+
end
|
45
|
+
|
46
|
+
return self
|
47
|
+
end
|
48
|
+
|
49
|
+
def exclude(**args)
|
50
|
+
args.each do |key, value|
|
51
|
+
@excludes[key.to_s] = [] unless @excludes[key.to_s]
|
52
|
+
@excludes[key.to_s] << value
|
53
|
+
end
|
54
|
+
|
55
|
+
return self
|
56
|
+
end
|
57
|
+
|
58
|
+
def restore_checkpoint(from)
|
59
|
+
if @checkpoint_name && FileTest.exist?(@checkpoint_name)
|
60
|
+
# Note that this throws an exception if the file is not readable. This is the intended behavior.
|
61
|
+
# As opposed to that, failing to save the checkpoint information just results in a warning on the log.
|
62
|
+
loaded_from = Marshal.load(File.read(@checkpoint_name))
|
63
|
+
raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
|
64
|
+
|
65
|
+
@logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
|
66
|
+
|
67
|
+
return loaded_from
|
68
|
+
else
|
69
|
+
return from
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def update_checkpoint(current_time)
|
74
|
+
if @checkpoint_name
|
75
|
+
begin
|
76
|
+
File.open(@checkpoint_name, "wb") do |f|
|
77
|
+
f.write(Marshal.dump(current_time))
|
78
|
+
end
|
79
|
+
rescue
|
80
|
+
@logger.warn(
|
81
|
+
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
82
|
+
)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
88
|
+
exceptions = []
|
89
|
+
|
90
|
+
from = restore_checkpoint(from)
|
91
|
+
|
92
|
+
self.each_time(from, to) do |current_time|
|
93
|
+
events = []
|
94
|
+
|
95
|
+
update_checkpoint(current_time)
|
96
|
+
|
97
|
+
begin
|
98
|
+
events = self.get(current_time)
|
99
|
+
rescue GHAException => e
|
100
|
+
@logger.warn(e.message)
|
101
|
+
next
|
102
|
+
rescue => e
|
103
|
+
@logger.error("An exception occurred for #{current_time}: #{e.message}")
|
104
|
+
exceptions << e
|
105
|
+
next
|
106
|
+
end
|
107
|
+
|
108
|
+
events.each do |event|
|
109
|
+
skip = false
|
110
|
+
@includes.each do |key, value|
|
111
|
+
skip = true unless value.include?(event[key])
|
112
|
+
end
|
113
|
+
|
114
|
+
@excludes.each do |key, value|
|
115
|
+
skip = true if value.include?(event[key])
|
116
|
+
end
|
117
|
+
next if skip
|
118
|
+
|
119
|
+
if @use_json
|
120
|
+
yield event, current_time
|
121
|
+
else
|
122
|
+
yield GHArchive::Event.parse(event), current_time
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
@logger.info("Scanned #{current_time}")
|
127
|
+
|
128
|
+
events.clear
|
129
|
+
GC.start
|
130
|
+
end
|
131
|
+
|
132
|
+
update_checkpoint(to)
|
133
|
+
|
134
|
+
return exceptions
|
135
|
+
end
|
136
|
+
|
137
|
+
class GHAException < Exception
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
class OnlineProvider < Provider
|
142
|
+
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
143
|
+
super()
|
144
|
+
|
145
|
+
self.max_retries(max_retries)
|
146
|
+
self.proactive(proactive_pool_size) if proactive
|
147
|
+
|
148
|
+
@cache = Cache.new
|
149
|
+
end
|
150
|
+
|
151
|
+
def max_retries(n)
|
152
|
+
@max_retries = n
|
153
|
+
|
154
|
+
return self
|
155
|
+
end
|
156
|
+
|
157
|
+
def proactive(pool_size = 10)
|
158
|
+
@proactive = true
|
159
|
+
@pool = GHArchive::ThreadPool.new(pool_size)
|
160
|
+
|
161
|
+
return self
|
162
|
+
end
|
163
|
+
|
164
|
+
def get(current_time)
|
165
|
+
@max_retries.times do
|
166
|
+
begin
|
167
|
+
filename = self.get_gha_filename(current_time)
|
168
|
+
|
169
|
+
if @proactive
|
170
|
+
@logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
|
171
|
+
|
172
|
+
while !@cache.has?(filename)
|
173
|
+
sleep 1
|
174
|
+
end
|
175
|
+
|
176
|
+
data = @cache.get(filename)
|
177
|
+
if data
|
178
|
+
return data
|
179
|
+
else
|
180
|
+
raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
|
181
|
+
end
|
182
|
+
else
|
183
|
+
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
184
|
+
return self.read_gha_file(gz)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
rescue Errno::ECONNRESET => e
|
188
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
189
|
+
next
|
190
|
+
rescue OpenURI::HTTPError => e
|
191
|
+
code = e.io.status[0]
|
192
|
+
if code.start_with?("5")
|
193
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
194
|
+
next
|
195
|
+
else
|
196
|
+
raise e
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
|
202
|
+
end
|
203
|
+
|
204
|
+
def cache(current_time)
|
205
|
+
@logger.info("Full cache. Waiting for some free slot...") if @cache.full?
|
206
|
+
while @cache.full?
|
207
|
+
sleep 1
|
208
|
+
end
|
209
|
+
|
210
|
+
filename = self.get_gha_filename(current_time)
|
211
|
+
@max_retries.times do
|
212
|
+
begin
|
213
|
+
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
214
|
+
content = self.read_gha_file(gz)
|
215
|
+
@cache.put(filename, content)
|
216
|
+
return
|
217
|
+
end
|
218
|
+
rescue Errno::ECONNRESET => e
|
219
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
220
|
+
next
|
221
|
+
rescue OpenURI::HTTPError => e
|
222
|
+
code = e.io.status[0]
|
223
|
+
if code.start_with?("5")
|
224
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
225
|
+
next
|
226
|
+
elsif code == "404"
|
227
|
+
@logger.error("File for #{current_time} not found. Skipping because: " + e.message)
|
228
|
+
else
|
229
|
+
raise e
|
230
|
+
end
|
231
|
+
rescue Zlib::GzipFile::Error => e
|
232
|
+
@logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
@cache.put(filename, nil) unless @cache.has?(filename)
|
237
|
+
end
|
238
|
+
|
239
|
+
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
240
|
+
if @proactive
|
241
|
+
real_from = restore_checkpoint(from)
|
242
|
+
any_ready = Thread.promise
|
243
|
+
|
244
|
+
@logger.info("Proactively scheduling download tasks...")
|
245
|
+
self.each_time(real_from, to) do |current_time|
|
246
|
+
@pool.process(current_time) do |current_time|
|
247
|
+
cache(current_time)
|
248
|
+
any_ready << true
|
249
|
+
@logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
~any_ready
|
254
|
+
@logger.info("Download tasks successfully scheduled!")
|
255
|
+
end
|
256
|
+
|
257
|
+
super
|
258
|
+
end
|
259
|
+
|
260
|
+
class Cache
|
261
|
+
def initialize(max_size = 10)
|
262
|
+
@cache = {}
|
263
|
+
@max_size = max_size
|
264
|
+
@mutex = Mutex.new
|
265
|
+
end
|
266
|
+
|
267
|
+
def put(name, content)
|
268
|
+
@mutex.synchronize do
|
269
|
+
@cache[name] = content
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
def get(name)
|
274
|
+
@mutex.synchronize do
|
275
|
+
return @cache.delete(name)
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
def size
|
280
|
+
@mutex.synchronize do
|
281
|
+
return @cache.size
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
def has?(name)
|
286
|
+
@mutex.synchronize do
|
287
|
+
return @cache.has_key?(name)
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
def full?
|
292
|
+
self.size >= @max_size
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
class DownloadArchiveException < Provider::GHAException
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
class FolderProvider < Provider
|
301
|
+
def initialize(folder)
|
302
|
+
super()
|
303
|
+
|
304
|
+
@folder = folder
|
305
|
+
end
|
306
|
+
|
307
|
+
def get(current_time)
|
308
|
+
filename = self.get_gha_filename(current_time)
|
309
|
+
complete_filename = File.join(@folder, filename)
|
310
|
+
mode = "rb"
|
311
|
+
|
312
|
+
unless FileTest.exist?(complete_filename)
|
313
|
+
complete_filename = complete_filename.sub(".gz", "")
|
314
|
+
mode = "r"
|
315
|
+
end
|
316
|
+
|
317
|
+
unless FileTest.exist?(complete_filename)
|
318
|
+
raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
|
319
|
+
end
|
320
|
+
|
321
|
+
File.open(complete_filename, mode) do |file|
|
322
|
+
return self.read_gha_file(file)
|
323
|
+
end
|
324
|
+
end
|
325
|
+
end
|
326
|
+
end
|
data/lib/gh-archive.rb
CHANGED
@@ -1,496 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
require 'thread/pool'
|
8
|
-
require 'thread/promise'
|
9
|
-
|
10
|
-
require_relative File.expand_path('../gh-archive/events', __FILE__)
|
11
|
-
|
12
|
-
module GHArchive
|
13
|
-
class ThreadPool
|
14
|
-
def initialize(size)
|
15
|
-
@size = size
|
16
|
-
@threads = []
|
17
|
-
@queue = []
|
18
|
-
@mutex = Mutex.new
|
19
|
-
|
20
|
-
@consumer_thread = Thread.start do
|
21
|
-
while !@shutdown || @threads.size > 0 || @queue.size > 0
|
22
|
-
sleep 0.1 if @queue.size == 0 || @threads.size == @size
|
23
|
-
@threads.delete_if { |t| !t.alive? }
|
24
|
-
|
25
|
-
if @threads.size < @size && @queue.size > 0
|
26
|
-
@mutex.synchronize do
|
27
|
-
args, job = @queue.shift
|
28
|
-
@threads << Thread.start(*args, &job)
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def process(*args, &block)
|
36
|
-
raise "Block expected" unless block_given?
|
37
|
-
raise "Can not add jobs while shutting down" if @shutdown
|
38
|
-
|
39
|
-
@mutex.synchronize do
|
40
|
-
@queue << [args, block]
|
41
|
-
end
|
42
|
-
|
43
|
-
return self.enqueued
|
44
|
-
end
|
45
|
-
|
46
|
-
def shutdown
|
47
|
-
@shutdown = true
|
48
|
-
end
|
49
|
-
|
50
|
-
def shutdown!
|
51
|
-
self.shutdown
|
52
|
-
@mutex.synchronize do
|
53
|
-
@queue.clear
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
def enqueued
|
58
|
-
return @queue.size
|
59
|
-
end
|
60
|
-
|
61
|
-
def shutdown?
|
62
|
-
@shutdown
|
63
|
-
end
|
64
|
-
|
65
|
-
def alive?
|
66
|
-
@consumer_thread.alive?
|
67
|
-
end
|
68
|
-
|
69
|
-
def wait
|
70
|
-
while alive?
|
71
|
-
sleep 0.1
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
module GHAUtils
|
78
|
-
def get_gha_filename(date)
|
79
|
-
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
80
|
-
end
|
81
|
-
|
82
|
-
def read_gha_file_content(gz)
|
83
|
-
gzip = Zlib::GzipReader.new(gz)
|
84
|
-
return gzip.read
|
85
|
-
ensure
|
86
|
-
gzip.close if gzip
|
87
|
-
end
|
88
|
-
|
89
|
-
def read_gha_file(file)
|
90
|
-
|
91
|
-
if !file.is_a?(StringIO) && file.path.end_with?(".json")
|
92
|
-
content = file.read
|
93
|
-
elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
94
|
-
content = read_gha_file_content(file)
|
95
|
-
else
|
96
|
-
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
97
|
-
end
|
98
|
-
|
99
|
-
result = []
|
100
|
-
content.lines.each do |line|
|
101
|
-
result << JSON.parse(line)
|
102
|
-
end
|
103
|
-
|
104
|
-
return result
|
105
|
-
end
|
106
|
-
|
107
|
-
def each_time(from, to)
|
108
|
-
current_time = from
|
109
|
-
while current_time < to
|
110
|
-
yield current_time
|
111
|
-
current_time += 3600
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
class GHAProvider
|
117
|
-
include GHAUtils
|
118
|
-
|
119
|
-
def initialize
|
120
|
-
@logger = Logger.new(STDOUT)
|
121
|
-
|
122
|
-
@includes = {}
|
123
|
-
@excludes = {}
|
124
|
-
|
125
|
-
@checkpoint_name = nil
|
126
|
-
@use_json = true
|
127
|
-
end
|
128
|
-
|
129
|
-
def use_checkpoint(filename)
|
130
|
-
@checkpoint_name = filename
|
131
|
-
|
132
|
-
return self
|
133
|
-
end
|
134
|
-
|
135
|
-
def parse_events
|
136
|
-
@use_json = false
|
137
|
-
|
138
|
-
return self
|
139
|
-
end
|
140
|
-
|
141
|
-
def logger=(logger)
|
142
|
-
@logger = logger
|
143
|
-
|
144
|
-
return self
|
145
|
-
end
|
146
|
-
alias :use_logger :logger=
|
147
|
-
|
148
|
-
def get(date)
|
149
|
-
raise "Not implemented"
|
150
|
-
end
|
151
|
-
|
152
|
-
def include(**args)
|
153
|
-
args.each do |key, value|
|
154
|
-
@includes[key.to_s] = [] unless @includes[key.to_s]
|
155
|
-
@includes[key.to_s] << value
|
156
|
-
end
|
157
|
-
|
158
|
-
return self
|
159
|
-
end
|
160
|
-
|
161
|
-
def exclude(**args)
|
162
|
-
args.each do |key, value|
|
163
|
-
@excludes[key.to_s] = [] unless @excludes[key.to_s]
|
164
|
-
@excludes[key.to_s] << value
|
165
|
-
end
|
166
|
-
|
167
|
-
return self
|
168
|
-
end
|
169
|
-
|
170
|
-
def restore_checkpoint(from)
|
171
|
-
if @checkpoint_name && FileTest.exist?(@checkpoint_name)
|
172
|
-
# Note that this throws an exception if the file is not readable. This is the intended behavior.
|
173
|
-
# As opposed to that, failing to save the checkpoint information just results in a warning on the log.
|
174
|
-
loaded_from = Marshal.load(File.read(@checkpoint_name))
|
175
|
-
raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
|
176
|
-
|
177
|
-
@logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
|
178
|
-
|
179
|
-
return loaded_from
|
180
|
-
else
|
181
|
-
return from
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
def update_checkpoint(current_time)
|
186
|
-
if @checkpoint_name
|
187
|
-
begin
|
188
|
-
File.open(@checkpoint_name, "wb") do |f|
|
189
|
-
f.write(Marshal.dump(current_time))
|
190
|
-
end
|
191
|
-
rescue
|
192
|
-
@logger.warn(
|
193
|
-
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
194
|
-
)
|
195
|
-
end
|
196
|
-
end
|
197
|
-
end
|
198
|
-
|
199
|
-
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
200
|
-
exceptions = []
|
201
|
-
|
202
|
-
from = restore_checkpoint(from)
|
203
|
-
|
204
|
-
self.each_time(from, to) do |current_time|
|
205
|
-
events = []
|
206
|
-
|
207
|
-
update_checkpoint(current_time)
|
208
|
-
|
209
|
-
begin
|
210
|
-
events = self.get(current_time)
|
211
|
-
rescue GHAException => e
|
212
|
-
@logger.warn(e.message)
|
213
|
-
next
|
214
|
-
rescue => e
|
215
|
-
@logger.error("An exception occurred for #{current_time}: #{e.message}")
|
216
|
-
exceptions << e
|
217
|
-
next
|
218
|
-
end
|
219
|
-
|
220
|
-
events.each do |event|
|
221
|
-
skip = false
|
222
|
-
@includes.each do |key, value|
|
223
|
-
skip = true unless value.include?(event[key])
|
224
|
-
end
|
225
|
-
|
226
|
-
@excludes.each do |key, value|
|
227
|
-
skip = true if value.include?(event[key])
|
228
|
-
end
|
229
|
-
next if skip
|
230
|
-
|
231
|
-
if @use_json
|
232
|
-
yield event, current_time
|
233
|
-
else
|
234
|
-
yield GHArchive::Event.parse(event), current_time
|
235
|
-
end
|
236
|
-
end
|
237
|
-
|
238
|
-
@logger.info("Scanned #{current_time}")
|
239
|
-
|
240
|
-
events.clear
|
241
|
-
GC.start
|
242
|
-
end
|
243
|
-
|
244
|
-
update_checkpoint(to)
|
245
|
-
|
246
|
-
return exceptions
|
247
|
-
end
|
248
|
-
|
249
|
-
class GHAException < Exception
|
250
|
-
end
|
251
|
-
end
|
252
|
-
|
253
|
-
class OnlineGHAProvider < GHAProvider
|
254
|
-
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
255
|
-
super()
|
256
|
-
|
257
|
-
self.max_retries(max_retries)
|
258
|
-
self.proactive(proactive_pool_size) if proactive
|
259
|
-
|
260
|
-
@cache = Cache.new
|
261
|
-
end
|
262
|
-
|
263
|
-
def max_retries(n)
|
264
|
-
@max_retries = n
|
265
|
-
|
266
|
-
return self
|
267
|
-
end
|
268
|
-
|
269
|
-
def proactive(pool_size = 10)
|
270
|
-
@proactive = true
|
271
|
-
@pool = GHArchive::ThreadPool.new(pool_size)
|
272
|
-
|
273
|
-
return self
|
274
|
-
end
|
275
|
-
|
276
|
-
def get(current_time)
|
277
|
-
@max_retries.times do
|
278
|
-
begin
|
279
|
-
filename = self.get_gha_filename(current_time)
|
280
|
-
|
281
|
-
if @proactive
|
282
|
-
@logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
|
283
|
-
|
284
|
-
while !@cache.has?(filename)
|
285
|
-
sleep 1
|
286
|
-
end
|
287
|
-
|
288
|
-
data = @cache.get(filename)
|
289
|
-
if data
|
290
|
-
return data
|
291
|
-
else
|
292
|
-
raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
|
293
|
-
end
|
294
|
-
else
|
295
|
-
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
296
|
-
return self.read_gha_file(gz)
|
297
|
-
end
|
298
|
-
end
|
299
|
-
rescue Errno::ECONNRESET => e
|
300
|
-
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
301
|
-
next
|
302
|
-
rescue OpenURI::HTTPError => e
|
303
|
-
code = e.io.status[0]
|
304
|
-
if code.start_with?("5")
|
305
|
-
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
306
|
-
next
|
307
|
-
else
|
308
|
-
raise e
|
309
|
-
end
|
310
|
-
end
|
311
|
-
end
|
312
|
-
|
313
|
-
raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
|
314
|
-
end
|
315
|
-
|
316
|
-
def cache(current_time)
|
317
|
-
@logger.info("Full cache. Waiting for some free slot...") if @cache.full?
|
318
|
-
while @cache.full?
|
319
|
-
sleep 1
|
320
|
-
end
|
321
|
-
|
322
|
-
filename = self.get_gha_filename(current_time)
|
323
|
-
@max_retries.times do
|
324
|
-
begin
|
325
|
-
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
326
|
-
content = self.read_gha_file(gz)
|
327
|
-
@cache.put(filename, content)
|
328
|
-
return
|
329
|
-
end
|
330
|
-
rescue Errno::ECONNRESET => e
|
331
|
-
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
332
|
-
next
|
333
|
-
rescue OpenURI::HTTPError => e
|
334
|
-
code = e.io.status[0]
|
335
|
-
if code.start_with?("5")
|
336
|
-
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
337
|
-
next
|
338
|
-
elsif code == "404"
|
339
|
-
@logger.error("File for #{current_time} not found. Skipping because: " + e.message)
|
340
|
-
else
|
341
|
-
raise e
|
342
|
-
end
|
343
|
-
rescue Zlib::GzipFile::Error => e
|
344
|
-
@logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
|
345
|
-
end
|
346
|
-
end
|
347
|
-
|
348
|
-
@cache.put(filename, nil) unless @cache.has?(filename)
|
349
|
-
end
|
350
|
-
|
351
|
-
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
352
|
-
if @proactive
|
353
|
-
real_from = restore_checkpoint(from)
|
354
|
-
any_ready = Thread.promise
|
355
|
-
|
356
|
-
@logger.info("Proactively scheduling download tasks...")
|
357
|
-
self.each_time(real_from, to) do |current_time|
|
358
|
-
@pool.process(current_time) do |current_time|
|
359
|
-
cache(current_time)
|
360
|
-
any_ready << true
|
361
|
-
@logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
|
362
|
-
end
|
363
|
-
end
|
364
|
-
|
365
|
-
~any_ready
|
366
|
-
@logger.info("Download tasks successfully scheduled!")
|
367
|
-
end
|
368
|
-
|
369
|
-
super
|
370
|
-
end
|
371
|
-
|
372
|
-
class Cache
|
373
|
-
def initialize(max_size = 10)
|
374
|
-
@cache = {}
|
375
|
-
@max_size = max_size
|
376
|
-
@mutex = Mutex.new
|
377
|
-
end
|
378
|
-
|
379
|
-
def put(name, content)
|
380
|
-
@mutex.synchronize do
|
381
|
-
@cache[name] = content
|
382
|
-
end
|
383
|
-
end
|
384
|
-
|
385
|
-
def get(name)
|
386
|
-
@mutex.synchronize do
|
387
|
-
return @cache.delete(name)
|
388
|
-
end
|
389
|
-
end
|
390
|
-
|
391
|
-
def size
|
392
|
-
@mutex.synchronize do
|
393
|
-
return @cache.size
|
394
|
-
end
|
395
|
-
end
|
396
|
-
|
397
|
-
def has?(name)
|
398
|
-
@mutex.synchronize do
|
399
|
-
return @cache.has_key?(name)
|
400
|
-
end
|
401
|
-
end
|
402
|
-
|
403
|
-
def full?
|
404
|
-
self.size >= @max_size
|
405
|
-
end
|
406
|
-
end
|
407
|
-
|
408
|
-
class DownloadArchiveException < GHAProvider::GHAException
|
409
|
-
end
|
410
|
-
end
|
411
|
-
|
412
|
-
class FolderGHAProvider < GHAProvider
|
413
|
-
def initialize(folder)
|
414
|
-
super()
|
415
|
-
|
416
|
-
@folder = folder
|
417
|
-
end
|
418
|
-
|
419
|
-
def get(current_time)
|
420
|
-
filename = self.get_gha_filename(current_time)
|
421
|
-
complete_filename = File.join(@folder, filename)
|
422
|
-
mode = "rb"
|
423
|
-
|
424
|
-
unless FileTest.exist?(complete_filename)
|
425
|
-
complete_filename = complete_filename.sub(".gz", "")
|
426
|
-
mode = "r"
|
427
|
-
end
|
428
|
-
|
429
|
-
unless FileTest.exist?(complete_filename)
|
430
|
-
raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
|
431
|
-
end
|
432
|
-
|
433
|
-
File.open(complete_filename, mode) do |file|
|
434
|
-
return self.read_gha_file(file)
|
435
|
-
end
|
436
|
-
end
|
437
|
-
end
|
438
|
-
|
439
|
-
class GHADownloader
|
440
|
-
include GHAUtils
|
441
|
-
|
442
|
-
def initialize(folder, decompress = false)
|
443
|
-
@logger = Logger.new(STDERR)
|
444
|
-
@decompress = decompress
|
445
|
-
@folder = folder
|
446
|
-
@max = nil
|
447
|
-
|
448
|
-
Dir.mkdir(@folder) unless FileTest.exist?(@folder)
|
449
|
-
raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
|
450
|
-
end
|
451
|
-
|
452
|
-
def max(max)
|
453
|
-
@max = max
|
454
|
-
return self
|
455
|
-
end
|
456
|
-
|
457
|
-
def logger=(logger)
|
458
|
-
@logger = logger
|
459
|
-
end
|
460
|
-
|
461
|
-
def download(from = Time.gm(2015, 1, 1), to = Time.now)
|
462
|
-
archive = []
|
463
|
-
self.each_time(from, to) do |current_time|
|
464
|
-
filename = self.get_gha_filename(current_time)
|
465
|
-
out_filename = filename.clone
|
466
|
-
out_filename.gsub!(".json.gz", ".json") if @decompress
|
467
|
-
|
468
|
-
target_file = File.join(@folder, out_filename)
|
469
|
-
if FileTest.exist?(target_file)
|
470
|
-
@logger.info("Skipping existing file for #{current_time}")
|
471
|
-
next
|
472
|
-
else
|
473
|
-
@logger.info("Downloading file for #{current_time}")
|
474
|
-
end
|
475
|
-
|
476
|
-
File.open(target_file, 'w') do |f|
|
477
|
-
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
478
|
-
if @decompress
|
479
|
-
f << self.read_gha_file_content(gz)
|
480
|
-
else
|
481
|
-
f << gz.read
|
482
|
-
end
|
483
|
-
end
|
484
|
-
end
|
485
|
-
archive << target_file
|
486
|
-
|
487
|
-
if @max && archive.size > @max
|
488
|
-
last = archive.shift
|
489
|
-
@logger.info("Removing local file #{last}")
|
490
|
-
File.unlink(last)
|
491
|
-
end
|
492
|
-
|
493
|
-
yield filename if block_given?
|
494
|
-
end
|
495
|
-
end
|
496
|
-
end
|
1
|
+
require_relative 'gh-archive/core'
|
2
|
+
require_relative 'gh-archive/providers'
|
3
|
+
require_relative 'gh-archive/downloader'
|
4
|
+
require_relative 'gh-archive/events'
|
5
|
+
require_relative 'gh-archive/entities'
|
6
|
+
require_relative 'gh-archive/legacy'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.17'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-12-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
@@ -57,8 +57,13 @@ extensions: []
|
|
57
57
|
extra_rdoc_files: []
|
58
58
|
files:
|
59
59
|
- lib/gh-archive.rb
|
60
|
+
- lib/gh-archive/core.rb
|
61
|
+
- lib/gh-archive/downloader.rb
|
60
62
|
- lib/gh-archive/entities.rb
|
61
63
|
- lib/gh-archive/events.rb
|
64
|
+
- lib/gh-archive/job.rb
|
65
|
+
- lib/gh-archive/legacy.rb
|
66
|
+
- lib/gh-archive/providers.rb
|
62
67
|
homepage: https://github.com/intersimone999/gh-archive
|
63
68
|
licenses:
|
64
69
|
- GPL-3.0-only
|
@@ -78,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
83
|
- !ruby/object:Gem::Version
|
79
84
|
version: '0'
|
80
85
|
requirements: []
|
81
|
-
rubygems_version: 3.2.
|
86
|
+
rubygems_version: 3.2.29
|
82
87
|
signing_key:
|
83
88
|
specification_version: 4
|
84
89
|
summary: GitHub Archive mining utility
|