gh-archive 0.16 → 0.17
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/gh-archive/core.rb +112 -0
- data/lib/gh-archive/downloader.rb +62 -0
- data/lib/gh-archive/entities.rb +1 -0
- data/lib/gh-archive/events.rb +1 -1
- data/lib/gh-archive/job.rb +22 -0
- data/lib/gh-archive/legacy.rb +31 -0
- data/lib/gh-archive/providers.rb +326 -0
- data/lib/gh-archive.rb +6 -496
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c17a920e176289a00fb21fff7b712dc634c4491241ebe11a3b06f0ddd112706d
|
4
|
+
data.tar.gz: 58e7e4fdc6442d4a0955bc70d0fb10eec05b437350c255e34731021b2d714deb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9066428d42acd77376fe72082adc62517c21bf8154b60b0006d7f0ab61fd0679af5775444c180d5cc8e842c99b0c207266e25ea28d4f0cdef33e49259339bb3
|
7
|
+
data.tar.gz: 036eaa0ead55db627ee8bb4f4a7421a525ab0ae73041045ef6d5b13307b7ed54d36e70532658ec3853cac2f21a7e5d2e80313c2fb688723ea87cd689ea469c9c
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'code-assertions'
|
2
|
+
require 'json'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'zlib'
|
5
|
+
require 'logger'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'thread/pool'
|
8
|
+
require 'thread/promise'
|
9
|
+
|
10
|
+
module GHArchive
|
11
|
+
class ThreadPool
|
12
|
+
def initialize(size)
|
13
|
+
@size = size
|
14
|
+
@threads = []
|
15
|
+
@queue = []
|
16
|
+
@mutex = Mutex.new
|
17
|
+
|
18
|
+
@consumer_thread = Thread.start do
|
19
|
+
while !@shutdown || @threads.size > 0 || @queue.size > 0
|
20
|
+
sleep 0.1 if @queue.size == 0 || @threads.size == @size
|
21
|
+
@threads.delete_if { |t| !t.alive? }
|
22
|
+
|
23
|
+
if @threads.size < @size && @queue.size > 0
|
24
|
+
@mutex.synchronize do
|
25
|
+
args, job = @queue.shift
|
26
|
+
@threads << Thread.start(*args, &job)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def process(*args, &block)
|
34
|
+
raise "Block expected" unless block_given?
|
35
|
+
raise "Can not add jobs while shutting down" if @shutdown
|
36
|
+
|
37
|
+
@mutex.synchronize do
|
38
|
+
@queue << [args, block]
|
39
|
+
end
|
40
|
+
|
41
|
+
return self.enqueued
|
42
|
+
end
|
43
|
+
|
44
|
+
def shutdown
|
45
|
+
@shutdown = true
|
46
|
+
end
|
47
|
+
|
48
|
+
def shutdown!
|
49
|
+
self.shutdown
|
50
|
+
@mutex.synchronize do
|
51
|
+
@queue.clear
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def enqueued
|
56
|
+
return @queue.size
|
57
|
+
end
|
58
|
+
|
59
|
+
def shutdown?
|
60
|
+
@shutdown
|
61
|
+
end
|
62
|
+
|
63
|
+
def alive?
|
64
|
+
@consumer_thread.alive?
|
65
|
+
end
|
66
|
+
|
67
|
+
def wait
|
68
|
+
while alive?
|
69
|
+
sleep 0.1
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
module Utils
|
75
|
+
def get_gha_filename(date)
|
76
|
+
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
77
|
+
end
|
78
|
+
|
79
|
+
def read_gha_file_content(gz)
|
80
|
+
gzip = Zlib::GzipReader.new(gz)
|
81
|
+
return gzip.read
|
82
|
+
ensure
|
83
|
+
gzip.close if gzip
|
84
|
+
end
|
85
|
+
|
86
|
+
def read_gha_file(file)
|
87
|
+
|
88
|
+
if !file.is_a?(StringIO) && file.path.end_with?(".json")
|
89
|
+
content = file.read
|
90
|
+
elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
91
|
+
content = read_gha_file_content(file)
|
92
|
+
else
|
93
|
+
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
94
|
+
end
|
95
|
+
|
96
|
+
result = []
|
97
|
+
content.lines.each do |line|
|
98
|
+
result << JSON.parse(line)
|
99
|
+
end
|
100
|
+
|
101
|
+
return result
|
102
|
+
end
|
103
|
+
|
104
|
+
def each_time(from, to)
|
105
|
+
current_time = from
|
106
|
+
while current_time < to
|
107
|
+
yield current_time
|
108
|
+
current_time += 3600
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require_relative 'core'
|
2
|
+
|
3
|
+
module GHArchive
|
4
|
+
class Downloader
|
5
|
+
include Utils
|
6
|
+
|
7
|
+
def initialize(folder, decompress = false)
|
8
|
+
@logger = Logger.new(STDERR)
|
9
|
+
@decompress = decompress
|
10
|
+
@folder = folder
|
11
|
+
@max = nil
|
12
|
+
|
13
|
+
Dir.mkdir(@folder) unless FileTest.exist?(@folder)
|
14
|
+
raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
|
15
|
+
end
|
16
|
+
|
17
|
+
def max(max)
|
18
|
+
@max = max
|
19
|
+
return self
|
20
|
+
end
|
21
|
+
|
22
|
+
def logger=(logger)
|
23
|
+
@logger = logger
|
24
|
+
end
|
25
|
+
|
26
|
+
def download(from = Time.gm(2015, 1, 1), to = Time.now)
|
27
|
+
archive = []
|
28
|
+
self.each_time(from, to) do |current_time|
|
29
|
+
filename = self.get_gha_filename(current_time)
|
30
|
+
out_filename = filename.clone
|
31
|
+
out_filename.gsub!(".json.gz", ".json") if @decompress
|
32
|
+
|
33
|
+
target_file = File.join(@folder, out_filename)
|
34
|
+
if FileTest.exist?(target_file)
|
35
|
+
@logger.info("Skipping existing file for #{current_time}")
|
36
|
+
next
|
37
|
+
else
|
38
|
+
@logger.info("Downloading file for #{current_time}")
|
39
|
+
end
|
40
|
+
|
41
|
+
File.open(target_file, 'w') do |f|
|
42
|
+
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
43
|
+
if @decompress
|
44
|
+
f << self.read_gha_file_content(gz)
|
45
|
+
else
|
46
|
+
f << gz.read
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
archive << target_file
|
51
|
+
|
52
|
+
if @max && archive.size > @max
|
53
|
+
last = archive.shift
|
54
|
+
@logger.info("Removing local file #{last}")
|
55
|
+
File.unlink(last)
|
56
|
+
end
|
57
|
+
|
58
|
+
yield filename if block_given?
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
data/lib/gh-archive/entities.rb
CHANGED
data/lib/gh-archive/events.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'core'
|
2
|
+
require 'providers'
|
3
|
+
|
4
|
+
module GHArchive
|
5
|
+
class Job
|
6
|
+
def initialize(provider, from, to)
|
7
|
+
@provider = provider
|
8
|
+
@from = from
|
9
|
+
@to = to
|
10
|
+
end
|
11
|
+
|
12
|
+
def start
|
13
|
+
@provider.each(@from, @to) do |event, time|
|
14
|
+
run(event, time)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def run(event, time)
|
19
|
+
raise GHAException, "This is an abstract job, it should be implemented before running"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative 'core'
|
2
|
+
|
3
|
+
GHAUtils = GHArchive::Utils
|
4
|
+
|
5
|
+
class GHAProvider < GHArchive::Provider
|
6
|
+
def initialize(*args)
|
7
|
+
warn "GHAProvider is deprecated. Please use GHArchive::Provider instead."
|
8
|
+
super
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
class OnlineGHAProvider < GHArchive::OnlineProvider
|
13
|
+
def initialize(*args)
|
14
|
+
warn "OnlineGHAProvider is deprecated. Please use GHArchive::OnlineProvider instead."
|
15
|
+
super
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class FolderGHAProvider < GHArchive::FolderProvider
|
20
|
+
def initialize(*args)
|
21
|
+
warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
|
22
|
+
super
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class GHADownloader < GHArchive::Downloader
|
27
|
+
def initialize(*args)
|
28
|
+
warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
|
29
|
+
super
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,326 @@
|
|
1
|
+
require_relative 'core'
|
2
|
+
|
3
|
+
module GHArchive
|
4
|
+
class Provider
|
5
|
+
include Utils
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@logger = Logger.new(STDOUT)
|
9
|
+
|
10
|
+
@includes = {}
|
11
|
+
@excludes = {}
|
12
|
+
|
13
|
+
@checkpoint_name = nil
|
14
|
+
@use_json = true
|
15
|
+
end
|
16
|
+
|
17
|
+
def use_checkpoint(filename)
|
18
|
+
@checkpoint_name = filename
|
19
|
+
|
20
|
+
return self
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse_events
|
24
|
+
@use_json = false
|
25
|
+
|
26
|
+
return self
|
27
|
+
end
|
28
|
+
|
29
|
+
def logger=(logger)
|
30
|
+
@logger = logger
|
31
|
+
|
32
|
+
return self
|
33
|
+
end
|
34
|
+
alias :use_logger :logger=
|
35
|
+
|
36
|
+
def get(date)
|
37
|
+
raise "Not implemented"
|
38
|
+
end
|
39
|
+
|
40
|
+
def include(**args)
|
41
|
+
args.each do |key, value|
|
42
|
+
@includes[key.to_s] = [] unless @includes[key.to_s]
|
43
|
+
@includes[key.to_s] << value
|
44
|
+
end
|
45
|
+
|
46
|
+
return self
|
47
|
+
end
|
48
|
+
|
49
|
+
def exclude(**args)
|
50
|
+
args.each do |key, value|
|
51
|
+
@excludes[key.to_s] = [] unless @excludes[key.to_s]
|
52
|
+
@excludes[key.to_s] << value
|
53
|
+
end
|
54
|
+
|
55
|
+
return self
|
56
|
+
end
|
57
|
+
|
58
|
+
def restore_checkpoint(from)
|
59
|
+
if @checkpoint_name && FileTest.exist?(@checkpoint_name)
|
60
|
+
# Note that this throws an exception if the file is not readable. This is the intended behavior.
|
61
|
+
# As opposed to that, failing to save the checkpoint information just results in a warning on the log.
|
62
|
+
loaded_from = Marshal.load(File.read(@checkpoint_name))
|
63
|
+
raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
|
64
|
+
|
65
|
+
@logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
|
66
|
+
|
67
|
+
return loaded_from
|
68
|
+
else
|
69
|
+
return from
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def update_checkpoint(current_time)
|
74
|
+
if @checkpoint_name
|
75
|
+
begin
|
76
|
+
File.open(@checkpoint_name, "wb") do |f|
|
77
|
+
f.write(Marshal.dump(current_time))
|
78
|
+
end
|
79
|
+
rescue
|
80
|
+
@logger.warn(
|
81
|
+
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
82
|
+
)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
88
|
+
exceptions = []
|
89
|
+
|
90
|
+
from = restore_checkpoint(from)
|
91
|
+
|
92
|
+
self.each_time(from, to) do |current_time|
|
93
|
+
events = []
|
94
|
+
|
95
|
+
update_checkpoint(current_time)
|
96
|
+
|
97
|
+
begin
|
98
|
+
events = self.get(current_time)
|
99
|
+
rescue GHAException => e
|
100
|
+
@logger.warn(e.message)
|
101
|
+
next
|
102
|
+
rescue => e
|
103
|
+
@logger.error("An exception occurred for #{current_time}: #{e.message}")
|
104
|
+
exceptions << e
|
105
|
+
next
|
106
|
+
end
|
107
|
+
|
108
|
+
events.each do |event|
|
109
|
+
skip = false
|
110
|
+
@includes.each do |key, value|
|
111
|
+
skip = true unless value.include?(event[key])
|
112
|
+
end
|
113
|
+
|
114
|
+
@excludes.each do |key, value|
|
115
|
+
skip = true if value.include?(event[key])
|
116
|
+
end
|
117
|
+
next if skip
|
118
|
+
|
119
|
+
if @use_json
|
120
|
+
yield event, current_time
|
121
|
+
else
|
122
|
+
yield GHArchive::Event.parse(event), current_time
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
@logger.info("Scanned #{current_time}")
|
127
|
+
|
128
|
+
events.clear
|
129
|
+
GC.start
|
130
|
+
end
|
131
|
+
|
132
|
+
update_checkpoint(to)
|
133
|
+
|
134
|
+
return exceptions
|
135
|
+
end
|
136
|
+
|
137
|
+
class GHAException < Exception
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
class OnlineProvider < Provider
|
142
|
+
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
143
|
+
super()
|
144
|
+
|
145
|
+
self.max_retries(max_retries)
|
146
|
+
self.proactive(proactive_pool_size) if proactive
|
147
|
+
|
148
|
+
@cache = Cache.new
|
149
|
+
end
|
150
|
+
|
151
|
+
def max_retries(n)
|
152
|
+
@max_retries = n
|
153
|
+
|
154
|
+
return self
|
155
|
+
end
|
156
|
+
|
157
|
+
def proactive(pool_size = 10)
|
158
|
+
@proactive = true
|
159
|
+
@pool = GHArchive::ThreadPool.new(pool_size)
|
160
|
+
|
161
|
+
return self
|
162
|
+
end
|
163
|
+
|
164
|
+
def get(current_time)
|
165
|
+
@max_retries.times do
|
166
|
+
begin
|
167
|
+
filename = self.get_gha_filename(current_time)
|
168
|
+
|
169
|
+
if @proactive
|
170
|
+
@logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
|
171
|
+
|
172
|
+
while !@cache.has?(filename)
|
173
|
+
sleep 1
|
174
|
+
end
|
175
|
+
|
176
|
+
data = @cache.get(filename)
|
177
|
+
if data
|
178
|
+
return data
|
179
|
+
else
|
180
|
+
raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
|
181
|
+
end
|
182
|
+
else
|
183
|
+
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
184
|
+
return self.read_gha_file(gz)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
rescue Errno::ECONNRESET => e
|
188
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
189
|
+
next
|
190
|
+
rescue OpenURI::HTTPError => e
|
191
|
+
code = e.io.status[0]
|
192
|
+
if code.start_with?("5")
|
193
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
194
|
+
next
|
195
|
+
else
|
196
|
+
raise e
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
|
202
|
+
end
|
203
|
+
|
204
|
+
def cache(current_time)
|
205
|
+
@logger.info("Full cache. Waiting for some free slot...") if @cache.full?
|
206
|
+
while @cache.full?
|
207
|
+
sleep 1
|
208
|
+
end
|
209
|
+
|
210
|
+
filename = self.get_gha_filename(current_time)
|
211
|
+
@max_retries.times do
|
212
|
+
begin
|
213
|
+
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
214
|
+
content = self.read_gha_file(gz)
|
215
|
+
@cache.put(filename, content)
|
216
|
+
return
|
217
|
+
end
|
218
|
+
rescue Errno::ECONNRESET => e
|
219
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
220
|
+
next
|
221
|
+
rescue OpenURI::HTTPError => e
|
222
|
+
code = e.io.status[0]
|
223
|
+
if code.start_with?("5")
|
224
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
225
|
+
next
|
226
|
+
elsif code == "404"
|
227
|
+
@logger.error("File for #{current_time} not found. Skipping because: " + e.message)
|
228
|
+
else
|
229
|
+
raise e
|
230
|
+
end
|
231
|
+
rescue Zlib::GzipFile::Error => e
|
232
|
+
@logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
@cache.put(filename, nil) unless @cache.has?(filename)
|
237
|
+
end
|
238
|
+
|
239
|
+
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
240
|
+
if @proactive
|
241
|
+
real_from = restore_checkpoint(from)
|
242
|
+
any_ready = Thread.promise
|
243
|
+
|
244
|
+
@logger.info("Proactively scheduling download tasks...")
|
245
|
+
self.each_time(real_from, to) do |current_time|
|
246
|
+
@pool.process(current_time) do |current_time|
|
247
|
+
cache(current_time)
|
248
|
+
any_ready << true
|
249
|
+
@logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
~any_ready
|
254
|
+
@logger.info("Download tasks successfully scheduled!")
|
255
|
+
end
|
256
|
+
|
257
|
+
super
|
258
|
+
end
|
259
|
+
|
260
|
+
class Cache
|
261
|
+
def initialize(max_size = 10)
|
262
|
+
@cache = {}
|
263
|
+
@max_size = max_size
|
264
|
+
@mutex = Mutex.new
|
265
|
+
end
|
266
|
+
|
267
|
+
def put(name, content)
|
268
|
+
@mutex.synchronize do
|
269
|
+
@cache[name] = content
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
def get(name)
|
274
|
+
@mutex.synchronize do
|
275
|
+
return @cache.delete(name)
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
def size
|
280
|
+
@mutex.synchronize do
|
281
|
+
return @cache.size
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
def has?(name)
|
286
|
+
@mutex.synchronize do
|
287
|
+
return @cache.has_key?(name)
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
def full?
|
292
|
+
self.size >= @max_size
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
class DownloadArchiveException < Provider::GHAException
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
class FolderProvider < Provider
|
301
|
+
def initialize(folder)
|
302
|
+
super()
|
303
|
+
|
304
|
+
@folder = folder
|
305
|
+
end
|
306
|
+
|
307
|
+
def get(current_time)
|
308
|
+
filename = self.get_gha_filename(current_time)
|
309
|
+
complete_filename = File.join(@folder, filename)
|
310
|
+
mode = "rb"
|
311
|
+
|
312
|
+
unless FileTest.exist?(complete_filename)
|
313
|
+
complete_filename = complete_filename.sub(".gz", "")
|
314
|
+
mode = "r"
|
315
|
+
end
|
316
|
+
|
317
|
+
unless FileTest.exist?(complete_filename)
|
318
|
+
raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
|
319
|
+
end
|
320
|
+
|
321
|
+
File.open(complete_filename, mode) do |file|
|
322
|
+
return self.read_gha_file(file)
|
323
|
+
end
|
324
|
+
end
|
325
|
+
end
|
326
|
+
end
|
data/lib/gh-archive.rb
CHANGED
@@ -1,496 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
require 'thread/pool'
|
8
|
-
require 'thread/promise'
|
9
|
-
|
10
|
-
require_relative File.expand_path('../gh-archive/events', __FILE__)
|
11
|
-
|
12
|
-
module GHArchive
|
13
|
-
class ThreadPool
|
14
|
-
def initialize(size)
|
15
|
-
@size = size
|
16
|
-
@threads = []
|
17
|
-
@queue = []
|
18
|
-
@mutex = Mutex.new
|
19
|
-
|
20
|
-
@consumer_thread = Thread.start do
|
21
|
-
while !@shutdown || @threads.size > 0 || @queue.size > 0
|
22
|
-
sleep 0.1 if @queue.size == 0 || @threads.size == @size
|
23
|
-
@threads.delete_if { |t| !t.alive? }
|
24
|
-
|
25
|
-
if @threads.size < @size && @queue.size > 0
|
26
|
-
@mutex.synchronize do
|
27
|
-
args, job = @queue.shift
|
28
|
-
@threads << Thread.start(*args, &job)
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def process(*args, &block)
|
36
|
-
raise "Block expected" unless block_given?
|
37
|
-
raise "Can not add jobs while shutting down" if @shutdown
|
38
|
-
|
39
|
-
@mutex.synchronize do
|
40
|
-
@queue << [args, block]
|
41
|
-
end
|
42
|
-
|
43
|
-
return self.enqueued
|
44
|
-
end
|
45
|
-
|
46
|
-
def shutdown
|
47
|
-
@shutdown = true
|
48
|
-
end
|
49
|
-
|
50
|
-
def shutdown!
|
51
|
-
self.shutdown
|
52
|
-
@mutex.synchronize do
|
53
|
-
@queue.clear
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
def enqueued
|
58
|
-
return @queue.size
|
59
|
-
end
|
60
|
-
|
61
|
-
def shutdown?
|
62
|
-
@shutdown
|
63
|
-
end
|
64
|
-
|
65
|
-
def alive?
|
66
|
-
@consumer_thread.alive?
|
67
|
-
end
|
68
|
-
|
69
|
-
def wait
|
70
|
-
while alive?
|
71
|
-
sleep 0.1
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
module GHAUtils
|
78
|
-
def get_gha_filename(date)
|
79
|
-
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
80
|
-
end
|
81
|
-
|
82
|
-
def read_gha_file_content(gz)
|
83
|
-
gzip = Zlib::GzipReader.new(gz)
|
84
|
-
return gzip.read
|
85
|
-
ensure
|
86
|
-
gzip.close if gzip
|
87
|
-
end
|
88
|
-
|
89
|
-
def read_gha_file(file)
|
90
|
-
|
91
|
-
if !file.is_a?(StringIO) && file.path.end_with?(".json")
|
92
|
-
content = file.read
|
93
|
-
elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
94
|
-
content = read_gha_file_content(file)
|
95
|
-
else
|
96
|
-
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
97
|
-
end
|
98
|
-
|
99
|
-
result = []
|
100
|
-
content.lines.each do |line|
|
101
|
-
result << JSON.parse(line)
|
102
|
-
end
|
103
|
-
|
104
|
-
return result
|
105
|
-
end
|
106
|
-
|
107
|
-
def each_time(from, to)
|
108
|
-
current_time = from
|
109
|
-
while current_time < to
|
110
|
-
yield current_time
|
111
|
-
current_time += 3600
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
class GHAProvider
|
117
|
-
include GHAUtils
|
118
|
-
|
119
|
-
def initialize
|
120
|
-
@logger = Logger.new(STDOUT)
|
121
|
-
|
122
|
-
@includes = {}
|
123
|
-
@excludes = {}
|
124
|
-
|
125
|
-
@checkpoint_name = nil
|
126
|
-
@use_json = true
|
127
|
-
end
|
128
|
-
|
129
|
-
def use_checkpoint(filename)
|
130
|
-
@checkpoint_name = filename
|
131
|
-
|
132
|
-
return self
|
133
|
-
end
|
134
|
-
|
135
|
-
def parse_events
|
136
|
-
@use_json = false
|
137
|
-
|
138
|
-
return self
|
139
|
-
end
|
140
|
-
|
141
|
-
def logger=(logger)
|
142
|
-
@logger = logger
|
143
|
-
|
144
|
-
return self
|
145
|
-
end
|
146
|
-
alias :use_logger :logger=
|
147
|
-
|
148
|
-
def get(date)
|
149
|
-
raise "Not implemented"
|
150
|
-
end
|
151
|
-
|
152
|
-
def include(**args)
|
153
|
-
args.each do |key, value|
|
154
|
-
@includes[key.to_s] = [] unless @includes[key.to_s]
|
155
|
-
@includes[key.to_s] << value
|
156
|
-
end
|
157
|
-
|
158
|
-
return self
|
159
|
-
end
|
160
|
-
|
161
|
-
def exclude(**args)
|
162
|
-
args.each do |key, value|
|
163
|
-
@excludes[key.to_s] = [] unless @excludes[key.to_s]
|
164
|
-
@excludes[key.to_s] << value
|
165
|
-
end
|
166
|
-
|
167
|
-
return self
|
168
|
-
end
|
169
|
-
|
170
|
-
def restore_checkpoint(from)
|
171
|
-
if @checkpoint_name && FileTest.exist?(@checkpoint_name)
|
172
|
-
# Note that this throws an exception if the file is not readable. This is the intended behavior.
|
173
|
-
# As opposed to that, failing to save the checkpoint information just results in a warning on the log.
|
174
|
-
loaded_from = Marshal.load(File.read(@checkpoint_name))
|
175
|
-
raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
|
176
|
-
|
177
|
-
@logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
|
178
|
-
|
179
|
-
return loaded_from
|
180
|
-
else
|
181
|
-
return from
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
def update_checkpoint(current_time)
|
186
|
-
if @checkpoint_name
|
187
|
-
begin
|
188
|
-
File.open(@checkpoint_name, "wb") do |f|
|
189
|
-
f.write(Marshal.dump(current_time))
|
190
|
-
end
|
191
|
-
rescue
|
192
|
-
@logger.warn(
|
193
|
-
"Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
|
194
|
-
)
|
195
|
-
end
|
196
|
-
end
|
197
|
-
end
|
198
|
-
|
199
|
-
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
200
|
-
exceptions = []
|
201
|
-
|
202
|
-
from = restore_checkpoint(from)
|
203
|
-
|
204
|
-
self.each_time(from, to) do |current_time|
|
205
|
-
events = []
|
206
|
-
|
207
|
-
update_checkpoint(current_time)
|
208
|
-
|
209
|
-
begin
|
210
|
-
events = self.get(current_time)
|
211
|
-
rescue GHAException => e
|
212
|
-
@logger.warn(e.message)
|
213
|
-
next
|
214
|
-
rescue => e
|
215
|
-
@logger.error("An exception occurred for #{current_time}: #{e.message}")
|
216
|
-
exceptions << e
|
217
|
-
next
|
218
|
-
end
|
219
|
-
|
220
|
-
events.each do |event|
|
221
|
-
skip = false
|
222
|
-
@includes.each do |key, value|
|
223
|
-
skip = true unless value.include?(event[key])
|
224
|
-
end
|
225
|
-
|
226
|
-
@excludes.each do |key, value|
|
227
|
-
skip = true if value.include?(event[key])
|
228
|
-
end
|
229
|
-
next if skip
|
230
|
-
|
231
|
-
if @use_json
|
232
|
-
yield event, current_time
|
233
|
-
else
|
234
|
-
yield GHArchive::Event.parse(event), current_time
|
235
|
-
end
|
236
|
-
end
|
237
|
-
|
238
|
-
@logger.info("Scanned #{current_time}")
|
239
|
-
|
240
|
-
events.clear
|
241
|
-
GC.start
|
242
|
-
end
|
243
|
-
|
244
|
-
update_checkpoint(to)
|
245
|
-
|
246
|
-
return exceptions
|
247
|
-
end
|
248
|
-
|
249
|
-
class GHAException < Exception
|
250
|
-
end
|
251
|
-
end
|
252
|
-
|
253
|
-
class OnlineGHAProvider < GHAProvider
|
254
|
-
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
255
|
-
super()
|
256
|
-
|
257
|
-
self.max_retries(max_retries)
|
258
|
-
self.proactive(proactive_pool_size) if proactive
|
259
|
-
|
260
|
-
@cache = Cache.new
|
261
|
-
end
|
262
|
-
|
263
|
-
def max_retries(n)
|
264
|
-
@max_retries = n
|
265
|
-
|
266
|
-
return self
|
267
|
-
end
|
268
|
-
|
269
|
-
def proactive(pool_size = 10)
|
270
|
-
@proactive = true
|
271
|
-
@pool = GHArchive::ThreadPool.new(pool_size)
|
272
|
-
|
273
|
-
return self
|
274
|
-
end
|
275
|
-
|
276
|
-
def get(current_time)
|
277
|
-
@max_retries.times do
|
278
|
-
begin
|
279
|
-
filename = self.get_gha_filename(current_time)
|
280
|
-
|
281
|
-
if @proactive
|
282
|
-
@logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
|
283
|
-
|
284
|
-
while !@cache.has?(filename)
|
285
|
-
sleep 1
|
286
|
-
end
|
287
|
-
|
288
|
-
data = @cache.get(filename)
|
289
|
-
if data
|
290
|
-
return data
|
291
|
-
else
|
292
|
-
raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
|
293
|
-
end
|
294
|
-
else
|
295
|
-
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
296
|
-
return self.read_gha_file(gz)
|
297
|
-
end
|
298
|
-
end
|
299
|
-
rescue Errno::ECONNRESET => e
|
300
|
-
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
301
|
-
next
|
302
|
-
rescue OpenURI::HTTPError => e
|
303
|
-
code = e.io.status[0]
|
304
|
-
if code.start_with?("5")
|
305
|
-
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
306
|
-
next
|
307
|
-
else
|
308
|
-
raise e
|
309
|
-
end
|
310
|
-
end
|
311
|
-
end
|
312
|
-
|
313
|
-
raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
|
314
|
-
end
|
315
|
-
|
316
|
-
def cache(current_time)
|
317
|
-
@logger.info("Full cache. Waiting for some free slot...") if @cache.full?
|
318
|
-
while @cache.full?
|
319
|
-
sleep 1
|
320
|
-
end
|
321
|
-
|
322
|
-
filename = self.get_gha_filename(current_time)
|
323
|
-
@max_retries.times do
|
324
|
-
begin
|
325
|
-
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
326
|
-
content = self.read_gha_file(gz)
|
327
|
-
@cache.put(filename, content)
|
328
|
-
return
|
329
|
-
end
|
330
|
-
rescue Errno::ECONNRESET => e
|
331
|
-
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
332
|
-
next
|
333
|
-
rescue OpenURI::HTTPError => e
|
334
|
-
code = e.io.status[0]
|
335
|
-
if code.start_with?("5")
|
336
|
-
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
337
|
-
next
|
338
|
-
elsif code == "404"
|
339
|
-
@logger.error("File for #{current_time} not found. Skipping because: " + e.message)
|
340
|
-
else
|
341
|
-
raise e
|
342
|
-
end
|
343
|
-
rescue Zlib::GzipFile::Error => e
|
344
|
-
@logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
|
345
|
-
end
|
346
|
-
end
|
347
|
-
|
348
|
-
@cache.put(filename, nil) unless @cache.has?(filename)
|
349
|
-
end
|
350
|
-
|
351
|
-
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
352
|
-
if @proactive
|
353
|
-
real_from = restore_checkpoint(from)
|
354
|
-
any_ready = Thread.promise
|
355
|
-
|
356
|
-
@logger.info("Proactively scheduling download tasks...")
|
357
|
-
self.each_time(real_from, to) do |current_time|
|
358
|
-
@pool.process(current_time) do |current_time|
|
359
|
-
cache(current_time)
|
360
|
-
any_ready << true
|
361
|
-
@logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
|
362
|
-
end
|
363
|
-
end
|
364
|
-
|
365
|
-
~any_ready
|
366
|
-
@logger.info("Download tasks successfully scheduled!")
|
367
|
-
end
|
368
|
-
|
369
|
-
super
|
370
|
-
end
|
371
|
-
|
372
|
-
class Cache
|
373
|
-
def initialize(max_size = 10)
|
374
|
-
@cache = {}
|
375
|
-
@max_size = max_size
|
376
|
-
@mutex = Mutex.new
|
377
|
-
end
|
378
|
-
|
379
|
-
def put(name, content)
|
380
|
-
@mutex.synchronize do
|
381
|
-
@cache[name] = content
|
382
|
-
end
|
383
|
-
end
|
384
|
-
|
385
|
-
def get(name)
|
386
|
-
@mutex.synchronize do
|
387
|
-
return @cache.delete(name)
|
388
|
-
end
|
389
|
-
end
|
390
|
-
|
391
|
-
def size
|
392
|
-
@mutex.synchronize do
|
393
|
-
return @cache.size
|
394
|
-
end
|
395
|
-
end
|
396
|
-
|
397
|
-
def has?(name)
|
398
|
-
@mutex.synchronize do
|
399
|
-
return @cache.has_key?(name)
|
400
|
-
end
|
401
|
-
end
|
402
|
-
|
403
|
-
def full?
|
404
|
-
self.size >= @max_size
|
405
|
-
end
|
406
|
-
end
|
407
|
-
|
408
|
-
class DownloadArchiveException < GHAProvider::GHAException
|
409
|
-
end
|
410
|
-
end
|
411
|
-
|
412
|
-
class FolderGHAProvider < GHAProvider
|
413
|
-
def initialize(folder)
|
414
|
-
super()
|
415
|
-
|
416
|
-
@folder = folder
|
417
|
-
end
|
418
|
-
|
419
|
-
def get(current_time)
|
420
|
-
filename = self.get_gha_filename(current_time)
|
421
|
-
complete_filename = File.join(@folder, filename)
|
422
|
-
mode = "rb"
|
423
|
-
|
424
|
-
unless FileTest.exist?(complete_filename)
|
425
|
-
complete_filename = complete_filename.sub(".gz", "")
|
426
|
-
mode = "r"
|
427
|
-
end
|
428
|
-
|
429
|
-
unless FileTest.exist?(complete_filename)
|
430
|
-
raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
|
431
|
-
end
|
432
|
-
|
433
|
-
File.open(complete_filename, mode) do |file|
|
434
|
-
return self.read_gha_file(file)
|
435
|
-
end
|
436
|
-
end
|
437
|
-
end
|
438
|
-
|
439
|
-
class GHADownloader
|
440
|
-
include GHAUtils
|
441
|
-
|
442
|
-
def initialize(folder, decompress = false)
|
443
|
-
@logger = Logger.new(STDERR)
|
444
|
-
@decompress = decompress
|
445
|
-
@folder = folder
|
446
|
-
@max = nil
|
447
|
-
|
448
|
-
Dir.mkdir(@folder) unless FileTest.exist?(@folder)
|
449
|
-
raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
|
450
|
-
end
|
451
|
-
|
452
|
-
def max(max)
|
453
|
-
@max = max
|
454
|
-
return self
|
455
|
-
end
|
456
|
-
|
457
|
-
def logger=(logger)
|
458
|
-
@logger = logger
|
459
|
-
end
|
460
|
-
|
461
|
-
def download(from = Time.gm(2015, 1, 1), to = Time.now)
|
462
|
-
archive = []
|
463
|
-
self.each_time(from, to) do |current_time|
|
464
|
-
filename = self.get_gha_filename(current_time)
|
465
|
-
out_filename = filename.clone
|
466
|
-
out_filename.gsub!(".json.gz", ".json") if @decompress
|
467
|
-
|
468
|
-
target_file = File.join(@folder, out_filename)
|
469
|
-
if FileTest.exist?(target_file)
|
470
|
-
@logger.info("Skipping existing file for #{current_time}")
|
471
|
-
next
|
472
|
-
else
|
473
|
-
@logger.info("Downloading file for #{current_time}")
|
474
|
-
end
|
475
|
-
|
476
|
-
File.open(target_file, 'w') do |f|
|
477
|
-
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
478
|
-
if @decompress
|
479
|
-
f << self.read_gha_file_content(gz)
|
480
|
-
else
|
481
|
-
f << gz.read
|
482
|
-
end
|
483
|
-
end
|
484
|
-
end
|
485
|
-
archive << target_file
|
486
|
-
|
487
|
-
if @max && archive.size > @max
|
488
|
-
last = archive.shift
|
489
|
-
@logger.info("Removing local file #{last}")
|
490
|
-
File.unlink(last)
|
491
|
-
end
|
492
|
-
|
493
|
-
yield filename if block_given?
|
494
|
-
end
|
495
|
-
end
|
496
|
-
end
|
1
|
+
require_relative 'gh-archive/core'
|
2
|
+
require_relative 'gh-archive/providers'
|
3
|
+
require_relative 'gh-archive/downloader'
|
4
|
+
require_relative 'gh-archive/events'
|
5
|
+
require_relative 'gh-archive/entities'
|
6
|
+
require_relative 'gh-archive/legacy'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.17'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-12-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
@@ -57,8 +57,13 @@ extensions: []
|
|
57
57
|
extra_rdoc_files: []
|
58
58
|
files:
|
59
59
|
- lib/gh-archive.rb
|
60
|
+
- lib/gh-archive/core.rb
|
61
|
+
- lib/gh-archive/downloader.rb
|
60
62
|
- lib/gh-archive/entities.rb
|
61
63
|
- lib/gh-archive/events.rb
|
64
|
+
- lib/gh-archive/job.rb
|
65
|
+
- lib/gh-archive/legacy.rb
|
66
|
+
- lib/gh-archive/providers.rb
|
62
67
|
homepage: https://github.com/intersimone999/gh-archive
|
63
68
|
licenses:
|
64
69
|
- GPL-3.0-only
|
@@ -78,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
83
|
- !ruby/object:Gem::Version
|
79
84
|
version: '0'
|
80
85
|
requirements: []
|
81
|
-
rubygems_version: 3.2.
|
86
|
+
rubygems_version: 3.2.29
|
82
87
|
signing_key:
|
83
88
|
specification_version: 4
|
84
89
|
summary: GitHub Archive mining utility
|