gh-archive 0.2 → 0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +111 -68
- metadata +29 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 91b0e957c5176b791d4f49e382680865405e7a6b2b29b349bcbb78b92d884e02
|
4
|
+
data.tar.gz: f8ddae3d80e80a24931d8632c9798c8f01520e1fe5ac8a85079c0d0e85eadcbc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f7b24be932f58142b36887671b4265e25631345e7b81cc36b264be4a018fc0c4a88b853ae384dc8472876bf0996e904bf499007adc3091ddb511f28c828090fc
|
7
|
+
data.tar.gz: 5cbb83495b9bb397a41022cb1bf4bce0344c735d16f9f43fb181b4b109948ac7a75bd44c693338f02b1ff17eeeeb5b83a6add9deeeedbd379a8848614041a3f5
|
data/lib/gh-archive.rb
CHANGED
@@ -3,6 +3,9 @@ require 'json'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'zlib'
|
5
5
|
require 'logger'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'thread/pool'
|
8
|
+
require 'thread/promise'
|
6
9
|
|
7
10
|
module GHAUtils
|
8
11
|
def get_gha_filename(date)
|
@@ -11,14 +14,19 @@ module GHAUtils
|
|
11
14
|
|
12
15
|
def read_gha_file_content(gz)
|
13
16
|
gzip = Zlib::GzipReader.new(gz)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
return content
|
17
|
+
return gzip.read
|
18
|
+
ensure
|
19
|
+
gzip.close if gzip
|
18
20
|
end
|
19
21
|
|
20
|
-
def read_gha_file(
|
21
|
-
|
22
|
+
def read_gha_file(file)
|
23
|
+
if file.path.end_with?(".json")
|
24
|
+
content = file.read
|
25
|
+
elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
26
|
+
content = read_gha_file_content(file)
|
27
|
+
else
|
28
|
+
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
29
|
+
end
|
22
30
|
|
23
31
|
result = []
|
24
32
|
content.lines.each do |line|
|
@@ -28,11 +36,11 @@ module GHAUtils
|
|
28
36
|
return result
|
29
37
|
end
|
30
38
|
|
31
|
-
def
|
32
|
-
|
33
|
-
while
|
34
|
-
yield
|
35
|
-
|
39
|
+
def each_time(from, to)
|
40
|
+
current_time = from
|
41
|
+
while current_time < to
|
42
|
+
yield current_time
|
43
|
+
current_time += 3600
|
36
44
|
end
|
37
45
|
end
|
38
46
|
end
|
@@ -70,13 +78,18 @@ class GHAProvider
|
|
70
78
|
end
|
71
79
|
|
72
80
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
73
|
-
|
81
|
+
exceptions = []
|
82
|
+
|
83
|
+
self.each_time(from, to) do |current_time|
|
74
84
|
events = []
|
75
85
|
begin
|
76
|
-
events = self.get(
|
77
|
-
|
78
|
-
|
79
|
-
|
86
|
+
events = self.get(current_time)
|
87
|
+
rescue GHAException => e
|
88
|
+
@logger.warn(e.message)
|
89
|
+
next
|
90
|
+
rescue => e
|
91
|
+
@logger.error("An exception occurred for #{current_time}: #{e.message}")
|
92
|
+
exceptions << e
|
80
93
|
next
|
81
94
|
end
|
82
95
|
|
@@ -91,21 +104,30 @@ class GHAProvider
|
|
91
104
|
end
|
92
105
|
next if skip
|
93
106
|
|
94
|
-
yield event
|
107
|
+
yield event, current_time
|
95
108
|
end
|
96
109
|
|
110
|
+
@logger.info("Scanned #{current_time}")
|
111
|
+
|
97
112
|
events.clear
|
98
113
|
GC.start
|
99
114
|
end
|
115
|
+
|
116
|
+
return exceptions
|
117
|
+
end
|
118
|
+
|
119
|
+
class GHAException < Exception
|
100
120
|
end
|
101
121
|
end
|
102
122
|
|
103
123
|
class OnlineGHAProvider < GHAProvider
|
104
|
-
def initialize(max_retries = 3, proactive = false)
|
124
|
+
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
105
125
|
super()
|
106
126
|
|
107
127
|
@max_retries = max_retries
|
108
128
|
@proactive = proactive
|
129
|
+
@proactive_pool_size = proactive_pool_size
|
130
|
+
@pool = Thread.pool(proactive_pool_size)
|
109
131
|
@cache = Cache.new
|
110
132
|
end
|
111
133
|
|
@@ -114,97 +136,106 @@ class OnlineGHAProvider < GHAProvider
|
|
114
136
|
begin
|
115
137
|
filename = self.get_gha_filename(current_time)
|
116
138
|
|
117
|
-
if @
|
118
|
-
|
139
|
+
if @proactive
|
140
|
+
@logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
|
141
|
+
|
142
|
+
while !@cache.has?(filename)
|
143
|
+
sleep 1
|
144
|
+
end
|
145
|
+
|
146
|
+
return @cache.get(filename)
|
119
147
|
else
|
120
148
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
121
|
-
# Save to cache
|
122
149
|
return self.read_gha_file(gz)
|
123
150
|
end
|
124
151
|
end
|
125
|
-
rescue
|
126
|
-
@logger.
|
152
|
+
rescue Errno::ECONNRESET => e
|
153
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
154
|
+
next
|
155
|
+
rescue OpenURI::HTTPError => e
|
156
|
+
code = e.io.status[0]
|
157
|
+
if code.start_with?("5")
|
158
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
159
|
+
next
|
160
|
+
else
|
161
|
+
raise e
|
162
|
+
end
|
127
163
|
end
|
128
164
|
end
|
129
165
|
|
130
|
-
raise DownloadArchiveException, "Exceeded maximum number of tentative downloads."
|
166
|
+
raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
|
131
167
|
end
|
132
168
|
|
133
169
|
def cache(current_time)
|
170
|
+
@logger.info("Full cache. Waiting for some free slot...") if @cache.full?
|
171
|
+
while @cache.full?
|
172
|
+
sleep 1
|
173
|
+
end
|
134
174
|
@max_retries.times do
|
135
175
|
begin
|
136
176
|
filename = self.get_gha_filename(current_time)
|
137
|
-
|
138
177
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
139
|
-
|
178
|
+
content = self.read_gha_file(gz)
|
179
|
+
@cache.put(filename, content)
|
140
180
|
return
|
141
181
|
end
|
142
|
-
rescue
|
182
|
+
rescue Errno::ECONNRESET => e
|
183
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
184
|
+
next
|
185
|
+
rescue OpenURI::HTTPError => e
|
186
|
+
code = e.io.status[0]
|
187
|
+
if code.start_with?("5")
|
188
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
189
|
+
next
|
190
|
+
else
|
191
|
+
raise e
|
192
|
+
end
|
143
193
|
end
|
144
194
|
end
|
145
195
|
end
|
146
196
|
|
147
197
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
148
198
|
if @proactive
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
end
|
158
|
-
|
159
|
-
while @cache.full?
|
160
|
-
sleep 1
|
161
|
-
end
|
199
|
+
any_ready = Thread.promise
|
200
|
+
|
201
|
+
@logger.info("Proactively scheduling download tasks...")
|
202
|
+
self.each_time(from, to) do |current_time|
|
203
|
+
@pool.process(current_time) do |current_time|
|
204
|
+
cache(current_time)
|
205
|
+
any_ready << true
|
206
|
+
@logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
|
162
207
|
end
|
163
208
|
end
|
209
|
+
|
210
|
+
~any_ready
|
211
|
+
@logger.info("Download tasks successfully scheduled!")
|
164
212
|
end
|
165
213
|
|
166
214
|
super
|
167
215
|
end
|
168
216
|
|
169
217
|
class Cache
|
170
|
-
def initialize(
|
218
|
+
def initialize(max_size = 10)
|
171
219
|
@cache = {}
|
172
220
|
@max_size = max_size
|
173
|
-
@folder = folder
|
174
221
|
@mutex = Mutex.new
|
175
222
|
end
|
176
223
|
|
177
224
|
def put(name, content)
|
178
|
-
File.open("#@folder/#{name}", 'w') do |f|
|
179
|
-
f << content
|
180
|
-
end
|
181
|
-
|
182
225
|
@mutex.synchronize do
|
183
|
-
@cache[name] =
|
226
|
+
@cache[name] = content
|
184
227
|
end
|
185
228
|
end
|
186
229
|
|
187
230
|
def get(name)
|
188
231
|
@mutex.synchronize do
|
189
|
-
return
|
190
|
-
end
|
191
|
-
ensure
|
192
|
-
self.unload(name)
|
193
|
-
end
|
194
|
-
|
195
|
-
def unload(name)
|
196
|
-
File.unlink(@cache[name])
|
197
|
-
|
198
|
-
@mutex.synchronize do
|
199
|
-
@cache.delete(name)
|
232
|
+
return @cache.delete(name)
|
200
233
|
end
|
201
|
-
|
202
|
-
return true
|
203
234
|
end
|
204
235
|
|
205
236
|
def size
|
206
237
|
@mutex.synchronize do
|
207
|
-
@cache.size
|
238
|
+
return @cache.size
|
208
239
|
end
|
209
240
|
end
|
210
241
|
|
@@ -217,7 +248,7 @@ class OnlineGHAProvider < GHAProvider
|
|
217
248
|
end
|
218
249
|
end
|
219
250
|
|
220
|
-
class DownloadArchiveException <
|
251
|
+
class DownloadArchiveException < GHAProvider::GHAException
|
221
252
|
end
|
222
253
|
end
|
223
254
|
|
@@ -230,8 +261,20 @@ class FolderGHAProvider < GHAProvider
|
|
230
261
|
|
231
262
|
def get(current_time)
|
232
263
|
filename = self.get_gha_filename(current_time)
|
233
|
-
File.
|
234
|
-
|
264
|
+
complete_filename = File.join(@folder, filename)
|
265
|
+
mode = "rb"
|
266
|
+
|
267
|
+
unless FileTest.exist?(complete_filename)
|
268
|
+
complete_filename = complete_filename.sub(".gz", "")
|
269
|
+
mode = "r"
|
270
|
+
end
|
271
|
+
|
272
|
+
unless FileTest.exist?(complete_filename)
|
273
|
+
raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
|
274
|
+
end
|
275
|
+
|
276
|
+
File.open(complete_filename, mode) do |file|
|
277
|
+
return self.read_gha_file(file)
|
235
278
|
end
|
236
279
|
end
|
237
280
|
end
|
@@ -260,17 +303,17 @@ class GHADownloader
|
|
260
303
|
|
261
304
|
def download(from = Time.gm(2015, 1, 1), to = Time.now)
|
262
305
|
archive = []
|
263
|
-
self.
|
264
|
-
filename = self.get_gha_filename(
|
306
|
+
self.each_time(from, to) do |current_time|
|
307
|
+
filename = self.get_gha_filename(current_time)
|
265
308
|
out_filename = filename.clone
|
266
309
|
out_filename.gsub!(".json.gz", ".json") if @decompress
|
267
310
|
|
268
311
|
target_file = File.join(@folder, out_filename)
|
269
312
|
if FileTest.exist?(target_file)
|
270
|
-
@logger.info("Skipping existing file for #{
|
313
|
+
@logger.info("Skipping existing file for #{current_time}")
|
271
314
|
next
|
272
315
|
else
|
273
|
-
@logger.info("Downloading file for #{
|
316
|
+
@logger.info("Downloading file for #{current_time}")
|
274
317
|
end
|
275
318
|
|
276
319
|
File.open(target_file, 'w') do |f|
|
metadata
CHANGED
@@ -1,35 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.6'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 1.1.2
|
20
|
-
- - "
|
20
|
+
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 1.1.2
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.1.2
|
27
30
|
- - ">="
|
28
31
|
- !ruby/object:Gem::Version
|
29
32
|
version: 1.1.2
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: thread
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
30
37
|
- - "~>"
|
31
38
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
39
|
+
version: 0.2.2
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.2.2
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.2.2
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.2.2
|
33
53
|
description: Download and analyze the GitHub events stored at GitHub archive
|
34
54
|
email: s.scalabrino9@gmail.com
|
35
55
|
executables: []
|
@@ -41,7 +61,7 @@ homepage: https://github.com/intersimone999/gh-archive
|
|
41
61
|
licenses:
|
42
62
|
- GPL-3.0-only
|
43
63
|
metadata: {}
|
44
|
-
post_install_message:
|
64
|
+
post_install_message:
|
45
65
|
rdoc_options: []
|
46
66
|
require_paths:
|
47
67
|
- lib
|
@@ -56,8 +76,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
56
76
|
- !ruby/object:Gem::Version
|
57
77
|
version: '0'
|
58
78
|
requirements: []
|
59
|
-
rubygems_version: 3.
|
60
|
-
signing_key:
|
79
|
+
rubygems_version: 3.2.21
|
80
|
+
signing_key:
|
61
81
|
specification_version: 4
|
62
82
|
summary: GitHub Archive mining utility
|
63
83
|
test_files: []
|