gh-archive 0.2 → 0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +111 -68
- metadata +29 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 91b0e957c5176b791d4f49e382680865405e7a6b2b29b349bcbb78b92d884e02
|
4
|
+
data.tar.gz: f8ddae3d80e80a24931d8632c9798c8f01520e1fe5ac8a85079c0d0e85eadcbc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f7b24be932f58142b36887671b4265e25631345e7b81cc36b264be4a018fc0c4a88b853ae384dc8472876bf0996e904bf499007adc3091ddb511f28c828090fc
|
7
|
+
data.tar.gz: 5cbb83495b9bb397a41022cb1bf4bce0344c735d16f9f43fb181b4b109948ac7a75bd44c693338f02b1ff17eeeeb5b83a6add9deeeedbd379a8848614041a3f5
|
data/lib/gh-archive.rb
CHANGED
@@ -3,6 +3,9 @@ require 'json'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'zlib'
|
5
5
|
require 'logger'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'thread/pool'
|
8
|
+
require 'thread/promise'
|
6
9
|
|
7
10
|
module GHAUtils
|
8
11
|
def get_gha_filename(date)
|
@@ -11,14 +14,19 @@ module GHAUtils
|
|
11
14
|
|
12
15
|
def read_gha_file_content(gz)
|
13
16
|
gzip = Zlib::GzipReader.new(gz)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
return content
|
17
|
+
return gzip.read
|
18
|
+
ensure
|
19
|
+
gzip.close if gzip
|
18
20
|
end
|
19
21
|
|
20
|
-
def read_gha_file(
|
21
|
-
|
22
|
+
def read_gha_file(file)
|
23
|
+
if file.path.end_with?(".json")
|
24
|
+
content = file.read
|
25
|
+
elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
26
|
+
content = read_gha_file_content(file)
|
27
|
+
else
|
28
|
+
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
29
|
+
end
|
22
30
|
|
23
31
|
result = []
|
24
32
|
content.lines.each do |line|
|
@@ -28,11 +36,11 @@ module GHAUtils
|
|
28
36
|
return result
|
29
37
|
end
|
30
38
|
|
31
|
-
def
|
32
|
-
|
33
|
-
while
|
34
|
-
yield
|
35
|
-
|
39
|
+
def each_time(from, to)
|
40
|
+
current_time = from
|
41
|
+
while current_time < to
|
42
|
+
yield current_time
|
43
|
+
current_time += 3600
|
36
44
|
end
|
37
45
|
end
|
38
46
|
end
|
@@ -70,13 +78,18 @@ class GHAProvider
|
|
70
78
|
end
|
71
79
|
|
72
80
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
73
|
-
|
81
|
+
exceptions = []
|
82
|
+
|
83
|
+
self.each_time(from, to) do |current_time|
|
74
84
|
events = []
|
75
85
|
begin
|
76
|
-
events = self.get(
|
77
|
-
|
78
|
-
|
79
|
-
|
86
|
+
events = self.get(current_time)
|
87
|
+
rescue GHAException => e
|
88
|
+
@logger.warn(e.message)
|
89
|
+
next
|
90
|
+
rescue => e
|
91
|
+
@logger.error("An exception occurred for #{current_time}: #{e.message}")
|
92
|
+
exceptions << e
|
80
93
|
next
|
81
94
|
end
|
82
95
|
|
@@ -91,21 +104,30 @@ class GHAProvider
|
|
91
104
|
end
|
92
105
|
next if skip
|
93
106
|
|
94
|
-
yield event
|
107
|
+
yield event, current_time
|
95
108
|
end
|
96
109
|
|
110
|
+
@logger.info("Scanned #{current_time}")
|
111
|
+
|
97
112
|
events.clear
|
98
113
|
GC.start
|
99
114
|
end
|
115
|
+
|
116
|
+
return exceptions
|
117
|
+
end
|
118
|
+
|
119
|
+
class GHAException < Exception
|
100
120
|
end
|
101
121
|
end
|
102
122
|
|
103
123
|
class OnlineGHAProvider < GHAProvider
|
104
|
-
def initialize(max_retries = 3, proactive = false)
|
124
|
+
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
105
125
|
super()
|
106
126
|
|
107
127
|
@max_retries = max_retries
|
108
128
|
@proactive = proactive
|
129
|
+
@proactive_pool_size = proactive_pool_size
|
130
|
+
@pool = Thread.pool(proactive_pool_size)
|
109
131
|
@cache = Cache.new
|
110
132
|
end
|
111
133
|
|
@@ -114,97 +136,106 @@ class OnlineGHAProvider < GHAProvider
|
|
114
136
|
begin
|
115
137
|
filename = self.get_gha_filename(current_time)
|
116
138
|
|
117
|
-
if @
|
118
|
-
|
139
|
+
if @proactive
|
140
|
+
@logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
|
141
|
+
|
142
|
+
while !@cache.has?(filename)
|
143
|
+
sleep 1
|
144
|
+
end
|
145
|
+
|
146
|
+
return @cache.get(filename)
|
119
147
|
else
|
120
148
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
121
|
-
# Save to cache
|
122
149
|
return self.read_gha_file(gz)
|
123
150
|
end
|
124
151
|
end
|
125
|
-
rescue
|
126
|
-
@logger.
|
152
|
+
rescue Errno::ECONNRESET => e
|
153
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
154
|
+
next
|
155
|
+
rescue OpenURI::HTTPError => e
|
156
|
+
code = e.io.status[0]
|
157
|
+
if code.start_with?("5")
|
158
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
159
|
+
next
|
160
|
+
else
|
161
|
+
raise e
|
162
|
+
end
|
127
163
|
end
|
128
164
|
end
|
129
165
|
|
130
|
-
raise DownloadArchiveException, "Exceeded maximum number of tentative downloads."
|
166
|
+
raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
|
131
167
|
end
|
132
168
|
|
133
169
|
def cache(current_time)
|
170
|
+
@logger.info("Full cache. Waiting for some free slot...") if @cache.full?
|
171
|
+
while @cache.full?
|
172
|
+
sleep 1
|
173
|
+
end
|
134
174
|
@max_retries.times do
|
135
175
|
begin
|
136
176
|
filename = self.get_gha_filename(current_time)
|
137
|
-
|
138
177
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
139
|
-
|
178
|
+
content = self.read_gha_file(gz)
|
179
|
+
@cache.put(filename, content)
|
140
180
|
return
|
141
181
|
end
|
142
|
-
rescue
|
182
|
+
rescue Errno::ECONNRESET => e
|
183
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
184
|
+
next
|
185
|
+
rescue OpenURI::HTTPError => e
|
186
|
+
code = e.io.status[0]
|
187
|
+
if code.start_with?("5")
|
188
|
+
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
189
|
+
next
|
190
|
+
else
|
191
|
+
raise e
|
192
|
+
end
|
143
193
|
end
|
144
194
|
end
|
145
195
|
end
|
146
196
|
|
147
197
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
148
198
|
if @proactive
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
end
|
158
|
-
|
159
|
-
while @cache.full?
|
160
|
-
sleep 1
|
161
|
-
end
|
199
|
+
any_ready = Thread.promise
|
200
|
+
|
201
|
+
@logger.info("Proactively scheduling download tasks...")
|
202
|
+
self.each_time(from, to) do |current_time|
|
203
|
+
@pool.process(current_time) do |current_time|
|
204
|
+
cache(current_time)
|
205
|
+
any_ready << true
|
206
|
+
@logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
|
162
207
|
end
|
163
208
|
end
|
209
|
+
|
210
|
+
~any_ready
|
211
|
+
@logger.info("Download tasks successfully scheduled!")
|
164
212
|
end
|
165
213
|
|
166
214
|
super
|
167
215
|
end
|
168
216
|
|
169
217
|
class Cache
|
170
|
-
def initialize(
|
218
|
+
def initialize(max_size = 10)
|
171
219
|
@cache = {}
|
172
220
|
@max_size = max_size
|
173
|
-
@folder = folder
|
174
221
|
@mutex = Mutex.new
|
175
222
|
end
|
176
223
|
|
177
224
|
def put(name, content)
|
178
|
-
File.open("#@folder/#{name}", 'w') do |f|
|
179
|
-
f << content
|
180
|
-
end
|
181
|
-
|
182
225
|
@mutex.synchronize do
|
183
|
-
@cache[name] =
|
226
|
+
@cache[name] = content
|
184
227
|
end
|
185
228
|
end
|
186
229
|
|
187
230
|
def get(name)
|
188
231
|
@mutex.synchronize do
|
189
|
-
return
|
190
|
-
end
|
191
|
-
ensure
|
192
|
-
self.unload(name)
|
193
|
-
end
|
194
|
-
|
195
|
-
def unload(name)
|
196
|
-
File.unlink(@cache[name])
|
197
|
-
|
198
|
-
@mutex.synchronize do
|
199
|
-
@cache.delete(name)
|
232
|
+
return @cache.delete(name)
|
200
233
|
end
|
201
|
-
|
202
|
-
return true
|
203
234
|
end
|
204
235
|
|
205
236
|
def size
|
206
237
|
@mutex.synchronize do
|
207
|
-
@cache.size
|
238
|
+
return @cache.size
|
208
239
|
end
|
209
240
|
end
|
210
241
|
|
@@ -217,7 +248,7 @@ class OnlineGHAProvider < GHAProvider
|
|
217
248
|
end
|
218
249
|
end
|
219
250
|
|
220
|
-
class DownloadArchiveException <
|
251
|
+
class DownloadArchiveException < GHAProvider::GHAException
|
221
252
|
end
|
222
253
|
end
|
223
254
|
|
@@ -230,8 +261,20 @@ class FolderGHAProvider < GHAProvider
|
|
230
261
|
|
231
262
|
def get(current_time)
|
232
263
|
filename = self.get_gha_filename(current_time)
|
233
|
-
File.
|
234
|
-
|
264
|
+
complete_filename = File.join(@folder, filename)
|
265
|
+
mode = "rb"
|
266
|
+
|
267
|
+
unless FileTest.exist?(complete_filename)
|
268
|
+
complete_filename = complete_filename.sub(".gz", "")
|
269
|
+
mode = "r"
|
270
|
+
end
|
271
|
+
|
272
|
+
unless FileTest.exist?(complete_filename)
|
273
|
+
raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
|
274
|
+
end
|
275
|
+
|
276
|
+
File.open(complete_filename, mode) do |file|
|
277
|
+
return self.read_gha_file(file)
|
235
278
|
end
|
236
279
|
end
|
237
280
|
end
|
@@ -260,17 +303,17 @@ class GHADownloader
|
|
260
303
|
|
261
304
|
def download(from = Time.gm(2015, 1, 1), to = Time.now)
|
262
305
|
archive = []
|
263
|
-
self.
|
264
|
-
filename = self.get_gha_filename(
|
306
|
+
self.each_time(from, to) do |current_time|
|
307
|
+
filename = self.get_gha_filename(current_time)
|
265
308
|
out_filename = filename.clone
|
266
309
|
out_filename.gsub!(".json.gz", ".json") if @decompress
|
267
310
|
|
268
311
|
target_file = File.join(@folder, out_filename)
|
269
312
|
if FileTest.exist?(target_file)
|
270
|
-
@logger.info("Skipping existing file for #{
|
313
|
+
@logger.info("Skipping existing file for #{current_time}")
|
271
314
|
next
|
272
315
|
else
|
273
|
-
@logger.info("Downloading file for #{
|
316
|
+
@logger.info("Downloading file for #{current_time}")
|
274
317
|
end
|
275
318
|
|
276
319
|
File.open(target_file, 'w') do |f|
|
metadata
CHANGED
@@ -1,35 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.6'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 1.1.2
|
20
|
-
- - "
|
20
|
+
- - ">="
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 1.1.2
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.1.2
|
27
30
|
- - ">="
|
28
31
|
- !ruby/object:Gem::Version
|
29
32
|
version: 1.1.2
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: thread
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
30
37
|
- - "~>"
|
31
38
|
- !ruby/object:Gem::Version
|
32
|
-
version:
|
39
|
+
version: 0.2.2
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.2.2
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.2.2
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.2.2
|
33
53
|
description: Download and analyze the GitHub events stored at GitHub archive
|
34
54
|
email: s.scalabrino9@gmail.com
|
35
55
|
executables: []
|
@@ -41,7 +61,7 @@ homepage: https://github.com/intersimone999/gh-archive
|
|
41
61
|
licenses:
|
42
62
|
- GPL-3.0-only
|
43
63
|
metadata: {}
|
44
|
-
post_install_message:
|
64
|
+
post_install_message:
|
45
65
|
rdoc_options: []
|
46
66
|
require_paths:
|
47
67
|
- lib
|
@@ -56,8 +76,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
56
76
|
- !ruby/object:Gem::Version
|
57
77
|
version: '0'
|
58
78
|
requirements: []
|
59
|
-
rubygems_version: 3.
|
60
|
-
signing_key:
|
79
|
+
rubygems_version: 3.2.21
|
80
|
+
signing_key:
|
61
81
|
specification_version: 4
|
62
82
|
summary: GitHub Archive mining utility
|
63
83
|
test_files: []
|