gh-archive 0.1 → 0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +60 -41
- metadata +23 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 011777addb798b172d58ffaac2b509ecf85288ee90cd28726c6303d14d39db1b
|
4
|
+
data.tar.gz: d8714b155567039e5de81f5ae36473c291f0af86701afaebf4527ab962dca240
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db6a72c3e6e31490c0a3b574ee0edf8f8995434f7ba32b6eb93c4ff35b3a8b0bd3e35c85ea207cb000b399fa71b524578067dc923752d5951f156b0f0d21df23
|
7
|
+
data.tar.gz: a0bac6036c2147e0bd933209f458cb272f6a20d669f2616fc4aa2e6b0a257354e704bcf48f3d658241d81d10d09a94ae07c899223862177fbe5315b9719b4874
|
data/lib/gh-archive.rb
CHANGED
@@ -3,6 +3,9 @@ require 'json'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'zlib'
|
5
5
|
require 'logger'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'thread/pool'
|
8
|
+
require 'thread/promise'
|
6
9
|
|
7
10
|
module GHAUtils
|
8
11
|
def get_gha_filename(date)
|
@@ -91,7 +94,7 @@ class GHAProvider
|
|
91
94
|
end
|
92
95
|
next if skip
|
93
96
|
|
94
|
-
yield event
|
97
|
+
yield event, current_date
|
95
98
|
end
|
96
99
|
|
97
100
|
events.clear
|
@@ -101,11 +104,13 @@ class GHAProvider
|
|
101
104
|
end
|
102
105
|
|
103
106
|
class OnlineGHAProvider < GHAProvider
|
104
|
-
def initialize(max_retries = 3, proactive = false)
|
107
|
+
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
105
108
|
super()
|
106
109
|
|
107
110
|
@max_retries = max_retries
|
108
111
|
@proactive = proactive
|
112
|
+
@proactive_pool_size = proactive_pool_size
|
113
|
+
@pool = Thread.pool(proactive_pool_size)
|
109
114
|
@cache = Cache.new
|
110
115
|
end
|
111
116
|
|
@@ -114,16 +119,25 @@ class OnlineGHAProvider < GHAProvider
|
|
114
119
|
begin
|
115
120
|
filename = self.get_gha_filename(current_time)
|
116
121
|
|
117
|
-
if @
|
118
|
-
|
122
|
+
if @proactive
|
123
|
+
@logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
|
124
|
+
|
125
|
+
while !@cache.has?(filename)
|
126
|
+
sleep 1
|
127
|
+
end
|
128
|
+
|
129
|
+
return @cache.get(filename)
|
119
130
|
else
|
120
131
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
121
|
-
# Save to cache
|
122
132
|
return self.read_gha_file(gz)
|
123
133
|
end
|
124
134
|
end
|
135
|
+
rescue Errno::ECONNRESET
|
136
|
+
next
|
137
|
+
rescue Zlib::GzipFile::Error
|
138
|
+
raise $!
|
125
139
|
rescue
|
126
|
-
@logger.
|
140
|
+
@logger.warn($!)
|
127
141
|
end
|
128
142
|
end
|
129
143
|
|
@@ -131,80 +145,70 @@ class OnlineGHAProvider < GHAProvider
|
|
131
145
|
end
|
132
146
|
|
133
147
|
def cache(current_time)
|
148
|
+
@logger.info("Full cache. Waiting for some free slot...") if @cache.full?
|
149
|
+
while @cache.full?
|
150
|
+
sleep 1
|
151
|
+
end
|
134
152
|
@max_retries.times do
|
135
153
|
begin
|
136
154
|
filename = self.get_gha_filename(current_time)
|
137
|
-
|
138
155
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
139
|
-
|
156
|
+
content = self.read_gha_file(gz)
|
157
|
+
@cache.put(filename, content)
|
140
158
|
return
|
141
159
|
end
|
160
|
+
rescue Errno::ECONNRESET
|
161
|
+
next
|
162
|
+
rescue Zlib::GzipFile::Error
|
163
|
+
raise $!
|
142
164
|
rescue
|
165
|
+
@logger.warn($!)
|
143
166
|
end
|
144
167
|
end
|
145
168
|
end
|
146
169
|
|
147
170
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
148
171
|
if @proactive
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
172
|
+
any_ready = Thread.promise
|
173
|
+
|
174
|
+
@logger.info("Proactively scheduling download tasks...")
|
175
|
+
self.each_date(from, to) do |current_date|
|
176
|
+
@pool.process(current_date) do |current_date|
|
177
|
+
cache(current_date)
|
178
|
+
any_ready << true
|
153
179
|
@logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
|
154
|
-
|
155
|
-
if @cache.full?
|
156
|
-
@logger.info("Full cache. Waiting...")
|
157
|
-
end
|
158
|
-
|
159
|
-
while @cache.full?
|
160
|
-
sleep 1
|
161
|
-
end
|
162
180
|
end
|
163
181
|
end
|
182
|
+
|
183
|
+
~any_ready
|
184
|
+
@logger.info("Download tasks successfully scheduled!")
|
164
185
|
end
|
165
186
|
|
166
187
|
super
|
167
188
|
end
|
168
189
|
|
169
190
|
class Cache
|
170
|
-
def initialize(
|
191
|
+
def initialize(max_size = 10)
|
171
192
|
@cache = {}
|
172
193
|
@max_size = max_size
|
173
|
-
@folder = folder
|
174
194
|
@mutex = Mutex.new
|
175
195
|
end
|
176
196
|
|
177
197
|
def put(name, content)
|
178
|
-
File.open("#@folder/#{name}", 'w') do |f|
|
179
|
-
f << content
|
180
|
-
end
|
181
|
-
|
182
198
|
@mutex.synchronize do
|
183
|
-
@cache[name] =
|
199
|
+
@cache[name] = content
|
184
200
|
end
|
185
201
|
end
|
186
202
|
|
187
203
|
def get(name)
|
188
204
|
@mutex.synchronize do
|
189
|
-
return
|
190
|
-
end
|
191
|
-
ensure
|
192
|
-
self.unload(name)
|
193
|
-
end
|
194
|
-
|
195
|
-
def unload(name)
|
196
|
-
File.unlink(@cache[name])
|
197
|
-
|
198
|
-
@mutex.synchronize do
|
199
|
-
@cache.delete(name)
|
205
|
+
return @cache.delete(name)
|
200
206
|
end
|
201
|
-
|
202
|
-
return true
|
203
207
|
end
|
204
208
|
|
205
209
|
def size
|
206
210
|
@mutex.synchronize do
|
207
|
-
@cache.size
|
211
|
+
return @cache.size
|
208
212
|
end
|
209
213
|
end
|
210
214
|
|
@@ -243,15 +247,23 @@ class GHADownloader
|
|
243
247
|
@logger = Logger.new(STDERR)
|
244
248
|
@decompress = decompress
|
245
249
|
@folder = folder
|
250
|
+
@max = nil
|
251
|
+
|
246
252
|
Dir.mkdir(@folder) unless FileTest.exist?(@folder)
|
247
253
|
raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
|
248
254
|
end
|
249
255
|
|
256
|
+
def max(max)
|
257
|
+
@max = max
|
258
|
+
return self
|
259
|
+
end
|
260
|
+
|
250
261
|
def logger=(logger)
|
251
262
|
@logger = logger
|
252
263
|
end
|
253
264
|
|
254
265
|
def download(from = Time.gm(2015, 1, 1), to = Time.now)
|
266
|
+
archive = []
|
255
267
|
self.each_date(from, to) do |current_date|
|
256
268
|
filename = self.get_gha_filename(current_date)
|
257
269
|
out_filename = filename.clone
|
@@ -274,6 +286,13 @@ class GHADownloader
|
|
274
286
|
end
|
275
287
|
end
|
276
288
|
end
|
289
|
+
archive << target_file
|
290
|
+
|
291
|
+
if @max && archive.size > @max
|
292
|
+
last = archive.shift
|
293
|
+
@logger.info("Removing local file #{last}")
|
294
|
+
File.unlink(last)
|
295
|
+
end
|
277
296
|
|
278
297
|
yield filename if block_given?
|
279
298
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.5'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-08-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
@@ -30,6 +30,26 @@ dependencies:
|
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.1.2
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: thread
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 0.2.2
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.2.2
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.2.2
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.2.2
|
33
53
|
description: Download and analyze the GitHub events stored at GitHub archive
|
34
54
|
email: s.scalabrino9@gmail.com
|
35
55
|
executables: []
|
@@ -56,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
56
76
|
- !ruby/object:Gem::Version
|
57
77
|
version: '0'
|
58
78
|
requirements: []
|
59
|
-
rubygems_version: 3.
|
79
|
+
rubygems_version: 3.2.21
|
60
80
|
signing_key:
|
61
81
|
specification_version: 4
|
62
82
|
summary: GitHub Archive mining utility
|