gh-archive 0.1 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +60 -41
- metadata +23 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 011777addb798b172d58ffaac2b509ecf85288ee90cd28726c6303d14d39db1b
|
4
|
+
data.tar.gz: d8714b155567039e5de81f5ae36473c291f0af86701afaebf4527ab962dca240
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db6a72c3e6e31490c0a3b574ee0edf8f8995434f7ba32b6eb93c4ff35b3a8b0bd3e35c85ea207cb000b399fa71b524578067dc923752d5951f156b0f0d21df23
|
7
|
+
data.tar.gz: a0bac6036c2147e0bd933209f458cb272f6a20d669f2616fc4aa2e6b0a257354e704bcf48f3d658241d81d10d09a94ae07c899223862177fbe5315b9719b4874
|
data/lib/gh-archive.rb
CHANGED
@@ -3,6 +3,9 @@ require 'json'
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'zlib'
|
5
5
|
require 'logger'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'thread/pool'
|
8
|
+
require 'thread/promise'
|
6
9
|
|
7
10
|
module GHAUtils
|
8
11
|
def get_gha_filename(date)
|
@@ -91,7 +94,7 @@ class GHAProvider
|
|
91
94
|
end
|
92
95
|
next if skip
|
93
96
|
|
94
|
-
yield event
|
97
|
+
yield event, current_date
|
95
98
|
end
|
96
99
|
|
97
100
|
events.clear
|
@@ -101,11 +104,13 @@ class GHAProvider
|
|
101
104
|
end
|
102
105
|
|
103
106
|
class OnlineGHAProvider < GHAProvider
|
104
|
-
def initialize(max_retries = 3, proactive = false)
|
107
|
+
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
105
108
|
super()
|
106
109
|
|
107
110
|
@max_retries = max_retries
|
108
111
|
@proactive = proactive
|
112
|
+
@proactive_pool_size = proactive_pool_size
|
113
|
+
@pool = Thread.pool(proactive_pool_size)
|
109
114
|
@cache = Cache.new
|
110
115
|
end
|
111
116
|
|
@@ -114,16 +119,25 @@ class OnlineGHAProvider < GHAProvider
|
|
114
119
|
begin
|
115
120
|
filename = self.get_gha_filename(current_time)
|
116
121
|
|
117
|
-
if @
|
118
|
-
|
122
|
+
if @proactive
|
123
|
+
@logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
|
124
|
+
|
125
|
+
while !@cache.has?(filename)
|
126
|
+
sleep 1
|
127
|
+
end
|
128
|
+
|
129
|
+
return @cache.get(filename)
|
119
130
|
else
|
120
131
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
121
|
-
# Save to cache
|
122
132
|
return self.read_gha_file(gz)
|
123
133
|
end
|
124
134
|
end
|
135
|
+
rescue Errno::ECONNRESET
|
136
|
+
next
|
137
|
+
rescue Zlib::GzipFile::Error
|
138
|
+
raise $!
|
125
139
|
rescue
|
126
|
-
@logger.
|
140
|
+
@logger.warn($!)
|
127
141
|
end
|
128
142
|
end
|
129
143
|
|
@@ -131,80 +145,70 @@ class OnlineGHAProvider < GHAProvider
|
|
131
145
|
end
|
132
146
|
|
133
147
|
def cache(current_time)
|
148
|
+
@logger.info("Full cache. Waiting for some free slot...") if @cache.full?
|
149
|
+
while @cache.full?
|
150
|
+
sleep 1
|
151
|
+
end
|
134
152
|
@max_retries.times do
|
135
153
|
begin
|
136
154
|
filename = self.get_gha_filename(current_time)
|
137
|
-
|
138
155
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
139
|
-
|
156
|
+
content = self.read_gha_file(gz)
|
157
|
+
@cache.put(filename, content)
|
140
158
|
return
|
141
159
|
end
|
160
|
+
rescue Errno::ECONNRESET
|
161
|
+
next
|
162
|
+
rescue Zlib::GzipFile::Error
|
163
|
+
raise $!
|
142
164
|
rescue
|
165
|
+
@logger.warn($!)
|
143
166
|
end
|
144
167
|
end
|
145
168
|
end
|
146
169
|
|
147
170
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
148
171
|
if @proactive
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
172
|
+
any_ready = Thread.promise
|
173
|
+
|
174
|
+
@logger.info("Proactively scheduling download tasks...")
|
175
|
+
self.each_date(from, to) do |current_date|
|
176
|
+
@pool.process(current_date) do |current_date|
|
177
|
+
cache(current_date)
|
178
|
+
any_ready << true
|
153
179
|
@logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
|
154
|
-
|
155
|
-
if @cache.full?
|
156
|
-
@logger.info("Full cache. Waiting...")
|
157
|
-
end
|
158
|
-
|
159
|
-
while @cache.full?
|
160
|
-
sleep 1
|
161
|
-
end
|
162
180
|
end
|
163
181
|
end
|
182
|
+
|
183
|
+
~any_ready
|
184
|
+
@logger.info("Download tasks successfully scheduled!")
|
164
185
|
end
|
165
186
|
|
166
187
|
super
|
167
188
|
end
|
168
189
|
|
169
190
|
class Cache
|
170
|
-
def initialize(
|
191
|
+
def initialize(max_size = 10)
|
171
192
|
@cache = {}
|
172
193
|
@max_size = max_size
|
173
|
-
@folder = folder
|
174
194
|
@mutex = Mutex.new
|
175
195
|
end
|
176
196
|
|
177
197
|
def put(name, content)
|
178
|
-
File.open("#@folder/#{name}", 'w') do |f|
|
179
|
-
f << content
|
180
|
-
end
|
181
|
-
|
182
198
|
@mutex.synchronize do
|
183
|
-
@cache[name] =
|
199
|
+
@cache[name] = content
|
184
200
|
end
|
185
201
|
end
|
186
202
|
|
187
203
|
def get(name)
|
188
204
|
@mutex.synchronize do
|
189
|
-
return
|
190
|
-
end
|
191
|
-
ensure
|
192
|
-
self.unload(name)
|
193
|
-
end
|
194
|
-
|
195
|
-
def unload(name)
|
196
|
-
File.unlink(@cache[name])
|
197
|
-
|
198
|
-
@mutex.synchronize do
|
199
|
-
@cache.delete(name)
|
205
|
+
return @cache.delete(name)
|
200
206
|
end
|
201
|
-
|
202
|
-
return true
|
203
207
|
end
|
204
208
|
|
205
209
|
def size
|
206
210
|
@mutex.synchronize do
|
207
|
-
@cache.size
|
211
|
+
return @cache.size
|
208
212
|
end
|
209
213
|
end
|
210
214
|
|
@@ -243,15 +247,23 @@ class GHADownloader
|
|
243
247
|
@logger = Logger.new(STDERR)
|
244
248
|
@decompress = decompress
|
245
249
|
@folder = folder
|
250
|
+
@max = nil
|
251
|
+
|
246
252
|
Dir.mkdir(@folder) unless FileTest.exist?(@folder)
|
247
253
|
raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
|
248
254
|
end
|
249
255
|
|
256
|
+
def max(max)
|
257
|
+
@max = max
|
258
|
+
return self
|
259
|
+
end
|
260
|
+
|
250
261
|
def logger=(logger)
|
251
262
|
@logger = logger
|
252
263
|
end
|
253
264
|
|
254
265
|
def download(from = Time.gm(2015, 1, 1), to = Time.now)
|
266
|
+
archive = []
|
255
267
|
self.each_date(from, to) do |current_date|
|
256
268
|
filename = self.get_gha_filename(current_date)
|
257
269
|
out_filename = filename.clone
|
@@ -274,6 +286,13 @@ class GHADownloader
|
|
274
286
|
end
|
275
287
|
end
|
276
288
|
end
|
289
|
+
archive << target_file
|
290
|
+
|
291
|
+
if @max && archive.size > @max
|
292
|
+
last = archive.shift
|
293
|
+
@logger.info("Removing local file #{last}")
|
294
|
+
File.unlink(last)
|
295
|
+
end
|
277
296
|
|
278
297
|
yield filename if block_given?
|
279
298
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.5'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-08-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
@@ -30,6 +30,26 @@ dependencies:
|
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.1.2
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: thread
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 0.2.2
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.2.2
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.2.2
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.2.2
|
33
53
|
description: Download and analyze the GitHub events stored at GitHub archive
|
34
54
|
email: s.scalabrino9@gmail.com
|
35
55
|
executables: []
|
@@ -56,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
56
76
|
- !ruby/object:Gem::Version
|
57
77
|
version: '0'
|
58
78
|
requirements: []
|
59
|
-
rubygems_version: 3.
|
79
|
+
rubygems_version: 3.2.21
|
60
80
|
signing_key:
|
61
81
|
specification_version: 4
|
62
82
|
summary: GitHub Archive mining utility
|