gh-archive 0.4 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +36 -39
- metadata +22 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 011777addb798b172d58ffaac2b509ecf85288ee90cd28726c6303d14d39db1b
|
4
|
+
data.tar.gz: d8714b155567039e5de81f5ae36473c291f0af86701afaebf4527ab962dca240
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db6a72c3e6e31490c0a3b574ee0edf8f8995434f7ba32b6eb93c4ff35b3a8b0bd3e35c85ea207cb000b399fa71b524578067dc923752d5951f156b0f0d21df23
|
7
|
+
data.tar.gz: a0bac6036c2147e0bd933209f458cb272f6a20d669f2616fc4aa2e6b0a257354e704bcf48f3d658241d81d10d09a94ae07c899223862177fbe5315b9719b4874
|
data/lib/gh-archive.rb
CHANGED
@@ -4,6 +4,8 @@ require 'open-uri'
|
|
4
4
|
require 'zlib'
|
5
5
|
require 'logger'
|
6
6
|
require 'tmpdir'
|
7
|
+
require 'thread/pool'
|
8
|
+
require 'thread/promise'
|
7
9
|
|
8
10
|
module GHAUtils
|
9
11
|
def get_gha_filename(date)
|
@@ -108,6 +110,7 @@ class OnlineGHAProvider < GHAProvider
|
|
108
110
|
@max_retries = max_retries
|
109
111
|
@proactive = proactive
|
110
112
|
@proactive_pool_size = proactive_pool_size
|
113
|
+
@pool = Thread.pool(proactive_pool_size)
|
111
114
|
@cache = Cache.new
|
112
115
|
end
|
113
116
|
|
@@ -116,14 +119,23 @@ class OnlineGHAProvider < GHAProvider
|
|
116
119
|
begin
|
117
120
|
filename = self.get_gha_filename(current_time)
|
118
121
|
|
119
|
-
if @
|
120
|
-
|
122
|
+
if @proactive
|
123
|
+
@logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
|
124
|
+
|
125
|
+
while !@cache.has?(filename)
|
126
|
+
sleep 1
|
127
|
+
end
|
128
|
+
|
129
|
+
return @cache.get(filename)
|
121
130
|
else
|
122
131
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
123
|
-
# Save to cache
|
124
132
|
return self.read_gha_file(gz)
|
125
133
|
end
|
126
134
|
end
|
135
|
+
rescue Errno::ECONNRESET
|
136
|
+
next
|
137
|
+
rescue Zlib::GzipFile::Error
|
138
|
+
raise $!
|
127
139
|
rescue
|
128
140
|
@logger.warn($!)
|
129
141
|
end
|
@@ -133,59 +145,56 @@ class OnlineGHAProvider < GHAProvider
|
|
133
145
|
end
|
134
146
|
|
135
147
|
def cache(current_time)
|
148
|
+
@logger.info("Full cache. Waiting for some free slot...") if @cache.full?
|
149
|
+
while @cache.full?
|
150
|
+
sleep 1
|
151
|
+
end
|
136
152
|
@max_retries.times do
|
137
153
|
begin
|
138
154
|
filename = self.get_gha_filename(current_time)
|
139
|
-
|
140
155
|
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
141
156
|
content = self.read_gha_file(gz)
|
142
157
|
@cache.put(filename, content)
|
143
158
|
return
|
144
159
|
end
|
160
|
+
rescue Errno::ECONNRESET
|
161
|
+
next
|
162
|
+
rescue Zlib::GzipFile::Error
|
163
|
+
raise $!
|
145
164
|
rescue
|
146
|
-
|
165
|
+
@logger.warn($!)
|
147
166
|
end
|
148
167
|
end
|
149
168
|
end
|
150
169
|
|
151
170
|
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
152
171
|
if @proactive
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
pool << Thread.start do
|
163
|
-
self.cache(current_date)
|
164
|
-
@logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
|
165
|
-
end
|
166
|
-
|
167
|
-
pool.delete_if { |t| !t.alive? }
|
172
|
+
any_ready = Thread.promise
|
173
|
+
|
174
|
+
@logger.info("Proactively scheduling download tasks...")
|
175
|
+
self.each_date(from, to) do |current_date|
|
176
|
+
@pool.process(current_date) do |current_date|
|
177
|
+
cache(current_date)
|
178
|
+
any_ready << true
|
179
|
+
@logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
|
168
180
|
end
|
169
181
|
end
|
182
|
+
|
183
|
+
~any_ready
|
184
|
+
@logger.info("Download tasks successfully scheduled!")
|
170
185
|
end
|
171
186
|
|
172
187
|
super
|
173
188
|
end
|
174
189
|
|
175
190
|
class Cache
|
176
|
-
def initialize(
|
191
|
+
def initialize(max_size = 10)
|
177
192
|
@cache = {}
|
178
193
|
@max_size = max_size
|
179
|
-
@folder = folder
|
180
194
|
@mutex = Mutex.new
|
181
195
|
end
|
182
196
|
|
183
197
|
def put(name, content)
|
184
|
-
#filename = "#@folder/#{name}"
|
185
|
-
#File.open(filename, 'w') do |f|
|
186
|
-
#f << content
|
187
|
-
#end
|
188
|
-
|
189
198
|
@mutex.synchronize do
|
190
199
|
@cache[name] = content
|
191
200
|
end
|
@@ -195,18 +204,6 @@ class OnlineGHAProvider < GHAProvider
|
|
195
204
|
@mutex.synchronize do
|
196
205
|
return @cache.delete(name)
|
197
206
|
end
|
198
|
-
ensure
|
199
|
-
#self.unload(name)
|
200
|
-
end
|
201
|
-
|
202
|
-
def unload(name)
|
203
|
-
File.unlink(@cache[name])
|
204
|
-
|
205
|
-
@mutex.synchronize do
|
206
|
-
@cache.delete(name)
|
207
|
-
end
|
208
|
-
|
209
|
-
return true
|
210
207
|
end
|
211
208
|
|
212
209
|
def size
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.5'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-08-
|
11
|
+
date: 2021-08-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
@@ -30,6 +30,26 @@ dependencies:
|
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.1.2
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: thread
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 0.2.2
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.2.2
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.2.2
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.2.2
|
33
53
|
description: Download and analyze the GitHub events stored at GitHub archive
|
34
54
|
email: s.scalabrino9@gmail.com
|
35
55
|
executables: []
|