gh-archive 0.1 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +60 -41
  3. metadata +23 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c493a4b789b2fc8b8b544287614667417f668c080e086a98e142eb70e5ada40
4
- data.tar.gz: ab043dd0e56f9fa884405de70e86d1e8ff7b0091cbd980cd4c7be8694fcc3d77
3
+ metadata.gz: 011777addb798b172d58ffaac2b509ecf85288ee90cd28726c6303d14d39db1b
4
+ data.tar.gz: d8714b155567039e5de81f5ae36473c291f0af86701afaebf4527ab962dca240
5
5
  SHA512:
6
- metadata.gz: 663226f4cd9b6dd51d679848877f1b93f64069a839c35d2149511135a960a07736589e8aa69b01d60c9ebdc295db25fb3cde6fa24f1a11885958aba9ffac0af1
7
- data.tar.gz: f8889d87fb7853ae54871ccf192527d6021535d12c614fc4059a235e1df9c6245516d5ac9478f53c613932ca4a0655dd2e5c4a29955b8cacd843b38f99ddcba2
6
+ metadata.gz: db6a72c3e6e31490c0a3b574ee0edf8f8995434f7ba32b6eb93c4ff35b3a8b0bd3e35c85ea207cb000b399fa71b524578067dc923752d5951f156b0f0d21df23
7
+ data.tar.gz: a0bac6036c2147e0bd933209f458cb272f6a20d669f2616fc4aa2e6b0a257354e704bcf48f3d658241d81d10d09a94ae07c899223862177fbe5315b9719b4874
data/lib/gh-archive.rb CHANGED
@@ -3,6 +3,9 @@ require 'json'
3
3
  require 'open-uri'
4
4
  require 'zlib'
5
5
  require 'logger'
6
+ require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
6
9
 
7
10
  module GHAUtils
8
11
  def get_gha_filename(date)
@@ -91,7 +94,7 @@ class GHAProvider
91
94
  end
92
95
  next if skip
93
96
 
94
- yield event
97
+ yield event, current_date
95
98
  end
96
99
 
97
100
  events.clear
@@ -101,11 +104,13 @@ class GHAProvider
101
104
  end
102
105
 
103
106
  class OnlineGHAProvider < GHAProvider
104
- def initialize(max_retries = 3, proactive = false)
107
+ def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
105
108
  super()
106
109
 
107
110
  @max_retries = max_retries
108
111
  @proactive = proactive
112
+ @proactive_pool_size = proactive_pool_size
113
+ @pool = Thread.pool(proactive_pool_size)
109
114
  @cache = Cache.new
110
115
  end
111
116
 
@@ -114,16 +119,25 @@ class OnlineGHAProvider < GHAProvider
114
119
  begin
115
120
  filename = self.get_gha_filename(current_time)
116
121
 
117
- if @cache.has?(filename)
118
- result = self.read_gha_file(@cache.get(filename))
122
+ if @proactive
123
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
124
+
125
+ while !@cache.has?(filename)
126
+ sleep 1
127
+ end
128
+
129
+ return @cache.get(filename)
119
130
  else
120
131
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
121
- # Save to cache
122
132
  return self.read_gha_file(gz)
123
133
  end
124
134
  end
135
+ rescue Errno::ECONNRESET
136
+ next
137
+ rescue Zlib::GzipFile::Error
138
+ raise $!
125
139
  rescue
126
- @logger.warning($!)
140
+ @logger.warn($!)
127
141
  end
128
142
  end
129
143
 
@@ -131,80 +145,70 @@ class OnlineGHAProvider < GHAProvider
131
145
  end
132
146
 
133
147
  def cache(current_time)
148
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
149
+ while @cache.full?
150
+ sleep 1
151
+ end
134
152
  @max_retries.times do
135
153
  begin
136
154
  filename = self.get_gha_filename(current_time)
137
-
138
155
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
139
- @cache.put(filename, gz.read)
156
+ content = self.read_gha_file(gz)
157
+ @cache.put(filename, content)
140
158
  return
141
159
  end
160
+ rescue Errno::ECONNRESET
161
+ next
162
+ rescue Zlib::GzipFile::Error
163
+ raise $!
142
164
  rescue
165
+ @logger.warn($!)
143
166
  end
144
167
  end
145
168
  end
146
169
 
147
170
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
148
171
  if @proactive
149
- @logger.info("Proactive download thread started")
150
- Thread.start do
151
- self.each_date(from, to) do |current_date|
152
- self.cache(current_date)
172
+ any_ready = Thread.promise
173
+
174
+ @logger.info("Proactively scheduling download tasks...")
175
+ self.each_date(from, to) do |current_date|
176
+ @pool.process(current_date) do |current_date|
177
+ cache(current_date)
178
+ any_ready << true
153
179
  @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
154
-
155
- if @cache.full?
156
- @logger.info("Full cache. Waiting...")
157
- end
158
-
159
- while @cache.full?
160
- sleep 1
161
- end
162
180
  end
163
181
  end
182
+
183
+ ~any_ready
184
+ @logger.info("Download tasks successfully scheduled!")
164
185
  end
165
186
 
166
187
  super
167
188
  end
168
189
 
169
190
  class Cache
170
- def initialize(folder = Dir.mktmpdir, max_size = 100)
191
+ def initialize(max_size = 10)
171
192
  @cache = {}
172
193
  @max_size = max_size
173
- @folder = folder
174
194
  @mutex = Mutex.new
175
195
  end
176
196
 
177
197
  def put(name, content)
178
- File.open("#@folder/#{name}", 'w') do |f|
179
- f << content
180
- end
181
-
182
198
  @mutex.synchronize do
183
- @cache[name] = value
199
+ @cache[name] = content
184
200
  end
185
201
  end
186
202
 
187
203
  def get(name)
188
204
  @mutex.synchronize do
189
- return File.read(@cache[name])
190
- end
191
- ensure
192
- self.unload(name)
193
- end
194
-
195
- def unload(name)
196
- File.unlink(@cache[name])
197
-
198
- @mutex.synchronize do
199
- @cache.delete(name)
205
+ return @cache.delete(name)
200
206
  end
201
-
202
- return true
203
207
  end
204
208
 
205
209
  def size
206
210
  @mutex.synchronize do
207
- @cache.size
211
+ return @cache.size
208
212
  end
209
213
  end
210
214
 
@@ -243,15 +247,23 @@ class GHADownloader
243
247
  @logger = Logger.new(STDERR)
244
248
  @decompress = decompress
245
249
  @folder = folder
250
+ @max = nil
251
+
246
252
  Dir.mkdir(@folder) unless FileTest.exist?(@folder)
247
253
  raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
248
254
  end
249
255
 
256
+ def max(max)
257
+ @max = max
258
+ return self
259
+ end
260
+
250
261
  def logger=(logger)
251
262
  @logger = logger
252
263
  end
253
264
 
254
265
  def download(from = Time.gm(2015, 1, 1), to = Time.now)
266
+ archive = []
255
267
  self.each_date(from, to) do |current_date|
256
268
  filename = self.get_gha_filename(current_date)
257
269
  out_filename = filename.clone
@@ -274,6 +286,13 @@ class GHADownloader
274
286
  end
275
287
  end
276
288
  end
289
+ archive << target_file
290
+
291
+ if @max && archive.size > @max
292
+ last = archive.shift
293
+ @logger.info("Removing local file #{last}")
294
+ File.unlink(last)
295
+ end
277
296
 
278
297
  yield filename if block_given?
279
298
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.5'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-26 00:00:00.000000000 Z
11
+ date: 2021-08-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -30,6 +30,26 @@ dependencies:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.1.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: thread
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 0.2.2
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 0.2.2
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 0.2.2
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 0.2.2
33
53
  description: Download and analyze the GitHub events stored at GitHub archive
34
54
  email: s.scalabrino9@gmail.com
35
55
  executables: []
@@ -56,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
56
76
  - !ruby/object:Gem::Version
57
77
  version: '0'
58
78
  requirements: []
59
- rubygems_version: 3.1.4
79
+ rubygems_version: 3.2.21
60
80
  signing_key:
61
81
  specification_version: 4
62
82
  summary: GitHub Archive mining utility