gh-archive 0.1 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +60 -41
  3. metadata +23 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c493a4b789b2fc8b8b544287614667417f668c080e086a98e142eb70e5ada40
4
- data.tar.gz: ab043dd0e56f9fa884405de70e86d1e8ff7b0091cbd980cd4c7be8694fcc3d77
3
+ metadata.gz: 011777addb798b172d58ffaac2b509ecf85288ee90cd28726c6303d14d39db1b
4
+ data.tar.gz: d8714b155567039e5de81f5ae36473c291f0af86701afaebf4527ab962dca240
5
5
  SHA512:
6
- metadata.gz: 663226f4cd9b6dd51d679848877f1b93f64069a839c35d2149511135a960a07736589e8aa69b01d60c9ebdc295db25fb3cde6fa24f1a11885958aba9ffac0af1
7
- data.tar.gz: f8889d87fb7853ae54871ccf192527d6021535d12c614fc4059a235e1df9c6245516d5ac9478f53c613932ca4a0655dd2e5c4a29955b8cacd843b38f99ddcba2
6
+ metadata.gz: db6a72c3e6e31490c0a3b574ee0edf8f8995434f7ba32b6eb93c4ff35b3a8b0bd3e35c85ea207cb000b399fa71b524578067dc923752d5951f156b0f0d21df23
7
+ data.tar.gz: a0bac6036c2147e0bd933209f458cb272f6a20d669f2616fc4aa2e6b0a257354e704bcf48f3d658241d81d10d09a94ae07c899223862177fbe5315b9719b4874
data/lib/gh-archive.rb CHANGED
@@ -3,6 +3,9 @@ require 'json'
3
3
  require 'open-uri'
4
4
  require 'zlib'
5
5
  require 'logger'
6
+ require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
6
9
 
7
10
  module GHAUtils
8
11
  def get_gha_filename(date)
@@ -91,7 +94,7 @@ class GHAProvider
91
94
  end
92
95
  next if skip
93
96
 
94
- yield event
97
+ yield event, current_date
95
98
  end
96
99
 
97
100
  events.clear
@@ -101,11 +104,13 @@ class GHAProvider
101
104
  end
102
105
 
103
106
  class OnlineGHAProvider < GHAProvider
104
- def initialize(max_retries = 3, proactive = false)
107
+ def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
105
108
  super()
106
109
 
107
110
  @max_retries = max_retries
108
111
  @proactive = proactive
112
+ @proactive_pool_size = proactive_pool_size
113
+ @pool = Thread.pool(proactive_pool_size)
109
114
  @cache = Cache.new
110
115
  end
111
116
 
@@ -114,16 +119,25 @@ class OnlineGHAProvider < GHAProvider
114
119
  begin
115
120
  filename = self.get_gha_filename(current_time)
116
121
 
117
- if @cache.has?(filename)
118
- result = self.read_gha_file(@cache.get(filename))
122
+ if @proactive
123
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
124
+
125
+ while !@cache.has?(filename)
126
+ sleep 1
127
+ end
128
+
129
+ return @cache.get(filename)
119
130
  else
120
131
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
121
- # Save to cache
122
132
  return self.read_gha_file(gz)
123
133
  end
124
134
  end
135
+ rescue Errno::ECONNRESET
136
+ next
137
+ rescue Zlib::GzipFile::Error
138
+ raise $!
125
139
  rescue
126
- @logger.warning($!)
140
+ @logger.warn($!)
127
141
  end
128
142
  end
129
143
 
@@ -131,80 +145,70 @@ class OnlineGHAProvider < GHAProvider
131
145
  end
132
146
 
133
147
  def cache(current_time)
148
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
149
+ while @cache.full?
150
+ sleep 1
151
+ end
134
152
  @max_retries.times do
135
153
  begin
136
154
  filename = self.get_gha_filename(current_time)
137
-
138
155
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
139
- @cache.put(filename, gz.read)
156
+ content = self.read_gha_file(gz)
157
+ @cache.put(filename, content)
140
158
  return
141
159
  end
160
+ rescue Errno::ECONNRESET
161
+ next
162
+ rescue Zlib::GzipFile::Error
163
+ raise $!
142
164
  rescue
165
+ @logger.warn($!)
143
166
  end
144
167
  end
145
168
  end
146
169
 
147
170
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
148
171
  if @proactive
149
- @logger.info("Proactive download thread started")
150
- Thread.start do
151
- self.each_date(from, to) do |current_date|
152
- self.cache(current_date)
172
+ any_ready = Thread.promise
173
+
174
+ @logger.info("Proactively scheduling download tasks...")
175
+ self.each_date(from, to) do |current_date|
176
+ @pool.process(current_date) do |current_date|
177
+ cache(current_date)
178
+ any_ready << true
153
179
  @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
154
-
155
- if @cache.full?
156
- @logger.info("Full cache. Waiting...")
157
- end
158
-
159
- while @cache.full?
160
- sleep 1
161
- end
162
180
  end
163
181
  end
182
+
183
+ ~any_ready
184
+ @logger.info("Download tasks successfully scheduled!")
164
185
  end
165
186
 
166
187
  super
167
188
  end
168
189
 
169
190
  class Cache
170
- def initialize(folder = Dir.mktmpdir, max_size = 100)
191
+ def initialize(max_size = 10)
171
192
  @cache = {}
172
193
  @max_size = max_size
173
- @folder = folder
174
194
  @mutex = Mutex.new
175
195
  end
176
196
 
177
197
  def put(name, content)
178
- File.open("#@folder/#{name}", 'w') do |f|
179
- f << content
180
- end
181
-
182
198
  @mutex.synchronize do
183
- @cache[name] = value
199
+ @cache[name] = content
184
200
  end
185
201
  end
186
202
 
187
203
  def get(name)
188
204
  @mutex.synchronize do
189
- return File.read(@cache[name])
190
- end
191
- ensure
192
- self.unload(name)
193
- end
194
-
195
- def unload(name)
196
- File.unlink(@cache[name])
197
-
198
- @mutex.synchronize do
199
- @cache.delete(name)
205
+ return @cache.delete(name)
200
206
  end
201
-
202
- return true
203
207
  end
204
208
 
205
209
  def size
206
210
  @mutex.synchronize do
207
- @cache.size
211
+ return @cache.size
208
212
  end
209
213
  end
210
214
 
@@ -243,15 +247,23 @@ class GHADownloader
243
247
  @logger = Logger.new(STDERR)
244
248
  @decompress = decompress
245
249
  @folder = folder
250
+ @max = nil
251
+
246
252
  Dir.mkdir(@folder) unless FileTest.exist?(@folder)
247
253
  raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
248
254
  end
249
255
 
256
+ def max(max)
257
+ @max = max
258
+ return self
259
+ end
260
+
250
261
  def logger=(logger)
251
262
  @logger = logger
252
263
  end
253
264
 
254
265
  def download(from = Time.gm(2015, 1, 1), to = Time.now)
266
+ archive = []
255
267
  self.each_date(from, to) do |current_date|
256
268
  filename = self.get_gha_filename(current_date)
257
269
  out_filename = filename.clone
@@ -274,6 +286,13 @@ class GHADownloader
274
286
  end
275
287
  end
276
288
  end
289
+ archive << target_file
290
+
291
+ if @max && archive.size > @max
292
+ last = archive.shift
293
+ @logger.info("Removing local file #{last}")
294
+ File.unlink(last)
295
+ end
277
296
 
278
297
  yield filename if block_given?
279
298
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.5'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-26 00:00:00.000000000 Z
11
+ date: 2021-08-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -30,6 +30,26 @@ dependencies:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.1.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: thread
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 0.2.2
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 0.2.2
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 0.2.2
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 0.2.2
33
53
  description: Download and analyze the GitHub events stored at GitHub archive
34
54
  email: s.scalabrino9@gmail.com
35
55
  executables: []
@@ -56,7 +76,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
56
76
  - !ruby/object:Gem::Version
57
77
  version: '0'
58
78
  requirements: []
59
- rubygems_version: 3.1.4
79
+ rubygems_version: 3.2.21
60
80
  signing_key:
61
81
  specification_version: 4
62
82
  summary: GitHub Archive mining utility