gh-archive 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +36 -39
  3. metadata +22 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4a7b3d8b97242ec787b56b409e4a81823473adc2b2915d5466e38a29015564a9
4
- data.tar.gz: 6c50ded9b466fedb0ea1ca22dfcf8bdf9ec65a83b4666a429122a7345d0ebdea
3
+ metadata.gz: 011777addb798b172d58ffaac2b509ecf85288ee90cd28726c6303d14d39db1b
4
+ data.tar.gz: d8714b155567039e5de81f5ae36473c291f0af86701afaebf4527ab962dca240
5
5
  SHA512:
6
- metadata.gz: 79bdb3c1649ff4fc86d711ce309e11055eb68720b8bf1d87f7e31b1a3a1586949cdd8d623630d446f7ec04288a2b0db7c598ca8bcf90c8a0c6c36edf6884805b
7
- data.tar.gz: b790981c3d3becd6e46cece1e4bc5fa4cc6b7a25ac5c0bdc9fbbf6ab3ea6cb188f0d5abf48036426fec82e2fd1543ba85928f32df3c6915d56d2300eb35a43ce
6
+ metadata.gz: db6a72c3e6e31490c0a3b574ee0edf8f8995434f7ba32b6eb93c4ff35b3a8b0bd3e35c85ea207cb000b399fa71b524578067dc923752d5951f156b0f0d21df23
7
+ data.tar.gz: a0bac6036c2147e0bd933209f458cb272f6a20d669f2616fc4aa2e6b0a257354e704bcf48f3d658241d81d10d09a94ae07c899223862177fbe5315b9719b4874
data/lib/gh-archive.rb CHANGED
@@ -4,6 +4,8 @@ require 'open-uri'
4
4
  require 'zlib'
5
5
  require 'logger'
6
6
  require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
7
9
 
8
10
  module GHAUtils
9
11
  def get_gha_filename(date)
@@ -108,6 +110,7 @@ class OnlineGHAProvider < GHAProvider
108
110
  @max_retries = max_retries
109
111
  @proactive = proactive
110
112
  @proactive_pool_size = proactive_pool_size
113
+ @pool = Thread.pool(proactive_pool_size)
111
114
  @cache = Cache.new
112
115
  end
113
116
 
@@ -116,14 +119,23 @@ class OnlineGHAProvider < GHAProvider
116
119
  begin
117
120
  filename = self.get_gha_filename(current_time)
118
121
 
119
- if @cache.has?(filename)
120
- result = @cache.get(filename)
122
+ if @proactive
123
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
124
+
125
+ while !@cache.has?(filename)
126
+ sleep 1
127
+ end
128
+
129
+ return @cache.get(filename)
121
130
  else
122
131
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
123
- # Save to cache
124
132
  return self.read_gha_file(gz)
125
133
  end
126
134
  end
135
+ rescue Errno::ECONNRESET
136
+ next
137
+ rescue Zlib::GzipFile::Error
138
+ raise $!
127
139
  rescue
128
140
  @logger.warn($!)
129
141
  end
@@ -133,59 +145,56 @@ class OnlineGHAProvider < GHAProvider
133
145
  end
134
146
 
135
147
  def cache(current_time)
148
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
149
+ while @cache.full?
150
+ sleep 1
151
+ end
136
152
  @max_retries.times do
137
153
  begin
138
154
  filename = self.get_gha_filename(current_time)
139
-
140
155
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
141
156
  content = self.read_gha_file(gz)
142
157
  @cache.put(filename, content)
143
158
  return
144
159
  end
160
+ rescue Errno::ECONNRESET
161
+ next
162
+ rescue Zlib::GzipFile::Error
163
+ raise $!
145
164
  rescue
146
- p $!
165
+ @logger.warn($!)
147
166
  end
148
167
  end
149
168
  end
150
169
 
151
170
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
152
171
  if @proactive
153
- @logger.info("Proactive download thread started")
154
- Thread.start do
155
- pool = []
156
- self.each_date(from, to) do |current_date|
157
- while pool.size > @proactive_pool_size || @cache.full?
158
- pool.delete_if { |t| !t.alive? }
159
- sleep 0.1
160
- end
161
-
162
- pool << Thread.start do
163
- self.cache(current_date)
164
- @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
165
- end
166
-
167
- pool.delete_if { |t| !t.alive? }
172
+ any_ready = Thread.promise
173
+
174
+ @logger.info("Proactively scheduling download tasks...")
175
+ self.each_date(from, to) do |current_date|
176
+ @pool.process(current_date) do |current_date|
177
+ cache(current_date)
178
+ any_ready << true
179
+ @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
168
180
  end
169
181
  end
182
+
183
+ ~any_ready
184
+ @logger.info("Download tasks successfully scheduled!")
170
185
  end
171
186
 
172
187
  super
173
188
  end
174
189
 
175
190
  class Cache
176
- def initialize(folder = Dir.mktmpdir, max_size = 100)
191
+ def initialize(max_size = 10)
177
192
  @cache = {}
178
193
  @max_size = max_size
179
- @folder = folder
180
194
  @mutex = Mutex.new
181
195
  end
182
196
 
183
197
  def put(name, content)
184
- #filename = "#@folder/#{name}"
185
- #File.open(filename, 'w') do |f|
186
- #f << content
187
- #end
188
-
189
198
  @mutex.synchronize do
190
199
  @cache[name] = content
191
200
  end
@@ -195,18 +204,6 @@ class OnlineGHAProvider < GHAProvider
195
204
  @mutex.synchronize do
196
205
  return @cache.delete(name)
197
206
  end
198
- ensure
199
- #self.unload(name)
200
- end
201
-
202
- def unload(name)
203
- File.unlink(@cache[name])
204
-
205
- @mutex.synchronize do
206
- @cache.delete(name)
207
- end
208
-
209
- return true
210
207
  end
211
208
 
212
209
  def size
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.4'
4
+ version: '0.5'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-11 00:00:00.000000000 Z
11
+ date: 2021-08-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -30,6 +30,26 @@ dependencies:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.1.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: thread
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 0.2.2
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 0.2.2
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 0.2.2
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 0.2.2
33
53
  description: Download and analyze the GitHub events stored at GitHub archive
34
54
  email: s.scalabrino9@gmail.com
35
55
  executables: []