gh-archive 0.4 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +36 -39
  3. metadata +22 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4a7b3d8b97242ec787b56b409e4a81823473adc2b2915d5466e38a29015564a9
4
- data.tar.gz: 6c50ded9b466fedb0ea1ca22dfcf8bdf9ec65a83b4666a429122a7345d0ebdea
3
+ metadata.gz: 011777addb798b172d58ffaac2b509ecf85288ee90cd28726c6303d14d39db1b
4
+ data.tar.gz: d8714b155567039e5de81f5ae36473c291f0af86701afaebf4527ab962dca240
5
5
  SHA512:
6
- metadata.gz: 79bdb3c1649ff4fc86d711ce309e11055eb68720b8bf1d87f7e31b1a3a1586949cdd8d623630d446f7ec04288a2b0db7c598ca8bcf90c8a0c6c36edf6884805b
7
- data.tar.gz: b790981c3d3becd6e46cece1e4bc5fa4cc6b7a25ac5c0bdc9fbbf6ab3ea6cb188f0d5abf48036426fec82e2fd1543ba85928f32df3c6915d56d2300eb35a43ce
6
+ metadata.gz: db6a72c3e6e31490c0a3b574ee0edf8f8995434f7ba32b6eb93c4ff35b3a8b0bd3e35c85ea207cb000b399fa71b524578067dc923752d5951f156b0f0d21df23
7
+ data.tar.gz: a0bac6036c2147e0bd933209f458cb272f6a20d669f2616fc4aa2e6b0a257354e704bcf48f3d658241d81d10d09a94ae07c899223862177fbe5315b9719b4874
data/lib/gh-archive.rb CHANGED
@@ -4,6 +4,8 @@ require 'open-uri'
4
4
  require 'zlib'
5
5
  require 'logger'
6
6
  require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
7
9
 
8
10
  module GHAUtils
9
11
  def get_gha_filename(date)
@@ -108,6 +110,7 @@ class OnlineGHAProvider < GHAProvider
108
110
  @max_retries = max_retries
109
111
  @proactive = proactive
110
112
  @proactive_pool_size = proactive_pool_size
113
+ @pool = Thread.pool(proactive_pool_size)
111
114
  @cache = Cache.new
112
115
  end
113
116
 
@@ -116,14 +119,23 @@ class OnlineGHAProvider < GHAProvider
116
119
  begin
117
120
  filename = self.get_gha_filename(current_time)
118
121
 
119
- if @cache.has?(filename)
120
- result = @cache.get(filename)
122
+ if @proactive
123
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
124
+
125
+ while !@cache.has?(filename)
126
+ sleep 1
127
+ end
128
+
129
+ return @cache.get(filename)
121
130
  else
122
131
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
123
- # Save to cache
124
132
  return self.read_gha_file(gz)
125
133
  end
126
134
  end
135
+ rescue Errno::ECONNRESET
136
+ next
137
+ rescue Zlib::GzipFile::Error
138
+ raise $!
127
139
  rescue
128
140
  @logger.warn($!)
129
141
  end
@@ -133,59 +145,56 @@ class OnlineGHAProvider < GHAProvider
133
145
  end
134
146
 
135
147
  def cache(current_time)
148
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
149
+ while @cache.full?
150
+ sleep 1
151
+ end
136
152
  @max_retries.times do
137
153
  begin
138
154
  filename = self.get_gha_filename(current_time)
139
-
140
155
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
141
156
  content = self.read_gha_file(gz)
142
157
  @cache.put(filename, content)
143
158
  return
144
159
  end
160
+ rescue Errno::ECONNRESET
161
+ next
162
+ rescue Zlib::GzipFile::Error
163
+ raise $!
145
164
  rescue
146
- p $!
165
+ @logger.warn($!)
147
166
  end
148
167
  end
149
168
  end
150
169
 
151
170
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
152
171
  if @proactive
153
- @logger.info("Proactive download thread started")
154
- Thread.start do
155
- pool = []
156
- self.each_date(from, to) do |current_date|
157
- while pool.size > @proactive_pool_size || @cache.full?
158
- pool.delete_if { |t| !t.alive? }
159
- sleep 0.1
160
- end
161
-
162
- pool << Thread.start do
163
- self.cache(current_date)
164
- @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
165
- end
166
-
167
- pool.delete_if { |t| !t.alive? }
172
+ any_ready = Thread.promise
173
+
174
+ @logger.info("Proactively scheduling download tasks...")
175
+ self.each_date(from, to) do |current_date|
176
+ @pool.process(current_date) do |current_date|
177
+ cache(current_date)
178
+ any_ready << true
179
+ @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
168
180
  end
169
181
  end
182
+
183
+ ~any_ready
184
+ @logger.info("Download tasks successfully scheduled!")
170
185
  end
171
186
 
172
187
  super
173
188
  end
174
189
 
175
190
  class Cache
176
- def initialize(folder = Dir.mktmpdir, max_size = 100)
191
+ def initialize(max_size = 10)
177
192
  @cache = {}
178
193
  @max_size = max_size
179
- @folder = folder
180
194
  @mutex = Mutex.new
181
195
  end
182
196
 
183
197
  def put(name, content)
184
- #filename = "#@folder/#{name}"
185
- #File.open(filename, 'w') do |f|
186
- #f << content
187
- #end
188
-
189
198
  @mutex.synchronize do
190
199
  @cache[name] = content
191
200
  end
@@ -195,18 +204,6 @@ class OnlineGHAProvider < GHAProvider
195
204
  @mutex.synchronize do
196
205
  return @cache.delete(name)
197
206
  end
198
- ensure
199
- #self.unload(name)
200
- end
201
-
202
- def unload(name)
203
- File.unlink(@cache[name])
204
-
205
- @mutex.synchronize do
206
- @cache.delete(name)
207
- end
208
-
209
- return true
210
207
  end
211
208
 
212
209
  def size
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.4'
4
+ version: '0.5'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-11 00:00:00.000000000 Z
11
+ date: 2021-08-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -30,6 +30,26 @@ dependencies:
30
30
  - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.1.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: thread
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 0.2.2
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 0.2.2
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 0.2.2
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 0.2.2
33
53
  description: Download and analyze the GitHub events stored at GitHub archive
34
54
  email: s.scalabrino9@gmail.com
35
55
  executables: []