gh-archive 0.2 → 0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +111 -68
  3. metadata +29 -9
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c90ec7c3f14e2f57de8a145ad50c1f7730869c5aad0b1de33286accec99642c7
4
- data.tar.gz: a16c29db393499905695d411a62ca0dfb6e319d7fd509c104d41b2d0959b3414
3
+ metadata.gz: 91b0e957c5176b791d4f49e382680865405e7a6b2b29b349bcbb78b92d884e02
4
+ data.tar.gz: f8ddae3d80e80a24931d8632c9798c8f01520e1fe5ac8a85079c0d0e85eadcbc
5
5
  SHA512:
6
- metadata.gz: f6a9feafaa7d0d06f75c489b147b2fc9b5485e88a28c15f98b1081c8b1f05fffc8218253ca817b0ecf64c1050f3e63ed4e1ad2d6aec9df5a4d362cf5ed2832bc
7
- data.tar.gz: 1bfa5ab2dbccc74bd2b162066bab6874fc3a2fc08b6d2ad9ec02c05a73078ee84abe1d069eb2b51b491134b35ea4ee0724b81aa3ffbd845b9997ba3860639c05
6
+ metadata.gz: f7b24be932f58142b36887671b4265e25631345e7b81cc36b264be4a018fc0c4a88b853ae384dc8472876bf0996e904bf499007adc3091ddb511f28c828090fc
7
+ data.tar.gz: 5cbb83495b9bb397a41022cb1bf4bce0344c735d16f9f43fb181b4b109948ac7a75bd44c693338f02b1ff17eeeeb5b83a6add9deeeedbd379a8848614041a3f5
data/lib/gh-archive.rb CHANGED
@@ -3,6 +3,9 @@ require 'json'
3
3
  require 'open-uri'
4
4
  require 'zlib'
5
5
  require 'logger'
6
+ require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
6
9
 
7
10
  module GHAUtils
8
11
  def get_gha_filename(date)
@@ -11,14 +14,19 @@ module GHAUtils
11
14
 
12
15
  def read_gha_file_content(gz)
13
16
  gzip = Zlib::GzipReader.new(gz)
14
- content = gzip.read
15
- gzip.close
16
-
17
- return content
17
+ return gzip.read
18
+ ensure
19
+ gzip.close if gzip
18
20
  end
19
21
 
20
- def read_gha_file(gz)
21
- content = read_gha_file_content(gz)
22
+ def read_gha_file(file)
23
+ if file.path.end_with?(".json")
24
+ content = file.read
25
+ elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
26
+ content = read_gha_file_content(file)
27
+ else
28
+ raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
29
+ end
22
30
 
23
31
  result = []
24
32
  content.lines.each do |line|
@@ -28,11 +36,11 @@ module GHAUtils
28
36
  return result
29
37
  end
30
38
 
31
- def each_date(from, to)
32
- current_date = from
33
- while current_date < to
34
- yield current_date
35
- current_date += 3600
39
+ def each_time(from, to)
40
+ current_time = from
41
+ while current_time < to
42
+ yield current_time
43
+ current_time += 3600
36
44
  end
37
45
  end
38
46
  end
@@ -70,13 +78,18 @@ class GHAProvider
70
78
  end
71
79
 
72
80
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
73
- self.each_date(from, to) do |current_date|
81
+ exceptions = []
82
+
83
+ self.each_time(from, to) do |current_time|
74
84
  events = []
75
85
  begin
76
- events = self.get(current_date)
77
- @logger.info("Scanned #{current_date}")
78
- rescue
79
- @logger.error($!)
86
+ events = self.get(current_time)
87
+ rescue GHAException => e
88
+ @logger.warn(e.message)
89
+ next
90
+ rescue => e
91
+ @logger.error("An exception occurred for #{current_time}: #{e.message}")
92
+ exceptions << e
80
93
  next
81
94
  end
82
95
 
@@ -91,21 +104,30 @@ class GHAProvider
91
104
  end
92
105
  next if skip
93
106
 
94
- yield event
107
+ yield event, current_time
95
108
  end
96
109
 
110
+ @logger.info("Scanned #{current_time}")
111
+
97
112
  events.clear
98
113
  GC.start
99
114
  end
115
+
116
+ return exceptions
117
+ end
118
+
119
+ class GHAException < Exception
100
120
  end
101
121
  end
102
122
 
103
123
  class OnlineGHAProvider < GHAProvider
104
- def initialize(max_retries = 3, proactive = false)
124
+ def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
105
125
  super()
106
126
 
107
127
  @max_retries = max_retries
108
128
  @proactive = proactive
129
+ @proactive_pool_size = proactive_pool_size
130
+ @pool = Thread.pool(proactive_pool_size)
109
131
  @cache = Cache.new
110
132
  end
111
133
 
@@ -114,97 +136,106 @@ class OnlineGHAProvider < GHAProvider
114
136
  begin
115
137
  filename = self.get_gha_filename(current_time)
116
138
 
117
- if @cache.has?(filename)
118
- result = self.read_gha_file(@cache.get(filename))
139
+ if @proactive
140
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
141
+
142
+ while !@cache.has?(filename)
143
+ sleep 1
144
+ end
145
+
146
+ return @cache.get(filename)
119
147
  else
120
148
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
121
- # Save to cache
122
149
  return self.read_gha_file(gz)
123
150
  end
124
151
  end
125
- rescue
126
- @logger.warning($!)
152
+ rescue Errno::ECONNRESET => e
153
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
154
+ next
155
+ rescue OpenURI::HTTPError => e
156
+ code = e.io.status[0]
157
+ if code.start_with?("5")
158
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
159
+ next
160
+ else
161
+ raise e
162
+ end
127
163
  end
128
164
  end
129
165
 
130
- raise DownloadArchiveException, "Exceeded maximum number of tentative downloads."
166
+ raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
131
167
  end
132
168
 
133
169
  def cache(current_time)
170
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
171
+ while @cache.full?
172
+ sleep 1
173
+ end
134
174
  @max_retries.times do
135
175
  begin
136
176
  filename = self.get_gha_filename(current_time)
137
-
138
177
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
139
- @cache.put(filename, gz.read)
178
+ content = self.read_gha_file(gz)
179
+ @cache.put(filename, content)
140
180
  return
141
181
  end
142
- rescue
182
+ rescue Errno::ECONNRESET => e
183
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
184
+ next
185
+ rescue OpenURI::HTTPError => e
186
+ code = e.io.status[0]
187
+ if code.start_with?("5")
188
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
189
+ next
190
+ else
191
+ raise e
192
+ end
143
193
  end
144
194
  end
145
195
  end
146
196
 
147
197
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
148
198
  if @proactive
149
- @logger.info("Proactive download thread started")
150
- Thread.start do
151
- self.each_date(from, to) do |current_date|
152
- self.cache(current_date)
153
- @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
154
-
155
- if @cache.full?
156
- @logger.info("Full cache. Waiting...")
157
- end
158
-
159
- while @cache.full?
160
- sleep 1
161
- end
199
+ any_ready = Thread.promise
200
+
201
+ @logger.info("Proactively scheduling download tasks...")
202
+ self.each_time(from, to) do |current_time|
203
+ @pool.process(current_time) do |current_time|
204
+ cache(current_time)
205
+ any_ready << true
206
+ @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
162
207
  end
163
208
  end
209
+
210
+ ~any_ready
211
+ @logger.info("Download tasks successfully scheduled!")
164
212
  end
165
213
 
166
214
  super
167
215
  end
168
216
 
169
217
  class Cache
170
- def initialize(folder = Dir.mktmpdir, max_size = 100)
218
+ def initialize(max_size = 10)
171
219
  @cache = {}
172
220
  @max_size = max_size
173
- @folder = folder
174
221
  @mutex = Mutex.new
175
222
  end
176
223
 
177
224
  def put(name, content)
178
- File.open("#@folder/#{name}", 'w') do |f|
179
- f << content
180
- end
181
-
182
225
  @mutex.synchronize do
183
- @cache[name] = value
226
+ @cache[name] = content
184
227
  end
185
228
  end
186
229
 
187
230
  def get(name)
188
231
  @mutex.synchronize do
189
- return File.read(@cache[name])
190
- end
191
- ensure
192
- self.unload(name)
193
- end
194
-
195
- def unload(name)
196
- File.unlink(@cache[name])
197
-
198
- @mutex.synchronize do
199
- @cache.delete(name)
232
+ return @cache.delete(name)
200
233
  end
201
-
202
- return true
203
234
  end
204
235
 
205
236
  def size
206
237
  @mutex.synchronize do
207
- @cache.size
238
+ return @cache.size
208
239
  end
209
240
  end
210
241
 
@@ -217,7 +248,7 @@ class OnlineGHAProvider < GHAProvider
217
248
  end
218
249
  end
219
250
 
220
- class DownloadArchiveException < Exception
251
+ class DownloadArchiveException < GHAProvider::GHAException
221
252
  end
222
253
  end
223
254
 
@@ -230,8 +261,20 @@ class FolderGHAProvider < GHAProvider
230
261
 
231
262
  def get(current_time)
232
263
  filename = self.get_gha_filename(current_time)
233
- File.open(File.join(@folder, filename), "rb") do |gz|
234
- return self.read_gha_file(gz)
264
+ complete_filename = File.join(@folder, filename)
265
+ mode = "rb"
266
+
267
+ unless FileTest.exist?(complete_filename)
268
+ complete_filename = complete_filename.sub(".gz", "")
269
+ mode = "r"
270
+ end
271
+
272
+ unless FileTest.exist?(complete_filename)
273
+ raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
274
+ end
275
+
276
+ File.open(complete_filename, mode) do |file|
277
+ return self.read_gha_file(file)
235
278
  end
236
279
  end
237
280
  end
@@ -260,17 +303,17 @@ class GHADownloader
260
303
 
261
304
  def download(from = Time.gm(2015, 1, 1), to = Time.now)
262
305
  archive = []
263
- self.each_date(from, to) do |current_date|
264
- filename = self.get_gha_filename(current_date)
306
+ self.each_time(from, to) do |current_time|
307
+ filename = self.get_gha_filename(current_time)
265
308
  out_filename = filename.clone
266
309
  out_filename.gsub!(".json.gz", ".json") if @decompress
267
310
 
268
311
  target_file = File.join(@folder, out_filename)
269
312
  if FileTest.exist?(target_file)
270
- @logger.info("Skipping existing file for #{current_date}")
313
+ @logger.info("Skipping existing file for #{current_time}")
271
314
  next
272
315
  else
273
- @logger.info("Downloading file for #{current_date}")
316
+ @logger.info("Downloading file for #{current_time}")
274
317
  end
275
318
 
276
319
  File.open(target_file, 'w') do |f|
metadata CHANGED
@@ -1,35 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.2'
4
+ version: '0.6'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-26 00:00:00.000000000 Z
11
+ date: 2021-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: 1.1.2
20
- - - "~>"
20
+ - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.1.2
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 1.1.2
27
30
  - - ">="
28
31
  - !ruby/object:Gem::Version
29
32
  version: 1.1.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: thread
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
30
37
  - - "~>"
31
38
  - !ruby/object:Gem::Version
32
- version: 1.1.2
39
+ version: 0.2.2
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 0.2.2
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 0.2.2
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 0.2.2
33
53
  description: Download and analyze the GitHub events stored at GitHub archive
34
54
  email: s.scalabrino9@gmail.com
35
55
  executables: []
@@ -41,7 +61,7 @@ homepage: https://github.com/intersimone999/gh-archive
41
61
  licenses:
42
62
  - GPL-3.0-only
43
63
  metadata: {}
44
- post_install_message:
64
+ post_install_message:
45
65
  rdoc_options: []
46
66
  require_paths:
47
67
  - lib
@@ -56,8 +76,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
56
76
  - !ruby/object:Gem::Version
57
77
  version: '0'
58
78
  requirements: []
59
- rubygems_version: 3.0.3
60
- signing_key:
79
+ rubygems_version: 3.2.21
80
+ signing_key:
61
81
  specification_version: 4
62
82
  summary: GitHub Archive mining utility
63
83
  test_files: []