gh-archive 0.2 → 0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +111 -68
  3. metadata +29 -9
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c90ec7c3f14e2f57de8a145ad50c1f7730869c5aad0b1de33286accec99642c7
4
- data.tar.gz: a16c29db393499905695d411a62ca0dfb6e319d7fd509c104d41b2d0959b3414
3
+ metadata.gz: 91b0e957c5176b791d4f49e382680865405e7a6b2b29b349bcbb78b92d884e02
4
+ data.tar.gz: f8ddae3d80e80a24931d8632c9798c8f01520e1fe5ac8a85079c0d0e85eadcbc
5
5
  SHA512:
6
- metadata.gz: f6a9feafaa7d0d06f75c489b147b2fc9b5485e88a28c15f98b1081c8b1f05fffc8218253ca817b0ecf64c1050f3e63ed4e1ad2d6aec9df5a4d362cf5ed2832bc
7
- data.tar.gz: 1bfa5ab2dbccc74bd2b162066bab6874fc3a2fc08b6d2ad9ec02c05a73078ee84abe1d069eb2b51b491134b35ea4ee0724b81aa3ffbd845b9997ba3860639c05
6
+ metadata.gz: f7b24be932f58142b36887671b4265e25631345e7b81cc36b264be4a018fc0c4a88b853ae384dc8472876bf0996e904bf499007adc3091ddb511f28c828090fc
7
+ data.tar.gz: 5cbb83495b9bb397a41022cb1bf4bce0344c735d16f9f43fb181b4b109948ac7a75bd44c693338f02b1ff17eeeeb5b83a6add9deeeedbd379a8848614041a3f5
data/lib/gh-archive.rb CHANGED
@@ -3,6 +3,9 @@ require 'json'
3
3
  require 'open-uri'
4
4
  require 'zlib'
5
5
  require 'logger'
6
+ require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
6
9
 
7
10
  module GHAUtils
8
11
  def get_gha_filename(date)
@@ -11,14 +14,19 @@ module GHAUtils
11
14
 
12
15
  def read_gha_file_content(gz)
13
16
  gzip = Zlib::GzipReader.new(gz)
14
- content = gzip.read
15
- gzip.close
16
-
17
- return content
17
+ return gzip.read
18
+ ensure
19
+ gzip.close if gzip
18
20
  end
19
21
 
20
- def read_gha_file(gz)
21
- content = read_gha_file_content(gz)
22
+ def read_gha_file(file)
23
+ if file.path.end_with?(".json")
24
+ content = file.read
25
+ elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
26
+ content = read_gha_file_content(file)
27
+ else
28
+ raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
29
+ end
22
30
 
23
31
  result = []
24
32
  content.lines.each do |line|
@@ -28,11 +36,11 @@ module GHAUtils
28
36
  return result
29
37
  end
30
38
 
31
- def each_date(from, to)
32
- current_date = from
33
- while current_date < to
34
- yield current_date
35
- current_date += 3600
39
+ def each_time(from, to)
40
+ current_time = from
41
+ while current_time < to
42
+ yield current_time
43
+ current_time += 3600
36
44
  end
37
45
  end
38
46
  end
@@ -70,13 +78,18 @@ class GHAProvider
70
78
  end
71
79
 
72
80
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
73
- self.each_date(from, to) do |current_date|
81
+ exceptions = []
82
+
83
+ self.each_time(from, to) do |current_time|
74
84
  events = []
75
85
  begin
76
- events = self.get(current_date)
77
- @logger.info("Scanned #{current_date}")
78
- rescue
79
- @logger.error($!)
86
+ events = self.get(current_time)
87
+ rescue GHAException => e
88
+ @logger.warn(e.message)
89
+ next
90
+ rescue => e
91
+ @logger.error("An exception occurred for #{current_time}: #{e.message}")
92
+ exceptions << e
80
93
  next
81
94
  end
82
95
 
@@ -91,21 +104,30 @@ class GHAProvider
91
104
  end
92
105
  next if skip
93
106
 
94
- yield event
107
+ yield event, current_time
95
108
  end
96
109
 
110
+ @logger.info("Scanned #{current_time}")
111
+
97
112
  events.clear
98
113
  GC.start
99
114
  end
115
+
116
+ return exceptions
117
+ end
118
+
119
+ class GHAException < Exception
100
120
  end
101
121
  end
102
122
 
103
123
  class OnlineGHAProvider < GHAProvider
104
- def initialize(max_retries = 3, proactive = false)
124
+ def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
105
125
  super()
106
126
 
107
127
  @max_retries = max_retries
108
128
  @proactive = proactive
129
+ @proactive_pool_size = proactive_pool_size
130
+ @pool = Thread.pool(proactive_pool_size)
109
131
  @cache = Cache.new
110
132
  end
111
133
 
@@ -114,97 +136,106 @@ class OnlineGHAProvider < GHAProvider
114
136
  begin
115
137
  filename = self.get_gha_filename(current_time)
116
138
 
117
- if @cache.has?(filename)
118
- result = self.read_gha_file(@cache.get(filename))
139
+ if @proactive
140
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
141
+
142
+ while !@cache.has?(filename)
143
+ sleep 1
144
+ end
145
+
146
+ return @cache.get(filename)
119
147
  else
120
148
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
121
- # Save to cache
122
149
  return self.read_gha_file(gz)
123
150
  end
124
151
  end
125
- rescue
126
- @logger.warning($!)
152
+ rescue Errno::ECONNRESET => e
153
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
154
+ next
155
+ rescue OpenURI::HTTPError => e
156
+ code = e.io.status[0]
157
+ if code.start_with?("5")
158
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
159
+ next
160
+ else
161
+ raise e
162
+ end
127
163
  end
128
164
  end
129
165
 
130
- raise DownloadArchiveException, "Exceeded maximum number of tentative downloads."
166
+ raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
131
167
  end
132
168
 
133
169
  def cache(current_time)
170
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
171
+ while @cache.full?
172
+ sleep 1
173
+ end
134
174
  @max_retries.times do
135
175
  begin
136
176
  filename = self.get_gha_filename(current_time)
137
-
138
177
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
139
- @cache.put(filename, gz.read)
178
+ content = self.read_gha_file(gz)
179
+ @cache.put(filename, content)
140
180
  return
141
181
  end
142
- rescue
182
+ rescue Errno::ECONNRESET => e
183
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
184
+ next
185
+ rescue OpenURI::HTTPError => e
186
+ code = e.io.status[0]
187
+ if code.start_with?("5")
188
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
189
+ next
190
+ else
191
+ raise e
192
+ end
143
193
  end
144
194
  end
145
195
  end
146
196
 
147
197
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
148
198
  if @proactive
149
- @logger.info("Proactive download thread started")
150
- Thread.start do
151
- self.each_date(from, to) do |current_date|
152
- self.cache(current_date)
153
- @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
154
-
155
- if @cache.full?
156
- @logger.info("Full cache. Waiting...")
157
- end
158
-
159
- while @cache.full?
160
- sleep 1
161
- end
199
+ any_ready = Thread.promise
200
+
201
+ @logger.info("Proactively scheduling download tasks...")
202
+ self.each_time(from, to) do |current_time|
203
+ @pool.process(current_time) do |current_time|
204
+ cache(current_time)
205
+ any_ready << true
206
+ @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
162
207
  end
163
208
  end
209
+
210
+ ~any_ready
211
+ @logger.info("Download tasks successfully scheduled!")
164
212
  end
165
213
 
166
214
  super
167
215
  end
168
216
 
169
217
  class Cache
170
- def initialize(folder = Dir.mktmpdir, max_size = 100)
218
+ def initialize(max_size = 10)
171
219
  @cache = {}
172
220
  @max_size = max_size
173
- @folder = folder
174
221
  @mutex = Mutex.new
175
222
  end
176
223
 
177
224
  def put(name, content)
178
- File.open("#@folder/#{name}", 'w') do |f|
179
- f << content
180
- end
181
-
182
225
  @mutex.synchronize do
183
- @cache[name] = value
226
+ @cache[name] = content
184
227
  end
185
228
  end
186
229
 
187
230
  def get(name)
188
231
  @mutex.synchronize do
189
- return File.read(@cache[name])
190
- end
191
- ensure
192
- self.unload(name)
193
- end
194
-
195
- def unload(name)
196
- File.unlink(@cache[name])
197
-
198
- @mutex.synchronize do
199
- @cache.delete(name)
232
+ return @cache.delete(name)
200
233
  end
201
-
202
- return true
203
234
  end
204
235
 
205
236
  def size
206
237
  @mutex.synchronize do
207
- @cache.size
238
+ return @cache.size
208
239
  end
209
240
  end
210
241
 
@@ -217,7 +248,7 @@ class OnlineGHAProvider < GHAProvider
217
248
  end
218
249
  end
219
250
 
220
- class DownloadArchiveException < Exception
251
+ class DownloadArchiveException < GHAProvider::GHAException
221
252
  end
222
253
  end
223
254
 
@@ -230,8 +261,20 @@ class FolderGHAProvider < GHAProvider
230
261
 
231
262
  def get(current_time)
232
263
  filename = self.get_gha_filename(current_time)
233
- File.open(File.join(@folder, filename), "rb") do |gz|
234
- return self.read_gha_file(gz)
264
+ complete_filename = File.join(@folder, filename)
265
+ mode = "rb"
266
+
267
+ unless FileTest.exist?(complete_filename)
268
+ complete_filename = complete_filename.sub(".gz", "")
269
+ mode = "r"
270
+ end
271
+
272
+ unless FileTest.exist?(complete_filename)
273
+ raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
274
+ end
275
+
276
+ File.open(complete_filename, mode) do |file|
277
+ return self.read_gha_file(file)
235
278
  end
236
279
  end
237
280
  end
@@ -260,17 +303,17 @@ class GHADownloader
260
303
 
261
304
  def download(from = Time.gm(2015, 1, 1), to = Time.now)
262
305
  archive = []
263
- self.each_date(from, to) do |current_date|
264
- filename = self.get_gha_filename(current_date)
306
+ self.each_time(from, to) do |current_time|
307
+ filename = self.get_gha_filename(current_time)
265
308
  out_filename = filename.clone
266
309
  out_filename.gsub!(".json.gz", ".json") if @decompress
267
310
 
268
311
  target_file = File.join(@folder, out_filename)
269
312
  if FileTest.exist?(target_file)
270
- @logger.info("Skipping existing file for #{current_date}")
313
+ @logger.info("Skipping existing file for #{current_time}")
271
314
  next
272
315
  else
273
- @logger.info("Downloading file for #{current_date}")
316
+ @logger.info("Downloading file for #{current_time}")
274
317
  end
275
318
 
276
319
  File.open(target_file, 'w') do |f|
metadata CHANGED
@@ -1,35 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.2'
4
+ version: '0.6'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-26 00:00:00.000000000 Z
11
+ date: 2021-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: 1.1.2
20
- - - "~>"
20
+ - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.1.2
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 1.1.2
27
30
  - - ">="
28
31
  - !ruby/object:Gem::Version
29
32
  version: 1.1.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: thread
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
30
37
  - - "~>"
31
38
  - !ruby/object:Gem::Version
32
- version: 1.1.2
39
+ version: 0.2.2
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 0.2.2
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 0.2.2
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 0.2.2
33
53
  description: Download and analyze the GitHub events stored at GitHub archive
34
54
  email: s.scalabrino9@gmail.com
35
55
  executables: []
@@ -41,7 +61,7 @@ homepage: https://github.com/intersimone999/gh-archive
41
61
  licenses:
42
62
  - GPL-3.0-only
43
63
  metadata: {}
44
- post_install_message:
64
+ post_install_message:
45
65
  rdoc_options: []
46
66
  require_paths:
47
67
  - lib
@@ -56,8 +76,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
56
76
  - !ruby/object:Gem::Version
57
77
  version: '0'
58
78
  requirements: []
59
- rubygems_version: 3.0.3
60
- signing_key:
79
+ rubygems_version: 3.2.21
80
+ signing_key:
61
81
  specification_version: 4
62
82
  summary: GitHub Archive mining utility
63
83
  test_files: []