gh-archive 0.13 → 0.17

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74e3037ee1115173176aa974f453f49b7649743f41f83118e9ee180bd620c095
4
- data.tar.gz: c4c1ca30210ba39204b28b4b3854e2e93c69003ef1ead0f85506f2fe213f0ee9
3
+ metadata.gz: c17a920e176289a00fb21fff7b712dc634c4491241ebe11a3b06f0ddd112706d
4
+ data.tar.gz: 58e7e4fdc6442d4a0955bc70d0fb10eec05b437350c255e34731021b2d714deb
5
5
  SHA512:
6
- metadata.gz: 289b568dce07aa1f0182c75d26f7ab286a2b48dbf31c9ee63c6a1ef77bf5a59823b17136984380d6cca123c0be70e6cca3cc3dba216a935354d9312eb93aa2fa
7
- data.tar.gz: 5d9ef4ec34a106e3fb2db37ab173c0b0876637941263f356f2e43440c506ad0471906cd4fa0fbd12bf98c84e44ee2bae0408a96ccdef0a5946c13487e5441204
6
+ metadata.gz: b9066428d42acd77376fe72082adc62517c21bf8154b60b0006d7f0ab61fd0679af5775444c180d5cc8e842c99b0c207266e25ea28d4f0cdef33e49259339bb3
7
+ data.tar.gz: 036eaa0ead55db627ee8bb4f4a7421a525ab0ae73041045ef6d5b13307b7ed54d36e70532658ec3853cac2f21a7e5d2e80313c2fb688723ea87cd689ea469c9c
@@ -0,0 +1,112 @@
1
+ require 'code-assertions'
2
+ require 'json'
3
+ require 'open-uri'
4
+ require 'zlib'
5
+ require 'logger'
6
+ require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
9
+
10
+ module GHArchive
11
+ class ThreadPool
12
+ def initialize(size)
13
+ @size = size
14
+ @threads = []
15
+ @queue = []
16
+ @mutex = Mutex.new
17
+
18
+ @consumer_thread = Thread.start do
19
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
20
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
21
+ @threads.delete_if { |t| !t.alive? }
22
+
23
+ if @threads.size < @size && @queue.size > 0
24
+ @mutex.synchronize do
25
+ args, job = @queue.shift
26
+ @threads << Thread.start(*args, &job)
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ def process(*args, &block)
34
+ raise "Block expected" unless block_given?
35
+ raise "Can not add jobs while shutting down" if @shutdown
36
+
37
+ @mutex.synchronize do
38
+ @queue << [args, block]
39
+ end
40
+
41
+ return self.enqueued
42
+ end
43
+
44
+ def shutdown
45
+ @shutdown = true
46
+ end
47
+
48
+ def shutdown!
49
+ self.shutdown
50
+ @mutex.synchronize do
51
+ @queue.clear
52
+ end
53
+ end
54
+
55
+ def enqueued
56
+ return @queue.size
57
+ end
58
+
59
+ def shutdown?
60
+ @shutdown
61
+ end
62
+
63
+ def alive?
64
+ @consumer_thread.alive?
65
+ end
66
+
67
+ def wait
68
+ while alive?
69
+ sleep 0.1
70
+ end
71
+ end
72
+ end
73
+
74
+ module Utils
75
+ def get_gha_filename(date)
76
+ return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
77
+ end
78
+
79
+ def read_gha_file_content(gz)
80
+ gzip = Zlib::GzipReader.new(gz)
81
+ return gzip.read
82
+ ensure
83
+ gzip.close if gzip
84
+ end
85
+
86
+ def read_gha_file(file)
87
+
88
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
89
+ content = file.read
90
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
91
+ content = read_gha_file_content(file)
92
+ else
93
+ raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
94
+ end
95
+
96
+ result = []
97
+ content.lines.each do |line|
98
+ result << JSON.parse(line)
99
+ end
100
+
101
+ return result
102
+ end
103
+
104
+ def each_time(from, to)
105
+ current_time = from
106
+ while current_time < to
107
+ yield current_time
108
+ current_time += 3600
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,62 @@
1
+ require_relative 'core'
2
+
3
+ module GHArchive
4
+ class Downloader
5
+ include Utils
6
+
7
+ def initialize(folder, decompress = false)
8
+ @logger = Logger.new(STDERR)
9
+ @decompress = decompress
10
+ @folder = folder
11
+ @max = nil
12
+
13
+ Dir.mkdir(@folder) unless FileTest.exist?(@folder)
14
+ raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
15
+ end
16
+
17
+ def max(max)
18
+ @max = max
19
+ return self
20
+ end
21
+
22
+ def logger=(logger)
23
+ @logger = logger
24
+ end
25
+
26
+ def download(from = Time.gm(2015, 1, 1), to = Time.now)
27
+ archive = []
28
+ self.each_time(from, to) do |current_time|
29
+ filename = self.get_gha_filename(current_time)
30
+ out_filename = filename.clone
31
+ out_filename.gsub!(".json.gz", ".json") if @decompress
32
+
33
+ target_file = File.join(@folder, out_filename)
34
+ if FileTest.exist?(target_file)
35
+ @logger.info("Skipping existing file for #{current_time}")
36
+ next
37
+ else
38
+ @logger.info("Downloading file for #{current_time}")
39
+ end
40
+
41
+ File.open(target_file, 'w') do |f|
42
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
43
+ if @decompress
44
+ f << self.read_gha_file_content(gz)
45
+ else
46
+ f << gz.read
47
+ end
48
+ end
49
+ end
50
+ archive << target_file
51
+
52
+ if @max && archive.size > @max
53
+ last = archive.shift
54
+ @logger.info("Removing local file #{last}")
55
+ File.unlink(last)
56
+ end
57
+
58
+ yield filename if block_given?
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,4 +1,5 @@
1
1
  require 'time'
2
+ require_relative 'core'
2
3
 
3
4
  module GHArchive
4
5
  Repository = Struct.new(:id, :name, :url)
@@ -1,5 +1,5 @@
1
1
  require 'time'
2
- require_relative File.expand_path('../entities', __FILE__)
2
+ require_relative 'entities'
3
3
 
4
4
  module GHArchive
5
5
  class Event
@@ -0,0 +1,22 @@
1
+ require 'core'
2
+ require 'providers'
3
+
4
+ module GHArchive
5
+ class Job
6
+ def initialize(provider, from, to)
7
+ @provider = provider
8
+ @from = from
9
+ @to = to
10
+ end
11
+
12
+ def start
13
+ @provider.each(@from, @to) do |event, time|
14
+ run(event, time)
15
+ end
16
+ end
17
+
18
+ def run(event, time)
19
+ raise GHAException, "This is an abstract job, it should be implemented before running"
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,31 @@
1
+ require_relative 'core'
2
+
3
+ GHAUtils = GHArchive::Utils
4
+
5
+ class GHAProvider < GHArchive::Provider
6
+ def initialize(*args)
7
+ warn "GHAProvider is deprecated. Please use GHArchive::Provider instead."
8
+ super
9
+ end
10
+ end
11
+
12
+ class OnlineGHAProvider < GHArchive::OnlineProvider
13
+ def initialize(*args)
14
+ warn "OnlineGHAProvider is deprecated. Please use GHArchive::OnlineProvider instead."
15
+ super
16
+ end
17
+ end
18
+
19
+ class FolderGHAProvider < GHArchive::FolderProvider
20
+ def initialize(*args)
21
+ warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
22
+ super
23
+ end
24
+ end
25
+
26
+ class GHADownloader < GHArchive::Downloader
27
+ def initialize(*args)
28
+ warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
29
+ super
30
+ end
31
+ end
@@ -0,0 +1,326 @@
1
+ require_relative 'core'
2
+
3
+ module GHArchive
4
+ class Provider
5
+ include Utils
6
+
7
+ def initialize
8
+ @logger = Logger.new(STDOUT)
9
+
10
+ @includes = {}
11
+ @excludes = {}
12
+
13
+ @checkpoint_name = nil
14
+ @use_json = true
15
+ end
16
+
17
+ def use_checkpoint(filename)
18
+ @checkpoint_name = filename
19
+
20
+ return self
21
+ end
22
+
23
+ def parse_events
24
+ @use_json = false
25
+
26
+ return self
27
+ end
28
+
29
+ def logger=(logger)
30
+ @logger = logger
31
+
32
+ return self
33
+ end
34
+ alias :use_logger :logger=
35
+
36
+ def get(date)
37
+ raise "Not implemented"
38
+ end
39
+
40
+ def include(**args)
41
+ args.each do |key, value|
42
+ @includes[key.to_s] = [] unless @includes[key.to_s]
43
+ @includes[key.to_s] << value
44
+ end
45
+
46
+ return self
47
+ end
48
+
49
+ def exclude(**args)
50
+ args.each do |key, value|
51
+ @excludes[key.to_s] = [] unless @excludes[key.to_s]
52
+ @excludes[key.to_s] << value
53
+ end
54
+
55
+ return self
56
+ end
57
+
58
+ def restore_checkpoint(from)
59
+ if @checkpoint_name && FileTest.exist?(@checkpoint_name)
60
+ # Note that this throws an exception if the file is not readable. This is the intended behavior.
61
+ # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
62
+ loaded_from = Marshal.load(File.read(@checkpoint_name))
63
+ raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
64
+
65
+ @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
66
+
67
+ return loaded_from
68
+ else
69
+ return from
70
+ end
71
+ end
72
+
73
+ def update_checkpoint(current_time)
74
+ if @checkpoint_name
75
+ begin
76
+ File.open(@checkpoint_name, "wb") do |f|
77
+ f.write(Marshal.dump(current_time))
78
+ end
79
+ rescue
80
+ @logger.warn(
81
+ "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
82
+ )
83
+ end
84
+ end
85
+ end
86
+
87
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
88
+ exceptions = []
89
+
90
+ from = restore_checkpoint(from)
91
+
92
+ self.each_time(from, to) do |current_time|
93
+ events = []
94
+
95
+ update_checkpoint(current_time)
96
+
97
+ begin
98
+ events = self.get(current_time)
99
+ rescue GHAException => e
100
+ @logger.warn(e.message)
101
+ next
102
+ rescue => e
103
+ @logger.error("An exception occurred for #{current_time}: #{e.message}")
104
+ exceptions << e
105
+ next
106
+ end
107
+
108
+ events.each do |event|
109
+ skip = false
110
+ @includes.each do |key, value|
111
+ skip = true unless value.include?(event[key])
112
+ end
113
+
114
+ @excludes.each do |key, value|
115
+ skip = true if value.include?(event[key])
116
+ end
117
+ next if skip
118
+
119
+ if @use_json
120
+ yield event, current_time
121
+ else
122
+ yield GHArchive::Event.parse(event), current_time
123
+ end
124
+ end
125
+
126
+ @logger.info("Scanned #{current_time}")
127
+
128
+ events.clear
129
+ GC.start
130
+ end
131
+
132
+ update_checkpoint(to)
133
+
134
+ return exceptions
135
+ end
136
+
137
+ class GHAException < Exception
138
+ end
139
+ end
140
+
141
+ class OnlineProvider < Provider
142
+ def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
143
+ super()
144
+
145
+ self.max_retries(max_retries)
146
+ self.proactive(proactive_pool_size) if proactive
147
+
148
+ @cache = Cache.new
149
+ end
150
+
151
+ def max_retries(n)
152
+ @max_retries = n
153
+
154
+ return self
155
+ end
156
+
157
+ def proactive(pool_size = 10)
158
+ @proactive = true
159
+ @pool = GHArchive::ThreadPool.new(pool_size)
160
+
161
+ return self
162
+ end
163
+
164
+ def get(current_time)
165
+ @max_retries.times do
166
+ begin
167
+ filename = self.get_gha_filename(current_time)
168
+
169
+ if @proactive
170
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
171
+
172
+ while !@cache.has?(filename)
173
+ sleep 1
174
+ end
175
+
176
+ data = @cache.get(filename)
177
+ if data
178
+ return data
179
+ else
180
+ raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
181
+ end
182
+ else
183
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
184
+ return self.read_gha_file(gz)
185
+ end
186
+ end
187
+ rescue Errno::ECONNRESET => e
188
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
189
+ next
190
+ rescue OpenURI::HTTPError => e
191
+ code = e.io.status[0]
192
+ if code.start_with?("5")
193
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
194
+ next
195
+ else
196
+ raise e
197
+ end
198
+ end
199
+ end
200
+
201
+ raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
202
+ end
203
+
204
+ def cache(current_time)
205
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
206
+ while @cache.full?
207
+ sleep 1
208
+ end
209
+
210
+ filename = self.get_gha_filename(current_time)
211
+ @max_retries.times do
212
+ begin
213
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
214
+ content = self.read_gha_file(gz)
215
+ @cache.put(filename, content)
216
+ return
217
+ end
218
+ rescue Errno::ECONNRESET => e
219
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
220
+ next
221
+ rescue OpenURI::HTTPError => e
222
+ code = e.io.status[0]
223
+ if code.start_with?("5")
224
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
225
+ next
226
+ elsif code == "404"
227
+ @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
228
+ else
229
+ raise e
230
+ end
231
+ rescue Zlib::GzipFile::Error => e
232
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
233
+ end
234
+ end
235
+
236
+ @cache.put(filename, nil) unless @cache.has?(filename)
237
+ end
238
+
239
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
240
+ if @proactive
241
+ real_from = restore_checkpoint(from)
242
+ any_ready = Thread.promise
243
+
244
+ @logger.info("Proactively scheduling download tasks...")
245
+ self.each_time(real_from, to) do |current_time|
246
+ @pool.process(current_time) do |current_time|
247
+ cache(current_time)
248
+ any_ready << true
249
+ @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
250
+ end
251
+ end
252
+
253
+ ~any_ready
254
+ @logger.info("Download tasks successfully scheduled!")
255
+ end
256
+
257
+ super
258
+ end
259
+
260
+ class Cache
261
+ def initialize(max_size = 10)
262
+ @cache = {}
263
+ @max_size = max_size
264
+ @mutex = Mutex.new
265
+ end
266
+
267
+ def put(name, content)
268
+ @mutex.synchronize do
269
+ @cache[name] = content
270
+ end
271
+ end
272
+
273
+ def get(name)
274
+ @mutex.synchronize do
275
+ return @cache.delete(name)
276
+ end
277
+ end
278
+
279
+ def size
280
+ @mutex.synchronize do
281
+ return @cache.size
282
+ end
283
+ end
284
+
285
+ def has?(name)
286
+ @mutex.synchronize do
287
+ return @cache.has_key?(name)
288
+ end
289
+ end
290
+
291
+ def full?
292
+ self.size >= @max_size
293
+ end
294
+ end
295
+
296
+ class DownloadArchiveException < Provider::GHAException
297
+ end
298
+ end
299
+
300
+ class FolderProvider < Provider
301
+ def initialize(folder)
302
+ super()
303
+
304
+ @folder = folder
305
+ end
306
+
307
+ def get(current_time)
308
+ filename = self.get_gha_filename(current_time)
309
+ complete_filename = File.join(@folder, filename)
310
+ mode = "rb"
311
+
312
+ unless FileTest.exist?(complete_filename)
313
+ complete_filename = complete_filename.sub(".gz", "")
314
+ mode = "r"
315
+ end
316
+
317
+ unless FileTest.exist?(complete_filename)
318
+ raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
319
+ end
320
+
321
+ File.open(complete_filename, mode) do |file|
322
+ return self.read_gha_file(file)
323
+ end
324
+ end
325
+ end
326
+ end
data/lib/gh-archive.rb CHANGED
@@ -1,484 +1,6 @@
1
- require 'code-assertions'
2
- require 'json'
3
- require 'open-uri'
4
- require 'zlib'
5
- require 'logger'
6
- require 'tmpdir'
7
- require 'thread/pool'
8
- require 'thread/promise'
9
-
10
- require_relative File.expand_path('../gh-archive/events', __FILE__)
11
-
12
- module GHArchive
13
- class ThreadPool
14
- def initialize(size)
15
- @size = size
16
- @threads = []
17
- @queue = []
18
- @mutex = Mutex.new
19
-
20
- @consumer_thread = Thread.start do
21
- while !@shutdown || @threads.size > 0 || @queue.size > 0
22
- sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
- @threads.delete_if { |t| !t.alive? }
24
-
25
- if @threads.size < @size && @queue.size > 0
26
- @mutex.synchronize do
27
- args, job = @queue.shift
28
- @threads << Thread.start(*args, &job)
29
- end
30
- end
31
- end
32
- end
33
- end
34
-
35
- def process(*args, &block)
36
- raise "Block expected" unless block_given?
37
- raise "Can not add jobs while shutting down" if @shutdown
38
-
39
- @mutex.synchronize do
40
- @queue << [args, block]
41
- end
42
-
43
- return self.enqueued
44
- end
45
-
46
- def shutdown
47
- @shutdown = true
48
- end
49
-
50
- def shutdown!
51
- self.shutdown
52
- @mutex.synchronize do
53
- @queue.clear
54
- end
55
- end
56
-
57
- def enqueued
58
- return @queue.size
59
- end
60
-
61
- def shutdown?
62
- @shutdown
63
- end
64
-
65
- def alive?
66
- @consumer_thread.alive?
67
- end
68
-
69
- def wait
70
- while alive?
71
- sleep 0.1
72
- end
73
- end
74
- end
75
- end
76
-
77
- module GHAUtils
78
- def get_gha_filename(date)
79
- return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
80
- end
81
-
82
- def read_gha_file_content(gz)
83
- gzip = Zlib::GzipReader.new(gz)
84
- return gzip.read
85
- ensure
86
- gzip.close if gzip
87
- end
88
-
89
- def read_gha_file(file)
90
-
91
- if !file.is_a?(StringIO) && file.path.end_with?(".json")
92
- content = file.read
93
- elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
94
- content = read_gha_file_content(file)
95
- else
96
- raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
97
- end
98
-
99
- result = []
100
- content.lines.each do |line|
101
- result << JSON.parse(line)
102
- end
103
-
104
- return result
105
- end
106
-
107
- def each_time(from, to)
108
- current_time = from
109
- while current_time < to
110
- yield current_time
111
- current_time += 3600
112
- end
113
- end
114
- end
115
-
116
- class GHAProvider
117
- include GHAUtils
118
-
119
- def initialize
120
- @logger = Logger.new(STDOUT)
121
-
122
- @includes = {}
123
- @excludes = {}
124
-
125
- @checkpoint_name = nil
126
- @use_json = true
127
- end
128
-
129
- def use_checkpoint(filename)
130
- @checkpoint_name = filename
131
-
132
- return self
133
- end
134
-
135
- def parse_events
136
- @use_json = false
137
-
138
- return self
139
- end
140
-
141
- def logger=(logger)
142
- @logger = logger
143
-
144
- return self
145
- end
146
- alias :use_logger :logger=
147
-
148
- def get(date)
149
- raise "Not implemented"
150
- end
151
-
152
- def include(**args)
153
- args.each do |key, value|
154
- @includes[key.to_s] = [] unless @includes[key.to_s]
155
- @includes[key.to_s] << value
156
- end
157
-
158
- return self
159
- end
160
-
161
- def exclude(**args)
162
- args.each do |key, value|
163
- @excludes[key.to_s] = [] unless @excludes[key.to_s]
164
- @excludes[key.to_s] << value
165
- end
166
-
167
- return self
168
- end
169
-
170
- def restore_checkpoint(from)
171
- if @checkpoint_name && FileTest.exist?(@checkpoint_name)
172
- # Note that this throws an exception if the file is not readable. This is the intended behavior.
173
- # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
174
- loaded_from = Marshal.load(File.read(@checkpoint_name))
175
- raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
176
-
177
- @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
178
-
179
- return loaded_from
180
- else
181
- return from
182
- end
183
- end
184
-
185
- def update_checkpoint(current_time)
186
- if @checkpoint_name
187
- begin
188
- File.open(@checkpoint_name, "wb") do |f|
189
- f.write(Marshal.dump(current_time))
190
- end
191
- rescue
192
- @logger.warn(
193
- "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
194
- )
195
- end
196
- end
197
- end
198
-
199
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
200
- exceptions = []
201
-
202
- from = restore_checkpoint(from)
203
-
204
- self.each_time(from, to) do |current_time|
205
- events = []
206
-
207
- update_checkpoint(current_time)
208
-
209
- begin
210
- events = self.get(current_time)
211
- rescue GHAException => e
212
- @logger.warn(e.message)
213
- next
214
- rescue => e
215
- @logger.error("An exception occurred for #{current_time}: #{e.message}")
216
- exceptions << e
217
- next
218
- end
219
-
220
- events.each do |event|
221
- skip = false
222
- @includes.each do |key, value|
223
- skip = true unless value.include?(event[key])
224
- end
225
-
226
- @excludes.each do |key, value|
227
- skip = true if value.include?(event[key])
228
- end
229
- next if skip
230
-
231
- if @use_json
232
- yield event, current_time
233
- else
234
- yield GHArchive::Event.parse(event), current_time
235
- end
236
- end
237
-
238
- @logger.info("Scanned #{current_time}")
239
-
240
- events.clear
241
- GC.start
242
- end
243
-
244
- update_checkpoint(to)
245
-
246
- return exceptions
247
- end
248
-
249
- class GHAException < Exception
250
- end
251
- end
252
-
253
- class OnlineGHAProvider < GHAProvider
254
- def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
255
- super()
256
-
257
- self.max_retries(max_retries)
258
- self.proactive(proactive_pool_size) if proactive
259
-
260
- @cache = Cache.new
261
- end
262
-
263
- def max_retries(n)
264
- @max_retries = n
265
-
266
- return self
267
- end
268
-
269
- def proactive(pool_size = 10)
270
- @proactive = true
271
- @pool = GHArchive::ThreadPool.new(pool_size)
272
-
273
- return self
274
- end
275
-
276
- def get(current_time)
277
- @max_retries.times do
278
- begin
279
- filename = self.get_gha_filename(current_time)
280
-
281
- if @proactive
282
- @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
283
-
284
- while !@cache.has?(filename)
285
- sleep 1
286
- end
287
-
288
- return @cache.get(filename)
289
- else
290
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
291
- return self.read_gha_file(gz)
292
- end
293
- end
294
- rescue Errno::ECONNRESET => e
295
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
296
- next
297
- rescue OpenURI::HTTPError => e
298
- code = e.io.status[0]
299
- if code.start_with?("5")
300
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
301
- next
302
- else
303
- raise e
304
- end
305
- end
306
- end
307
-
308
- raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
309
- end
310
-
311
- def cache(current_time)
312
- @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
313
- while @cache.full?
314
- sleep 1
315
- end
316
- @max_retries.times do
317
- begin
318
- filename = self.get_gha_filename(current_time)
319
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
320
- content = self.read_gha_file(gz)
321
- @cache.put(filename, content)
322
- return
323
- end
324
- rescue Errno::ECONNRESET => e
325
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
326
- next
327
- rescue OpenURI::HTTPError => e
328
- code = e.io.status[0]
329
- if code.start_with?("5")
330
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
331
- next
332
- else
333
- raise e
334
- end
335
- rescue Zlib::GzipFile::Error => e
336
- @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
337
- end
338
- end
339
- end
340
-
341
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
342
- if @proactive
343
- real_from = restore_checkpoint(from)
344
- any_ready = Thread.promise
345
-
346
- @logger.info("Proactively scheduling download tasks...")
347
- self.each_time(real_from, to) do |current_time|
348
- @pool.process(current_time) do |current_time|
349
- cache(current_time)
350
- any_ready << true
351
- @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
352
- end
353
- end
354
-
355
- ~any_ready
356
- @logger.info("Download tasks successfully scheduled!")
357
- end
358
-
359
- super
360
- end
361
-
362
- class Cache
363
- def initialize(max_size = 10)
364
- @cache = {}
365
- @max_size = max_size
366
- @mutex = Mutex.new
367
- end
368
-
369
- def put(name, content)
370
- @mutex.synchronize do
371
- @cache[name] = content
372
- end
373
- end
374
-
375
- def get(name)
376
- @mutex.synchronize do
377
- return @cache.delete(name)
378
- end
379
- end
380
-
381
- def size
382
- @mutex.synchronize do
383
- return @cache.size
384
- end
385
- end
386
-
387
- def has?(name)
388
- return @cache.has_key?(name)
389
- end
390
-
391
- def full?
392
- self.size >= @max_size
393
- end
394
- end
395
-
396
- class DownloadArchiveException < GHAProvider::GHAException
397
- end
398
- end
399
-
400
- class FolderGHAProvider < GHAProvider
401
- def initialize(folder)
402
- super()
403
-
404
- @folder = folder
405
- end
406
-
407
- def get(current_time)
408
- filename = self.get_gha_filename(current_time)
409
- complete_filename = File.join(@folder, filename)
410
- mode = "rb"
411
-
412
- unless FileTest.exist?(complete_filename)
413
- complete_filename = complete_filename.sub(".gz", "")
414
- mode = "r"
415
- end
416
-
417
- unless FileTest.exist?(complete_filename)
418
- raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
419
- end
420
-
421
- File.open(complete_filename, mode) do |file|
422
- return self.read_gha_file(file)
423
- end
424
- end
425
- end
426
-
427
- class GHADownloader
428
- include GHAUtils
429
-
430
- def initialize(folder, decompress = false)
431
- @logger = Logger.new(STDERR)
432
- @decompress = decompress
433
- @folder = folder
434
- @max = nil
435
-
436
- Dir.mkdir(@folder) unless FileTest.exist?(@folder)
437
- raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
438
- end
439
-
440
- def max(max)
441
- @max = max
442
- return self
443
- end
444
-
445
- def logger=(logger)
446
- @logger = logger
447
- end
448
-
449
- def download(from = Time.gm(2015, 1, 1), to = Time.now)
450
- archive = []
451
- self.each_time(from, to) do |current_time|
452
- filename = self.get_gha_filename(current_time)
453
- out_filename = filename.clone
454
- out_filename.gsub!(".json.gz", ".json") if @decompress
455
-
456
- target_file = File.join(@folder, out_filename)
457
- if FileTest.exist?(target_file)
458
- @logger.info("Skipping existing file for #{current_time}")
459
- next
460
- else
461
- @logger.info("Downloading file for #{current_time}")
462
- end
463
-
464
- File.open(target_file, 'w') do |f|
465
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
466
- if @decompress
467
- f << self.read_gha_file_content(gz)
468
- else
469
- f << gz.read
470
- end
471
- end
472
- end
473
- archive << target_file
474
-
475
- if @max && archive.size > @max
476
- last = archive.shift
477
- @logger.info("Removing local file #{last}")
478
- File.unlink(last)
479
- end
480
-
481
- yield filename if block_given?
482
- end
483
- end
484
- end
1
+ require_relative 'gh-archive/core'
2
+ require_relative 'gh-archive/providers'
3
+ require_relative 'gh-archive/downloader'
4
+ require_relative 'gh-archive/events'
5
+ require_relative 'gh-archive/entities'
6
+ require_relative 'gh-archive/legacy'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.13'
4
+ version: '0.17'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-15 00:00:00.000000000 Z
11
+ date: 2021-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -57,8 +57,13 @@ extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
59
  - lib/gh-archive.rb
60
+ - lib/gh-archive/core.rb
61
+ - lib/gh-archive/downloader.rb
60
62
  - lib/gh-archive/entities.rb
61
63
  - lib/gh-archive/events.rb
64
+ - lib/gh-archive/job.rb
65
+ - lib/gh-archive/legacy.rb
66
+ - lib/gh-archive/providers.rb
62
67
  homepage: https://github.com/intersimone999/gh-archive
63
68
  licenses:
64
69
  - GPL-3.0-only
@@ -78,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
83
  - !ruby/object:Gem::Version
79
84
  version: '0'
80
85
  requirements: []
81
- rubygems_version: 3.2.21
86
+ rubygems_version: 3.2.29
82
87
  signing_key:
83
88
  specification_version: 4
84
89
  summary: GitHub Archive mining utility