gh-archive 0.16 → 0.17

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1f339fca5ebd3f7ee085fa257b47567993ab776efd3ba143e424a6bab2ca1712
4
- data.tar.gz: bd3709e9067fbc5ba0a7b92f156c3fa6f87d26bc81b0a969f7020526f31b5264
3
+ metadata.gz: c17a920e176289a00fb21fff7b712dc634c4491241ebe11a3b06f0ddd112706d
4
+ data.tar.gz: 58e7e4fdc6442d4a0955bc70d0fb10eec05b437350c255e34731021b2d714deb
5
5
  SHA512:
6
- metadata.gz: 9d56ecf4dc4101cf162d02e49f62dc77592bc652b9933b7413cb40f56eb25c595a20eed8fbc42e1f96830048d16520299972094b0d29b14f400befc5c21e1672
7
- data.tar.gz: 5a7e8c158271b1b540e76e1b68f93ece7cebed8d29057728c6ac93b536f3f3417b89fffddcbe57c80e75269cda9ff719fef84e5bb0558dd52fd8842f80e44f55
6
+ metadata.gz: b9066428d42acd77376fe72082adc62517c21bf8154b60b0006d7f0ab61fd0679af5775444c180d5cc8e842c99b0c207266e25ea28d4f0cdef33e49259339bb3
7
+ data.tar.gz: 036eaa0ead55db627ee8bb4f4a7421a525ab0ae73041045ef6d5b13307b7ed54d36e70532658ec3853cac2f21a7e5d2e80313c2fb688723ea87cd689ea469c9c
@@ -0,0 +1,112 @@
1
+ require 'code-assertions'
2
+ require 'json'
3
+ require 'open-uri'
4
+ require 'zlib'
5
+ require 'logger'
6
+ require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
9
+
10
+ module GHArchive
11
+ class ThreadPool
12
+ def initialize(size)
13
+ @size = size
14
+ @threads = []
15
+ @queue = []
16
+ @mutex = Mutex.new
17
+
18
+ @consumer_thread = Thread.start do
19
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
20
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
21
+ @threads.delete_if { |t| !t.alive? }
22
+
23
+ if @threads.size < @size && @queue.size > 0
24
+ @mutex.synchronize do
25
+ args, job = @queue.shift
26
+ @threads << Thread.start(*args, &job)
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ def process(*args, &block)
34
+ raise "Block expected" unless block_given?
35
+ raise "Can not add jobs while shutting down" if @shutdown
36
+
37
+ @mutex.synchronize do
38
+ @queue << [args, block]
39
+ end
40
+
41
+ return self.enqueued
42
+ end
43
+
44
+ def shutdown
45
+ @shutdown = true
46
+ end
47
+
48
+ def shutdown!
49
+ self.shutdown
50
+ @mutex.synchronize do
51
+ @queue.clear
52
+ end
53
+ end
54
+
55
+ def enqueued
56
+ return @queue.size
57
+ end
58
+
59
+ def shutdown?
60
+ @shutdown
61
+ end
62
+
63
+ def alive?
64
+ @consumer_thread.alive?
65
+ end
66
+
67
+ def wait
68
+ while alive?
69
+ sleep 0.1
70
+ end
71
+ end
72
+ end
73
+
74
+ module Utils
75
+ def get_gha_filename(date)
76
+ return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
77
+ end
78
+
79
+ def read_gha_file_content(gz)
80
+ gzip = Zlib::GzipReader.new(gz)
81
+ return gzip.read
82
+ ensure
83
+ gzip.close if gzip
84
+ end
85
+
86
+ def read_gha_file(file)
87
+
88
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
89
+ content = file.read
90
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
91
+ content = read_gha_file_content(file)
92
+ else
93
+ raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
94
+ end
95
+
96
+ result = []
97
+ content.lines.each do |line|
98
+ result << JSON.parse(line)
99
+ end
100
+
101
+ return result
102
+ end
103
+
104
+ def each_time(from, to)
105
+ current_time = from
106
+ while current_time < to
107
+ yield current_time
108
+ current_time += 3600
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,62 @@
1
+ require_relative 'core'
2
+
3
+ module GHArchive
4
+ class Downloader
5
+ include Utils
6
+
7
+ def initialize(folder, decompress = false)
8
+ @logger = Logger.new(STDERR)
9
+ @decompress = decompress
10
+ @folder = folder
11
+ @max = nil
12
+
13
+ Dir.mkdir(@folder) unless FileTest.exist?(@folder)
14
+ raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
15
+ end
16
+
17
+ def max(max)
18
+ @max = max
19
+ return self
20
+ end
21
+
22
+ def logger=(logger)
23
+ @logger = logger
24
+ end
25
+
26
+ def download(from = Time.gm(2015, 1, 1), to = Time.now)
27
+ archive = []
28
+ self.each_time(from, to) do |current_time|
29
+ filename = self.get_gha_filename(current_time)
30
+ out_filename = filename.clone
31
+ out_filename.gsub!(".json.gz", ".json") if @decompress
32
+
33
+ target_file = File.join(@folder, out_filename)
34
+ if FileTest.exist?(target_file)
35
+ @logger.info("Skipping existing file for #{current_time}")
36
+ next
37
+ else
38
+ @logger.info("Downloading file for #{current_time}")
39
+ end
40
+
41
+ File.open(target_file, 'w') do |f|
42
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
43
+ if @decompress
44
+ f << self.read_gha_file_content(gz)
45
+ else
46
+ f << gz.read
47
+ end
48
+ end
49
+ end
50
+ archive << target_file
51
+
52
+ if @max && archive.size > @max
53
+ last = archive.shift
54
+ @logger.info("Removing local file #{last}")
55
+ File.unlink(last)
56
+ end
57
+
58
+ yield filename if block_given?
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,4 +1,5 @@
1
1
  require 'time'
2
+ require_relative 'core'
2
3
 
3
4
  module GHArchive
4
5
  Repository = Struct.new(:id, :name, :url)
@@ -1,5 +1,5 @@
1
1
  require 'time'
2
- require_relative File.expand_path('../entities', __FILE__)
2
+ require_relative 'entities'
3
3
 
4
4
  module GHArchive
5
5
  class Event
@@ -0,0 +1,22 @@
1
+ require 'core'
2
+ require 'providers'
3
+
4
+ module GHArchive
5
+ class Job
6
+ def initialize(provider, from, to)
7
+ @provider = provider
8
+ @from = from
9
+ @to = to
10
+ end
11
+
12
+ def start
13
+ @provider.each(@from, @to) do |event, time|
14
+ run(event, time)
15
+ end
16
+ end
17
+
18
+ def run(event, time)
19
+ raise GHAException, "This is an abstract job, it should be implemented before running"
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,31 @@
1
+ require_relative 'core'
2
+
3
+ GHAUtils = GHArchive::Utils
4
+
5
+ class GHAProvider < GHArchive::Provider
6
+ def initialize(*args)
7
+ warn "GHAProvider is deprecated. Please use GHArchive::Provider instead."
8
+ super
9
+ end
10
+ end
11
+
12
+ class OnlineGHAProvider < GHArchive::OnlineProvider
13
+ def initialize(*args)
14
+ warn "OnlineGHAProvider is deprecated. Please use GHArchive::OnlineProvider instead."
15
+ super
16
+ end
17
+ end
18
+
19
+ class FolderGHAProvider < GHArchive::FolderProvider
20
+ def initialize(*args)
21
+ warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
22
+ super
23
+ end
24
+ end
25
+
26
+ class GHADownloader < GHArchive::Downloader
27
+ def initialize(*args)
28
+ warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
29
+ super
30
+ end
31
+ end
@@ -0,0 +1,326 @@
1
+ require_relative 'core'
2
+
3
+ module GHArchive
4
+ class Provider
5
+ include Utils
6
+
7
+ def initialize
8
+ @logger = Logger.new(STDOUT)
9
+
10
+ @includes = {}
11
+ @excludes = {}
12
+
13
+ @checkpoint_name = nil
14
+ @use_json = true
15
+ end
16
+
17
+ def use_checkpoint(filename)
18
+ @checkpoint_name = filename
19
+
20
+ return self
21
+ end
22
+
23
+ def parse_events
24
+ @use_json = false
25
+
26
+ return self
27
+ end
28
+
29
+ def logger=(logger)
30
+ @logger = logger
31
+
32
+ return self
33
+ end
34
+ alias :use_logger :logger=
35
+
36
+ def get(date)
37
+ raise "Not implemented"
38
+ end
39
+
40
+ def include(**args)
41
+ args.each do |key, value|
42
+ @includes[key.to_s] = [] unless @includes[key.to_s]
43
+ @includes[key.to_s] << value
44
+ end
45
+
46
+ return self
47
+ end
48
+
49
+ def exclude(**args)
50
+ args.each do |key, value|
51
+ @excludes[key.to_s] = [] unless @excludes[key.to_s]
52
+ @excludes[key.to_s] << value
53
+ end
54
+
55
+ return self
56
+ end
57
+
58
+ def restore_checkpoint(from)
59
+ if @checkpoint_name && FileTest.exist?(@checkpoint_name)
60
+ # Note that this throws an exception if the file is not readable. This is the intended behavior.
61
+ # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
62
+ loaded_from = Marshal.load(File.read(@checkpoint_name))
63
+ raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
64
+
65
+ @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
66
+
67
+ return loaded_from
68
+ else
69
+ return from
70
+ end
71
+ end
72
+
73
+ def update_checkpoint(current_time)
74
+ if @checkpoint_name
75
+ begin
76
+ File.open(@checkpoint_name, "wb") do |f|
77
+ f.write(Marshal.dump(current_time))
78
+ end
79
+ rescue
80
+ @logger.warn(
81
+ "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
82
+ )
83
+ end
84
+ end
85
+ end
86
+
87
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
88
+ exceptions = []
89
+
90
+ from = restore_checkpoint(from)
91
+
92
+ self.each_time(from, to) do |current_time|
93
+ events = []
94
+
95
+ update_checkpoint(current_time)
96
+
97
+ begin
98
+ events = self.get(current_time)
99
+ rescue GHAException => e
100
+ @logger.warn(e.message)
101
+ next
102
+ rescue => e
103
+ @logger.error("An exception occurred for #{current_time}: #{e.message}")
104
+ exceptions << e
105
+ next
106
+ end
107
+
108
+ events.each do |event|
109
+ skip = false
110
+ @includes.each do |key, value|
111
+ skip = true unless value.include?(event[key])
112
+ end
113
+
114
+ @excludes.each do |key, value|
115
+ skip = true if value.include?(event[key])
116
+ end
117
+ next if skip
118
+
119
+ if @use_json
120
+ yield event, current_time
121
+ else
122
+ yield GHArchive::Event.parse(event), current_time
123
+ end
124
+ end
125
+
126
+ @logger.info("Scanned #{current_time}")
127
+
128
+ events.clear
129
+ GC.start
130
+ end
131
+
132
+ update_checkpoint(to)
133
+
134
+ return exceptions
135
+ end
136
+
137
+ class GHAException < Exception
138
+ end
139
+ end
140
+
141
+ class OnlineProvider < Provider
142
+ def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
143
+ super()
144
+
145
+ self.max_retries(max_retries)
146
+ self.proactive(proactive_pool_size) if proactive
147
+
148
+ @cache = Cache.new
149
+ end
150
+
151
+ def max_retries(n)
152
+ @max_retries = n
153
+
154
+ return self
155
+ end
156
+
157
+ def proactive(pool_size = 10)
158
+ @proactive = true
159
+ @pool = GHArchive::ThreadPool.new(pool_size)
160
+
161
+ return self
162
+ end
163
+
164
+ def get(current_time)
165
+ @max_retries.times do
166
+ begin
167
+ filename = self.get_gha_filename(current_time)
168
+
169
+ if @proactive
170
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
171
+
172
+ while !@cache.has?(filename)
173
+ sleep 1
174
+ end
175
+
176
+ data = @cache.get(filename)
177
+ if data
178
+ return data
179
+ else
180
+ raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
181
+ end
182
+ else
183
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
184
+ return self.read_gha_file(gz)
185
+ end
186
+ end
187
+ rescue Errno::ECONNRESET => e
188
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
189
+ next
190
+ rescue OpenURI::HTTPError => e
191
+ code = e.io.status[0]
192
+ if code.start_with?("5")
193
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
194
+ next
195
+ else
196
+ raise e
197
+ end
198
+ end
199
+ end
200
+
201
+ raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
202
+ end
203
+
204
+ def cache(current_time)
205
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
206
+ while @cache.full?
207
+ sleep 1
208
+ end
209
+
210
+ filename = self.get_gha_filename(current_time)
211
+ @max_retries.times do
212
+ begin
213
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
214
+ content = self.read_gha_file(gz)
215
+ @cache.put(filename, content)
216
+ return
217
+ end
218
+ rescue Errno::ECONNRESET => e
219
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
220
+ next
221
+ rescue OpenURI::HTTPError => e
222
+ code = e.io.status[0]
223
+ if code.start_with?("5")
224
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
225
+ next
226
+ elsif code == "404"
227
+ @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
228
+ else
229
+ raise e
230
+ end
231
+ rescue Zlib::GzipFile::Error => e
232
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
233
+ end
234
+ end
235
+
236
+ @cache.put(filename, nil) unless @cache.has?(filename)
237
+ end
238
+
239
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
240
+ if @proactive
241
+ real_from = restore_checkpoint(from)
242
+ any_ready = Thread.promise
243
+
244
+ @logger.info("Proactively scheduling download tasks...")
245
+ self.each_time(real_from, to) do |current_time|
246
+ @pool.process(current_time) do |current_time|
247
+ cache(current_time)
248
+ any_ready << true
249
+ @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
250
+ end
251
+ end
252
+
253
+ ~any_ready
254
+ @logger.info("Download tasks successfully scheduled!")
255
+ end
256
+
257
+ super
258
+ end
259
+
260
+ class Cache
261
+ def initialize(max_size = 10)
262
+ @cache = {}
263
+ @max_size = max_size
264
+ @mutex = Mutex.new
265
+ end
266
+
267
+ def put(name, content)
268
+ @mutex.synchronize do
269
+ @cache[name] = content
270
+ end
271
+ end
272
+
273
+ def get(name)
274
+ @mutex.synchronize do
275
+ return @cache.delete(name)
276
+ end
277
+ end
278
+
279
+ def size
280
+ @mutex.synchronize do
281
+ return @cache.size
282
+ end
283
+ end
284
+
285
+ def has?(name)
286
+ @mutex.synchronize do
287
+ return @cache.has_key?(name)
288
+ end
289
+ end
290
+
291
+ def full?
292
+ self.size >= @max_size
293
+ end
294
+ end
295
+
296
+ class DownloadArchiveException < Provider::GHAException
297
+ end
298
+ end
299
+
300
+ class FolderProvider < Provider
301
+ def initialize(folder)
302
+ super()
303
+
304
+ @folder = folder
305
+ end
306
+
307
+ def get(current_time)
308
+ filename = self.get_gha_filename(current_time)
309
+ complete_filename = File.join(@folder, filename)
310
+ mode = "rb"
311
+
312
+ unless FileTest.exist?(complete_filename)
313
+ complete_filename = complete_filename.sub(".gz", "")
314
+ mode = "r"
315
+ end
316
+
317
+ unless FileTest.exist?(complete_filename)
318
+ raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
319
+ end
320
+
321
+ File.open(complete_filename, mode) do |file|
322
+ return self.read_gha_file(file)
323
+ end
324
+ end
325
+ end
326
+ end
data/lib/gh-archive.rb CHANGED
@@ -1,496 +1,6 @@
1
- require 'code-assertions'
2
- require 'json'
3
- require 'open-uri'
4
- require 'zlib'
5
- require 'logger'
6
- require 'tmpdir'
7
- require 'thread/pool'
8
- require 'thread/promise'
9
-
10
- require_relative File.expand_path('../gh-archive/events', __FILE__)
11
-
12
- module GHArchive
13
- class ThreadPool
14
- def initialize(size)
15
- @size = size
16
- @threads = []
17
- @queue = []
18
- @mutex = Mutex.new
19
-
20
- @consumer_thread = Thread.start do
21
- while !@shutdown || @threads.size > 0 || @queue.size > 0
22
- sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
- @threads.delete_if { |t| !t.alive? }
24
-
25
- if @threads.size < @size && @queue.size > 0
26
- @mutex.synchronize do
27
- args, job = @queue.shift
28
- @threads << Thread.start(*args, &job)
29
- end
30
- end
31
- end
32
- end
33
- end
34
-
35
- def process(*args, &block)
36
- raise "Block expected" unless block_given?
37
- raise "Can not add jobs while shutting down" if @shutdown
38
-
39
- @mutex.synchronize do
40
- @queue << [args, block]
41
- end
42
-
43
- return self.enqueued
44
- end
45
-
46
- def shutdown
47
- @shutdown = true
48
- end
49
-
50
- def shutdown!
51
- self.shutdown
52
- @mutex.synchronize do
53
- @queue.clear
54
- end
55
- end
56
-
57
- def enqueued
58
- return @queue.size
59
- end
60
-
61
- def shutdown?
62
- @shutdown
63
- end
64
-
65
- def alive?
66
- @consumer_thread.alive?
67
- end
68
-
69
- def wait
70
- while alive?
71
- sleep 0.1
72
- end
73
- end
74
- end
75
- end
76
-
77
- module GHAUtils
78
- def get_gha_filename(date)
79
- return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
80
- end
81
-
82
- def read_gha_file_content(gz)
83
- gzip = Zlib::GzipReader.new(gz)
84
- return gzip.read
85
- ensure
86
- gzip.close if gzip
87
- end
88
-
89
- def read_gha_file(file)
90
-
91
- if !file.is_a?(StringIO) && file.path.end_with?(".json")
92
- content = file.read
93
- elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
94
- content = read_gha_file_content(file)
95
- else
96
- raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
97
- end
98
-
99
- result = []
100
- content.lines.each do |line|
101
- result << JSON.parse(line)
102
- end
103
-
104
- return result
105
- end
106
-
107
- def each_time(from, to)
108
- current_time = from
109
- while current_time < to
110
- yield current_time
111
- current_time += 3600
112
- end
113
- end
114
- end
115
-
116
- class GHAProvider
117
- include GHAUtils
118
-
119
- def initialize
120
- @logger = Logger.new(STDOUT)
121
-
122
- @includes = {}
123
- @excludes = {}
124
-
125
- @checkpoint_name = nil
126
- @use_json = true
127
- end
128
-
129
- def use_checkpoint(filename)
130
- @checkpoint_name = filename
131
-
132
- return self
133
- end
134
-
135
- def parse_events
136
- @use_json = false
137
-
138
- return self
139
- end
140
-
141
- def logger=(logger)
142
- @logger = logger
143
-
144
- return self
145
- end
146
- alias :use_logger :logger=
147
-
148
- def get(date)
149
- raise "Not implemented"
150
- end
151
-
152
- def include(**args)
153
- args.each do |key, value|
154
- @includes[key.to_s] = [] unless @includes[key.to_s]
155
- @includes[key.to_s] << value
156
- end
157
-
158
- return self
159
- end
160
-
161
- def exclude(**args)
162
- args.each do |key, value|
163
- @excludes[key.to_s] = [] unless @excludes[key.to_s]
164
- @excludes[key.to_s] << value
165
- end
166
-
167
- return self
168
- end
169
-
170
- def restore_checkpoint(from)
171
- if @checkpoint_name && FileTest.exist?(@checkpoint_name)
172
- # Note that this throws an exception if the file is not readable. This is the intended behavior.
173
- # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
174
- loaded_from = Marshal.load(File.read(@checkpoint_name))
175
- raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
176
-
177
- @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
178
-
179
- return loaded_from
180
- else
181
- return from
182
- end
183
- end
184
-
185
- def update_checkpoint(current_time)
186
- if @checkpoint_name
187
- begin
188
- File.open(@checkpoint_name, "wb") do |f|
189
- f.write(Marshal.dump(current_time))
190
- end
191
- rescue
192
- @logger.warn(
193
- "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
194
- )
195
- end
196
- end
197
- end
198
-
199
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
200
- exceptions = []
201
-
202
- from = restore_checkpoint(from)
203
-
204
- self.each_time(from, to) do |current_time|
205
- events = []
206
-
207
- update_checkpoint(current_time)
208
-
209
- begin
210
- events = self.get(current_time)
211
- rescue GHAException => e
212
- @logger.warn(e.message)
213
- next
214
- rescue => e
215
- @logger.error("An exception occurred for #{current_time}: #{e.message}")
216
- exceptions << e
217
- next
218
- end
219
-
220
- events.each do |event|
221
- skip = false
222
- @includes.each do |key, value|
223
- skip = true unless value.include?(event[key])
224
- end
225
-
226
- @excludes.each do |key, value|
227
- skip = true if value.include?(event[key])
228
- end
229
- next if skip
230
-
231
- if @use_json
232
- yield event, current_time
233
- else
234
- yield GHArchive::Event.parse(event), current_time
235
- end
236
- end
237
-
238
- @logger.info("Scanned #{current_time}")
239
-
240
- events.clear
241
- GC.start
242
- end
243
-
244
- update_checkpoint(to)
245
-
246
- return exceptions
247
- end
248
-
249
- class GHAException < Exception
250
- end
251
- end
252
-
253
- class OnlineGHAProvider < GHAProvider
254
- def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
255
- super()
256
-
257
- self.max_retries(max_retries)
258
- self.proactive(proactive_pool_size) if proactive
259
-
260
- @cache = Cache.new
261
- end
262
-
263
- def max_retries(n)
264
- @max_retries = n
265
-
266
- return self
267
- end
268
-
269
- def proactive(pool_size = 10)
270
- @proactive = true
271
- @pool = GHArchive::ThreadPool.new(pool_size)
272
-
273
- return self
274
- end
275
-
276
- def get(current_time)
277
- @max_retries.times do
278
- begin
279
- filename = self.get_gha_filename(current_time)
280
-
281
- if @proactive
282
- @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
283
-
284
- while !@cache.has?(filename)
285
- sleep 1
286
- end
287
-
288
- data = @cache.get(filename)
289
- if data
290
- return data
291
- else
292
- raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
293
- end
294
- else
295
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
296
- return self.read_gha_file(gz)
297
- end
298
- end
299
- rescue Errno::ECONNRESET => e
300
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
301
- next
302
- rescue OpenURI::HTTPError => e
303
- code = e.io.status[0]
304
- if code.start_with?("5")
305
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
306
- next
307
- else
308
- raise e
309
- end
310
- end
311
- end
312
-
313
- raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
314
- end
315
-
316
- def cache(current_time)
317
- @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
318
- while @cache.full?
319
- sleep 1
320
- end
321
-
322
- filename = self.get_gha_filename(current_time)
323
- @max_retries.times do
324
- begin
325
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
326
- content = self.read_gha_file(gz)
327
- @cache.put(filename, content)
328
- return
329
- end
330
- rescue Errno::ECONNRESET => e
331
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
332
- next
333
- rescue OpenURI::HTTPError => e
334
- code = e.io.status[0]
335
- if code.start_with?("5")
336
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
337
- next
338
- elsif code == "404"
339
- @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
340
- else
341
- raise e
342
- end
343
- rescue Zlib::GzipFile::Error => e
344
- @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
345
- end
346
- end
347
-
348
- @cache.put(filename, nil) unless @cache.has?(filename)
349
- end
350
-
351
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
352
- if @proactive
353
- real_from = restore_checkpoint(from)
354
- any_ready = Thread.promise
355
-
356
- @logger.info("Proactively scheduling download tasks...")
357
- self.each_time(real_from, to) do |current_time|
358
- @pool.process(current_time) do |current_time|
359
- cache(current_time)
360
- any_ready << true
361
- @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
362
- end
363
- end
364
-
365
- ~any_ready
366
- @logger.info("Download tasks successfully scheduled!")
367
- end
368
-
369
- super
370
- end
371
-
372
- class Cache
373
- def initialize(max_size = 10)
374
- @cache = {}
375
- @max_size = max_size
376
- @mutex = Mutex.new
377
- end
378
-
379
- def put(name, content)
380
- @mutex.synchronize do
381
- @cache[name] = content
382
- end
383
- end
384
-
385
- def get(name)
386
- @mutex.synchronize do
387
- return @cache.delete(name)
388
- end
389
- end
390
-
391
- def size
392
- @mutex.synchronize do
393
- return @cache.size
394
- end
395
- end
396
-
397
- def has?(name)
398
- @mutex.synchronize do
399
- return @cache.has_key?(name)
400
- end
401
- end
402
-
403
- def full?
404
- self.size >= @max_size
405
- end
406
- end
407
-
408
- class DownloadArchiveException < GHAProvider::GHAException
409
- end
410
- end
411
-
412
- class FolderGHAProvider < GHAProvider
413
- def initialize(folder)
414
- super()
415
-
416
- @folder = folder
417
- end
418
-
419
- def get(current_time)
420
- filename = self.get_gha_filename(current_time)
421
- complete_filename = File.join(@folder, filename)
422
- mode = "rb"
423
-
424
- unless FileTest.exist?(complete_filename)
425
- complete_filename = complete_filename.sub(".gz", "")
426
- mode = "r"
427
- end
428
-
429
- unless FileTest.exist?(complete_filename)
430
- raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
431
- end
432
-
433
- File.open(complete_filename, mode) do |file|
434
- return self.read_gha_file(file)
435
- end
436
- end
437
- end
438
-
439
- class GHADownloader
440
- include GHAUtils
441
-
442
- def initialize(folder, decompress = false)
443
- @logger = Logger.new(STDERR)
444
- @decompress = decompress
445
- @folder = folder
446
- @max = nil
447
-
448
- Dir.mkdir(@folder) unless FileTest.exist?(@folder)
449
- raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
450
- end
451
-
452
- def max(max)
453
- @max = max
454
- return self
455
- end
456
-
457
- def logger=(logger)
458
- @logger = logger
459
- end
460
-
461
- def download(from = Time.gm(2015, 1, 1), to = Time.now)
462
- archive = []
463
- self.each_time(from, to) do |current_time|
464
- filename = self.get_gha_filename(current_time)
465
- out_filename = filename.clone
466
- out_filename.gsub!(".json.gz", ".json") if @decompress
467
-
468
- target_file = File.join(@folder, out_filename)
469
- if FileTest.exist?(target_file)
470
- @logger.info("Skipping existing file for #{current_time}")
471
- next
472
- else
473
- @logger.info("Downloading file for #{current_time}")
474
- end
475
-
476
- File.open(target_file, 'w') do |f|
477
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
478
- if @decompress
479
- f << self.read_gha_file_content(gz)
480
- else
481
- f << gz.read
482
- end
483
- end
484
- end
485
- archive << target_file
486
-
487
- if @max && archive.size > @max
488
- last = archive.shift
489
- @logger.info("Removing local file #{last}")
490
- File.unlink(last)
491
- end
492
-
493
- yield filename if block_given?
494
- end
495
- end
496
- end
1
+ require_relative 'gh-archive/core'
2
+ require_relative 'gh-archive/providers'
3
+ require_relative 'gh-archive/downloader'
4
+ require_relative 'gh-archive/events'
5
+ require_relative 'gh-archive/entities'
6
+ require_relative 'gh-archive/legacy'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.16'
4
+ version: '0.17'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-01 00:00:00.000000000 Z
11
+ date: 2021-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -57,8 +57,13 @@ extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
59
  - lib/gh-archive.rb
60
+ - lib/gh-archive/core.rb
61
+ - lib/gh-archive/downloader.rb
60
62
  - lib/gh-archive/entities.rb
61
63
  - lib/gh-archive/events.rb
64
+ - lib/gh-archive/job.rb
65
+ - lib/gh-archive/legacy.rb
66
+ - lib/gh-archive/providers.rb
62
67
  homepage: https://github.com/intersimone999/gh-archive
63
68
  licenses:
64
69
  - GPL-3.0-only
@@ -78,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
83
  - !ruby/object:Gem::Version
79
84
  version: '0'
80
85
  requirements: []
81
- rubygems_version: 3.2.21
86
+ rubygems_version: 3.2.29
82
87
  signing_key:
83
88
  specification_version: 4
84
89
  summary: GitHub Archive mining utility