gh-archive 0.13 → 0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74e3037ee1115173176aa974f453f49b7649743f41f83118e9ee180bd620c095
4
- data.tar.gz: c4c1ca30210ba39204b28b4b3854e2e93c69003ef1ead0f85506f2fe213f0ee9
3
+ metadata.gz: c17a920e176289a00fb21fff7b712dc634c4491241ebe11a3b06f0ddd112706d
4
+ data.tar.gz: 58e7e4fdc6442d4a0955bc70d0fb10eec05b437350c255e34731021b2d714deb
5
5
  SHA512:
6
- metadata.gz: 289b568dce07aa1f0182c75d26f7ab286a2b48dbf31c9ee63c6a1ef77bf5a59823b17136984380d6cca123c0be70e6cca3cc3dba216a935354d9312eb93aa2fa
7
- data.tar.gz: 5d9ef4ec34a106e3fb2db37ab173c0b0876637941263f356f2e43440c506ad0471906cd4fa0fbd12bf98c84e44ee2bae0408a96ccdef0a5946c13487e5441204
6
+ metadata.gz: b9066428d42acd77376fe72082adc62517c21bf8154b60b0006d7f0ab61fd0679af5775444c180d5cc8e842c99b0c207266e25ea28d4f0cdef33e49259339bb3
7
+ data.tar.gz: 036eaa0ead55db627ee8bb4f4a7421a525ab0ae73041045ef6d5b13307b7ed54d36e70532658ec3853cac2f21a7e5d2e80313c2fb688723ea87cd689ea469c9c
@@ -0,0 +1,112 @@
1
+ require 'code-assertions'
2
+ require 'json'
3
+ require 'open-uri'
4
+ require 'zlib'
5
+ require 'logger'
6
+ require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
9
+
10
+ module GHArchive
11
+ class ThreadPool
12
+ def initialize(size)
13
+ @size = size
14
+ @threads = []
15
+ @queue = []
16
+ @mutex = Mutex.new
17
+
18
+ @consumer_thread = Thread.start do
19
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
20
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
21
+ @threads.delete_if { |t| !t.alive? }
22
+
23
+ if @threads.size < @size && @queue.size > 0
24
+ @mutex.synchronize do
25
+ args, job = @queue.shift
26
+ @threads << Thread.start(*args, &job)
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ def process(*args, &block)
34
+ raise "Block expected" unless block_given?
35
+ raise "Can not add jobs while shutting down" if @shutdown
36
+
37
+ @mutex.synchronize do
38
+ @queue << [args, block]
39
+ end
40
+
41
+ return self.enqueued
42
+ end
43
+
44
+ def shutdown
45
+ @shutdown = true
46
+ end
47
+
48
+ def shutdown!
49
+ self.shutdown
50
+ @mutex.synchronize do
51
+ @queue.clear
52
+ end
53
+ end
54
+
55
+ def enqueued
56
+ return @queue.size
57
+ end
58
+
59
+ def shutdown?
60
+ @shutdown
61
+ end
62
+
63
+ def alive?
64
+ @consumer_thread.alive?
65
+ end
66
+
67
+ def wait
68
+ while alive?
69
+ sleep 0.1
70
+ end
71
+ end
72
+ end
73
+
74
+ module Utils
75
+ def get_gha_filename(date)
76
+ return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
77
+ end
78
+
79
+ def read_gha_file_content(gz)
80
+ gzip = Zlib::GzipReader.new(gz)
81
+ return gzip.read
82
+ ensure
83
+ gzip.close if gzip
84
+ end
85
+
86
+ def read_gha_file(file)
87
+
88
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
89
+ content = file.read
90
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
91
+ content = read_gha_file_content(file)
92
+ else
93
+ raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
94
+ end
95
+
96
+ result = []
97
+ content.lines.each do |line|
98
+ result << JSON.parse(line)
99
+ end
100
+
101
+ return result
102
+ end
103
+
104
+ def each_time(from, to)
105
+ current_time = from
106
+ while current_time < to
107
+ yield current_time
108
+ current_time += 3600
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,62 @@
1
+ require_relative 'core'
2
+
3
+ module GHArchive
4
+ class Downloader
5
+ include Utils
6
+
7
+ def initialize(folder, decompress = false)
8
+ @logger = Logger.new(STDERR)
9
+ @decompress = decompress
10
+ @folder = folder
11
+ @max = nil
12
+
13
+ Dir.mkdir(@folder) unless FileTest.exist?(@folder)
14
+ raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
15
+ end
16
+
17
+ def max(max)
18
+ @max = max
19
+ return self
20
+ end
21
+
22
+ def logger=(logger)
23
+ @logger = logger
24
+ end
25
+
26
+ def download(from = Time.gm(2015, 1, 1), to = Time.now)
27
+ archive = []
28
+ self.each_time(from, to) do |current_time|
29
+ filename = self.get_gha_filename(current_time)
30
+ out_filename = filename.clone
31
+ out_filename.gsub!(".json.gz", ".json") if @decompress
32
+
33
+ target_file = File.join(@folder, out_filename)
34
+ if FileTest.exist?(target_file)
35
+ @logger.info("Skipping existing file for #{current_time}")
36
+ next
37
+ else
38
+ @logger.info("Downloading file for #{current_time}")
39
+ end
40
+
41
+ File.open(target_file, 'w') do |f|
42
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
43
+ if @decompress
44
+ f << self.read_gha_file_content(gz)
45
+ else
46
+ f << gz.read
47
+ end
48
+ end
49
+ end
50
+ archive << target_file
51
+
52
+ if @max && archive.size > @max
53
+ last = archive.shift
54
+ @logger.info("Removing local file #{last}")
55
+ File.unlink(last)
56
+ end
57
+
58
+ yield filename if block_given?
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,4 +1,5 @@
1
1
  require 'time'
2
+ require_relative 'core'
2
3
 
3
4
  module GHArchive
4
5
  Repository = Struct.new(:id, :name, :url)
@@ -1,5 +1,5 @@
1
1
  require 'time'
2
- require_relative File.expand_path('../entities', __FILE__)
2
+ require_relative 'entities'
3
3
 
4
4
  module GHArchive
5
5
  class Event
@@ -0,0 +1,22 @@
1
+ require 'core'
2
+ require 'providers'
3
+
4
+ module GHArchive
5
+ class Job
6
+ def initialize(provider, from, to)
7
+ @provider = provider
8
+ @from = from
9
+ @to = to
10
+ end
11
+
12
+ def start
13
+ @provider.each(@from, @to) do |event, time|
14
+ run(event, time)
15
+ end
16
+ end
17
+
18
+ def run(event, time)
19
+ raise GHAException, "This is an abstract job, it should be implemented before running"
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,31 @@
1
+ require_relative 'core'
2
+
3
+ GHAUtils = GHArchive::Utils
4
+
5
+ class GHAProvider < GHArchive::Provider
6
+ def initialize(*args)
7
+ warn "GHAProvider is deprecated. Please use GHArchive::Provider instead."
8
+ super
9
+ end
10
+ end
11
+
12
+ class OnlineGHAProvider < GHArchive::OnlineProvider
13
+ def initialize(*args)
14
+ warn "OnlineGHAProvider is deprecated. Please use GHArchive::OnlineProvider instead."
15
+ super
16
+ end
17
+ end
18
+
19
+ class FolderGHAProvider < GHArchive::FolderProvider
20
+ def initialize(*args)
21
+ warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
22
+ super
23
+ end
24
+ end
25
+
26
+ class GHADownloader < GHArchive::Downloader
27
+ def initialize(*args)
28
+ warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
29
+ super
30
+ end
31
+ end
@@ -0,0 +1,326 @@
1
+ require_relative 'core'
2
+
3
+ module GHArchive
4
+ class Provider
5
+ include Utils
6
+
7
+ def initialize
8
+ @logger = Logger.new(STDOUT)
9
+
10
+ @includes = {}
11
+ @excludes = {}
12
+
13
+ @checkpoint_name = nil
14
+ @use_json = true
15
+ end
16
+
17
+ def use_checkpoint(filename)
18
+ @checkpoint_name = filename
19
+
20
+ return self
21
+ end
22
+
23
+ def parse_events
24
+ @use_json = false
25
+
26
+ return self
27
+ end
28
+
29
+ def logger=(logger)
30
+ @logger = logger
31
+
32
+ return self
33
+ end
34
+ alias :use_logger :logger=
35
+
36
+ def get(date)
37
+ raise "Not implemented"
38
+ end
39
+
40
+ def include(**args)
41
+ args.each do |key, value|
42
+ @includes[key.to_s] = [] unless @includes[key.to_s]
43
+ @includes[key.to_s] << value
44
+ end
45
+
46
+ return self
47
+ end
48
+
49
+ def exclude(**args)
50
+ args.each do |key, value|
51
+ @excludes[key.to_s] = [] unless @excludes[key.to_s]
52
+ @excludes[key.to_s] << value
53
+ end
54
+
55
+ return self
56
+ end
57
+
58
+ def restore_checkpoint(from)
59
+ if @checkpoint_name && FileTest.exist?(@checkpoint_name)
60
+ # Note that this throws an exception if the file is not readable. This is the intended behavior.
61
+ # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
62
+ loaded_from = Marshal.load(File.read(@checkpoint_name))
63
+ raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
64
+
65
+ @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
66
+
67
+ return loaded_from
68
+ else
69
+ return from
70
+ end
71
+ end
72
+
73
+ def update_checkpoint(current_time)
74
+ if @checkpoint_name
75
+ begin
76
+ File.open(@checkpoint_name, "wb") do |f|
77
+ f.write(Marshal.dump(current_time))
78
+ end
79
+ rescue
80
+ @logger.warn(
81
+ "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
82
+ )
83
+ end
84
+ end
85
+ end
86
+
87
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
88
+ exceptions = []
89
+
90
+ from = restore_checkpoint(from)
91
+
92
+ self.each_time(from, to) do |current_time|
93
+ events = []
94
+
95
+ update_checkpoint(current_time)
96
+
97
+ begin
98
+ events = self.get(current_time)
99
+ rescue GHAException => e
100
+ @logger.warn(e.message)
101
+ next
102
+ rescue => e
103
+ @logger.error("An exception occurred for #{current_time}: #{e.message}")
104
+ exceptions << e
105
+ next
106
+ end
107
+
108
+ events.each do |event|
109
+ skip = false
110
+ @includes.each do |key, value|
111
+ skip = true unless value.include?(event[key])
112
+ end
113
+
114
+ @excludes.each do |key, value|
115
+ skip = true if value.include?(event[key])
116
+ end
117
+ next if skip
118
+
119
+ if @use_json
120
+ yield event, current_time
121
+ else
122
+ yield GHArchive::Event.parse(event), current_time
123
+ end
124
+ end
125
+
126
+ @logger.info("Scanned #{current_time}")
127
+
128
+ events.clear
129
+ GC.start
130
+ end
131
+
132
+ update_checkpoint(to)
133
+
134
+ return exceptions
135
+ end
136
+
137
+ class GHAException < Exception
138
+ end
139
+ end
140
+
141
+ class OnlineProvider < Provider
142
+ def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
143
+ super()
144
+
145
+ self.max_retries(max_retries)
146
+ self.proactive(proactive_pool_size) if proactive
147
+
148
+ @cache = Cache.new
149
+ end
150
+
151
+ def max_retries(n)
152
+ @max_retries = n
153
+
154
+ return self
155
+ end
156
+
157
+ def proactive(pool_size = 10)
158
+ @proactive = true
159
+ @pool = GHArchive::ThreadPool.new(pool_size)
160
+
161
+ return self
162
+ end
163
+
164
+ def get(current_time)
165
+ @max_retries.times do
166
+ begin
167
+ filename = self.get_gha_filename(current_time)
168
+
169
+ if @proactive
170
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
171
+
172
+ while !@cache.has?(filename)
173
+ sleep 1
174
+ end
175
+
176
+ data = @cache.get(filename)
177
+ if data
178
+ return data
179
+ else
180
+ raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
181
+ end
182
+ else
183
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
184
+ return self.read_gha_file(gz)
185
+ end
186
+ end
187
+ rescue Errno::ECONNRESET => e
188
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
189
+ next
190
+ rescue OpenURI::HTTPError => e
191
+ code = e.io.status[0]
192
+ if code.start_with?("5")
193
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
194
+ next
195
+ else
196
+ raise e
197
+ end
198
+ end
199
+ end
200
+
201
+ raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
202
+ end
203
+
204
+ def cache(current_time)
205
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
206
+ while @cache.full?
207
+ sleep 1
208
+ end
209
+
210
+ filename = self.get_gha_filename(current_time)
211
+ @max_retries.times do
212
+ begin
213
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
214
+ content = self.read_gha_file(gz)
215
+ @cache.put(filename, content)
216
+ return
217
+ end
218
+ rescue Errno::ECONNRESET => e
219
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
220
+ next
221
+ rescue OpenURI::HTTPError => e
222
+ code = e.io.status[0]
223
+ if code.start_with?("5")
224
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
225
+ next
226
+ elsif code == "404"
227
+ @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
228
+ else
229
+ raise e
230
+ end
231
+ rescue Zlib::GzipFile::Error => e
232
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
233
+ end
234
+ end
235
+
236
+ @cache.put(filename, nil) unless @cache.has?(filename)
237
+ end
238
+
239
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
240
+ if @proactive
241
+ real_from = restore_checkpoint(from)
242
+ any_ready = Thread.promise
243
+
244
+ @logger.info("Proactively scheduling download tasks...")
245
+ self.each_time(real_from, to) do |current_time|
246
+ @pool.process(current_time) do |current_time|
247
+ cache(current_time)
248
+ any_ready << true
249
+ @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
250
+ end
251
+ end
252
+
253
+ ~any_ready
254
+ @logger.info("Download tasks successfully scheduled!")
255
+ end
256
+
257
+ super
258
+ end
259
+
260
+ class Cache
261
+ def initialize(max_size = 10)
262
+ @cache = {}
263
+ @max_size = max_size
264
+ @mutex = Mutex.new
265
+ end
266
+
267
+ def put(name, content)
268
+ @mutex.synchronize do
269
+ @cache[name] = content
270
+ end
271
+ end
272
+
273
+ def get(name)
274
+ @mutex.synchronize do
275
+ return @cache.delete(name)
276
+ end
277
+ end
278
+
279
+ def size
280
+ @mutex.synchronize do
281
+ return @cache.size
282
+ end
283
+ end
284
+
285
+ def has?(name)
286
+ @mutex.synchronize do
287
+ return @cache.has_key?(name)
288
+ end
289
+ end
290
+
291
+ def full?
292
+ self.size >= @max_size
293
+ end
294
+ end
295
+
296
+ class DownloadArchiveException < Provider::GHAException
297
+ end
298
+ end
299
+
300
+ class FolderProvider < Provider
301
+ def initialize(folder)
302
+ super()
303
+
304
+ @folder = folder
305
+ end
306
+
307
+ def get(current_time)
308
+ filename = self.get_gha_filename(current_time)
309
+ complete_filename = File.join(@folder, filename)
310
+ mode = "rb"
311
+
312
+ unless FileTest.exist?(complete_filename)
313
+ complete_filename = complete_filename.sub(".gz", "")
314
+ mode = "r"
315
+ end
316
+
317
+ unless FileTest.exist?(complete_filename)
318
+ raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
319
+ end
320
+
321
+ File.open(complete_filename, mode) do |file|
322
+ return self.read_gha_file(file)
323
+ end
324
+ end
325
+ end
326
+ end
data/lib/gh-archive.rb CHANGED
@@ -1,484 +1,6 @@
1
- require 'code-assertions'
2
- require 'json'
3
- require 'open-uri'
4
- require 'zlib'
5
- require 'logger'
6
- require 'tmpdir'
7
- require 'thread/pool'
8
- require 'thread/promise'
9
-
10
- require_relative File.expand_path('../gh-archive/events', __FILE__)
11
-
12
- module GHArchive
13
- class ThreadPool
14
- def initialize(size)
15
- @size = size
16
- @threads = []
17
- @queue = []
18
- @mutex = Mutex.new
19
-
20
- @consumer_thread = Thread.start do
21
- while !@shutdown || @threads.size > 0 || @queue.size > 0
22
- sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
- @threads.delete_if { |t| !t.alive? }
24
-
25
- if @threads.size < @size && @queue.size > 0
26
- @mutex.synchronize do
27
- args, job = @queue.shift
28
- @threads << Thread.start(*args, &job)
29
- end
30
- end
31
- end
32
- end
33
- end
34
-
35
- def process(*args, &block)
36
- raise "Block expected" unless block_given?
37
- raise "Can not add jobs while shutting down" if @shutdown
38
-
39
- @mutex.synchronize do
40
- @queue << [args, block]
41
- end
42
-
43
- return self.enqueued
44
- end
45
-
46
- def shutdown
47
- @shutdown = true
48
- end
49
-
50
- def shutdown!
51
- self.shutdown
52
- @mutex.synchronize do
53
- @queue.clear
54
- end
55
- end
56
-
57
- def enqueued
58
- return @queue.size
59
- end
60
-
61
- def shutdown?
62
- @shutdown
63
- end
64
-
65
- def alive?
66
- @consumer_thread.alive?
67
- end
68
-
69
- def wait
70
- while alive?
71
- sleep 0.1
72
- end
73
- end
74
- end
75
- end
76
-
77
- module GHAUtils
78
- def get_gha_filename(date)
79
- return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
80
- end
81
-
82
- def read_gha_file_content(gz)
83
- gzip = Zlib::GzipReader.new(gz)
84
- return gzip.read
85
- ensure
86
- gzip.close if gzip
87
- end
88
-
89
- def read_gha_file(file)
90
-
91
- if !file.is_a?(StringIO) && file.path.end_with?(".json")
92
- content = file.read
93
- elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
94
- content = read_gha_file_content(file)
95
- else
96
- raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
97
- end
98
-
99
- result = []
100
- content.lines.each do |line|
101
- result << JSON.parse(line)
102
- end
103
-
104
- return result
105
- end
106
-
107
- def each_time(from, to)
108
- current_time = from
109
- while current_time < to
110
- yield current_time
111
- current_time += 3600
112
- end
113
- end
114
- end
115
-
116
- class GHAProvider
117
- include GHAUtils
118
-
119
- def initialize
120
- @logger = Logger.new(STDOUT)
121
-
122
- @includes = {}
123
- @excludes = {}
124
-
125
- @checkpoint_name = nil
126
- @use_json = true
127
- end
128
-
129
- def use_checkpoint(filename)
130
- @checkpoint_name = filename
131
-
132
- return self
133
- end
134
-
135
- def parse_events
136
- @use_json = false
137
-
138
- return self
139
- end
140
-
141
- def logger=(logger)
142
- @logger = logger
143
-
144
- return self
145
- end
146
- alias :use_logger :logger=
147
-
148
- def get(date)
149
- raise "Not implemented"
150
- end
151
-
152
- def include(**args)
153
- args.each do |key, value|
154
- @includes[key.to_s] = [] unless @includes[key.to_s]
155
- @includes[key.to_s] << value
156
- end
157
-
158
- return self
159
- end
160
-
161
- def exclude(**args)
162
- args.each do |key, value|
163
- @excludes[key.to_s] = [] unless @excludes[key.to_s]
164
- @excludes[key.to_s] << value
165
- end
166
-
167
- return self
168
- end
169
-
170
- def restore_checkpoint(from)
171
- if @checkpoint_name && FileTest.exist?(@checkpoint_name)
172
- # Note that this throws an exception if the file is not readable. This is the intended behavior.
173
- # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
174
- loaded_from = Marshal.load(File.read(@checkpoint_name))
175
- raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
176
-
177
- @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
178
-
179
- return loaded_from
180
- else
181
- return from
182
- end
183
- end
184
-
185
- def update_checkpoint(current_time)
186
- if @checkpoint_name
187
- begin
188
- File.open(@checkpoint_name, "wb") do |f|
189
- f.write(Marshal.dump(current_time))
190
- end
191
- rescue
192
- @logger.warn(
193
- "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
194
- )
195
- end
196
- end
197
- end
198
-
199
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
200
- exceptions = []
201
-
202
- from = restore_checkpoint(from)
203
-
204
- self.each_time(from, to) do |current_time|
205
- events = []
206
-
207
- update_checkpoint(current_time)
208
-
209
- begin
210
- events = self.get(current_time)
211
- rescue GHAException => e
212
- @logger.warn(e.message)
213
- next
214
- rescue => e
215
- @logger.error("An exception occurred for #{current_time}: #{e.message}")
216
- exceptions << e
217
- next
218
- end
219
-
220
- events.each do |event|
221
- skip = false
222
- @includes.each do |key, value|
223
- skip = true unless value.include?(event[key])
224
- end
225
-
226
- @excludes.each do |key, value|
227
- skip = true if value.include?(event[key])
228
- end
229
- next if skip
230
-
231
- if @use_json
232
- yield event, current_time
233
- else
234
- yield GHArchive::Event.parse(event), current_time
235
- end
236
- end
237
-
238
- @logger.info("Scanned #{current_time}")
239
-
240
- events.clear
241
- GC.start
242
- end
243
-
244
- update_checkpoint(to)
245
-
246
- return exceptions
247
- end
248
-
249
- class GHAException < Exception
250
- end
251
- end
252
-
253
- class OnlineGHAProvider < GHAProvider
254
- def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
255
- super()
256
-
257
- self.max_retries(max_retries)
258
- self.proactive(proactive_pool_size) if proactive
259
-
260
- @cache = Cache.new
261
- end
262
-
263
- def max_retries(n)
264
- @max_retries = n
265
-
266
- return self
267
- end
268
-
269
- def proactive(pool_size = 10)
270
- @proactive = true
271
- @pool = GHArchive::ThreadPool.new(pool_size)
272
-
273
- return self
274
- end
275
-
276
- def get(current_time)
277
- @max_retries.times do
278
- begin
279
- filename = self.get_gha_filename(current_time)
280
-
281
- if @proactive
282
- @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
283
-
284
- while !@cache.has?(filename)
285
- sleep 1
286
- end
287
-
288
- return @cache.get(filename)
289
- else
290
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
291
- return self.read_gha_file(gz)
292
- end
293
- end
294
- rescue Errno::ECONNRESET => e
295
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
296
- next
297
- rescue OpenURI::HTTPError => e
298
- code = e.io.status[0]
299
- if code.start_with?("5")
300
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
301
- next
302
- else
303
- raise e
304
- end
305
- end
306
- end
307
-
308
- raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
309
- end
310
-
311
- def cache(current_time)
312
- @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
313
- while @cache.full?
314
- sleep 1
315
- end
316
- @max_retries.times do
317
- begin
318
- filename = self.get_gha_filename(current_time)
319
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
320
- content = self.read_gha_file(gz)
321
- @cache.put(filename, content)
322
- return
323
- end
324
- rescue Errno::ECONNRESET => e
325
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
326
- next
327
- rescue OpenURI::HTTPError => e
328
- code = e.io.status[0]
329
- if code.start_with?("5")
330
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
331
- next
332
- else
333
- raise e
334
- end
335
- rescue Zlib::GzipFile::Error => e
336
- @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
337
- end
338
- end
339
- end
340
-
341
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
342
- if @proactive
343
- real_from = restore_checkpoint(from)
344
- any_ready = Thread.promise
345
-
346
- @logger.info("Proactively scheduling download tasks...")
347
- self.each_time(real_from, to) do |current_time|
348
- @pool.process(current_time) do |current_time|
349
- cache(current_time)
350
- any_ready << true
351
- @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
352
- end
353
- end
354
-
355
- ~any_ready
356
- @logger.info("Download tasks successfully scheduled!")
357
- end
358
-
359
- super
360
- end
361
-
362
- class Cache
363
- def initialize(max_size = 10)
364
- @cache = {}
365
- @max_size = max_size
366
- @mutex = Mutex.new
367
- end
368
-
369
- def put(name, content)
370
- @mutex.synchronize do
371
- @cache[name] = content
372
- end
373
- end
374
-
375
- def get(name)
376
- @mutex.synchronize do
377
- return @cache.delete(name)
378
- end
379
- end
380
-
381
- def size
382
- @mutex.synchronize do
383
- return @cache.size
384
- end
385
- end
386
-
387
- def has?(name)
388
- return @cache.has_key?(name)
389
- end
390
-
391
- def full?
392
- self.size >= @max_size
393
- end
394
- end
395
-
396
- class DownloadArchiveException < GHAProvider::GHAException
397
- end
398
- end
399
-
400
- class FolderGHAProvider < GHAProvider
401
- def initialize(folder)
402
- super()
403
-
404
- @folder = folder
405
- end
406
-
407
- def get(current_time)
408
- filename = self.get_gha_filename(current_time)
409
- complete_filename = File.join(@folder, filename)
410
- mode = "rb"
411
-
412
- unless FileTest.exist?(complete_filename)
413
- complete_filename = complete_filename.sub(".gz", "")
414
- mode = "r"
415
- end
416
-
417
- unless FileTest.exist?(complete_filename)
418
- raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
419
- end
420
-
421
- File.open(complete_filename, mode) do |file|
422
- return self.read_gha_file(file)
423
- end
424
- end
425
- end
426
-
427
- class GHADownloader
428
- include GHAUtils
429
-
430
- def initialize(folder, decompress = false)
431
- @logger = Logger.new(STDERR)
432
- @decompress = decompress
433
- @folder = folder
434
- @max = nil
435
-
436
- Dir.mkdir(@folder) unless FileTest.exist?(@folder)
437
- raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
438
- end
439
-
440
- def max(max)
441
- @max = max
442
- return self
443
- end
444
-
445
- def logger=(logger)
446
- @logger = logger
447
- end
448
-
449
- def download(from = Time.gm(2015, 1, 1), to = Time.now)
450
- archive = []
451
- self.each_time(from, to) do |current_time|
452
- filename = self.get_gha_filename(current_time)
453
- out_filename = filename.clone
454
- out_filename.gsub!(".json.gz", ".json") if @decompress
455
-
456
- target_file = File.join(@folder, out_filename)
457
- if FileTest.exist?(target_file)
458
- @logger.info("Skipping existing file for #{current_time}")
459
- next
460
- else
461
- @logger.info("Downloading file for #{current_time}")
462
- end
463
-
464
- File.open(target_file, 'w') do |f|
465
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
466
- if @decompress
467
- f << self.read_gha_file_content(gz)
468
- else
469
- f << gz.read
470
- end
471
- end
472
- end
473
- archive << target_file
474
-
475
- if @max && archive.size > @max
476
- last = archive.shift
477
- @logger.info("Removing local file #{last}")
478
- File.unlink(last)
479
- end
480
-
481
- yield filename if block_given?
482
- end
483
- end
484
- end
1
+ require_relative 'gh-archive/core'
2
+ require_relative 'gh-archive/providers'
3
+ require_relative 'gh-archive/downloader'
4
+ require_relative 'gh-archive/events'
5
+ require_relative 'gh-archive/entities'
6
+ require_relative 'gh-archive/legacy'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.13'
4
+ version: '0.17'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-15 00:00:00.000000000 Z
11
+ date: 2021-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -57,8 +57,13 @@ extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
59
  - lib/gh-archive.rb
60
+ - lib/gh-archive/core.rb
61
+ - lib/gh-archive/downloader.rb
60
62
  - lib/gh-archive/entities.rb
61
63
  - lib/gh-archive/events.rb
64
+ - lib/gh-archive/job.rb
65
+ - lib/gh-archive/legacy.rb
66
+ - lib/gh-archive/providers.rb
62
67
  homepage: https://github.com/intersimone999/gh-archive
63
68
  licenses:
64
69
  - GPL-3.0-only
@@ -78,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
83
  - !ruby/object:Gem::Version
79
84
  version: '0'
80
85
  requirements: []
81
- rubygems_version: 3.2.21
86
+ rubygems_version: 3.2.29
82
87
  signing_key:
83
88
  specification_version: 4
84
89
  summary: GitHub Archive mining utility