gh-archive 0.16 → 0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1f339fca5ebd3f7ee085fa257b47567993ab776efd3ba143e424a6bab2ca1712
4
- data.tar.gz: bd3709e9067fbc5ba0a7b92f156c3fa6f87d26bc81b0a969f7020526f31b5264
3
+ metadata.gz: c17a920e176289a00fb21fff7b712dc634c4491241ebe11a3b06f0ddd112706d
4
+ data.tar.gz: 58e7e4fdc6442d4a0955bc70d0fb10eec05b437350c255e34731021b2d714deb
5
5
  SHA512:
6
- metadata.gz: 9d56ecf4dc4101cf162d02e49f62dc77592bc652b9933b7413cb40f56eb25c595a20eed8fbc42e1f96830048d16520299972094b0d29b14f400befc5c21e1672
7
- data.tar.gz: 5a7e8c158271b1b540e76e1b68f93ece7cebed8d29057728c6ac93b536f3f3417b89fffddcbe57c80e75269cda9ff719fef84e5bb0558dd52fd8842f80e44f55
6
+ metadata.gz: b9066428d42acd77376fe72082adc62517c21bf8154b60b0006d7f0ab61fd0679af5775444c180d5cc8e842c99b0c207266e25ea28d4f0cdef33e49259339bb3
7
+ data.tar.gz: 036eaa0ead55db627ee8bb4f4a7421a525ab0ae73041045ef6d5b13307b7ed54d36e70532658ec3853cac2f21a7e5d2e80313c2fb688723ea87cd689ea469c9c
@@ -0,0 +1,112 @@
1
+ require 'code-assertions'
2
+ require 'json'
3
+ require 'open-uri'
4
+ require 'zlib'
5
+ require 'logger'
6
+ require 'tmpdir'
7
+ require 'thread/pool'
8
+ require 'thread/promise'
9
+
10
+ module GHArchive
11
+ class ThreadPool
12
+ def initialize(size)
13
+ @size = size
14
+ @threads = []
15
+ @queue = []
16
+ @mutex = Mutex.new
17
+
18
+ @consumer_thread = Thread.start do
19
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
20
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
21
+ @threads.delete_if { |t| !t.alive? }
22
+
23
+ if @threads.size < @size && @queue.size > 0
24
+ @mutex.synchronize do
25
+ args, job = @queue.shift
26
+ @threads << Thread.start(*args, &job)
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ def process(*args, &block)
34
+ raise "Block expected" unless block_given?
35
+ raise "Can not add jobs while shutting down" if @shutdown
36
+
37
+ @mutex.synchronize do
38
+ @queue << [args, block]
39
+ end
40
+
41
+ return self.enqueued
42
+ end
43
+
44
+ def shutdown
45
+ @shutdown = true
46
+ end
47
+
48
+ def shutdown!
49
+ self.shutdown
50
+ @mutex.synchronize do
51
+ @queue.clear
52
+ end
53
+ end
54
+
55
+ def enqueued
56
+ return @queue.size
57
+ end
58
+
59
+ def shutdown?
60
+ @shutdown
61
+ end
62
+
63
+ def alive?
64
+ @consumer_thread.alive?
65
+ end
66
+
67
+ def wait
68
+ while alive?
69
+ sleep 0.1
70
+ end
71
+ end
72
+ end
73
+
74
+ module Utils
75
+ def get_gha_filename(date)
76
+ return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
77
+ end
78
+
79
+ def read_gha_file_content(gz)
80
+ gzip = Zlib::GzipReader.new(gz)
81
+ return gzip.read
82
+ ensure
83
+ gzip.close if gzip
84
+ end
85
+
86
+ def read_gha_file(file)
87
+
88
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
89
+ content = file.read
90
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
91
+ content = read_gha_file_content(file)
92
+ else
93
+ raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
94
+ end
95
+
96
+ result = []
97
+ content.lines.each do |line|
98
+ result << JSON.parse(line)
99
+ end
100
+
101
+ return result
102
+ end
103
+
104
+ def each_time(from, to)
105
+ current_time = from
106
+ while current_time < to
107
+ yield current_time
108
+ current_time += 3600
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,62 @@
1
+ require_relative 'core'
2
+
3
+ module GHArchive
4
+ class Downloader
5
+ include Utils
6
+
7
+ def initialize(folder, decompress = false)
8
+ @logger = Logger.new(STDERR)
9
+ @decompress = decompress
10
+ @folder = folder
11
+ @max = nil
12
+
13
+ Dir.mkdir(@folder) unless FileTest.exist?(@folder)
14
+ raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
15
+ end
16
+
17
+ def max(max)
18
+ @max = max
19
+ return self
20
+ end
21
+
22
+ def logger=(logger)
23
+ @logger = logger
24
+ end
25
+
26
+ def download(from = Time.gm(2015, 1, 1), to = Time.now)
27
+ archive = []
28
+ self.each_time(from, to) do |current_time|
29
+ filename = self.get_gha_filename(current_time)
30
+ out_filename = filename.clone
31
+ out_filename.gsub!(".json.gz", ".json") if @decompress
32
+
33
+ target_file = File.join(@folder, out_filename)
34
+ if FileTest.exist?(target_file)
35
+ @logger.info("Skipping existing file for #{current_time}")
36
+ next
37
+ else
38
+ @logger.info("Downloading file for #{current_time}")
39
+ end
40
+
41
+ File.open(target_file, 'w') do |f|
42
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
43
+ if @decompress
44
+ f << self.read_gha_file_content(gz)
45
+ else
46
+ f << gz.read
47
+ end
48
+ end
49
+ end
50
+ archive << target_file
51
+
52
+ if @max && archive.size > @max
53
+ last = archive.shift
54
+ @logger.info("Removing local file #{last}")
55
+ File.unlink(last)
56
+ end
57
+
58
+ yield filename if block_given?
59
+ end
60
+ end
61
+ end
62
+ end
@@ -1,4 +1,5 @@
1
1
  require 'time'
2
+ require_relative 'core'
2
3
 
3
4
  module GHArchive
4
5
  Repository = Struct.new(:id, :name, :url)
@@ -1,5 +1,5 @@
1
1
  require 'time'
2
- require_relative File.expand_path('../entities', __FILE__)
2
+ require_relative 'entities'
3
3
 
4
4
  module GHArchive
5
5
  class Event
@@ -0,0 +1,22 @@
1
+ require 'core'
2
+ require 'providers'
3
+
4
+ module GHArchive
5
+ class Job
6
+ def initialize(provider, from, to)
7
+ @provider = provider
8
+ @from = from
9
+ @to = to
10
+ end
11
+
12
+ def start
13
+ @provider.each(@from, @to) do |event, time|
14
+ run(event, time)
15
+ end
16
+ end
17
+
18
+ def run(event, time)
19
+ raise GHAException, "This is an abstract job, it should be implemented before running"
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,31 @@
1
+ require_relative 'core'
2
+
3
+ GHAUtils = GHArchive::Utils
4
+
5
+ class GHAProvider < GHArchive::Provider
6
+ def initialize(*args)
7
+ warn "GHAProvider is deprecated. Please use GHArchive::Provider instead."
8
+ super
9
+ end
10
+ end
11
+
12
+ class OnlineGHAProvider < GHArchive::OnlineProvider
13
+ def initialize(*args)
14
+ warn "OnlineGHAProvider is deprecated. Please use GHArchive::OnlineProvider instead."
15
+ super
16
+ end
17
+ end
18
+
19
+ class FolderGHAProvider < GHArchive::FolderProvider
20
+ def initialize(*args)
21
+ warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
22
+ super
23
+ end
24
+ end
25
+
26
+ class GHADownloader < GHArchive::Downloader
27
+ def initialize(*args)
28
+ warn "FolderGHAProvider is deprecated. Please use GHArchive::FolderProvider instead."
29
+ super
30
+ end
31
+ end
@@ -0,0 +1,326 @@
1
+ require_relative 'core'
2
+
3
+ module GHArchive
4
+ class Provider
5
+ include Utils
6
+
7
+ def initialize
8
+ @logger = Logger.new(STDOUT)
9
+
10
+ @includes = {}
11
+ @excludes = {}
12
+
13
+ @checkpoint_name = nil
14
+ @use_json = true
15
+ end
16
+
17
+ def use_checkpoint(filename)
18
+ @checkpoint_name = filename
19
+
20
+ return self
21
+ end
22
+
23
+ def parse_events
24
+ @use_json = false
25
+
26
+ return self
27
+ end
28
+
29
+ def logger=(logger)
30
+ @logger = logger
31
+
32
+ return self
33
+ end
34
+ alias :use_logger :logger=
35
+
36
+ def get(date)
37
+ raise "Not implemented"
38
+ end
39
+
40
+ def include(**args)
41
+ args.each do |key, value|
42
+ @includes[key.to_s] = [] unless @includes[key.to_s]
43
+ @includes[key.to_s] << value
44
+ end
45
+
46
+ return self
47
+ end
48
+
49
+ def exclude(**args)
50
+ args.each do |key, value|
51
+ @excludes[key.to_s] = [] unless @excludes[key.to_s]
52
+ @excludes[key.to_s] << value
53
+ end
54
+
55
+ return self
56
+ end
57
+
58
+ def restore_checkpoint(from)
59
+ if @checkpoint_name && FileTest.exist?(@checkpoint_name)
60
+ # Note that this throws an exception if the file is not readable. This is the intended behavior.
61
+ # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
62
+ loaded_from = Marshal.load(File.read(@checkpoint_name))
63
+ raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
64
+
65
+ @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
66
+
67
+ return loaded_from
68
+ else
69
+ return from
70
+ end
71
+ end
72
+
73
+ def update_checkpoint(current_time)
74
+ if @checkpoint_name
75
+ begin
76
+ File.open(@checkpoint_name, "wb") do |f|
77
+ f.write(Marshal.dump(current_time))
78
+ end
79
+ rescue
80
+ @logger.warn(
81
+ "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
82
+ )
83
+ end
84
+ end
85
+ end
86
+
87
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
88
+ exceptions = []
89
+
90
+ from = restore_checkpoint(from)
91
+
92
+ self.each_time(from, to) do |current_time|
93
+ events = []
94
+
95
+ update_checkpoint(current_time)
96
+
97
+ begin
98
+ events = self.get(current_time)
99
+ rescue GHAException => e
100
+ @logger.warn(e.message)
101
+ next
102
+ rescue => e
103
+ @logger.error("An exception occurred for #{current_time}: #{e.message}")
104
+ exceptions << e
105
+ next
106
+ end
107
+
108
+ events.each do |event|
109
+ skip = false
110
+ @includes.each do |key, value|
111
+ skip = true unless value.include?(event[key])
112
+ end
113
+
114
+ @excludes.each do |key, value|
115
+ skip = true if value.include?(event[key])
116
+ end
117
+ next if skip
118
+
119
+ if @use_json
120
+ yield event, current_time
121
+ else
122
+ yield GHArchive::Event.parse(event), current_time
123
+ end
124
+ end
125
+
126
+ @logger.info("Scanned #{current_time}")
127
+
128
+ events.clear
129
+ GC.start
130
+ end
131
+
132
+ update_checkpoint(to)
133
+
134
+ return exceptions
135
+ end
136
+
137
+ class GHAException < Exception
138
+ end
139
+ end
140
+
141
+ class OnlineProvider < Provider
142
+ def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
143
+ super()
144
+
145
+ self.max_retries(max_retries)
146
+ self.proactive(proactive_pool_size) if proactive
147
+
148
+ @cache = Cache.new
149
+ end
150
+
151
+ def max_retries(n)
152
+ @max_retries = n
153
+
154
+ return self
155
+ end
156
+
157
+ def proactive(pool_size = 10)
158
+ @proactive = true
159
+ @pool = GHArchive::ThreadPool.new(pool_size)
160
+
161
+ return self
162
+ end
163
+
164
+ def get(current_time)
165
+ @max_retries.times do
166
+ begin
167
+ filename = self.get_gha_filename(current_time)
168
+
169
+ if @proactive
170
+ @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
171
+
172
+ while !@cache.has?(filename)
173
+ sleep 1
174
+ end
175
+
176
+ data = @cache.get(filename)
177
+ if data
178
+ return data
179
+ else
180
+ raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
181
+ end
182
+ else
183
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
184
+ return self.read_gha_file(gz)
185
+ end
186
+ end
187
+ rescue Errno::ECONNRESET => e
188
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
189
+ next
190
+ rescue OpenURI::HTTPError => e
191
+ code = e.io.status[0]
192
+ if code.start_with?("5")
193
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
194
+ next
195
+ else
196
+ raise e
197
+ end
198
+ end
199
+ end
200
+
201
+ raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
202
+ end
203
+
204
+ def cache(current_time)
205
+ @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
206
+ while @cache.full?
207
+ sleep 1
208
+ end
209
+
210
+ filename = self.get_gha_filename(current_time)
211
+ @max_retries.times do
212
+ begin
213
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
214
+ content = self.read_gha_file(gz)
215
+ @cache.put(filename, content)
216
+ return
217
+ end
218
+ rescue Errno::ECONNRESET => e
219
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
220
+ next
221
+ rescue OpenURI::HTTPError => e
222
+ code = e.io.status[0]
223
+ if code.start_with?("5")
224
+ @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
225
+ next
226
+ elsif code == "404"
227
+ @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
228
+ else
229
+ raise e
230
+ end
231
+ rescue Zlib::GzipFile::Error => e
232
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
233
+ end
234
+ end
235
+
236
+ @cache.put(filename, nil) unless @cache.has?(filename)
237
+ end
238
+
239
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
240
+ if @proactive
241
+ real_from = restore_checkpoint(from)
242
+ any_ready = Thread.promise
243
+
244
+ @logger.info("Proactively scheduling download tasks...")
245
+ self.each_time(real_from, to) do |current_time|
246
+ @pool.process(current_time) do |current_time|
247
+ cache(current_time)
248
+ any_ready << true
249
+ @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
250
+ end
251
+ end
252
+
253
+ ~any_ready
254
+ @logger.info("Download tasks successfully scheduled!")
255
+ end
256
+
257
+ super
258
+ end
259
+
260
+ class Cache
261
+ def initialize(max_size = 10)
262
+ @cache = {}
263
+ @max_size = max_size
264
+ @mutex = Mutex.new
265
+ end
266
+
267
+ def put(name, content)
268
+ @mutex.synchronize do
269
+ @cache[name] = content
270
+ end
271
+ end
272
+
273
+ def get(name)
274
+ @mutex.synchronize do
275
+ return @cache.delete(name)
276
+ end
277
+ end
278
+
279
+ def size
280
+ @mutex.synchronize do
281
+ return @cache.size
282
+ end
283
+ end
284
+
285
+ def has?(name)
286
+ @mutex.synchronize do
287
+ return @cache.has_key?(name)
288
+ end
289
+ end
290
+
291
+ def full?
292
+ self.size >= @max_size
293
+ end
294
+ end
295
+
296
+ class DownloadArchiveException < Provider::GHAException
297
+ end
298
+ end
299
+
300
+ class FolderProvider < Provider
301
+ def initialize(folder)
302
+ super()
303
+
304
+ @folder = folder
305
+ end
306
+
307
+ def get(current_time)
308
+ filename = self.get_gha_filename(current_time)
309
+ complete_filename = File.join(@folder, filename)
310
+ mode = "rb"
311
+
312
+ unless FileTest.exist?(complete_filename)
313
+ complete_filename = complete_filename.sub(".gz", "")
314
+ mode = "r"
315
+ end
316
+
317
+ unless FileTest.exist?(complete_filename)
318
+ raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
319
+ end
320
+
321
+ File.open(complete_filename, mode) do |file|
322
+ return self.read_gha_file(file)
323
+ end
324
+ end
325
+ end
326
+ end
data/lib/gh-archive.rb CHANGED
@@ -1,496 +1,6 @@
1
- require 'code-assertions'
2
- require 'json'
3
- require 'open-uri'
4
- require 'zlib'
5
- require 'logger'
6
- require 'tmpdir'
7
- require 'thread/pool'
8
- require 'thread/promise'
9
-
10
- require_relative File.expand_path('../gh-archive/events', __FILE__)
11
-
12
- module GHArchive
13
- class ThreadPool
14
- def initialize(size)
15
- @size = size
16
- @threads = []
17
- @queue = []
18
- @mutex = Mutex.new
19
-
20
- @consumer_thread = Thread.start do
21
- while !@shutdown || @threads.size > 0 || @queue.size > 0
22
- sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
- @threads.delete_if { |t| !t.alive? }
24
-
25
- if @threads.size < @size && @queue.size > 0
26
- @mutex.synchronize do
27
- args, job = @queue.shift
28
- @threads << Thread.start(*args, &job)
29
- end
30
- end
31
- end
32
- end
33
- end
34
-
35
- def process(*args, &block)
36
- raise "Block expected" unless block_given?
37
- raise "Can not add jobs while shutting down" if @shutdown
38
-
39
- @mutex.synchronize do
40
- @queue << [args, block]
41
- end
42
-
43
- return self.enqueued
44
- end
45
-
46
- def shutdown
47
- @shutdown = true
48
- end
49
-
50
- def shutdown!
51
- self.shutdown
52
- @mutex.synchronize do
53
- @queue.clear
54
- end
55
- end
56
-
57
- def enqueued
58
- return @queue.size
59
- end
60
-
61
- def shutdown?
62
- @shutdown
63
- end
64
-
65
- def alive?
66
- @consumer_thread.alive?
67
- end
68
-
69
- def wait
70
- while alive?
71
- sleep 0.1
72
- end
73
- end
74
- end
75
- end
76
-
77
- module GHAUtils
78
- def get_gha_filename(date)
79
- return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
80
- end
81
-
82
- def read_gha_file_content(gz)
83
- gzip = Zlib::GzipReader.new(gz)
84
- return gzip.read
85
- ensure
86
- gzip.close if gzip
87
- end
88
-
89
- def read_gha_file(file)
90
-
91
- if !file.is_a?(StringIO) && file.path.end_with?(".json")
92
- content = file.read
93
- elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
94
- content = read_gha_file_content(file)
95
- else
96
- raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
97
- end
98
-
99
- result = []
100
- content.lines.each do |line|
101
- result << JSON.parse(line)
102
- end
103
-
104
- return result
105
- end
106
-
107
- def each_time(from, to)
108
- current_time = from
109
- while current_time < to
110
- yield current_time
111
- current_time += 3600
112
- end
113
- end
114
- end
115
-
116
- class GHAProvider
117
- include GHAUtils
118
-
119
- def initialize
120
- @logger = Logger.new(STDOUT)
121
-
122
- @includes = {}
123
- @excludes = {}
124
-
125
- @checkpoint_name = nil
126
- @use_json = true
127
- end
128
-
129
- def use_checkpoint(filename)
130
- @checkpoint_name = filename
131
-
132
- return self
133
- end
134
-
135
- def parse_events
136
- @use_json = false
137
-
138
- return self
139
- end
140
-
141
- def logger=(logger)
142
- @logger = logger
143
-
144
- return self
145
- end
146
- alias :use_logger :logger=
147
-
148
- def get(date)
149
- raise "Not implemented"
150
- end
151
-
152
- def include(**args)
153
- args.each do |key, value|
154
- @includes[key.to_s] = [] unless @includes[key.to_s]
155
- @includes[key.to_s] << value
156
- end
157
-
158
- return self
159
- end
160
-
161
- def exclude(**args)
162
- args.each do |key, value|
163
- @excludes[key.to_s] = [] unless @excludes[key.to_s]
164
- @excludes[key.to_s] << value
165
- end
166
-
167
- return self
168
- end
169
-
170
- def restore_checkpoint(from)
171
- if @checkpoint_name && FileTest.exist?(@checkpoint_name)
172
- # Note that this throws an exception if the file is not readable. This is the intended behavior.
173
- # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
174
- loaded_from = Marshal.load(File.read(@checkpoint_name))
175
- raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
176
-
177
- @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
178
-
179
- return loaded_from
180
- else
181
- return from
182
- end
183
- end
184
-
185
- def update_checkpoint(current_time)
186
- if @checkpoint_name
187
- begin
188
- File.open(@checkpoint_name, "wb") do |f|
189
- f.write(Marshal.dump(current_time))
190
- end
191
- rescue
192
- @logger.warn(
193
- "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
194
- )
195
- end
196
- end
197
- end
198
-
199
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
200
- exceptions = []
201
-
202
- from = restore_checkpoint(from)
203
-
204
- self.each_time(from, to) do |current_time|
205
- events = []
206
-
207
- update_checkpoint(current_time)
208
-
209
- begin
210
- events = self.get(current_time)
211
- rescue GHAException => e
212
- @logger.warn(e.message)
213
- next
214
- rescue => e
215
- @logger.error("An exception occurred for #{current_time}: #{e.message}")
216
- exceptions << e
217
- next
218
- end
219
-
220
- events.each do |event|
221
- skip = false
222
- @includes.each do |key, value|
223
- skip = true unless value.include?(event[key])
224
- end
225
-
226
- @excludes.each do |key, value|
227
- skip = true if value.include?(event[key])
228
- end
229
- next if skip
230
-
231
- if @use_json
232
- yield event, current_time
233
- else
234
- yield GHArchive::Event.parse(event), current_time
235
- end
236
- end
237
-
238
- @logger.info("Scanned #{current_time}")
239
-
240
- events.clear
241
- GC.start
242
- end
243
-
244
- update_checkpoint(to)
245
-
246
- return exceptions
247
- end
248
-
249
- class GHAException < Exception
250
- end
251
- end
252
-
253
- class OnlineGHAProvider < GHAProvider
254
- def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
255
- super()
256
-
257
- self.max_retries(max_retries)
258
- self.proactive(proactive_pool_size) if proactive
259
-
260
- @cache = Cache.new
261
- end
262
-
263
- def max_retries(n)
264
- @max_retries = n
265
-
266
- return self
267
- end
268
-
269
- def proactive(pool_size = 10)
270
- @proactive = true
271
- @pool = GHArchive::ThreadPool.new(pool_size)
272
-
273
- return self
274
- end
275
-
276
- def get(current_time)
277
- @max_retries.times do
278
- begin
279
- filename = self.get_gha_filename(current_time)
280
-
281
- if @proactive
282
- @logger.info("Waiting for cache to have #{current_time}...") unless @cache.has?(filename)
283
-
284
- while !@cache.has?(filename)
285
- sleep 1
286
- end
287
-
288
- data = @cache.get(filename)
289
- if data
290
- return data
291
- else
292
- raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
293
- end
294
- else
295
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
296
- return self.read_gha_file(gz)
297
- end
298
- end
299
- rescue Errno::ECONNRESET => e
300
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
301
- next
302
- rescue OpenURI::HTTPError => e
303
- code = e.io.status[0]
304
- if code.start_with?("5")
305
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
306
- next
307
- else
308
- raise e
309
- end
310
- end
311
- end
312
-
313
- raise DownloadArchiveException, "Exceeded maximum number of tentative downloads for #{current_time}."
314
- end
315
-
316
- def cache(current_time)
317
- @logger.info("Full cache. Waiting for some free slot...") if @cache.full?
318
- while @cache.full?
319
- sleep 1
320
- end
321
-
322
- filename = self.get_gha_filename(current_time)
323
- @max_retries.times do
324
- begin
325
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
326
- content = self.read_gha_file(gz)
327
- @cache.put(filename, content)
328
- return
329
- end
330
- rescue Errno::ECONNRESET => e
331
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
332
- next
333
- rescue OpenURI::HTTPError => e
334
- code = e.io.status[0]
335
- if code.start_with?("5")
336
- @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
337
- next
338
- elsif code == "404"
339
- @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
340
- else
341
- raise e
342
- end
343
- rescue Zlib::GzipFile::Error => e
344
- @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
345
- end
346
- end
347
-
348
- @cache.put(filename, nil) unless @cache.has?(filename)
349
- end
350
-
351
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
352
- if @proactive
353
- real_from = restore_checkpoint(from)
354
- any_ready = Thread.promise
355
-
356
- @logger.info("Proactively scheduling download tasks...")
357
- self.each_time(real_from, to) do |current_time|
358
- @pool.process(current_time) do |current_time|
359
- cache(current_time)
360
- any_ready << true
361
- @logger.info("Proactively cached #{current_time}. Cache size: #{@cache.size}")
362
- end
363
- end
364
-
365
- ~any_ready
366
- @logger.info("Download tasks successfully scheduled!")
367
- end
368
-
369
- super
370
- end
371
-
372
- class Cache
373
- def initialize(max_size = 10)
374
- @cache = {}
375
- @max_size = max_size
376
- @mutex = Mutex.new
377
- end
378
-
379
- def put(name, content)
380
- @mutex.synchronize do
381
- @cache[name] = content
382
- end
383
- end
384
-
385
- def get(name)
386
- @mutex.synchronize do
387
- return @cache.delete(name)
388
- end
389
- end
390
-
391
- def size
392
- @mutex.synchronize do
393
- return @cache.size
394
- end
395
- end
396
-
397
- def has?(name)
398
- @mutex.synchronize do
399
- return @cache.has_key?(name)
400
- end
401
- end
402
-
403
- def full?
404
- self.size >= @max_size
405
- end
406
- end
407
-
408
- class DownloadArchiveException < GHAProvider::GHAException
409
- end
410
- end
411
-
412
- class FolderGHAProvider < GHAProvider
413
- def initialize(folder)
414
- super()
415
-
416
- @folder = folder
417
- end
418
-
419
- def get(current_time)
420
- filename = self.get_gha_filename(current_time)
421
- complete_filename = File.join(@folder, filename)
422
- mode = "rb"
423
-
424
- unless FileTest.exist?(complete_filename)
425
- complete_filename = complete_filename.sub(".gz", "")
426
- mode = "r"
427
- end
428
-
429
- unless FileTest.exist?(complete_filename)
430
- raise GHAException.new("Cannot find any file (neither `.json.gz` nor `.json`) for #{current_time}")
431
- end
432
-
433
- File.open(complete_filename, mode) do |file|
434
- return self.read_gha_file(file)
435
- end
436
- end
437
- end
438
-
439
- class GHADownloader
440
- include GHAUtils
441
-
442
- def initialize(folder, decompress = false)
443
- @logger = Logger.new(STDERR)
444
- @decompress = decompress
445
- @folder = folder
446
- @max = nil
447
-
448
- Dir.mkdir(@folder) unless FileTest.exist?(@folder)
449
- raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
450
- end
451
-
452
- def max(max)
453
- @max = max
454
- return self
455
- end
456
-
457
- def logger=(logger)
458
- @logger = logger
459
- end
460
-
461
- def download(from = Time.gm(2015, 1, 1), to = Time.now)
462
- archive = []
463
- self.each_time(from, to) do |current_time|
464
- filename = self.get_gha_filename(current_time)
465
- out_filename = filename.clone
466
- out_filename.gsub!(".json.gz", ".json") if @decompress
467
-
468
- target_file = File.join(@folder, out_filename)
469
- if FileTest.exist?(target_file)
470
- @logger.info("Skipping existing file for #{current_time}")
471
- next
472
- else
473
- @logger.info("Downloading file for #{current_time}")
474
- end
475
-
476
- File.open(target_file, 'w') do |f|
477
- URI.open("http://data.gharchive.org/#{filename}") do |gz|
478
- if @decompress
479
- f << self.read_gha_file_content(gz)
480
- else
481
- f << gz.read
482
- end
483
- end
484
- end
485
- archive << target_file
486
-
487
- if @max && archive.size > @max
488
- last = archive.shift
489
- @logger.info("Removing local file #{last}")
490
- File.unlink(last)
491
- end
492
-
493
- yield filename if block_given?
494
- end
495
- end
496
- end
1
+ require_relative 'gh-archive/core'
2
+ require_relative 'gh-archive/providers'
3
+ require_relative 'gh-archive/downloader'
4
+ require_relative 'gh-archive/events'
5
+ require_relative 'gh-archive/entities'
6
+ require_relative 'gh-archive/legacy'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.16'
4
+ version: '0.17'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-01 00:00:00.000000000 Z
11
+ date: 2021-12-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -57,8 +57,13 @@ extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
59
  - lib/gh-archive.rb
60
+ - lib/gh-archive/core.rb
61
+ - lib/gh-archive/downloader.rb
60
62
  - lib/gh-archive/entities.rb
61
63
  - lib/gh-archive/events.rb
64
+ - lib/gh-archive/job.rb
65
+ - lib/gh-archive/legacy.rb
66
+ - lib/gh-archive/providers.rb
62
67
  homepage: https://github.com/intersimone999/gh-archive
63
68
  licenses:
64
69
  - GPL-3.0-only
@@ -78,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
83
  - !ruby/object:Gem::Version
79
84
  version: '0'
80
85
  requirements: []
81
- rubygems_version: 3.2.21
86
+ rubygems_version: 3.2.29
82
87
  signing_key:
83
88
  specification_version: 4
84
89
  summary: GitHub Archive mining utility