gh-archive 0.10 → 0.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/gh-archive.rb +91 -8
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '09dad47f83d5fe7255be12f2aaa380331de9fc34380bc9c02d6288d7216c7778'
|
4
|
+
data.tar.gz: c65569657989ad1df63e16e0ca50dfb91c97674465a767099b8bcb207aad8d16
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5c4ad892837193a4665bb22241480db59bb1a3757b0cbde72cfef6e4ff242006c60591a83183b02215e6e05aa3efaa79d2ce71d6e802dbb5b5d210a1f0b462f7
|
7
|
+
data.tar.gz: 24161291f01c2fb86f367f7c3b03045e4a8853433d3a4d0075a131adf2dba65f337808b5cac9480d131c9afae4b346c5904d99f630b1aa8115bb4a285832aedf
|
data/lib/gh-archive.rb
CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
|
|
9
9
|
|
10
10
|
require_relative File.expand_path('../gh-archive/events', __FILE__)
|
11
11
|
|
12
|
+
module GHArchive
|
13
|
+
class ThreadPool
|
14
|
+
def initialize(size)
|
15
|
+
@size = size
|
16
|
+
@threads = []
|
17
|
+
@queue = []
|
18
|
+
@mutex = Mutex.new
|
19
|
+
|
20
|
+
@consumer_thread = Thread.start do
|
21
|
+
while !@shutdown || @threads.size > 0 || @queue.size > 0
|
22
|
+
sleep 0.1 if @queue.size == 0 || @threads.size == @size
|
23
|
+
@threads.delete_if { |t| !t.alive? }
|
24
|
+
|
25
|
+
if @threads.size < @size && @queue.size > 0
|
26
|
+
@mutex.synchronize do
|
27
|
+
args, job = @queue.shift
|
28
|
+
@threads << Thread.start(*args, &job)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def process(*args, &block)
|
36
|
+
raise "Block expected" unless block_given?
|
37
|
+
raise "Can not add jobs while shutting down" if @shutdown
|
38
|
+
|
39
|
+
@mutex.synchronize do
|
40
|
+
@queue << [args, block]
|
41
|
+
end
|
42
|
+
|
43
|
+
return self.enqueued
|
44
|
+
end
|
45
|
+
|
46
|
+
def shutdown
|
47
|
+
@shutdown = true
|
48
|
+
end
|
49
|
+
|
50
|
+
def shutdown!
|
51
|
+
self.shutdown
|
52
|
+
@mutex.synchronize do
|
53
|
+
@queue.clear
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def enqueued
|
58
|
+
return @queue.size
|
59
|
+
end
|
60
|
+
|
61
|
+
def shutdown?
|
62
|
+
@shutdown
|
63
|
+
end
|
64
|
+
|
65
|
+
def alive?
|
66
|
+
@consumer_thread.alive?
|
67
|
+
end
|
68
|
+
|
69
|
+
def wait
|
70
|
+
while alive?
|
71
|
+
sleep 0.1
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
12
77
|
module GHAUtils
|
13
78
|
def get_gha_filename(date)
|
14
79
|
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
@@ -22,9 +87,10 @@ module GHAUtils
|
|
22
87
|
end
|
23
88
|
|
24
89
|
def read_gha_file(file)
|
25
|
-
|
90
|
+
|
91
|
+
if !file.is_a?(StringIO) && file.path.end_with?(".json")
|
26
92
|
content = file.read
|
27
|
-
elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
93
|
+
elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
|
28
94
|
content = read_gha_file_content(file)
|
29
95
|
else
|
30
96
|
raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
|
@@ -137,6 +203,9 @@ class GHAProvider
|
|
137
203
|
|
138
204
|
self.each_time(from, to) do |current_time|
|
139
205
|
events = []
|
206
|
+
|
207
|
+
update_checkpoint(current_time)
|
208
|
+
|
140
209
|
begin
|
141
210
|
events = self.get(current_time)
|
142
211
|
rescue GHAException => e
|
@@ -148,8 +217,6 @@ class GHAProvider
|
|
148
217
|
next
|
149
218
|
end
|
150
219
|
|
151
|
-
update_checkpoint(current_time)
|
152
|
-
|
153
220
|
events.each do |event|
|
154
221
|
skip = false
|
155
222
|
@includes.each do |key, value|
|
@@ -187,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
|
|
187
254
|
def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
|
188
255
|
super()
|
189
256
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
@pool = Thread.pool(proactive_pool_size)
|
257
|
+
self.max_retries(max_retries)
|
258
|
+
self.proactive(proactive_pool_size) if proactive
|
259
|
+
|
194
260
|
@cache = Cache.new
|
195
261
|
end
|
196
262
|
|
263
|
+
def max_retries(n)
|
264
|
+
@max_retries = n
|
265
|
+
|
266
|
+
return self
|
267
|
+
end
|
268
|
+
|
269
|
+
def proactive(pool_size = 10)
|
270
|
+
@proactive = true
|
271
|
+
@pool = GHArchive::ThreadPool.new(pool_size)
|
272
|
+
|
273
|
+
return self
|
274
|
+
end
|
275
|
+
|
197
276
|
def get(current_time)
|
198
277
|
@max_retries.times do
|
199
278
|
begin
|
@@ -250,9 +329,13 @@ class OnlineGHAProvider < GHAProvider
|
|
250
329
|
if code.start_with?("5")
|
251
330
|
@logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
|
252
331
|
next
|
332
|
+
elsif code == "404"
|
333
|
+
@logger.error("File for #{current_time} not found. Skipping because: " + e.message)
|
253
334
|
else
|
254
335
|
raise e
|
255
336
|
end
|
337
|
+
rescue Zlib::GzipFile::Error => e
|
338
|
+
@logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
|
256
339
|
end
|
257
340
|
end
|
258
341
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gh-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.14'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
@@ -63,7 +63,7 @@ homepage: https://github.com/intersimone999/gh-archive
|
|
63
63
|
licenses:
|
64
64
|
- GPL-3.0-only
|
65
65
|
metadata: {}
|
66
|
-
post_install_message:
|
66
|
+
post_install_message:
|
67
67
|
rdoc_options: []
|
68
68
|
require_paths:
|
69
69
|
- lib
|
@@ -78,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '0'
|
80
80
|
requirements: []
|
81
|
-
rubygems_version: 3.2.
|
82
|
-
signing_key:
|
81
|
+
rubygems_version: 3.2.21
|
82
|
+
signing_key:
|
83
83
|
specification_version: 4
|
84
84
|
summary: GitHub Archive mining utility
|
85
85
|
test_files: []
|