gh-archive 0.9 → 0.13

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +117 -34
  3. metadata +6 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 498649edc20ceb260d6468cd7e9ff46781e148205c17cc2c6dd65cd8c7258e8f
4
- data.tar.gz: 6e14d05254a7dcddc3a5afdf1c5e749602fdffc508df70dd1a01ebc97642a78c
3
+ metadata.gz: 74e3037ee1115173176aa974f453f49b7649743f41f83118e9ee180bd620c095
4
+ data.tar.gz: c4c1ca30210ba39204b28b4b3854e2e93c69003ef1ead0f85506f2fe213f0ee9
5
5
  SHA512:
6
- metadata.gz: 6b3156cdeec56577f61961b53c5552670534aaba0de3391daba5a99afc23fd09deb79da47879e2fc326eae4e85bbb180c41ca170540f67aa428c73571e443fad
7
- data.tar.gz: 8129e09a30fd7abe02066ff890dcbd4d48f6965156f92d92582880883a11e508f397358aa39265fc65e7152c2e8204623a417c8689d3096c34e70df411e092f2
6
+ metadata.gz: 289b568dce07aa1f0182c75d26f7ab286a2b48dbf31c9ee63c6a1ef77bf5a59823b17136984380d6cca123c0be70e6cca3cc3dba216a935354d9312eb93aa2fa
7
+ data.tar.gz: 5d9ef4ec34a106e3fb2db37ab173c0b0876637941263f356f2e43440c506ad0471906cd4fa0fbd12bf98c84e44ee2bae0408a96ccdef0a5946c13487e5441204
data/lib/gh-archive.rb CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
9
9
 
10
10
  require_relative File.expand_path('../gh-archive/events', __FILE__)
11
11
 
12
+ module GHArchive
13
+ class ThreadPool
14
+ def initialize(size)
15
+ @size = size
16
+ @threads = []
17
+ @queue = []
18
+ @mutex = Mutex.new
19
+
20
+ @consumer_thread = Thread.start do
21
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
22
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
+ @threads.delete_if { |t| !t.alive? }
24
+
25
+ if @threads.size < @size && @queue.size > 0
26
+ @mutex.synchronize do
27
+ args, job = @queue.shift
28
+ @threads << Thread.start(*args, &job)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def process(*args, &block)
36
+ raise "Block expected" unless block_given?
37
+ raise "Can not add jobs while shutting down" if @shutdown
38
+
39
+ @mutex.synchronize do
40
+ @queue << [args, block]
41
+ end
42
+
43
+ return self.enqueued
44
+ end
45
+
46
+ def shutdown
47
+ @shutdown = true
48
+ end
49
+
50
+ def shutdown!
51
+ self.shutdown
52
+ @mutex.synchronize do
53
+ @queue.clear
54
+ end
55
+ end
56
+
57
+ def enqueued
58
+ return @queue.size
59
+ end
60
+
61
+ def shutdown?
62
+ @shutdown
63
+ end
64
+
65
+ def alive?
66
+ @consumer_thread.alive?
67
+ end
68
+
69
+ def wait
70
+ while alive?
71
+ sleep 0.1
72
+ end
73
+ end
74
+ end
75
+ end
76
+
12
77
  module GHAUtils
13
78
  def get_gha_filename(date)
14
79
  return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
@@ -22,9 +87,10 @@ module GHAUtils
22
87
  end
23
88
 
24
89
  def read_gha_file(file)
25
- if file.path.end_with?(".json")
90
+
91
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
26
92
  content = file.read
27
- elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
93
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
28
94
  content = read_gha_file_content(file)
29
95
  else
30
96
  raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
@@ -101,9 +167,7 @@ class GHAProvider
101
167
  return self
102
168
  end
103
169
 
104
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
105
- exceptions = []
106
-
170
+ def restore_checkpoint(from)
107
171
  if @checkpoint_name && FileTest.exist?(@checkpoint_name)
108
172
  # Note that this throws an exception if the file is not readable. This is the intended behavior.
109
173
  # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
@@ -111,11 +175,37 @@ class GHAProvider
111
175
  raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
112
176
 
113
177
  @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
114
- from = loaded_from
178
+
179
+ return loaded_from
180
+ else
181
+ return from
115
182
  end
183
+ end
184
+
185
+ def update_checkpoint(current_time)
186
+ if @checkpoint_name
187
+ begin
188
+ File.open(@checkpoint_name, "wb") do |f|
189
+ f.write(Marshal.dump(current_time))
190
+ end
191
+ rescue
192
+ @logger.warn(
193
+ "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
194
+ )
195
+ end
196
+ end
197
+ end
198
+
199
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
200
+ exceptions = []
201
+
202
+ from = restore_checkpoint(from)
116
203
 
117
204
  self.each_time(from, to) do |current_time|
118
205
  events = []
206
+
207
+ update_checkpoint(current_time)
208
+
119
209
  begin
120
210
  events = self.get(current_time)
121
211
  rescue GHAException => e
@@ -127,18 +217,6 @@ class GHAProvider
127
217
  next
128
218
  end
129
219
 
130
- if @checkpoint_name
131
- begin
132
- File.open(@checkpoint_name, "wb") do |f|
133
- f.write(Marshal.dump(current_time))
134
- end
135
- rescue
136
- @logger.warn(
137
- "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
138
- )
139
- end
140
- end
141
-
142
220
  events.each do |event|
143
221
  skip = false
144
222
  @includes.each do |key, value|
@@ -163,17 +241,7 @@ class GHAProvider
163
241
  GC.start
164
242
  end
165
243
 
166
- if @checkpoint_name
167
- begin
168
- File.open(@checkpoint_name, "wb") do |f|
169
- f.write(Marshal.dump(to))
170
- end
171
- rescue
172
- @logger.warn(
173
- "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
174
- )
175
- end
176
- end
244
+ update_checkpoint(to)
177
245
 
178
246
  return exceptions
179
247
  end
@@ -186,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
186
254
  def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
187
255
  super()
188
256
 
189
- @max_retries = max_retries
190
- @proactive = proactive
191
- @proactive_pool_size = proactive_pool_size
192
- @pool = Thread.pool(proactive_pool_size)
257
+ self.max_retries(max_retries)
258
+ self.proactive(proactive_pool_size) if proactive
259
+
193
260
  @cache = Cache.new
194
261
  end
195
262
 
263
+ def max_retries(n)
264
+ @max_retries = n
265
+
266
+ return self
267
+ end
268
+
269
+ def proactive(pool_size = 10)
270
+ @proactive = true
271
+ @pool = GHArchive::ThreadPool.new(pool_size)
272
+
273
+ return self
274
+ end
275
+
196
276
  def get(current_time)
197
277
  @max_retries.times do
198
278
  begin
@@ -252,16 +332,19 @@ class OnlineGHAProvider < GHAProvider
252
332
  else
253
333
  raise e
254
334
  end
335
+ rescue Zlib::GzipFile::Error => e
336
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
255
337
  end
256
338
  end
257
339
  end
258
340
 
259
341
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
260
342
  if @proactive
343
+ real_from = restore_checkpoint(from)
261
344
  any_ready = Thread.promise
262
345
 
263
346
  @logger.info("Proactively scheduling download tasks...")
264
- self.each_time(from, to) do |current_time|
347
+ self.each_time(real_from, to) do |current_time|
265
348
  @pool.process(current_time) do |current_time|
266
349
  cache(current_time)
267
350
  any_ready << true
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.9'
4
+ version: '0.13'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-14 00:00:00.000000000 Z
11
+ date: 2021-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -63,7 +63,7 @@ homepage: https://github.com/intersimone999/gh-archive
63
63
  licenses:
64
64
  - GPL-3.0-only
65
65
  metadata: {}
66
- post_install_message:
66
+ post_install_message:
67
67
  rdoc_options: []
68
68
  require_paths:
69
69
  - lib
@@ -78,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
78
  - !ruby/object:Gem::Version
79
79
  version: '0'
80
80
  requirements: []
81
- rubygems_version: 3.2.22
82
- signing_key:
81
+ rubygems_version: 3.2.21
82
+ signing_key:
83
83
  specification_version: 4
84
84
  summary: GitHub Archive mining utility
85
85
  test_files: []