gh-archive 0.9 → 0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +117 -34
  3. metadata +6 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 498649edc20ceb260d6468cd7e9ff46781e148205c17cc2c6dd65cd8c7258e8f
4
- data.tar.gz: 6e14d05254a7dcddc3a5afdf1c5e749602fdffc508df70dd1a01ebc97642a78c
3
+ metadata.gz: 74e3037ee1115173176aa974f453f49b7649743f41f83118e9ee180bd620c095
4
+ data.tar.gz: c4c1ca30210ba39204b28b4b3854e2e93c69003ef1ead0f85506f2fe213f0ee9
5
5
  SHA512:
6
- metadata.gz: 6b3156cdeec56577f61961b53c5552670534aaba0de3391daba5a99afc23fd09deb79da47879e2fc326eae4e85bbb180c41ca170540f67aa428c73571e443fad
7
- data.tar.gz: 8129e09a30fd7abe02066ff890dcbd4d48f6965156f92d92582880883a11e508f397358aa39265fc65e7152c2e8204623a417c8689d3096c34e70df411e092f2
6
+ metadata.gz: 289b568dce07aa1f0182c75d26f7ab286a2b48dbf31c9ee63c6a1ef77bf5a59823b17136984380d6cca123c0be70e6cca3cc3dba216a935354d9312eb93aa2fa
7
+ data.tar.gz: 5d9ef4ec34a106e3fb2db37ab173c0b0876637941263f356f2e43440c506ad0471906cd4fa0fbd12bf98c84e44ee2bae0408a96ccdef0a5946c13487e5441204
data/lib/gh-archive.rb CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
9
9
 
10
10
  require_relative File.expand_path('../gh-archive/events', __FILE__)
11
11
 
12
+ module GHArchive
13
+ class ThreadPool
14
+ def initialize(size)
15
+ @size = size
16
+ @threads = []
17
+ @queue = []
18
+ @mutex = Mutex.new
19
+
20
+ @consumer_thread = Thread.start do
21
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
22
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
+ @threads.delete_if { |t| !t.alive? }
24
+
25
+ if @threads.size < @size && @queue.size > 0
26
+ @mutex.synchronize do
27
+ args, job = @queue.shift
28
+ @threads << Thread.start(*args, &job)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def process(*args, &block)
36
+ raise "Block expected" unless block_given?
37
+ raise "Can not add jobs while shutting down" if @shutdown
38
+
39
+ @mutex.synchronize do
40
+ @queue << [args, block]
41
+ end
42
+
43
+ return self.enqueued
44
+ end
45
+
46
+ def shutdown
47
+ @shutdown = true
48
+ end
49
+
50
+ def shutdown!
51
+ self.shutdown
52
+ @mutex.synchronize do
53
+ @queue.clear
54
+ end
55
+ end
56
+
57
+ def enqueued
58
+ return @queue.size
59
+ end
60
+
61
+ def shutdown?
62
+ @shutdown
63
+ end
64
+
65
+ def alive?
66
+ @consumer_thread.alive?
67
+ end
68
+
69
+ def wait
70
+ while alive?
71
+ sleep 0.1
72
+ end
73
+ end
74
+ end
75
+ end
76
+
12
77
  module GHAUtils
13
78
  def get_gha_filename(date)
14
79
  return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
@@ -22,9 +87,10 @@ module GHAUtils
22
87
  end
23
88
 
24
89
  def read_gha_file(file)
25
- if file.path.end_with?(".json")
90
+
91
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
26
92
  content = file.read
27
- elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
93
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
28
94
  content = read_gha_file_content(file)
29
95
  else
30
96
  raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
@@ -101,9 +167,7 @@ class GHAProvider
101
167
  return self
102
168
  end
103
169
 
104
- def each(from = Time.gm(2015, 1, 1), to = Time.now)
105
- exceptions = []
106
-
170
+ def restore_checkpoint(from)
107
171
  if @checkpoint_name && FileTest.exist?(@checkpoint_name)
108
172
  # Note that this throws an exception if the file is not readable. This is the intended behavior.
109
173
  # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
@@ -111,11 +175,37 @@ class GHAProvider
111
175
  raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
112
176
 
113
177
  @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
114
- from = loaded_from
178
+
179
+ return loaded_from
180
+ else
181
+ return from
115
182
  end
183
+ end
184
+
185
+ def update_checkpoint(current_time)
186
+ if @checkpoint_name
187
+ begin
188
+ File.open(@checkpoint_name, "wb") do |f|
189
+ f.write(Marshal.dump(current_time))
190
+ end
191
+ rescue
192
+ @logger.warn(
193
+ "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
194
+ )
195
+ end
196
+ end
197
+ end
198
+
199
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
200
+ exceptions = []
201
+
202
+ from = restore_checkpoint(from)
116
203
 
117
204
  self.each_time(from, to) do |current_time|
118
205
  events = []
206
+
207
+ update_checkpoint(current_time)
208
+
119
209
  begin
120
210
  events = self.get(current_time)
121
211
  rescue GHAException => e
@@ -127,18 +217,6 @@ class GHAProvider
127
217
  next
128
218
  end
129
219
 
130
- if @checkpoint_name
131
- begin
132
- File.open(@checkpoint_name, "wb") do |f|
133
- f.write(Marshal.dump(current_time))
134
- end
135
- rescue
136
- @logger.warn(
137
- "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
138
- )
139
- end
140
- end
141
-
142
220
  events.each do |event|
143
221
  skip = false
144
222
  @includes.each do |key, value|
@@ -163,17 +241,7 @@ class GHAProvider
163
241
  GC.start
164
242
  end
165
243
 
166
- if @checkpoint_name
167
- begin
168
- File.open(@checkpoint_name, "wb") do |f|
169
- f.write(Marshal.dump(to))
170
- end
171
- rescue
172
- @logger.warn(
173
- "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
174
- )
175
- end
176
- end
244
+ update_checkpoint(to)
177
245
 
178
246
  return exceptions
179
247
  end
@@ -186,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
186
254
  def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
187
255
  super()
188
256
 
189
- @max_retries = max_retries
190
- @proactive = proactive
191
- @proactive_pool_size = proactive_pool_size
192
- @pool = Thread.pool(proactive_pool_size)
257
+ self.max_retries(max_retries)
258
+ self.proactive(proactive_pool_size) if proactive
259
+
193
260
  @cache = Cache.new
194
261
  end
195
262
 
263
+ def max_retries(n)
264
+ @max_retries = n
265
+
266
+ return self
267
+ end
268
+
269
+ def proactive(pool_size = 10)
270
+ @proactive = true
271
+ @pool = GHArchive::ThreadPool.new(pool_size)
272
+
273
+ return self
274
+ end
275
+
196
276
  def get(current_time)
197
277
  @max_retries.times do
198
278
  begin
@@ -252,16 +332,19 @@ class OnlineGHAProvider < GHAProvider
252
332
  else
253
333
  raise e
254
334
  end
335
+ rescue Zlib::GzipFile::Error => e
336
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
255
337
  end
256
338
  end
257
339
  end
258
340
 
259
341
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
260
342
  if @proactive
343
+ real_from = restore_checkpoint(from)
261
344
  any_ready = Thread.promise
262
345
 
263
346
  @logger.info("Proactively scheduling download tasks...")
264
- self.each_time(from, to) do |current_time|
347
+ self.each_time(real_from, to) do |current_time|
265
348
  @pool.process(current_time) do |current_time|
266
349
  cache(current_time)
267
350
  any_ready << true
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.9'
4
+ version: '0.13'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-14 00:00:00.000000000 Z
11
+ date: 2021-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -63,7 +63,7 @@ homepage: https://github.com/intersimone999/gh-archive
63
63
  licenses:
64
64
  - GPL-3.0-only
65
65
  metadata: {}
66
- post_install_message:
66
+ post_install_message:
67
67
  rdoc_options: []
68
68
  require_paths:
69
69
  - lib
@@ -78,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
78
  - !ruby/object:Gem::Version
79
79
  version: '0'
80
80
  requirements: []
81
- rubygems_version: 3.2.22
82
- signing_key:
81
+ rubygems_version: 3.2.21
82
+ signing_key:
83
83
  specification_version: 4
84
84
  summary: GitHub Archive mining utility
85
85
  test_files: []