gh-archive 0.10 → 0.14

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +91 -8
  3. metadata +6 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b1a8f0a5088aa3d598466d5ef3ba3472b5e9b0c2ad3327f291c24045c01bdfba
4
- data.tar.gz: b3df61810f7634eb8e9a153a812fa2073078975ee0ca303bfb9fb87e58f08160
3
+ metadata.gz: '09dad47f83d5fe7255be12f2aaa380331de9fc34380bc9c02d6288d7216c7778'
4
+ data.tar.gz: c65569657989ad1df63e16e0ca50dfb91c97674465a767099b8bcb207aad8d16
5
5
  SHA512:
6
- metadata.gz: f5462029f6ef8e32f632bfdb7552606cf47a924e019502288ee2d841e28e76a5899d944d912a41933834e08e7a813c88cc919a7e93dba42b1c3250fcd911ae6e
7
- data.tar.gz: 1f180c6640e38423ac2cde87ee4181cf04db8bb0a5cc6f861fd39d5de0c5bc4d30df87524f594482c170d591c5cd063abfa539b5715a4850056ab5262acef965
6
+ metadata.gz: 5c4ad892837193a4665bb22241480db59bb1a3757b0cbde72cfef6e4ff242006c60591a83183b02215e6e05aa3efaa79d2ce71d6e802dbb5b5d210a1f0b462f7
7
+ data.tar.gz: 24161291f01c2fb86f367f7c3b03045e4a8853433d3a4d0075a131adf2dba65f337808b5cac9480d131c9afae4b346c5904d99f630b1aa8115bb4a285832aedf
data/lib/gh-archive.rb CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
9
9
 
10
10
  require_relative File.expand_path('../gh-archive/events', __FILE__)
11
11
 
12
+ module GHArchive
13
+ class ThreadPool
14
+ def initialize(size)
15
+ @size = size
16
+ @threads = []
17
+ @queue = []
18
+ @mutex = Mutex.new
19
+
20
+ @consumer_thread = Thread.start do
21
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
22
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
+ @threads.delete_if { |t| !t.alive? }
24
+
25
+ if @threads.size < @size && @queue.size > 0
26
+ @mutex.synchronize do
27
+ args, job = @queue.shift
28
+ @threads << Thread.start(*args, &job)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def process(*args, &block)
36
+ raise "Block expected" unless block_given?
37
+ raise "Can not add jobs while shutting down" if @shutdown
38
+
39
+ @mutex.synchronize do
40
+ @queue << [args, block]
41
+ end
42
+
43
+ return self.enqueued
44
+ end
45
+
46
+ def shutdown
47
+ @shutdown = true
48
+ end
49
+
50
+ def shutdown!
51
+ self.shutdown
52
+ @mutex.synchronize do
53
+ @queue.clear
54
+ end
55
+ end
56
+
57
+ def enqueued
58
+ return @queue.size
59
+ end
60
+
61
+ def shutdown?
62
+ @shutdown
63
+ end
64
+
65
+ def alive?
66
+ @consumer_thread.alive?
67
+ end
68
+
69
+ def wait
70
+ while alive?
71
+ sleep 0.1
72
+ end
73
+ end
74
+ end
75
+ end
76
+
12
77
  module GHAUtils
13
78
  def get_gha_filename(date)
14
79
  return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
@@ -22,9 +87,10 @@ module GHAUtils
22
87
  end
23
88
 
24
89
  def read_gha_file(file)
25
- if file.path.end_with?(".json")
90
+
91
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
26
92
  content = file.read
27
- elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
93
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
28
94
  content = read_gha_file_content(file)
29
95
  else
30
96
  raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
@@ -137,6 +203,9 @@ class GHAProvider
137
203
 
138
204
  self.each_time(from, to) do |current_time|
139
205
  events = []
206
+
207
+ update_checkpoint(current_time)
208
+
140
209
  begin
141
210
  events = self.get(current_time)
142
211
  rescue GHAException => e
@@ -148,8 +217,6 @@ class GHAProvider
148
217
  next
149
218
  end
150
219
 
151
- update_checkpoint(current_time)
152
-
153
220
  events.each do |event|
154
221
  skip = false
155
222
  @includes.each do |key, value|
@@ -187,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
187
254
  def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
188
255
  super()
189
256
 
190
- @max_retries = max_retries
191
- @proactive = proactive
192
- @proactive_pool_size = proactive_pool_size
193
- @pool = Thread.pool(proactive_pool_size)
257
+ self.max_retries(max_retries)
258
+ self.proactive(proactive_pool_size) if proactive
259
+
194
260
  @cache = Cache.new
195
261
  end
196
262
 
263
+ def max_retries(n)
264
+ @max_retries = n
265
+
266
+ return self
267
+ end
268
+
269
+ def proactive(pool_size = 10)
270
+ @proactive = true
271
+ @pool = GHArchive::ThreadPool.new(pool_size)
272
+
273
+ return self
274
+ end
275
+
197
276
  def get(current_time)
198
277
  @max_retries.times do
199
278
  begin
@@ -250,9 +329,13 @@ class OnlineGHAProvider < GHAProvider
250
329
  if code.start_with?("5")
251
330
  @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
252
331
  next
332
+ elsif code == "404"
333
+ @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
253
334
  else
254
335
  raise e
255
336
  end
337
+ rescue Zlib::GzipFile::Error => e
338
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
256
339
  end
257
340
  end
258
341
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.10'
4
+ version: '0.14'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-15 00:00:00.000000000 Z
11
+ date: 2021-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -63,7 +63,7 @@ homepage: https://github.com/intersimone999/gh-archive
63
63
  licenses:
64
64
  - GPL-3.0-only
65
65
  metadata: {}
66
- post_install_message:
66
+ post_install_message:
67
67
  rdoc_options: []
68
68
  require_paths:
69
69
  - lib
@@ -78,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
78
  - !ruby/object:Gem::Version
79
79
  version: '0'
80
80
  requirements: []
81
- rubygems_version: 3.2.22
82
- signing_key:
81
+ rubygems_version: 3.2.21
82
+ signing_key:
83
83
  specification_version: 4
84
84
  summary: GitHub Archive mining utility
85
85
  test_files: []