gh-archive 0.10 → 0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +91 -8
  3. metadata +6 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b1a8f0a5088aa3d598466d5ef3ba3472b5e9b0c2ad3327f291c24045c01bdfba
4
- data.tar.gz: b3df61810f7634eb8e9a153a812fa2073078975ee0ca303bfb9fb87e58f08160
3
+ metadata.gz: '09dad47f83d5fe7255be12f2aaa380331de9fc34380bc9c02d6288d7216c7778'
4
+ data.tar.gz: c65569657989ad1df63e16e0ca50dfb91c97674465a767099b8bcb207aad8d16
5
5
  SHA512:
6
- metadata.gz: f5462029f6ef8e32f632bfdb7552606cf47a924e019502288ee2d841e28e76a5899d944d912a41933834e08e7a813c88cc919a7e93dba42b1c3250fcd911ae6e
7
- data.tar.gz: 1f180c6640e38423ac2cde87ee4181cf04db8bb0a5cc6f861fd39d5de0c5bc4d30df87524f594482c170d591c5cd063abfa539b5715a4850056ab5262acef965
6
+ metadata.gz: 5c4ad892837193a4665bb22241480db59bb1a3757b0cbde72cfef6e4ff242006c60591a83183b02215e6e05aa3efaa79d2ce71d6e802dbb5b5d210a1f0b462f7
7
+ data.tar.gz: 24161291f01c2fb86f367f7c3b03045e4a8853433d3a4d0075a131adf2dba65f337808b5cac9480d131c9afae4b346c5904d99f630b1aa8115bb4a285832aedf
data/lib/gh-archive.rb CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
9
9
 
10
10
  require_relative File.expand_path('../gh-archive/events', __FILE__)
11
11
 
12
+ module GHArchive
13
+ class ThreadPool
14
+ def initialize(size)
15
+ @size = size
16
+ @threads = []
17
+ @queue = []
18
+ @mutex = Mutex.new
19
+
20
+ @consumer_thread = Thread.start do
21
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
22
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
+ @threads.delete_if { |t| !t.alive? }
24
+
25
+ if @threads.size < @size && @queue.size > 0
26
+ @mutex.synchronize do
27
+ args, job = @queue.shift
28
+ @threads << Thread.start(*args, &job)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def process(*args, &block)
36
+ raise "Block expected" unless block_given?
37
+ raise "Can not add jobs while shutting down" if @shutdown
38
+
39
+ @mutex.synchronize do
40
+ @queue << [args, block]
41
+ end
42
+
43
+ return self.enqueued
44
+ end
45
+
46
+ def shutdown
47
+ @shutdown = true
48
+ end
49
+
50
+ def shutdown!
51
+ self.shutdown
52
+ @mutex.synchronize do
53
+ @queue.clear
54
+ end
55
+ end
56
+
57
+ def enqueued
58
+ return @queue.size
59
+ end
60
+
61
+ def shutdown?
62
+ @shutdown
63
+ end
64
+
65
+ def alive?
66
+ @consumer_thread.alive?
67
+ end
68
+
69
+ def wait
70
+ while alive?
71
+ sleep 0.1
72
+ end
73
+ end
74
+ end
75
+ end
76
+
12
77
  module GHAUtils
13
78
  def get_gha_filename(date)
14
79
  return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
@@ -22,9 +87,10 @@ module GHAUtils
22
87
  end
23
88
 
24
89
  def read_gha_file(file)
25
- if file.path.end_with?(".json")
90
+
91
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
26
92
  content = file.read
27
- elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
93
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
28
94
  content = read_gha_file_content(file)
29
95
  else
30
96
  raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
@@ -137,6 +203,9 @@ class GHAProvider
137
203
 
138
204
  self.each_time(from, to) do |current_time|
139
205
  events = []
206
+
207
+ update_checkpoint(current_time)
208
+
140
209
  begin
141
210
  events = self.get(current_time)
142
211
  rescue GHAException => e
@@ -148,8 +217,6 @@ class GHAProvider
148
217
  next
149
218
  end
150
219
 
151
- update_checkpoint(current_time)
152
-
153
220
  events.each do |event|
154
221
  skip = false
155
222
  @includes.each do |key, value|
@@ -187,13 +254,25 @@ class OnlineGHAProvider < GHAProvider
187
254
  def initialize(max_retries = 3, proactive = false, proactive_pool_size = 10)
188
255
  super()
189
256
 
190
- @max_retries = max_retries
191
- @proactive = proactive
192
- @proactive_pool_size = proactive_pool_size
193
- @pool = Thread.pool(proactive_pool_size)
257
+ self.max_retries(max_retries)
258
+ self.proactive(proactive_pool_size) if proactive
259
+
194
260
  @cache = Cache.new
195
261
  end
196
262
 
263
+ def max_retries(n)
264
+ @max_retries = n
265
+
266
+ return self
267
+ end
268
+
269
+ def proactive(pool_size = 10)
270
+ @proactive = true
271
+ @pool = GHArchive::ThreadPool.new(pool_size)
272
+
273
+ return self
274
+ end
275
+
197
276
  def get(current_time)
198
277
  @max_retries.times do
199
278
  begin
@@ -250,9 +329,13 @@ class OnlineGHAProvider < GHAProvider
250
329
  if code.start_with?("5")
251
330
  @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
252
331
  next
332
+ elsif code == "404"
333
+ @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
253
334
  else
254
335
  raise e
255
336
  end
337
+ rescue Zlib::GzipFile::Error => e
338
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
256
339
  end
257
340
  end
258
341
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.10'
4
+ version: '0.14'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-15 00:00:00.000000000 Z
11
+ date: 2021-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -63,7 +63,7 @@ homepage: https://github.com/intersimone999/gh-archive
63
63
  licenses:
64
64
  - GPL-3.0-only
65
65
  metadata: {}
66
- post_install_message:
66
+ post_install_message:
67
67
  rdoc_options: []
68
68
  require_paths:
69
69
  - lib
@@ -78,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
78
  - !ruby/object:Gem::Version
79
79
  version: '0'
80
80
  requirements: []
81
- rubygems_version: 3.2.22
82
- signing_key:
81
+ rubygems_version: 3.2.21
82
+ signing_key:
83
83
  specification_version: 4
84
84
  summary: GitHub Archive mining utility
85
85
  test_files: []