gh-archive 0.11 → 0.15

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +81 -4
  3. metadata +6 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 652cb53430d2e6a230d8f11465b81685f2e92107868345566d16b07d4c4231ed
4
- data.tar.gz: abaec255c079e5a959fd51f7b1e7ca15d72c21eaaeac98d34d13309ec91eedeb
3
+ metadata.gz: de82207e7b0f5588a694e22b9d94c8e15e6812e8ad2dd433a8c78080c3629253
4
+ data.tar.gz: 7bc6e39cb4834922e29764744a3a753d8cea34052b44b282126b520b36f69d64
5
5
  SHA512:
6
- metadata.gz: 829b1d53a7b72f4ea83712e8a95ba059e48dff7c982e811d30d27018e45e1b7010ace2a4a3f2641318a598e765a6d2c6138b319f9d67dd2604617dc2785175be
7
- data.tar.gz: fd85da1c6630e8b534fb34fc9b5a08fcda55e3d2606ca393157134b6d85da2a9a7f92bc87ae31369441da1799cfd2178f0af9703ca997d10296c3f9a5ab1e4f9
6
+ metadata.gz: 1acd0b50fdba7cf5944e50670817ad41879ea273b27c79d87d6463b0bc438220b9a806aba2db879b4dbe8394439f24a66ec9bace2a466f51fbb44755eefd6d99
7
+ data.tar.gz: a30a6de0c073840eff54b7f0678bd996a794194d2a19ae35db1559b7d8ff4ec4f29591f36a6f114878d4737d29406ba919f8907b6706afb2f7b6ab7e8bcd4c26
data/lib/gh-archive.rb CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
9
9
 
10
10
  require_relative File.expand_path('../gh-archive/events', __FILE__)
11
11
 
12
+ module GHArchive
13
+ class ThreadPool
14
+ def initialize(size)
15
+ @size = size
16
+ @threads = []
17
+ @queue = []
18
+ @mutex = Mutex.new
19
+
20
+ @consumer_thread = Thread.start do
21
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
22
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
+ @threads.delete_if { |t| !t.alive? }
24
+
25
+ if @threads.size < @size && @queue.size > 0
26
+ @mutex.synchronize do
27
+ args, job = @queue.shift
28
+ @threads << Thread.start(*args, &job)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def process(*args, &block)
36
+ raise "Block expected" unless block_given?
37
+ raise "Can not add jobs while shutting down" if @shutdown
38
+
39
+ @mutex.synchronize do
40
+ @queue << [args, block]
41
+ end
42
+
43
+ return self.enqueued
44
+ end
45
+
46
+ def shutdown
47
+ @shutdown = true
48
+ end
49
+
50
+ def shutdown!
51
+ self.shutdown
52
+ @mutex.synchronize do
53
+ @queue.clear
54
+ end
55
+ end
56
+
57
+ def enqueued
58
+ return @queue.size
59
+ end
60
+
61
+ def shutdown?
62
+ @shutdown
63
+ end
64
+
65
+ def alive?
66
+ @consumer_thread.alive?
67
+ end
68
+
69
+ def wait
70
+ while alive?
71
+ sleep 0.1
72
+ end
73
+ end
74
+ end
75
+ end
76
+
12
77
  module GHAUtils
13
78
  def get_gha_filename(date)
14
79
  return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
@@ -22,9 +87,10 @@ module GHAUtils
22
87
  end
23
88
 
24
89
  def read_gha_file(file)
25
- if file.path.end_with?(".json")
90
+
91
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
26
92
  content = file.read
27
- elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
93
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
28
94
  content = read_gha_file_content(file)
29
95
  else
30
96
  raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
@@ -202,7 +268,7 @@ class OnlineGHAProvider < GHAProvider
202
268
 
203
269
  def proactive(pool_size = 10)
204
270
  @proactive = true
205
- @pool = Thread.pool(pool_size)
271
+ @pool = GHArchive::ThreadPool.new(pool_size)
206
272
 
207
273
  return self
208
274
  end
@@ -219,7 +285,12 @@ class OnlineGHAProvider < GHAProvider
219
285
  sleep 1
220
286
  end
221
287
 
222
- return @cache.get(filename)
288
+ data = @cache.get(filename)
289
+ if data
290
+ return data
291
+ else
292
+ raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
293
+ end
223
294
  else
224
295
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
225
296
  return self.read_gha_file(gz)
@@ -263,11 +334,17 @@ class OnlineGHAProvider < GHAProvider
263
334
  if code.start_with?("5")
264
335
  @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
265
336
  next
337
+ elsif code == "404"
338
+ @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
266
339
  else
267
340
  raise e
268
341
  end
342
+ rescue Zlib::GzipFile::Error => e
343
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
269
344
  end
270
345
  end
346
+
347
+ @cache.put(filename, nil) unless @cache.has?(filename)
271
348
  end
272
349
 
273
350
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.11'
4
+ version: '0.15'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-15 00:00:00.000000000 Z
11
+ date: 2021-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -63,7 +63,7 @@ homepage: https://github.com/intersimone999/gh-archive
63
63
  licenses:
64
64
  - GPL-3.0-only
65
65
  metadata: {}
66
- post_install_message:
66
+ post_install_message:
67
67
  rdoc_options: []
68
68
  require_paths:
69
69
  - lib
@@ -78,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
78
  - !ruby/object:Gem::Version
79
79
  version: '0'
80
80
  requirements: []
81
- rubygems_version: 3.2.22
82
- signing_key:
81
+ rubygems_version: 3.2.21
82
+ signing_key:
83
83
  specification_version: 4
84
84
  summary: GitHub Archive mining utility
85
85
  test_files: []