gh-archive 0.11 → 0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/gh-archive.rb +81 -4
  3. metadata +6 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 652cb53430d2e6a230d8f11465b81685f2e92107868345566d16b07d4c4231ed
4
- data.tar.gz: abaec255c079e5a959fd51f7b1e7ca15d72c21eaaeac98d34d13309ec91eedeb
3
+ metadata.gz: de82207e7b0f5588a694e22b9d94c8e15e6812e8ad2dd433a8c78080c3629253
4
+ data.tar.gz: 7bc6e39cb4834922e29764744a3a753d8cea34052b44b282126b520b36f69d64
5
5
  SHA512:
6
- metadata.gz: 829b1d53a7b72f4ea83712e8a95ba059e48dff7c982e811d30d27018e45e1b7010ace2a4a3f2641318a598e765a6d2c6138b319f9d67dd2604617dc2785175be
7
- data.tar.gz: fd85da1c6630e8b534fb34fc9b5a08fcda55e3d2606ca393157134b6d85da2a9a7f92bc87ae31369441da1799cfd2178f0af9703ca997d10296c3f9a5ab1e4f9
6
+ metadata.gz: 1acd0b50fdba7cf5944e50670817ad41879ea273b27c79d87d6463b0bc438220b9a806aba2db879b4dbe8394439f24a66ec9bace2a466f51fbb44755eefd6d99
7
+ data.tar.gz: a30a6de0c073840eff54b7f0678bd996a794194d2a19ae35db1559b7d8ff4ec4f29591f36a6f114878d4737d29406ba919f8907b6706afb2f7b6ab7e8bcd4c26
data/lib/gh-archive.rb CHANGED
@@ -9,6 +9,71 @@ require 'thread/promise'
9
9
 
10
10
  require_relative File.expand_path('../gh-archive/events', __FILE__)
11
11
 
12
+ module GHArchive
13
+ class ThreadPool
14
+ def initialize(size)
15
+ @size = size
16
+ @threads = []
17
+ @queue = []
18
+ @mutex = Mutex.new
19
+
20
+ @consumer_thread = Thread.start do
21
+ while !@shutdown || @threads.size > 0 || @queue.size > 0
22
+ sleep 0.1 if @queue.size == 0 || @threads.size == @size
23
+ @threads.delete_if { |t| !t.alive? }
24
+
25
+ if @threads.size < @size && @queue.size > 0
26
+ @mutex.synchronize do
27
+ args, job = @queue.shift
28
+ @threads << Thread.start(*args, &job)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def process(*args, &block)
36
+ raise "Block expected" unless block_given?
37
+ raise "Can not add jobs while shutting down" if @shutdown
38
+
39
+ @mutex.synchronize do
40
+ @queue << [args, block]
41
+ end
42
+
43
+ return self.enqueued
44
+ end
45
+
46
+ def shutdown
47
+ @shutdown = true
48
+ end
49
+
50
+ def shutdown!
51
+ self.shutdown
52
+ @mutex.synchronize do
53
+ @queue.clear
54
+ end
55
+ end
56
+
57
+ def enqueued
58
+ return @queue.size
59
+ end
60
+
61
+ def shutdown?
62
+ @shutdown
63
+ end
64
+
65
+ def alive?
66
+ @consumer_thread.alive?
67
+ end
68
+
69
+ def wait
70
+ while alive?
71
+ sleep 0.1
72
+ end
73
+ end
74
+ end
75
+ end
76
+
12
77
  module GHAUtils
13
78
  def get_gha_filename(date)
14
79
  return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
@@ -22,9 +87,10 @@ module GHAUtils
22
87
  end
23
88
 
24
89
  def read_gha_file(file)
25
- if file.path.end_with?(".json")
90
+
91
+ if !file.is_a?(StringIO) && file.path.end_with?(".json")
26
92
  content = file.read
27
- elsif file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
93
+ elsif file.is_a?(StringIO) || file.path.end_with?(".gz") || file.path.start_with?("/tmp/open-uri")
28
94
  content = read_gha_file_content(file)
29
95
  else
30
96
  raise "Invalid file extension for #{file.path}: expected `.json.gz` or `json`,"
@@ -202,7 +268,7 @@ class OnlineGHAProvider < GHAProvider
202
268
 
203
269
  def proactive(pool_size = 10)
204
270
  @proactive = true
205
- @pool = Thread.pool(pool_size)
271
+ @pool = GHArchive::ThreadPool.new(pool_size)
206
272
 
207
273
  return self
208
274
  end
@@ -219,7 +285,12 @@ class OnlineGHAProvider < GHAProvider
219
285
  sleep 1
220
286
  end
221
287
 
222
- return @cache.get(filename)
288
+ data = @cache.get(filename)
289
+ if data
290
+ return data
291
+ else
292
+ raise DownloadArchiveException, "Could not scan #{filename}: data unavailable."
293
+ end
223
294
  else
224
295
  URI.open("http://data.gharchive.org/#{filename}") do |gz|
225
296
  return self.read_gha_file(gz)
@@ -263,11 +334,17 @@ class OnlineGHAProvider < GHAProvider
263
334
  if code.start_with?("5")
264
335
  @logger.warn("A server error temporary prevented the download of #{current_time}: " + e.message)
265
336
  next
337
+ elsif code == "404"
338
+ @logger.error("File for #{current_time} not found. Skipping because: " + e.message)
266
339
  else
267
340
  raise e
268
341
  end
342
+ rescue Zlib::GzipFile::Error => e
343
+ @logger.warn("Could not unzip, cache and analyze the zip at #{current_time}: " + e.message)
269
344
  end
270
345
  end
346
+
347
+ @cache.put(filename, nil) unless @cache.has?(filename)
271
348
  end
272
349
 
273
350
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.11'
4
+ version: '0.15'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-15 00:00:00.000000000 Z
11
+ date: 2021-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -63,7 +63,7 @@ homepage: https://github.com/intersimone999/gh-archive
63
63
  licenses:
64
64
  - GPL-3.0-only
65
65
  metadata: {}
66
- post_install_message:
66
+ post_install_message:
67
67
  rdoc_options: []
68
68
  require_paths:
69
69
  - lib
@@ -78,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
78
  - !ruby/object:Gem::Version
79
79
  version: '0'
80
80
  requirements: []
81
- rubygems_version: 3.2.22
82
- signing_key:
81
+ rubygems_version: 3.2.21
82
+ signing_key:
83
83
  specification_version: 4
84
84
  summary: GitHub Archive mining utility
85
85
  test_files: []