gh-archive 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/gh-archive.rb +281 -0
- metadata +63 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 0c493a4b789b2fc8b8b544287614667417f668c080e086a98e142eb70e5ada40
|
4
|
+
data.tar.gz: ab043dd0e56f9fa884405de70e86d1e8ff7b0091cbd980cd4c7be8694fcc3d77
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 663226f4cd9b6dd51d679848877f1b93f64069a839c35d2149511135a960a07736589e8aa69b01d60c9ebdc295db25fb3cde6fa24f1a11885958aba9ffac0af1
|
7
|
+
data.tar.gz: f8889d87fb7853ae54871ccf192527d6021535d12c614fc4059a235e1df9c6245516d5ac9478f53c613932ca4a0655dd2e5c4a29955b8cacd843b38f99ddcba2
|
data/lib/gh-archive.rb
ADDED
@@ -0,0 +1,281 @@
|
|
1
|
+
require 'code-assertions'
|
2
|
+
require 'json'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'zlib'
|
5
|
+
require 'logger'
|
6
|
+
|
7
|
+
module GHAUtils
|
8
|
+
def get_gha_filename(date)
|
9
|
+
return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
|
10
|
+
end
|
11
|
+
|
12
|
+
def read_gha_file_content(gz)
|
13
|
+
gzip = Zlib::GzipReader.new(gz)
|
14
|
+
content = gzip.read
|
15
|
+
gzip.close
|
16
|
+
|
17
|
+
return content
|
18
|
+
end
|
19
|
+
|
20
|
+
def read_gha_file(gz)
|
21
|
+
content = read_gha_file_content(gz)
|
22
|
+
|
23
|
+
result = []
|
24
|
+
content.lines.each do |line|
|
25
|
+
result << JSON.parse(line)
|
26
|
+
end
|
27
|
+
|
28
|
+
return result
|
29
|
+
end
|
30
|
+
|
31
|
+
def each_date(from, to)
|
32
|
+
current_date = from
|
33
|
+
while current_date < to
|
34
|
+
yield current_date
|
35
|
+
current_date += 3600
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class GHAProvider
|
41
|
+
include GHAUtils
|
42
|
+
|
43
|
+
def initialize
|
44
|
+
@logger = Logger.new(STDOUT)
|
45
|
+
|
46
|
+
@includes = {}
|
47
|
+
@excludes = {}
|
48
|
+
end
|
49
|
+
|
50
|
+
def logger=(logger)
|
51
|
+
@logger = logger
|
52
|
+
end
|
53
|
+
|
54
|
+
def get(date)
|
55
|
+
raise "Not implemented"
|
56
|
+
end
|
57
|
+
|
58
|
+
def include(**args)
|
59
|
+
args.each do |key, value|
|
60
|
+
@includes[key.to_s] = [] unless @includes[key.to_s]
|
61
|
+
@includes[key.to_s] << value
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def exclude(**args)
|
66
|
+
args.each do |key, value|
|
67
|
+
@excludes[key.to_s] = [] unless @excludes[key.to_s]
|
68
|
+
@excludes[key.to_s] << value
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
73
|
+
self.each_date(from, to) do |current_date|
|
74
|
+
events = []
|
75
|
+
begin
|
76
|
+
events = self.get(current_date)
|
77
|
+
@logger.info("Scanned #{current_date}")
|
78
|
+
rescue
|
79
|
+
@logger.error($!)
|
80
|
+
next
|
81
|
+
end
|
82
|
+
|
83
|
+
events.each do |event|
|
84
|
+
skip = false
|
85
|
+
@includes.each do |key, value|
|
86
|
+
skip = true unless value.include?(event[key])
|
87
|
+
end
|
88
|
+
|
89
|
+
@excludes.each do |key, value|
|
90
|
+
skip = true if value.include?(event[key])
|
91
|
+
end
|
92
|
+
next if skip
|
93
|
+
|
94
|
+
yield event
|
95
|
+
end
|
96
|
+
|
97
|
+
events.clear
|
98
|
+
GC.start
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class OnlineGHAProvider < GHAProvider
|
104
|
+
def initialize(max_retries = 3, proactive = false)
|
105
|
+
super()
|
106
|
+
|
107
|
+
@max_retries = max_retries
|
108
|
+
@proactive = proactive
|
109
|
+
@cache = Cache.new
|
110
|
+
end
|
111
|
+
|
112
|
+
def get(current_time)
|
113
|
+
@max_retries.times do
|
114
|
+
begin
|
115
|
+
filename = self.get_gha_filename(current_time)
|
116
|
+
|
117
|
+
if @cache.has?(filename)
|
118
|
+
result = self.read_gha_file(@cache.get(filename))
|
119
|
+
else
|
120
|
+
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
121
|
+
# Save to cache
|
122
|
+
return self.read_gha_file(gz)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
rescue
|
126
|
+
@logger.warning($!)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
raise DownloadArchiveException, "Exceeded maximum number of tentative downloads."
|
131
|
+
end
|
132
|
+
|
133
|
+
def cache(current_time)
|
134
|
+
@max_retries.times do
|
135
|
+
begin
|
136
|
+
filename = self.get_gha_filename(current_time)
|
137
|
+
|
138
|
+
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
139
|
+
@cache.put(filename, gz.read)
|
140
|
+
return
|
141
|
+
end
|
142
|
+
rescue
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def each(from = Time.gm(2015, 1, 1), to = Time.now)
|
148
|
+
if @proactive
|
149
|
+
@logger.info("Proactive download thread started")
|
150
|
+
Thread.start do
|
151
|
+
self.each_date(from, to) do |current_date|
|
152
|
+
self.cache(current_date)
|
153
|
+
@logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
|
154
|
+
|
155
|
+
if @cache.full?
|
156
|
+
@logger.info("Full cache. Waiting...")
|
157
|
+
end
|
158
|
+
|
159
|
+
while @cache.full?
|
160
|
+
sleep 1
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
super
|
167
|
+
end
|
168
|
+
|
169
|
+
class Cache
|
170
|
+
def initialize(folder = Dir.mktmpdir, max_size = 100)
|
171
|
+
@cache = {}
|
172
|
+
@max_size = max_size
|
173
|
+
@folder = folder
|
174
|
+
@mutex = Mutex.new
|
175
|
+
end
|
176
|
+
|
177
|
+
def put(name, content)
|
178
|
+
File.open("#@folder/#{name}", 'w') do |f|
|
179
|
+
f << content
|
180
|
+
end
|
181
|
+
|
182
|
+
@mutex.synchronize do
|
183
|
+
@cache[name] = value
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def get(name)
|
188
|
+
@mutex.synchronize do
|
189
|
+
return File.read(@cache[name])
|
190
|
+
end
|
191
|
+
ensure
|
192
|
+
self.unload(name)
|
193
|
+
end
|
194
|
+
|
195
|
+
def unload(name)
|
196
|
+
File.unlink(@cache[name])
|
197
|
+
|
198
|
+
@mutex.synchronize do
|
199
|
+
@cache.delete(name)
|
200
|
+
end
|
201
|
+
|
202
|
+
return true
|
203
|
+
end
|
204
|
+
|
205
|
+
def size
|
206
|
+
@mutex.synchronize do
|
207
|
+
@cache.size
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def has?(name)
|
212
|
+
return @cache.has_key?(name)
|
213
|
+
end
|
214
|
+
|
215
|
+
def full?
|
216
|
+
self.size >= @max_size
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
class DownloadArchiveException < Exception
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
class FolderGHAProvider < GHAProvider
|
225
|
+
def initialize(folder)
|
226
|
+
super()
|
227
|
+
|
228
|
+
@folder = folder
|
229
|
+
end
|
230
|
+
|
231
|
+
def get(current_time)
|
232
|
+
filename = self.get_gha_filename(current_time)
|
233
|
+
File.open(File.join(@folder, filename), "rb") do |gz|
|
234
|
+
return self.read_gha_file(gz)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
class GHADownloader
|
240
|
+
include GHAUtils
|
241
|
+
|
242
|
+
def initialize(folder, decompress = false)
|
243
|
+
@logger = Logger.new(STDERR)
|
244
|
+
@decompress = decompress
|
245
|
+
@folder = folder
|
246
|
+
Dir.mkdir(@folder) unless FileTest.exist?(@folder)
|
247
|
+
raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
|
248
|
+
end
|
249
|
+
|
250
|
+
def logger=(logger)
|
251
|
+
@logger = logger
|
252
|
+
end
|
253
|
+
|
254
|
+
def download(from = Time.gm(2015, 1, 1), to = Time.now)
|
255
|
+
self.each_date(from, to) do |current_date|
|
256
|
+
filename = self.get_gha_filename(current_date)
|
257
|
+
out_filename = filename.clone
|
258
|
+
out_filename.gsub!(".json.gz", ".json") if @decompress
|
259
|
+
|
260
|
+
target_file = File.join(@folder, out_filename)
|
261
|
+
if FileTest.exist?(target_file)
|
262
|
+
@logger.info("Skipping existing file for #{current_date}")
|
263
|
+
next
|
264
|
+
else
|
265
|
+
@logger.info("Downloading file for #{current_date}")
|
266
|
+
end
|
267
|
+
|
268
|
+
File.open(target_file, 'w') do |f|
|
269
|
+
URI.open("http://data.gharchive.org/#{filename}") do |gz|
|
270
|
+
if @decompress
|
271
|
+
f << self.read_gha_file_content(gz)
|
272
|
+
else
|
273
|
+
f << gz.read
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
yield filename if block_given?
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gh-archive
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simone Scalabrino
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-01-26 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: code-assertions
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.1.2
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.1.2
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.1.2
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.1.2
|
33
|
+
description: Download and analyze the GitHub events stored at GitHub archive
|
34
|
+
email: s.scalabrino9@gmail.com
|
35
|
+
executables: []
|
36
|
+
extensions: []
|
37
|
+
extra_rdoc_files: []
|
38
|
+
files:
|
39
|
+
- lib/gh-archive.rb
|
40
|
+
homepage: https://github.com/intersimone999/gh-archive
|
41
|
+
licenses:
|
42
|
+
- GPL-3.0-only
|
43
|
+
metadata: {}
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubygems_version: 3.1.4
|
60
|
+
signing_key:
|
61
|
+
specification_version: 4
|
62
|
+
summary: GitHub Archive mining utility
|
63
|
+
test_files: []
|