gh-archive 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/gh-archive.rb +281 -0
  3. metadata +63 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0c493a4b789b2fc8b8b544287614667417f668c080e086a98e142eb70e5ada40
4
+ data.tar.gz: ab043dd0e56f9fa884405de70e86d1e8ff7b0091cbd980cd4c7be8694fcc3d77
5
+ SHA512:
6
+ metadata.gz: 663226f4cd9b6dd51d679848877f1b93f64069a839c35d2149511135a960a07736589e8aa69b01d60c9ebdc295db25fb3cde6fa24f1a11885958aba9ffac0af1
7
+ data.tar.gz: f8889d87fb7853ae54871ccf192527d6021535d12c614fc4059a235e1df9c6245516d5ac9478f53c613932ca4a0655dd2e5c4a29955b8cacd843b38f99ddcba2
@@ -0,0 +1,281 @@
1
+ require 'code-assertions'
2
+ require 'json'
3
+ require 'open-uri'
4
+ require 'zlib'
5
+ require 'logger'
6
+
7
+ module GHAUtils
8
+ def get_gha_filename(date)
9
+ return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
10
+ end
11
+
12
+ def read_gha_file_content(gz)
13
+ gzip = Zlib::GzipReader.new(gz)
14
+ content = gzip.read
15
+ gzip.close
16
+
17
+ return content
18
+ end
19
+
20
+ def read_gha_file(gz)
21
+ content = read_gha_file_content(gz)
22
+
23
+ result = []
24
+ content.lines.each do |line|
25
+ result << JSON.parse(line)
26
+ end
27
+
28
+ return result
29
+ end
30
+
31
+ def each_date(from, to)
32
+ current_date = from
33
+ while current_date < to
34
+ yield current_date
35
+ current_date += 3600
36
+ end
37
+ end
38
+ end
39
+
40
+ class GHAProvider
41
+ include GHAUtils
42
+
43
+ def initialize
44
+ @logger = Logger.new(STDOUT)
45
+
46
+ @includes = {}
47
+ @excludes = {}
48
+ end
49
+
50
+ def logger=(logger)
51
+ @logger = logger
52
+ end
53
+
54
+ def get(date)
55
+ raise "Not implemented"
56
+ end
57
+
58
+ def include(**args)
59
+ args.each do |key, value|
60
+ @includes[key.to_s] = [] unless @includes[key.to_s]
61
+ @includes[key.to_s] << value
62
+ end
63
+ end
64
+
65
+ def exclude(**args)
66
+ args.each do |key, value|
67
+ @excludes[key.to_s] = [] unless @excludes[key.to_s]
68
+ @excludes[key.to_s] << value
69
+ end
70
+ end
71
+
72
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
73
+ self.each_date(from, to) do |current_date|
74
+ events = []
75
+ begin
76
+ events = self.get(current_date)
77
+ @logger.info("Scanned #{current_date}")
78
+ rescue
79
+ @logger.error($!)
80
+ next
81
+ end
82
+
83
+ events.each do |event|
84
+ skip = false
85
+ @includes.each do |key, value|
86
+ skip = true unless value.include?(event[key])
87
+ end
88
+
89
+ @excludes.each do |key, value|
90
+ skip = true if value.include?(event[key])
91
+ end
92
+ next if skip
93
+
94
+ yield event
95
+ end
96
+
97
+ events.clear
98
+ GC.start
99
+ end
100
+ end
101
+ end
102
+
103
+ class OnlineGHAProvider < GHAProvider
104
+ def initialize(max_retries = 3, proactive = false)
105
+ super()
106
+
107
+ @max_retries = max_retries
108
+ @proactive = proactive
109
+ @cache = Cache.new
110
+ end
111
+
112
+ def get(current_time)
113
+ @max_retries.times do
114
+ begin
115
+ filename = self.get_gha_filename(current_time)
116
+
117
+ if @cache.has?(filename)
118
+ result = self.read_gha_file(@cache.get(filename))
119
+ else
120
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
121
+ # Save to cache
122
+ return self.read_gha_file(gz)
123
+ end
124
+ end
125
+ rescue
126
+ @logger.warning($!)
127
+ end
128
+ end
129
+
130
+ raise DownloadArchiveException, "Exceeded maximum number of tentative downloads."
131
+ end
132
+
133
+ def cache(current_time)
134
+ @max_retries.times do
135
+ begin
136
+ filename = self.get_gha_filename(current_time)
137
+
138
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
139
+ @cache.put(filename, gz.read)
140
+ return
141
+ end
142
+ rescue
143
+ end
144
+ end
145
+ end
146
+
147
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
148
+ if @proactive
149
+ @logger.info("Proactive download thread started")
150
+ Thread.start do
151
+ self.each_date(from, to) do |current_date|
152
+ self.cache(current_date)
153
+ @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
154
+
155
+ if @cache.full?
156
+ @logger.info("Full cache. Waiting...")
157
+ end
158
+
159
+ while @cache.full?
160
+ sleep 1
161
+ end
162
+ end
163
+ end
164
+ end
165
+
166
+ super
167
+ end
168
+
169
+ class Cache
170
+ def initialize(folder = Dir.mktmpdir, max_size = 100)
171
+ @cache = {}
172
+ @max_size = max_size
173
+ @folder = folder
174
+ @mutex = Mutex.new
175
+ end
176
+
177
+ def put(name, content)
178
+ File.open("#@folder/#{name}", 'w') do |f|
179
+ f << content
180
+ end
181
+
182
+ @mutex.synchronize do
183
+ @cache[name] = value
184
+ end
185
+ end
186
+
187
+ def get(name)
188
+ @mutex.synchronize do
189
+ return File.read(@cache[name])
190
+ end
191
+ ensure
192
+ self.unload(name)
193
+ end
194
+
195
+ def unload(name)
196
+ File.unlink(@cache[name])
197
+
198
+ @mutex.synchronize do
199
+ @cache.delete(name)
200
+ end
201
+
202
+ return true
203
+ end
204
+
205
+ def size
206
+ @mutex.synchronize do
207
+ @cache.size
208
+ end
209
+ end
210
+
211
+ def has?(name)
212
+ return @cache.has_key?(name)
213
+ end
214
+
215
+ def full?
216
+ self.size >= @max_size
217
+ end
218
+ end
219
+
220
+ class DownloadArchiveException < Exception
221
+ end
222
+ end
223
+
224
+ class FolderGHAProvider < GHAProvider
225
+ def initialize(folder)
226
+ super()
227
+
228
+ @folder = folder
229
+ end
230
+
231
+ def get(current_time)
232
+ filename = self.get_gha_filename(current_time)
233
+ File.open(File.join(@folder, filename), "rb") do |gz|
234
+ return self.read_gha_file(gz)
235
+ end
236
+ end
237
+ end
238
+
239
+ class GHADownloader
240
+ include GHAUtils
241
+
242
+ def initialize(folder, decompress = false)
243
+ @logger = Logger.new(STDERR)
244
+ @decompress = decompress
245
+ @folder = folder
246
+ Dir.mkdir(@folder) unless FileTest.exist?(@folder)
247
+ raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
248
+ end
249
+
250
+ def logger=(logger)
251
+ @logger = logger
252
+ end
253
+
254
+ def download(from = Time.gm(2015, 1, 1), to = Time.now)
255
+ self.each_date(from, to) do |current_date|
256
+ filename = self.get_gha_filename(current_date)
257
+ out_filename = filename.clone
258
+ out_filename.gsub!(".json.gz", ".json") if @decompress
259
+
260
+ target_file = File.join(@folder, out_filename)
261
+ if FileTest.exist?(target_file)
262
+ @logger.info("Skipping existing file for #{current_date}")
263
+ next
264
+ else
265
+ @logger.info("Downloading file for #{current_date}")
266
+ end
267
+
268
+ File.open(target_file, 'w') do |f|
269
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
270
+ if @decompress
271
+ f << self.read_gha_file_content(gz)
272
+ else
273
+ f << gz.read
274
+ end
275
+ end
276
+ end
277
+
278
+ yield filename if block_given?
279
+ end
280
+ end
281
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gh-archive
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Simone Scalabrino
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-01-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: code-assertions
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.1.2
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.1.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 1.1.2
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.1.2
33
+ description: Download and analyze the GitHub events stored at GitHub archive
34
+ email: s.scalabrino9@gmail.com
35
+ executables: []
36
+ extensions: []
37
+ extra_rdoc_files: []
38
+ files:
39
+ - lib/gh-archive.rb
40
+ homepage: https://github.com/intersimone999/gh-archive
41
+ licenses:
42
+ - GPL-3.0-only
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubygems_version: 3.1.4
60
+ signing_key:
61
+ specification_version: 4
62
+ summary: GitHub Archive mining utility
63
+ test_files: []