gh-archive 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/gh-archive.rb +281 -0
  3. metadata +63 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0c493a4b789b2fc8b8b544287614667417f668c080e086a98e142eb70e5ada40
4
+ data.tar.gz: ab043dd0e56f9fa884405de70e86d1e8ff7b0091cbd980cd4c7be8694fcc3d77
5
+ SHA512:
6
+ metadata.gz: 663226f4cd9b6dd51d679848877f1b93f64069a839c35d2149511135a960a07736589e8aa69b01d60c9ebdc295db25fb3cde6fa24f1a11885958aba9ffac0af1
7
+ data.tar.gz: f8889d87fb7853ae54871ccf192527d6021535d12c614fc4059a235e1df9c6245516d5ac9478f53c613932ca4a0655dd2e5c4a29955b8cacd843b38f99ddcba2
@@ -0,0 +1,281 @@
1
+ require 'code-assertions'
2
+ require 'json'
3
+ require 'open-uri'
4
+ require 'zlib'
5
+ require 'logger'
6
+
7
+ module GHAUtils
8
+ def get_gha_filename(date)
9
+ return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
10
+ end
11
+
12
+ def read_gha_file_content(gz)
13
+ gzip = Zlib::GzipReader.new(gz)
14
+ content = gzip.read
15
+ gzip.close
16
+
17
+ return content
18
+ end
19
+
20
+ def read_gha_file(gz)
21
+ content = read_gha_file_content(gz)
22
+
23
+ result = []
24
+ content.lines.each do |line|
25
+ result << JSON.parse(line)
26
+ end
27
+
28
+ return result
29
+ end
30
+
31
+ def each_date(from, to)
32
+ current_date = from
33
+ while current_date < to
34
+ yield current_date
35
+ current_date += 3600
36
+ end
37
+ end
38
+ end
39
+
40
+ class GHAProvider
41
+ include GHAUtils
42
+
43
+ def initialize
44
+ @logger = Logger.new(STDOUT)
45
+
46
+ @includes = {}
47
+ @excludes = {}
48
+ end
49
+
50
+ def logger=(logger)
51
+ @logger = logger
52
+ end
53
+
54
+ def get(date)
55
+ raise "Not implemented"
56
+ end
57
+
58
+ def include(**args)
59
+ args.each do |key, value|
60
+ @includes[key.to_s] = [] unless @includes[key.to_s]
61
+ @includes[key.to_s] << value
62
+ end
63
+ end
64
+
65
+ def exclude(**args)
66
+ args.each do |key, value|
67
+ @excludes[key.to_s] = [] unless @excludes[key.to_s]
68
+ @excludes[key.to_s] << value
69
+ end
70
+ end
71
+
72
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
73
+ self.each_date(from, to) do |current_date|
74
+ events = []
75
+ begin
76
+ events = self.get(current_date)
77
+ @logger.info("Scanned #{current_date}")
78
+ rescue
79
+ @logger.error($!)
80
+ next
81
+ end
82
+
83
+ events.each do |event|
84
+ skip = false
85
+ @includes.each do |key, value|
86
+ skip = true unless value.include?(event[key])
87
+ end
88
+
89
+ @excludes.each do |key, value|
90
+ skip = true if value.include?(event[key])
91
+ end
92
+ next if skip
93
+
94
+ yield event
95
+ end
96
+
97
+ events.clear
98
+ GC.start
99
+ end
100
+ end
101
+ end
102
+
103
+ class OnlineGHAProvider < GHAProvider
104
+ def initialize(max_retries = 3, proactive = false)
105
+ super()
106
+
107
+ @max_retries = max_retries
108
+ @proactive = proactive
109
+ @cache = Cache.new
110
+ end
111
+
112
+ def get(current_time)
113
+ @max_retries.times do
114
+ begin
115
+ filename = self.get_gha_filename(current_time)
116
+
117
+ if @cache.has?(filename)
118
+ result = self.read_gha_file(@cache.get(filename))
119
+ else
120
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
121
+ # Save to cache
122
+ return self.read_gha_file(gz)
123
+ end
124
+ end
125
+ rescue
126
+ @logger.warning($!)
127
+ end
128
+ end
129
+
130
+ raise DownloadArchiveException, "Exceeded maximum number of tentative downloads."
131
+ end
132
+
133
+ def cache(current_time)
134
+ @max_retries.times do
135
+ begin
136
+ filename = self.get_gha_filename(current_time)
137
+
138
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
139
+ @cache.put(filename, gz.read)
140
+ return
141
+ end
142
+ rescue
143
+ end
144
+ end
145
+ end
146
+
147
+ def each(from = Time.gm(2015, 1, 1), to = Time.now)
148
+ if @proactive
149
+ @logger.info("Proactive download thread started")
150
+ Thread.start do
151
+ self.each_date(from, to) do |current_date|
152
+ self.cache(current_date)
153
+ @logger.info("Proactively cached #{current_date}. Cache size: #{@cache.size}")
154
+
155
+ if @cache.full?
156
+ @logger.info("Full cache. Waiting...")
157
+ end
158
+
159
+ while @cache.full?
160
+ sleep 1
161
+ end
162
+ end
163
+ end
164
+ end
165
+
166
+ super
167
+ end
168
+
169
+ class Cache
170
+ def initialize(folder = Dir.mktmpdir, max_size = 100)
171
+ @cache = {}
172
+ @max_size = max_size
173
+ @folder = folder
174
+ @mutex = Mutex.new
175
+ end
176
+
177
+ def put(name, content)
178
+ File.open("#@folder/#{name}", 'w') do |f|
179
+ f << content
180
+ end
181
+
182
+ @mutex.synchronize do
183
+ @cache[name] = value
184
+ end
185
+ end
186
+
187
+ def get(name)
188
+ @mutex.synchronize do
189
+ return File.read(@cache[name])
190
+ end
191
+ ensure
192
+ self.unload(name)
193
+ end
194
+
195
+ def unload(name)
196
+ File.unlink(@cache[name])
197
+
198
+ @mutex.synchronize do
199
+ @cache.delete(name)
200
+ end
201
+
202
+ return true
203
+ end
204
+
205
+ def size
206
+ @mutex.synchronize do
207
+ @cache.size
208
+ end
209
+ end
210
+
211
+ def has?(name)
212
+ return @cache.has_key?(name)
213
+ end
214
+
215
+ def full?
216
+ self.size >= @max_size
217
+ end
218
+ end
219
+
220
+ class DownloadArchiveException < Exception
221
+ end
222
+ end
223
+
224
+ class FolderGHAProvider < GHAProvider
225
+ def initialize(folder)
226
+ super()
227
+
228
+ @folder = folder
229
+ end
230
+
231
+ def get(current_time)
232
+ filename = self.get_gha_filename(current_time)
233
+ File.open(File.join(@folder, filename), "rb") do |gz|
234
+ return self.read_gha_file(gz)
235
+ end
236
+ end
237
+ end
238
+
239
+ class GHADownloader
240
+ include GHAUtils
241
+
242
+ def initialize(folder, decompress = false)
243
+ @logger = Logger.new(STDERR)
244
+ @decompress = decompress
245
+ @folder = folder
246
+ Dir.mkdir(@folder) unless FileTest.exist?(@folder)
247
+ raise "A file exist with the desired folder name #{folder}" unless FileTest.directory?(@folder)
248
+ end
249
+
250
+ def logger=(logger)
251
+ @logger = logger
252
+ end
253
+
254
+ def download(from = Time.gm(2015, 1, 1), to = Time.now)
255
+ self.each_date(from, to) do |current_date|
256
+ filename = self.get_gha_filename(current_date)
257
+ out_filename = filename.clone
258
+ out_filename.gsub!(".json.gz", ".json") if @decompress
259
+
260
+ target_file = File.join(@folder, out_filename)
261
+ if FileTest.exist?(target_file)
262
+ @logger.info("Skipping existing file for #{current_date}")
263
+ next
264
+ else
265
+ @logger.info("Downloading file for #{current_date}")
266
+ end
267
+
268
+ File.open(target_file, 'w') do |f|
269
+ URI.open("http://data.gharchive.org/#{filename}") do |gz|
270
+ if @decompress
271
+ f << self.read_gha_file_content(gz)
272
+ else
273
+ f << gz.read
274
+ end
275
+ end
276
+ end
277
+
278
+ yield filename if block_given?
279
+ end
280
+ end
281
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gh-archive
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Simone Scalabrino
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-01-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: code-assertions
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.1.2
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.1.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 1.1.2
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.1.2
33
+ description: Download and analyze the GitHub events stored at GitHub archive
34
+ email: s.scalabrino9@gmail.com
35
+ executables: []
36
+ extensions: []
37
+ extra_rdoc_files: []
38
+ files:
39
+ - lib/gh-archive.rb
40
+ homepage: https://github.com/intersimone999/gh-archive
41
+ licenses:
42
+ - GPL-3.0-only
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubygems_version: 3.1.4
60
+ signing_key:
61
+ specification_version: 4
62
+ summary: GitHub Archive mining utility
63
+ test_files: []