gh-archive 0.6 → 0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 91b0e957c5176b791d4f49e382680865405e7a6b2b29b349bcbb78b92d884e02
4
- data.tar.gz: f8ddae3d80e80a24931d8632c9798c8f01520e1fe5ac8a85079c0d0e85eadcbc
3
+ metadata.gz: b1a8f0a5088aa3d598466d5ef3ba3472b5e9b0c2ad3327f291c24045c01bdfba
4
+ data.tar.gz: b3df61810f7634eb8e9a153a812fa2073078975ee0ca303bfb9fb87e58f08160
5
5
  SHA512:
6
- metadata.gz: f7b24be932f58142b36887671b4265e25631345e7b81cc36b264be4a018fc0c4a88b853ae384dc8472876bf0996e904bf499007adc3091ddb511f28c828090fc
7
- data.tar.gz: 5cbb83495b9bb397a41022cb1bf4bce0344c735d16f9f43fb181b4b109948ac7a75bd44c693338f02b1ff17eeeeb5b83a6add9deeeedbd379a8848614041a3f5
6
+ metadata.gz: f5462029f6ef8e32f632bfdb7552606cf47a924e019502288ee2d841e28e76a5899d944d912a41933834e08e7a813c88cc919a7e93dba42b1c3250fcd911ae6e
7
+ data.tar.gz: 1f180c6640e38423ac2cde87ee4181cf04db8bb0a5cc6f861fd39d5de0c5bc4d30df87524f594482c170d591c5cd063abfa539b5715a4850056ab5262acef965
data/lib/gh-archive.rb CHANGED
@@ -7,6 +7,8 @@ require 'tmpdir'
7
7
  require 'thread/pool'
8
8
  require 'thread/promise'
9
9
 
10
+ require_relative File.expand_path('../gh-archive/events', __FILE__)
11
+
10
12
  module GHAUtils
11
13
  def get_gha_filename(date)
12
14
  return ("%04d-%02d-%02d-%d.json.gz" % [date.year, date.month, date.day, date.hour])
@@ -53,11 +55,29 @@ class GHAProvider
53
55
 
54
56
  @includes = {}
55
57
  @excludes = {}
58
+
59
+ @checkpoint_name = nil
60
+ @use_json = true
61
+ end
62
+
63
+ def use_checkpoint(filename)
64
+ @checkpoint_name = filename
65
+
66
+ return self
67
+ end
68
+
69
+ def parse_events
70
+ @use_json = false
71
+
72
+ return self
56
73
  end
57
74
 
58
75
  def logger=(logger)
59
76
  @logger = logger
77
+
78
+ return self
60
79
  end
80
+ alias :use_logger :logger=
61
81
 
62
82
  def get(date)
63
83
  raise "Not implemented"
@@ -68,6 +88,8 @@ class GHAProvider
68
88
  @includes[key.to_s] = [] unless @includes[key.to_s]
69
89
  @includes[key.to_s] << value
70
90
  end
91
+
92
+ return self
71
93
  end
72
94
 
73
95
  def exclude(**args)
@@ -75,11 +97,44 @@ class GHAProvider
75
97
  @excludes[key.to_s] = [] unless @excludes[key.to_s]
76
98
  @excludes[key.to_s] << value
77
99
  end
100
+
101
+ return self
102
+ end
103
+
104
+ def restore_checkpoint(from)
105
+ if @checkpoint_name && FileTest.exist?(@checkpoint_name)
106
+ # Note that this throws an exception if the file is not readable. This is the intended behavior.
107
+ # As opposed to that, failing to save the checkpoint information just results in a warning on the log.
108
+ loaded_from = Marshal.load(File.read(@checkpoint_name))
109
+ raise "The loaded checkpoint (#{loaded_from}) occurs before the current from date (#{from})" if loaded_from < from
110
+
111
+ @logger.info("Valid checkpoint loaded. Restored execution from #{loaded_from}.")
112
+
113
+ return loaded_from
114
+ else
115
+ return from
116
+ end
117
+ end
118
+
119
+ def update_checkpoint(current_time)
120
+ if @checkpoint_name
121
+ begin
122
+ File.open(@checkpoint_name, "wb") do |f|
123
+ f.write(Marshal.dump(current_time))
124
+ end
125
+ rescue
126
+ @logger.warn(
127
+ "Unable to save the checkpoint at the specified location (#{File.expand_path(@checkpoint_name)})."
128
+ )
129
+ end
130
+ end
78
131
  end
79
132
 
80
133
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
81
134
  exceptions = []
82
135
 
136
+ from = restore_checkpoint(from)
137
+
83
138
  self.each_time(from, to) do |current_time|
84
139
  events = []
85
140
  begin
@@ -93,6 +148,8 @@ class GHAProvider
93
148
  next
94
149
  end
95
150
 
151
+ update_checkpoint(current_time)
152
+
96
153
  events.each do |event|
97
154
  skip = false
98
155
  @includes.each do |key, value|
@@ -104,7 +161,11 @@ class GHAProvider
104
161
  end
105
162
  next if skip
106
163
 
107
- yield event, current_time
164
+ if @use_json
165
+ yield event, current_time
166
+ else
167
+ yield GHArchive::Event.parse(event), current_time
168
+ end
108
169
  end
109
170
 
110
171
  @logger.info("Scanned #{current_time}")
@@ -113,6 +174,8 @@ class GHAProvider
113
174
  GC.start
114
175
  end
115
176
 
177
+ update_checkpoint(to)
178
+
116
179
  return exceptions
117
180
  end
118
181
 
@@ -196,10 +259,11 @@ class OnlineGHAProvider < GHAProvider
196
259
 
197
260
  def each(from = Time.gm(2015, 1, 1), to = Time.now)
198
261
  if @proactive
262
+ real_from = restore_checkpoint(from)
199
263
  any_ready = Thread.promise
200
264
 
201
265
  @logger.info("Proactively scheduling download tasks...")
202
- self.each_time(from, to) do |current_time|
266
+ self.each_time(real_from, to) do |current_time|
203
267
  @pool.process(current_time) do |current_time|
204
268
  cache(current_time)
205
269
  any_ready << true
@@ -0,0 +1,312 @@
1
+ require 'time'
2
+
3
+ module GHArchive
4
+ Repository = Struct.new(:id, :name, :url)
5
+ CommitAuthor = Struct.new(:email, :name)
6
+
7
+ class Entity
8
+ def initialize(payload)
9
+ @payload = payload
10
+ end
11
+ end
12
+
13
+ class Commit < Entity
14
+ def sha
15
+ @payload['sha']
16
+ end
17
+
18
+ def author
19
+ CommitAuthor.new(
20
+ @payload['author']['email'],
21
+ @payload['author']['name']
22
+ )
23
+ end
24
+
25
+ def message
26
+ @payload['message']
27
+ end
28
+
29
+ def distinct
30
+ @payload['distinct']
31
+ end
32
+
33
+ def url
34
+ @payload['url']
35
+ end
36
+ end
37
+
38
+ class User < Entity
39
+ def id
40
+ @payload['id']
41
+ end
42
+
43
+ def url
44
+ @payload['url']
45
+ end
46
+
47
+ def type
48
+ @payload['type']
49
+ end
50
+
51
+ def login
52
+ @payload['login']
53
+ end
54
+
55
+ def gravatar_id
56
+ @payload['gravatar_id']
57
+ end
58
+
59
+ def avatar_url
60
+ @payload['avatar_url']
61
+ end
62
+
63
+ def site_admin
64
+ @payload['site_admin']
65
+ end
66
+ end
67
+
68
+ class BasicIssue < Entity
69
+ def url
70
+ @payload['url']
71
+ end
72
+
73
+ def id
74
+ @payload['id']
75
+ end
76
+
77
+ def number
78
+ @payload['number']
79
+ end
80
+
81
+ def state
82
+ @payload['state']
83
+ end
84
+
85
+ def locked
86
+ @payload['locked']
87
+ end
88
+
89
+ def title
90
+ @payload['title']
91
+ end
92
+
93
+ def body
94
+ @payload['body']
95
+ end
96
+
97
+ def user
98
+ User.new(@payload['user']) rescue nil
99
+ end
100
+
101
+ def created_at
102
+ Time.parse(@payload['created_at'])
103
+ end
104
+
105
+ def updated_at
106
+ Time.parse(@payload['updated_at']) rescue nil
107
+ end
108
+
109
+ def closed_at
110
+ Time.parse(@payload['closed_at']) rescue nil
111
+ end
112
+ end
113
+
114
+ class PullRequest < BasicIssue
115
+ def merged_at
116
+ Time.parse(@payload['merged_at']) rescue nil
117
+ end
118
+
119
+ def merge_commit_sha
120
+ @payload['merge_commit_sha']
121
+ end
122
+
123
+ def merged
124
+ @payload['merged']
125
+ end
126
+
127
+ def mergeable
128
+ @payload['mergeable']
129
+ end
130
+
131
+ def mergeable_state
132
+ @payload['mergeable_state']
133
+ end
134
+
135
+ def merged_by
136
+ @payload['merged_by']
137
+ end
138
+
139
+ def comments
140
+ @payload['comments']
141
+ end
142
+
143
+ def review_comments
144
+ @payload['review_comments']
145
+ end
146
+
147
+ def commits
148
+ @payload['commits']
149
+ end
150
+
151
+ def additions
152
+ @payload['additions']
153
+ end
154
+
155
+ def deletions
156
+ @payload['deletions']
157
+ end
158
+
159
+ def changed_files
160
+ @payload['changed_files']
161
+ end
162
+
163
+ def head
164
+ @payload['head']
165
+ end
166
+
167
+ def base
168
+ @payload['base']
169
+ end
170
+ end
171
+
172
+ class Issue < BasicIssue
173
+ def labels
174
+ @payload['labels']
175
+ end
176
+ end
177
+
178
+ class BasicComment < Entity
179
+ def url
180
+ @payload['url']
181
+ end
182
+
183
+ def id
184
+ @payload['id']
185
+ end
186
+
187
+ def user
188
+ User.new(@payload['user']) rescue nil
189
+ end
190
+
191
+ def created_at
192
+ Time.parse(@payload['created_at'])
193
+ end
194
+
195
+ def updated_at
196
+ Time.parse(@payload['updated_at']) rescue nil
197
+ end
198
+
199
+ def body
200
+ @payload['body']
201
+ end
202
+ end
203
+
204
+ class PullRequestComment < BasicComment
205
+ def diff_hunk
206
+ @payload['diff_hunk']
207
+ end
208
+
209
+ def path
210
+ @payload['path']
211
+ end
212
+
213
+ def position
214
+ @payload['position']
215
+ end
216
+
217
+ def original_position
218
+ @payload['original_position']
219
+ end
220
+
221
+ def commit_id
222
+ @payload['commit_id']
223
+ end
224
+
225
+ def original_commit_id
226
+ @payload['original_commit_id']
227
+ end
228
+ end
229
+
230
+ class IssueComment < BasicComment
231
+ end
232
+
233
+ class Release < Entity
234
+ def url
235
+ @payload['url']
236
+ end
237
+
238
+ def id
239
+ @payload['id']
240
+ end
241
+
242
+ def tag_name
243
+ @payload['tag_name']
244
+ end
245
+
246
+ def target_commitish
247
+ @payload['target_commitish']
248
+ end
249
+
250
+ def name
251
+ @payload['name']
252
+ end
253
+
254
+ def draft
255
+ @payload['draft']
256
+ end
257
+
258
+ def author
259
+ User.new(@payload['author'])
260
+ end
261
+
262
+ def prerelease
263
+ @payload['prerelease']
264
+ end
265
+
266
+ def created_at
267
+ Time.parse(@payload['created_at'])
268
+ end
269
+
270
+ def published_at
271
+ Time.parse(@payload['published_at'])
272
+ end
273
+
274
+ def assets
275
+ @payload['assets']
276
+ end
277
+
278
+ def tarball_url
279
+ @payload['tarball_url']
280
+ end
281
+
282
+ def zipball_url
283
+ @payload['zipball_url']
284
+ end
285
+
286
+ def body
287
+ @payload['body']
288
+ end
289
+ end
290
+
291
+ class Page < Entity
292
+ def name
293
+ @payload['page_name']
294
+ end
295
+
296
+ def title
297
+ @payload['title']
298
+ end
299
+
300
+ def summary
301
+ @payload['summary']
302
+ end
303
+
304
+ def action
305
+ @payload['action']
306
+ end
307
+
308
+ def sha
309
+ @payload['sha']
310
+ end
311
+ end
312
+ end
@@ -0,0 +1,405 @@
1
+ require 'time'
2
+ require_relative File.expand_path('../entities', __FILE__)
3
+
4
+ module GHArchive
5
+ class Event
6
+ def self.parse(json)
7
+ IMPLEMENTATIONS.each do |event_class|
8
+ return event_class.new(json) if event_class.fits?(json)
9
+ end
10
+
11
+ return Event.new(json)
12
+ end
13
+
14
+ def initialize(json)
15
+ @json = json.freeze
16
+ @payload = json['payload']
17
+ end
18
+
19
+ def public?
20
+ @json['public']
21
+ end
22
+
23
+ def created_at
24
+ Time.parse(@json['created_at'])
25
+ end
26
+ alias :time :created_at
27
+
28
+ def actor
29
+ User.new(@json['actor'])
30
+ end
31
+
32
+ def repo
33
+ Repository.new(
34
+ @json['repo']['id'],
35
+ @json['repo']['name'],
36
+ @json['repo']['url']
37
+ )
38
+ end
39
+
40
+ def json
41
+ @json
42
+ end
43
+ end
44
+
45
+ class PushEvent < Event
46
+ def self.fits?(json)
47
+ json['type'] == "PushEvent"
48
+ end
49
+
50
+ def push_id
51
+ @payload['push_id']
52
+ end
53
+
54
+ def size
55
+ @payload['size']
56
+ end
57
+
58
+ def distinct_size
59
+ @payload['distinct_size']
60
+ end
61
+
62
+ def head
63
+ @payload['head']
64
+ end
65
+
66
+ def before
67
+ @payload['before']
68
+ end
69
+
70
+ def commits
71
+ @payload['commits'].map { |c| Commit.new(c) }
72
+ end
73
+ end
74
+
75
+ class CommitCommentEvent < Event
76
+ def self.fits?(json)
77
+ return json['type'] == "CommitCommentEvent"
78
+ end
79
+
80
+ def comment_id
81
+ @payload['comment']['id']
82
+ end
83
+
84
+ def comment_url
85
+ @payload['comment']['url']
86
+ end
87
+
88
+ def comment_user
89
+ User.new(@payload['comment']['author'])
90
+ end
91
+
92
+ def comment_position
93
+ @payload['comment']['position']
94
+ end
95
+
96
+ def comment_line
97
+ @payload['comment']['line']
98
+ end
99
+
100
+ def comment_path
101
+ @payload['comment']['path']
102
+ end
103
+
104
+ def comment_commit_id
105
+ @payload['comment']['commit_id']
106
+ end
107
+
108
+ def comment_body
109
+ @payload['comment']['body']
110
+ end
111
+
112
+ def comment_created_at
113
+ Time.parse(@payload['comment']['created_at'])
114
+ end
115
+
116
+ def comment_updated_at
117
+ Time.parse(@payload['comment']['updated_at'])
118
+ end
119
+ end
120
+
121
+ class PullRequestEvent < Event
122
+ def self.fits?(json)
123
+ return json['type'] == "PullRequestEvent"
124
+ end
125
+
126
+ def action
127
+ @payload['action']
128
+ end
129
+
130
+ def number
131
+ @payload['number']
132
+ end
133
+
134
+ def pull_request
135
+ PullRequest.new(@payload['pull_request'])
136
+ end
137
+ end
138
+
139
+ class PullRequestReviewCommentEvent < Event
140
+ def self.fits?(json)
141
+ return json['type'] == "PullRequestReviewCommentEvent"
142
+ end
143
+
144
+ def action
145
+ @payload['action']
146
+ end
147
+
148
+ def number
149
+ @payload['number']
150
+ end
151
+
152
+ def pull_request
153
+ PullRequest.new(@payload['pull_request'])
154
+ end
155
+
156
+ def comment
157
+ PullRequestComment.new(@payload['comment'])
158
+ end
159
+ end
160
+
161
+ class IssuesEvent < Event
162
+ def self.fits?(json)
163
+ return json['type'] == "IssuesEvent"
164
+ end
165
+
166
+ def action
167
+ @payload['action']
168
+ end
169
+
170
+ def issue
171
+ Issue.new(@payload['issue'])
172
+ end
173
+ end
174
+
175
+ class IssueCommentEvent < Event
176
+ def self.fits?(json)
177
+ return json['type'] == "IssueCommentEvent"
178
+ end
179
+
180
+ def action
181
+ @payload['action']
182
+ end
183
+
184
+ def issue
185
+ Issue.new(@payload['issue'])
186
+ end
187
+ end
188
+
189
+ class CreateEvent < Event
190
+ def self.fits?(json)
191
+ return json['type'] == "CreateEvent"
192
+ end
193
+
194
+ def ref
195
+ @payload['ref']
196
+ end
197
+
198
+ def ref_type
199
+ @payload['ref_type']
200
+ end
201
+
202
+ def master_branch
203
+ @payload['master_branch']
204
+ end
205
+
206
+ def description
207
+ @payload['description']
208
+ end
209
+
210
+ def pusher_type
211
+ @payload['pusher_type']
212
+ end
213
+ end
214
+
215
+ class ForkEvent < Event
216
+ def self.fits?(json)
217
+ return json['type'] == "ForkEvent"
218
+ end
219
+
220
+ def forkee_id
221
+ @payload['forkee']['id']
222
+ end
223
+
224
+ def forkee_name
225
+ @payload['forkee']['name']
226
+ end
227
+
228
+ def forkee_full_name
229
+ @payload['forkee']['full_name']
230
+ end
231
+
232
+ def forkee_owner
233
+ User.new(@payload['forkee']['owner'])
234
+ end
235
+
236
+ def forkee_private
237
+ @payload['forkee']['private']
238
+ end
239
+
240
+ def forkee_description
241
+ @payload['forkee']['description']
242
+ end
243
+
244
+ def forkee_fork
245
+ @payload['forkee']['fork']
246
+ end
247
+
248
+ def forkee_created_at
249
+ Time.parse(@payload['forkee']['created_at'])
250
+ end
251
+
252
+ def forkee_updated_at
253
+ Time.parse(@payload['forkee']['updated_at'])
254
+ end
255
+
256
+ def forkee_pushed_at
257
+ Time.parse(@payload['forkee']['pushed_at'])
258
+ end
259
+
260
+ def forkee_urls
261
+ {
262
+ 'git' => @payload['forkee']['git_url'],
263
+ 'ssh' => @payload['forkee']['ssh_url'],
264
+ 'clone' => @payload['forkee']['clone_url'],
265
+ 'svn' => @payload['forkee']['svn_url']
266
+ }
267
+ end
268
+
269
+ def forkee_homepage
270
+ Time.parse(@payload['forkee']['homepage'])
271
+ end
272
+
273
+ def forkee_size
274
+ Time.parse(@payload['forkee']['size'])
275
+ end
276
+
277
+ def forkee_stargazers_count
278
+ Time.parse(@payload['forkee']['stargazers_count'])
279
+ end
280
+
281
+ def forkee_watchers_count
282
+ Time.parse(@payload['forkee']['watchers_count'])
283
+ end
284
+
285
+ def forkee_language
286
+ Time.parse(@payload['forkee']['language'])
287
+ end
288
+
289
+ def forkee_has_issues
290
+ Time.parse(@payload['forkee']['has_issues'])
291
+ end
292
+
293
+ def forkee_has_downloads
294
+ Time.parse(@payload['forkee']['has_downloads'])
295
+ end
296
+
297
+ def forkee_has_wiki
298
+ Time.parse(@payload['forkee']['has_wiki'])
299
+ end
300
+
301
+ def forkee_has_pages
302
+ Time.parse(@payload['forkee']['has_pages'])
303
+ end
304
+
305
+ def forkee_forks_count
306
+ Time.parse(@payload['forkee']['forks_count'])
307
+ end
308
+
309
+ def forkee_mirror_url
310
+ Time.parse(@payload['forkee']['mirror_url'])
311
+ end
312
+
313
+ def forkee_open_issues_count
314
+ Time.parse(@payload['forkee']['open_issues_count'])
315
+ end
316
+
317
+ def forkee_watchers
318
+ Time.parse(@payload['forkee']['watchers'])
319
+ end
320
+
321
+ def forkee_default_branch
322
+ Time.parse(@payload['forkee']['default_branch'])
323
+ end
324
+
325
+ def forkee_public
326
+ Time.parse(@payload['forkee']['public'])
327
+ end
328
+ end
329
+
330
+ class PublicEvent < Event
331
+ def self.fits?(json)
332
+ return json['type'] == "PublicEvent"
333
+ end
334
+ end
335
+
336
+ class WatchEvent < Event
337
+ def self.fits?(json)
338
+ return json['type'] == "WatchEvent"
339
+ end
340
+
341
+ def action
342
+ @payload['action']
343
+ end
344
+ end
345
+
346
+ class DeleteEvent < Event
347
+ def self.fits?(json)
348
+ return json['type'] == "DeleteEvent"
349
+ end
350
+
351
+ def ref
352
+ @payload['ref']
353
+ end
354
+
355
+ def ref_type
356
+ @payload['ref_type']
357
+ end
358
+
359
+ def pusher_type
360
+ @payload['pusher_type']
361
+ end
362
+ end
363
+
364
+ class ReleaseEvent < Event
365
+ def self.fits?(json)
366
+ return json['type'] == "ReleaseEvent"
367
+ end
368
+
369
+ def action
370
+ @payload['action']
371
+ end
372
+
373
+ def release
374
+ Release.new(@payload['release'])
375
+ end
376
+ end
377
+
378
+ class MemberEvent < Event
379
+ def self.fits?(json)
380
+ return json['type'] == "MemberEvent"
381
+ end
382
+
383
+ def action
384
+ @payload['action']
385
+ end
386
+
387
+ def member
388
+ User.new(@payload['member'])
389
+ end
390
+ end
391
+
392
+ class GollumEvent < Event
393
+ def self.fits?(json)
394
+ return json['type'] == "GollumEvent"
395
+ end
396
+
397
+ def pages
398
+ @payload[pages].map { |p| Page.new(p) }
399
+ end
400
+ end
401
+
402
+ class Event
403
+ IMPLEMENTATIONS = ObjectSpace.each_object(Class).select { |klass| klass < self }
404
+ end
405
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gh-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.6'
4
+ version: '0.10'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-08-13 00:00:00.000000000 Z
11
+ date: 2021-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -57,11 +57,13 @@ extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
59
  - lib/gh-archive.rb
60
+ - lib/gh-archive/entities.rb
61
+ - lib/gh-archive/events.rb
60
62
  homepage: https://github.com/intersimone999/gh-archive
61
63
  licenses:
62
64
  - GPL-3.0-only
63
65
  metadata: {}
64
- post_install_message:
66
+ post_install_message:
65
67
  rdoc_options: []
66
68
  require_paths:
67
69
  - lib
@@ -76,8 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
76
78
  - !ruby/object:Gem::Version
77
79
  version: '0'
78
80
  requirements: []
79
- rubygems_version: 3.2.21
80
- signing_key:
81
+ rubygems_version: 3.2.22
82
+ signing_key:
81
83
  specification_version: 4
82
84
  summary: GitHub Archive mining utility
83
85
  test_files: []