ghtorrent 0.2 → 0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +12 -4
- data/bin/ght-data-retrieval +13 -3
- data/bin/ght-load +1 -1
- data/bin/ght-mirror-events +47 -17
- data/bin/ght-periodic-dump +51 -13
- data/lib/ghtorrent.rb +1 -1
- data/lib/ghtorrent/adapters/base_adapter.rb +11 -2
- data/lib/ghtorrent/adapters/mongo_persister.rb +45 -16
- data/lib/ghtorrent/api_client.rb +51 -17
- data/lib/ghtorrent/command.rb +43 -2
- data/lib/ghtorrent/ghtorrent.rb +265 -71
- data/lib/ghtorrent/migrations/001_init_schema.rb +5 -3
- data/lib/ghtorrent/migrations/{003_add_external_ref_ids.rb → 002_add_external_ref_ids.rb} +0 -0
- data/lib/ghtorrent/migrations/003_add_orgs.rb +37 -0
- data/lib/ghtorrent/migrations/004_add_commit_comments.rb +27 -0
- data/lib/ghtorrent/retriever.rb +146 -8
- data/lib/ghtorrent/settings.rb +1 -0
- data/lib/ghtorrent/utils.rb +13 -0
- data/test/callstack_test.rb +1 -1
- metadata +38 -5
- data/lib/ghtorrent/migrations/002_add_followers_created_at.rb +0 -15
@@ -12,7 +12,7 @@ Sequel.migration do
|
|
12
12
|
String :email, :null => true, :unique => true
|
13
13
|
TrueClass :hireable, :null => true
|
14
14
|
String :bio, :null => true
|
15
|
-
|
15
|
+
DateTime :created_at, :null => false, :default=>Sequel::CURRENT_TIMESTAMP
|
16
16
|
end
|
17
17
|
|
18
18
|
puts("Creating table projects")
|
@@ -23,7 +23,7 @@ Sequel.migration do
|
|
23
23
|
String :name, :null => false
|
24
24
|
String :description
|
25
25
|
String :language
|
26
|
-
|
26
|
+
DateTime :created_at, :null => false, :default=>Sequel::CURRENT_TIMESTAMP
|
27
27
|
end
|
28
28
|
|
29
29
|
puts("Creating table commits")
|
@@ -32,7 +32,8 @@ Sequel.migration do
|
|
32
32
|
String :sha, :size => 40, :unique => true
|
33
33
|
foreign_key :author_id, :users
|
34
34
|
foreign_key :committer_id, :users
|
35
|
-
|
35
|
+
foreign_key :project_id, :projects
|
36
|
+
DateTime :created_at, :null => false, :default=>Sequel::CURRENT_TIMESTAMP
|
36
37
|
end
|
37
38
|
|
38
39
|
puts("Creating table commit_parents")
|
@@ -46,6 +47,7 @@ Sequel.migration do
|
|
46
47
|
create_table :followers do
|
47
48
|
foreign_key :user_id, :users, :null => false
|
48
49
|
foreign_key :follower_id, :users, :null => false
|
50
|
+
DateTime :created_at, :null => false, :default=>Sequel::CURRENT_TIMESTAMP
|
49
51
|
primary_key [:user_id, :follower_id]
|
50
52
|
end
|
51
53
|
end
|
File without changes
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
|
3
|
+
Sequel.migration do
|
4
|
+
up do
|
5
|
+
|
6
|
+
puts("Adding organization descriminator field to table users")
|
7
|
+
|
8
|
+
alter_table :users do
|
9
|
+
add_column :type, "enum('USR', 'ORG')", :null => false
|
10
|
+
end
|
11
|
+
|
12
|
+
puts("Updating users with default values")
|
13
|
+
DB.transaction(:rollback => :reraise, :isolation => :committed) do
|
14
|
+
DB[:users].update(:type => "USR")
|
15
|
+
end
|
16
|
+
|
17
|
+
puts("Creating table organization-members")
|
18
|
+
|
19
|
+
create_table :organization_members do
|
20
|
+
foreign_key :org_id, :users, :null => false
|
21
|
+
foreign_key :user_id, :users, :null => false
|
22
|
+
primary_key [:org_id, :user_id]
|
23
|
+
DateTime :created_at, :null => false,
|
24
|
+
:default => Sequel::CURRENT_TIMESTAMP
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
down do
|
29
|
+
puts("Droping table organization-members")
|
30
|
+
drop_table :organization_members
|
31
|
+
|
32
|
+
puts("Droping organization descriminator field to table users")
|
33
|
+
alter_table :users do
|
34
|
+
drop_column :type
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
|
3
|
+
Sequel.migration do
|
4
|
+
up do
|
5
|
+
|
6
|
+
puts("Adding table commit comments")
|
7
|
+
|
8
|
+
create_table :commit_comments do
|
9
|
+
primary_key :id
|
10
|
+
foreign_key :commit_id, :commits, :null => false
|
11
|
+
foreign_key :user_id, :users, :null => false
|
12
|
+
String :body
|
13
|
+
Integer :line, :null => true
|
14
|
+
Integer :position, :null => true
|
15
|
+
Integer :comment_id, :null => false, :unique => true
|
16
|
+
String :ext_ref_id, :null => false, :size => 24, :default => "0"
|
17
|
+
DateTime :created_at, :null => false,
|
18
|
+
:default => Sequel::CURRENT_TIMESTAMP
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
down do
|
23
|
+
|
24
|
+
drop_table :commit_comments
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
data/lib/ghtorrent/retriever.rb
CHANGED
@@ -29,8 +29,8 @@
|
|
29
29
|
module GHTorrent
|
30
30
|
module Retriever
|
31
31
|
|
32
|
+
include GHTorrent::Utils
|
32
33
|
include GHTorrent::APIClient
|
33
|
-
include GHTorrent::Settings
|
34
34
|
|
35
35
|
def initialize(settings)
|
36
36
|
super(settings)
|
@@ -44,16 +44,18 @@ module GHTorrent
|
|
44
44
|
url = ghurl "users/#{user}"
|
45
45
|
u = api_request(url)
|
46
46
|
|
47
|
-
if u.
|
47
|
+
if u.empty?
|
48
48
|
throw GHTorrentException.new("Cannot find user #{user}")
|
49
49
|
end
|
50
50
|
|
51
51
|
unq = @persister.store(:users, u)
|
52
52
|
u[@uniq] = unq
|
53
|
-
|
53
|
+
what = user_type(u['type'])
|
54
|
+
info "Retriever: New #{what} #{user}"
|
54
55
|
u
|
55
56
|
else
|
56
|
-
|
57
|
+
what = user_type(stored_user.first['type'])
|
58
|
+
debug "Retriever: Already got #{what} #{user}"
|
57
59
|
stored_user.first
|
58
60
|
end
|
59
61
|
end
|
@@ -63,10 +65,13 @@ module GHTorrent
|
|
63
65
|
# http://develop.github.com/p/users.html
|
64
66
|
def retrieve_user_byemail(email, name)
|
65
67
|
url = ghurl_v2("user/email/#{email}")
|
66
|
-
api_request(url)
|
68
|
+
r = api_request(url)
|
69
|
+
|
70
|
+
return nil if r.empty?
|
71
|
+
r
|
67
72
|
end
|
68
73
|
|
69
|
-
def
|
74
|
+
def retrieve_user_followers(user)
|
70
75
|
stored_followers = @persister.find(:followers, {'follows' => user})
|
71
76
|
|
72
77
|
followers = paged_api_request(ghurl "users/#{user}/followers")
|
@@ -88,6 +93,7 @@ module GHTorrent
|
|
88
93
|
@persister.find(:followers, {'follows' => user})
|
89
94
|
end
|
90
95
|
|
96
|
+
# Retrieve a single commit from a repo
|
91
97
|
def retrieve_commit(repo, sha, user)
|
92
98
|
commit = @persister.find(:commits, {'sha' => "#{sha}"})
|
93
99
|
|
@@ -95,7 +101,7 @@ module GHTorrent
|
|
95
101
|
url = ghurl "repos/#{user}/#{repo}/commits/#{sha}"
|
96
102
|
c = api_request(url)
|
97
103
|
|
98
|
-
if c.
|
104
|
+
if c.empty?
|
99
105
|
throw GHTorrentException.new("Cannot find commit #{user}/#{repo}/#{sha}")
|
100
106
|
end
|
101
107
|
|
@@ -109,6 +115,31 @@ module GHTorrent
|
|
109
115
|
end
|
110
116
|
end
|
111
117
|
|
118
|
+
# Retrieve all project commits or 500 (whatever comes first),
|
119
|
+
# starting from the provided +sha+
|
120
|
+
def retrieve_commits(repo, sha, user)
|
121
|
+
last_sha = if sha.nil?
|
122
|
+
"master"
|
123
|
+
else
|
124
|
+
sha
|
125
|
+
end
|
126
|
+
|
127
|
+
url = ghurl "repos/#{user}/#{repo}/commits?last_sha=#{last_sha}"
|
128
|
+
commits = paged_api_request(url, config(:mirror_commit_pages_new_repo))
|
129
|
+
|
130
|
+
commits.reduce(Array.new) do |acc, c|
|
131
|
+
commit = @persister.find(:commits, {'sha' => "#{c['sha']}"})
|
132
|
+
|
133
|
+
if commit.empty?
|
134
|
+
acc << retrieve_commit(repo, c['sha'], user)
|
135
|
+
else
|
136
|
+
debug "Retriever: Already got commit #{repo} -> #{c['sha']}"
|
137
|
+
end
|
138
|
+
acc
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
|
112
143
|
def retrieve_repo(user, repo)
|
113
144
|
stored_repo = @persister.find(:repos, {'owner.login' => user,
|
114
145
|
'name' => repo })
|
@@ -116,7 +147,7 @@ module GHTorrent
|
|
116
147
|
url = ghurl "repos/#{user}/#{repo}"
|
117
148
|
r = api_request(url)
|
118
149
|
|
119
|
-
if r.
|
150
|
+
if r.empty?
|
120
151
|
throw GHTorrentException.new("Cannot find repo #{user}/#{repo}")
|
121
152
|
end
|
122
153
|
|
@@ -130,6 +161,94 @@ module GHTorrent
|
|
130
161
|
end
|
131
162
|
end
|
132
163
|
|
164
|
+
# Retrieve organizations the provided user participates into
|
165
|
+
def retrieve_orgs(user)
|
166
|
+
url = ghurl "users/#{user}/orgs"
|
167
|
+
orgs = paged_api_request(url)
|
168
|
+
orgs.map{|o| retrieve_org(o['login'])}
|
169
|
+
end
|
170
|
+
|
171
|
+
# Retrieve a single organization
|
172
|
+
def retrieve_org(org)
|
173
|
+
retrieve_user_byusername(org)
|
174
|
+
end
|
175
|
+
|
176
|
+
# Retrieve organization members
|
177
|
+
def retrieve_org_members(org)
|
178
|
+
url = ghurl "orgs/#{org}/members"
|
179
|
+
stored_org_members = @persister.find(:org_members, {'org' => org})
|
180
|
+
|
181
|
+
org_members = paged_api_request(ghurl "orgs/#{org}/members")
|
182
|
+
org_members.each do |x|
|
183
|
+
x['org'] = org
|
184
|
+
|
185
|
+
exists = !stored_org_members.find { |f|
|
186
|
+
f['org'] == user && f['login'] == x['login']
|
187
|
+
}.nil?
|
188
|
+
|
189
|
+
if not exists
|
190
|
+
@persister.store(:org_members, x)
|
191
|
+
info "Retriever: Added member #{org} -> #{x['login']}"
|
192
|
+
else
|
193
|
+
debug "Retriever: Member #{org} -> #{x['login']} exists"
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
@persister.find(:org_members, {'org' => org}).map{|o| retrieve_org(o['login'])}
|
198
|
+
end
|
199
|
+
|
200
|
+
# Retrieve all commit comments for a specific repository
|
201
|
+
def retrieve_repo_comments(repo, user)
|
202
|
+
commit_comments = paged_api_request(ghurl "repos/#{user}/#{repo}/comments")
|
203
|
+
stored_comments = @persister.find(:commit_comments,
|
204
|
+
{'repo' => repo,
|
205
|
+
'user' => user})
|
206
|
+
store_commit_comments(repo, user, commit_comments, stored_comments)
|
207
|
+
end
|
208
|
+
|
209
|
+
# Retrieve all comments for a single commit
|
210
|
+
def retrieve_commit_comments(user, repo, sha, reentrer = false)
|
211
|
+
# Optimization: if no commits comments are registered for the repo
|
212
|
+
# get them en masse
|
213
|
+
#items = @persister.count(:commit_comments, {'repo' => repo, 'user' => user})
|
214
|
+
#if items == 0 && !reentrer
|
215
|
+
# retrieve_repo_comments(repo, user)
|
216
|
+
# return retrieve_commit_comments(user, repo, sha, true)
|
217
|
+
#end
|
218
|
+
|
219
|
+
stored_comments = @persister.find(:commit_comments, {'commit_id' => sha})
|
220
|
+
retrieved_comments = paged_api_request(ghurl "repos/#{user}/#{repo}/commits/#{sha}/comments")
|
221
|
+
store_commit_comments(repo, user, stored_comments, retrieved_comments)
|
222
|
+
@persister.find(:commit_comments, {'commit_id' => sha})
|
223
|
+
end
|
224
|
+
|
225
|
+
# Retrieve a single comment
|
226
|
+
def retrieve_commit_comment(user, repo, id, reentrer = false)
|
227
|
+
# Optimization: if no commits comments are registered for the repo
|
228
|
+
# get them en masse
|
229
|
+
#items = @persister.count(:commit_comments, {'repo' => repo, 'user' => user})
|
230
|
+
#if items == 0 && !reentrer
|
231
|
+
# retrieve_repo_comments(repo, user)
|
232
|
+
# return retrieve_commit_comment(user, repo, id)
|
233
|
+
#end
|
234
|
+
|
235
|
+
comment = @persister.find(:commit_comments, {'repo' => repo,
|
236
|
+
'user' => user, 'id' => id})
|
237
|
+
if comment.empty?
|
238
|
+
r = api_request(ghurl "repos/#{user}/#{repo}/comments/#{id}")
|
239
|
+
r['repo'] = repo
|
240
|
+
r['user'] = user
|
241
|
+
@persister.store(:commit_comments, r)
|
242
|
+
info "Retriever: Added commit comment #{r['commit_id']} -> #{r['id']}"
|
243
|
+
r[@uniq] = r['_id']
|
244
|
+
r
|
245
|
+
else
|
246
|
+
debug "Retriever: Commit comment #{comment['commit_id']} -> #{comment['id']} exists"
|
247
|
+
comment[@uniq] = comment['_id']
|
248
|
+
comment
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
133
252
|
# Get current Github events
|
134
253
|
def get_events
|
135
254
|
api_request "https://api.github.com/events"
|
@@ -144,5 +263,24 @@ module GHTorrent
|
|
144
263
|
def ghurl_v2(path)
|
145
264
|
config(:mirror_urlbase_v2) + path
|
146
265
|
end
|
266
|
+
|
267
|
+
def store_commit_comments(repo, user, stored_comments, retrieved_comments)
|
268
|
+
retrieved_comments.each do |x|
|
269
|
+
|
270
|
+
exists = !stored_comments.find { |f|
|
271
|
+
f['commit_id'] == x['commit_id'] && f['id'] == x['id']
|
272
|
+
}.nil?
|
273
|
+
|
274
|
+
unless exists
|
275
|
+
x['repo'] = repo
|
276
|
+
x['user'] = user
|
277
|
+
|
278
|
+
@persister.store(:commit_comments, x)
|
279
|
+
info "Retriever: Added commit comment #{x['commit_id']} -> #{x['id']}"
|
280
|
+
else
|
281
|
+
debug "Retriever: Commit comment #{x['commit_id']} -> #{x['id']} exists"
|
282
|
+
end
|
283
|
+
end
|
284
|
+
end
|
147
285
|
end
|
148
286
|
end
|
data/lib/ghtorrent/settings.rb
CHANGED
data/lib/ghtorrent/utils.rb
CHANGED
@@ -28,6 +28,11 @@
|
|
28
28
|
|
29
29
|
module GHTorrent
|
30
30
|
module Utils
|
31
|
+
|
32
|
+
def self.included(other)
|
33
|
+
other.extend self
|
34
|
+
end
|
35
|
+
|
31
36
|
# Read a value whose format is "foo.bar.baz" from a hierarchical map
|
32
37
|
# (the result of a JSON parse or a Mongo query), where a dot represents
|
33
38
|
# one level deep in the result hierarchy.
|
@@ -54,5 +59,13 @@ module GHTorrent
|
|
54
59
|
end
|
55
60
|
end
|
56
61
|
end
|
62
|
+
|
63
|
+
def user_type(type)
|
64
|
+
if type == "User"
|
65
|
+
"USR"
|
66
|
+
else
|
67
|
+
"ORG"
|
68
|
+
end
|
69
|
+
end
|
57
70
|
end
|
58
71
|
end
|
data/test/callstack_test.rb
CHANGED
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ghtorrent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 13
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 3
|
9
|
+
version: "0.3"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Georgios Gousios
|
@@ -107,6 +107,38 @@ dependencies:
|
|
107
107
|
version: "3.35"
|
108
108
|
type: :runtime
|
109
109
|
version_requirements: *id006
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: sqlite3-ruby
|
112
|
+
prerelease: false
|
113
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
114
|
+
none: false
|
115
|
+
requirements:
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
hash: 31
|
119
|
+
segments:
|
120
|
+
- 1
|
121
|
+
- 3
|
122
|
+
- 2
|
123
|
+
version: 1.3.2
|
124
|
+
type: :runtime
|
125
|
+
version_requirements: *id007
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: daemons
|
128
|
+
prerelease: false
|
129
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
130
|
+
none: false
|
131
|
+
requirements:
|
132
|
+
- - ">="
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
hash: 3
|
135
|
+
segments:
|
136
|
+
- 1
|
137
|
+
- 1
|
138
|
+
- 8
|
139
|
+
version: 1.1.8
|
140
|
+
type: :runtime
|
141
|
+
version_requirements: *id008
|
110
142
|
description: |-
|
111
143
|
A library and a collection of associated programs
|
112
144
|
to mirror and process Github data
|
@@ -128,8 +160,9 @@ files:
|
|
128
160
|
- lib/ghtorrent/ghtorrent.rb
|
129
161
|
- lib/ghtorrent/logging.rb
|
130
162
|
- lib/ghtorrent/migrations/001_init_schema.rb
|
131
|
-
- lib/ghtorrent/migrations/
|
132
|
-
- lib/ghtorrent/migrations/
|
163
|
+
- lib/ghtorrent/migrations/002_add_external_ref_ids.rb
|
164
|
+
- lib/ghtorrent/migrations/003_add_orgs.rb
|
165
|
+
- lib/ghtorrent/migrations/004_add_commit_comments.rb
|
133
166
|
- lib/ghtorrent/persister.rb
|
134
167
|
- lib/ghtorrent/retriever.rb
|
135
168
|
- lib/ghtorrent/settings.rb
|