ghtorrent 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/CHANGELOG +24 -0
  2. data/Gemfile +17 -0
  3. data/Gemfile.lock +40 -0
  4. data/README.md +23 -22
  5. data/bin/ght-data-retrieval +66 -24
  6. data/bin/ght-load +41 -19
  7. data/bin/ght-mirror-events +13 -16
  8. data/bin/ght-rm-dupl +119 -77
  9. data/lib/ghtorrent.rb +14 -4
  10. data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
  11. data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
  12. data/lib/ghtorrent/api_client.rb +151 -16
  13. data/lib/ghtorrent/bson_orderedhash.rb +23 -0
  14. data/lib/ghtorrent/cache.rb +97 -0
  15. data/lib/ghtorrent/command.rb +43 -25
  16. data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
  17. data/lib/ghtorrent/ghtorrent.rb +615 -164
  18. data/lib/ghtorrent/hash.rb +11 -0
  19. data/lib/ghtorrent/logging.rb +11 -7
  20. data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
  21. data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
  22. data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
  23. data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
  24. data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
  25. data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
  26. data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
  27. data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
  28. data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
  29. data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
  30. data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
  31. data/lib/ghtorrent/persister.rb +3 -0
  32. data/lib/ghtorrent/retriever.rb +298 -102
  33. data/lib/ghtorrent/settings.rb +20 -1
  34. data/lib/ghtorrent/time.rb +5 -0
  35. data/lib/ghtorrent/utils.rb +22 -4
  36. data/lib/version.rb +5 -0
  37. metadata +173 -145
  38. data/lib/ghtorrent/call_stack.rb +0 -91
@@ -0,0 +1,6 @@
1
+ # Base exception for all GHTorrent exceptions
2
+ module GHTorrent
3
+ class GHTorrentException < Exception
4
+
5
+ end
6
+ end
@@ -1,5 +1,11 @@
1
1
  require 'sequel'
2
2
 
3
+ require 'ghtorrent/time'
4
+ require 'ghtorrent/logging'
5
+ require 'ghtorrent/settings'
6
+ require 'ghtorrent/retriever'
7
+ require 'ghtorrent/persister'
8
+
3
9
  module GHTorrent
4
10
  class Mirror
5
11
 
@@ -8,32 +14,34 @@ module GHTorrent
8
14
  include GHTorrent::Retriever
9
15
  include GHTorrent::Persister
10
16
 
11
- attr_reader :settings, :persister
12
-
13
- def initialize(configuration)
17
+ attr_reader :settings, :persister, :ext_uniq, :logger
14
18
 
15
- @settings = YAML::load_file configuration
16
- super(@settings)
19
+ def initialize(settings)
20
+ @settings = settings
17
21
  @ext_uniq = config(:uniq_id)
18
22
  @logger = Logger.new(STDOUT)
19
- @persister = connect(:mongo, @settings)
20
- get_db
21
23
  end
22
24
 
23
25
  # db related functions
24
26
  def get_db
25
-
26
- @db = Sequel.connect(config(:sql_url))
27
-
27
+ Sequel.single_threaded = true
28
+ @db = Sequel.connect(config(:sql_url), :encoding => 'utf8')
29
+ #@db.loggers << @logger
28
30
  if @db.tables.empty?
29
31
  dir = File.join(File.dirname(__FILE__), 'migrations')
30
32
  puts "Database empty, running migrations from #{dir}"
31
33
  Sequel.extension :migration
32
34
  Sequel::Migrator.apply(@db, dir)
33
35
  end
36
+
34
37
  @db
35
38
  end
36
39
 
40
+ def persister
41
+ @persister ||= connect(:mongo, @settings)
42
+ @persister
43
+ end
44
+
37
45
  ##
38
46
  # Ensure that a user exists, or fetch its latest state from Github
39
47
  # ==Parameters:
@@ -46,7 +54,7 @@ module GHTorrent
46
54
  end
47
55
 
48
56
  transaction do
49
- ensure_repo(user, repo)
57
+ ensure_user(user, true, true)
50
58
  ensure_commit(repo, sha, user)
51
59
  end
52
60
  end
@@ -60,7 +68,6 @@ module GHTorrent
60
68
  # [date_added] The timestamp that the add event took place
61
69
  def get_project_member(owner, repo, new_member, date_added)
62
70
  transaction do
63
- ensure_repo(owner, repo)
64
71
  ensure_project_member(owner, repo, new_member, date_added)
65
72
  end
66
73
  end
@@ -74,7 +81,6 @@ module GHTorrent
74
81
  # [date_added] The timestamp that the add event took place
75
82
  def get_commit_comment(user, repo, comment_id, date_added)
76
83
  transaction do
77
- ensure_repo(user, repo)
78
84
  ensure_commit_comment(user, repo, comment_id, date_added)
79
85
  end
80
86
  end
@@ -88,7 +94,6 @@ module GHTorrent
88
94
  # [date_added] The timestamp that the add event took place
89
95
  def get_watcher(owner, repo, watcher, date_added)
90
96
  transaction do
91
- ensure_repo(owner, repo)
92
97
  ensure_watcher(owner, repo, watcher, date_added)
93
98
  end
94
99
  end
@@ -101,20 +106,84 @@ module GHTorrent
101
106
  # [date_added] The timestamp that the add event took place
102
107
  def get_follower(follower, followed, date_added)
103
108
  transaction do
104
- ensure_user(follower, false, false)
105
- ensure_user(followed, false, false)
106
- ensure_user_followers(followed, date_added)
109
+ ensure_user(follower, true, true)
110
+ ensure_user(followed, true, true)
111
+ ensure_user_follower(followed, follower, date_added)
112
+ end
113
+ end
114
+
115
+ ##
116
+ # Get a pull request and record the changes it affects
117
+ # ==Parameters:
118
+ # [owner] The owner of the repository to which the pullreq will be applied
119
+ # [repo] The repository to which the pullreq will be applied
120
+ # [pullreq_id] The ID of the pull request relative to the repository
121
+ def get_pull_request(owner, repo, pullreq_id, state, created_at)
122
+ transaction do
123
+ ensure_pull_request(owner, repo, pullreq_id, true, true, state, created_at)
124
+ end
125
+ end
126
+
127
+ ##
128
+ # Retrieve details about a project fork (including the forked project)
129
+ # ==Parameters:
130
+ # [owner] The login of the repository owner
131
+ # [repo] The name of the repository
132
+ # [fork_id] The fork item id
133
+ # [date_added] The timestamp that the add event took place
134
+ def get_fork(owner, repo, fork_id, date_added)
135
+ transaction do
136
+ ensure_fork(owner, repo, fork_id, date_added)
107
137
  end
108
138
  end
109
139
 
140
+ ##
141
+ # Retrieve a pull request review comment
142
+ # ==Parameters:
143
+ # [owner] The login of the repository owner
144
+ # [repo] The name of the repository
145
+ # [fork_id] The fork item id
146
+ # [date_added] The timestamp that the add event took place
147
+ def get_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
148
+ transaction do
149
+ ensure_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
150
+ end
151
+ end
152
+
153
+ ##
154
+ # Retrieve a pull request review comment
155
+ # ==Parameters:
156
+ # [owner] The login of the repository owner
157
+ # [repo] The name of the repository
158
+ # [fork_id] The fork item id
159
+ # [date_added] The timestamp that the add event took place
160
+ def get_issue_comment(owner, repo, issue_id, comment_id, created_at)
161
+ transaction do
162
+ raise "Not implemented"
163
+ #ensure_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
164
+ end
165
+ end
166
+
167
+
110
168
  ##
111
169
  # Make sure a commit exists
112
170
  #
113
171
  def ensure_commit(repo, sha, user, comments = true)
172
+ ensure_repo(user, repo)
114
173
  c = retrieve_commit(repo, sha, user)
174
+
175
+ if c.nil?
176
+ warn "GHTorrent: Commit #{user}/#{repo} -> #{sha} does not exist"
177
+ return
178
+ end
179
+
115
180
  stored = store_commit(c, repo, user)
116
181
  ensure_parents(c)
117
- ensure_commit_comments(user, repo, sha) if comments
182
+ if not c['commit']['comment_count'].nil? \
183
+ and c['commit']['comment_count'] > 0
184
+ ensure_commit_comments(user, repo, sha) if comments
185
+ end
186
+ ensure_repo_commit(user, repo, sha)
118
187
  stored
119
188
  end
120
189
 
@@ -162,13 +231,41 @@ module GHTorrent
162
231
 
163
232
  parents.insert(:commit_id => this[:id],
164
233
  :parent_id => parent[:id])
165
- info "Added parent #{parent[:sha]} to commit #{this[:sha]}"
234
+ info "GHTorrent: Added parent #{parent[:sha]} to commit #{this[:sha]}"
166
235
  else
167
- info "Parent #{parent[:sha]} for commit #{this[:sha]} exists"
236
+ debug "GHTorrent: Parent #{parent[:sha]} for commit #{this[:sha]} exists"
168
237
  end
169
238
  end
170
239
  end
171
240
 
241
+ ##
242
+ # Make sure that a commit has been associated with the provided repo
243
+ # ==Parameters:
244
+ # [user] The user that owns the repo this commit has been submitted to
245
+ # [repo] The repo receiving the commit
246
+ # [sha] The commit SHA
247
+ def ensure_repo_commit(user, repo, sha)
248
+ userid = @db[:users].first(:login => user)[:id]
249
+ projectid = @db[:projects].first(:owner_id => userid,
250
+ :name => repo)[:id]
251
+ commitid = @db[:commits].first(:sha => sha)[:id]
252
+
253
+ exists = @db[:project_commits].first(:project_id => projectid,
254
+ :commit_id => commitid)
255
+ if exists.nil?
256
+ @db[:project_commits].insert(
257
+ :project_id => projectid,
258
+ :commit_id => commitid
259
+ )
260
+ info "GHTorrent: Added commit #{user}/#{repo} -> #{sha}"
261
+ @db[:project_commits].first(:project_id => projectid,
262
+ :commit_id => commitid)
263
+ else
264
+ debug "GHTorrent: Commit #{user}/#{repo} -> #{sha} exists"
265
+ exists
266
+ end
267
+ end
268
+
172
269
  ##
173
270
  # Add (or update) an entry for a commit author. This method uses information
174
271
  # in the JSON object returned by Github to add (or update) a user in the
@@ -193,15 +290,14 @@ module GHTorrent
193
290
  login = githubuser['login'] unless githubuser.nil?
194
291
 
195
292
  if login.nil?
196
- ensure_user("#{name}<#{email}>", true, false)
293
+ ensure_user("#{name}<#{email}>", false, false)
197
294
  else
198
295
  dbuser = users.first(:login => login)
199
296
  byemail = users.first(:email => email)
200
297
  if dbuser.nil?
201
298
  # We do not have the user in the database yet. Add him
202
- added = ensure_user(login, true, false)
299
+ added = ensure_user(login, false, false)
203
300
  if byemail.nil?
204
- #
205
301
  users.filter(:login => login).update(:name => name) if added[:name].nil?
206
302
  users.filter(:login => login).update(:email => email) if added[:email].nil?
207
303
  else
@@ -215,8 +311,6 @@ module GHTorrent
215
311
  :login => login,
216
312
  :company => added['company'],
217
313
  :location => added['location'],
218
- :hireable => added['hireable'],
219
- :bio => added['bio'],
220
314
  :created_at => added['created_at']
221
315
  )
222
316
  end
@@ -259,7 +353,6 @@ module GHTorrent
259
353
  return u
260
354
  end
261
355
 
262
-
263
356
  ##
264
357
  # Ensure that a user exists, or fetch its latest state from Github
265
358
  # ==Parameters:
@@ -275,6 +368,12 @@ module GHTorrent
275
368
 
276
369
  if usr.nil?
277
370
  u = retrieve_user_byusername(user)
371
+
372
+ if u.nil?
373
+ warn "GHTorrent: User #{user} does not exist"
374
+ return
375
+ end
376
+
278
377
  email = unless u['email'].nil?
279
378
  if u['email'].strip == "" then
280
379
  nil
@@ -283,49 +382,16 @@ module GHTorrent
283
382
  end
284
383
  end
285
384
 
286
- if not email.nil?
287
- # Check whether a user has been added by email before
288
- byemail = users.first(:email => email)
289
- unless byemail.nil?
290
- users.filter(:email => email).update(:login => u['login'],
291
- :name => u['name'],
292
- :company => u['company'],
293
- :hireable => boolean(u['hirable']),
294
- :bio => u['bio'],
295
- :location => u['location'],
296
- :type => user_type(u['type']),
297
- :created_at => date(u['created_at']),
298
- :ext_ref_id => u[@ext_uniq]
299
- )
300
- info "GHTorrent: Updating user #{user} (email #{email})"
301
- else
302
- users.insert(:login => u['login'],
303
- :name => u['name'],
304
- :company => u['company'],
305
- :email => email,
306
- :hireable => boolean(u['hirable']),
307
- :bio => u['bio'],
308
- :location => u['location'],
309
- :type => user_type(u['type']),
310
- :created_at => date(u['created_at']),
311
- :ext_ref_id => u[@ext_uniq])
312
-
313
- info "GHTorrent: New user #{user}"
314
- end
315
- else
316
- users.insert(:login => u['login'],
317
- :name => u['name'],
318
- :company => u['company'],
319
- :email => email,
320
- :hireable => boolean(u['hirable']),
321
- :bio => u['bio'],
322
- :location => u['location'],
323
- :type => user_type(u['type']),
324
- :created_at => date(u['created_at']),
325
- :ext_ref_id => u[@ext_uniq])
385
+ users.insert(:login => u['login'],
386
+ :name => u['name'],
387
+ :company => u['company'],
388
+ :email => email,
389
+ :location => u['location'],
390
+ :type => user_type(u['type']),
391
+ :created_at => date(u['created_at']),
392
+ :ext_ref_id => u[@ext_uniq])
326
393
 
327
- info "GHTorrent: New user #{user}"
328
- end
394
+ info "GHTorrent: New user #{user}"
329
395
  users.first(:login => user)
330
396
  else
331
397
  debug "GHTorrent: User #{user} exists"
@@ -340,37 +406,62 @@ module GHTorrent
340
406
  #
341
407
  # ==Parameters:
342
408
  # [user] The user login to find followers by
343
- def ensure_user_followers(user, date_added = nil)
409
+ def ensure_user_followers(followed, date_added = nil)
410
+ curuser = ensure_user(followed, false, false)
411
+ time = curuser[:created_at]
412
+ followers = @db.from(:followers, :users).\
413
+ where(:followers__follower_id => :users__id).
414
+ where(:followers__user_id => curuser[:id]).select(:login).all
415
+
416
+ retrieve_user_followers(followed).reduce([]) do |acc, x|
417
+ if followers.find {|y| y[:login] == x['login']}.nil?
418
+ acc << x
419
+ else
420
+ acc
421
+ end
422
+ end.map { |x| ensure_user_follower(followed, x['login'], time) }
423
+ end
424
+
425
+ ##
426
+ # Make sure that a user follows another one
427
+ def ensure_user_follower(followed, follower, date_added)
428
+ follower_user = ensure_user(follower, false, false)
429
+ followed_user = ensure_user(followed, false, false)
430
+
431
+ if followed_user.nil? or follower_user.nil?
432
+ warn "Could not add follower #{follower} to #{followed}"
433
+ return
434
+ end
435
+
344
436
  followers = @db[:followers]
345
- userid = @db[:users].first(:login => user)[:id]
437
+ followed_id = follower_user[:id]
438
+ follower_id = followed_user[:id]
346
439
 
347
- retrieved = retrieve_user_followers(user)
348
- retrieved.each { |f|
349
- follower = f['login']
350
- ensure_user(user, false, false)
351
- ensure_user(follower, false, false)
440
+ follower_exists = followers.first(:user_id => followed_id,
441
+ :follower_id => follower_id)
352
442
 
353
- followerid = @db[:users].first(:login => follower)[:id]
443
+ if follower_exists.nil?
444
+ added = if date_added.nil? then Time.now else date_added end
445
+ retrieved = retrieve_user_follower(followed, follower)
354
446
 
447
+ if retrieved.nil?
448
+ warn "Follower #{follower} does not exist for user #{followed}"
449
+ return
450
+ end
355
451
 
356
- if followers.first(:user_id => userid, :follower_id => followerid).nil?
357
- added = if date_added.nil? then Time.now else date_added end
358
- followers.insert(:user_id => userid,
359
- :follower_id => followerid,
360
- :created_at => added,
361
- :ext_ref_id => f[@ext_uniq]
362
- )
363
- info "GHTorrent: User #{follower} follows #{user}"
364
- else
365
- unless date_added.nil?
366
- followers.filter(:user_id => userid,
367
- :follower_id => followerid).\
368
- update(:created_at => date(date_added))
369
- info "GHTorrent: Updated follower #{follower} -> #{user}"
370
- end
371
- debug "GHTorrent: User #{follower} already follows #{user}"
452
+ followers.insert(:user_id => followed_id,
453
+ :follower_id => follower_id,
454
+ :created_at => added,
455
+ :ext_ref_id => retrieved[@ext_uniq])
456
+ info "GHTorrent: User #{follower} follows #{followed}"
457
+ else
458
+ unless date_added.nil?
459
+ followers.filter(:user_id => followed_id,
460
+ :follower_id => follower_id)\
461
+ .update(:created_at => date(date_added))
462
+ debug "GHTorrent: Updating follower #{followed} -> #{follower}"
372
463
  end
373
- }
464
+ end
374
465
  end
375
466
 
376
467
  ##
@@ -379,8 +470,7 @@ module GHTorrent
379
470
  #
380
471
  # ==Parameters:
381
472
  # [email] The email to lookup the user by
382
- # [email] The user's name
383
- # [followers] If true, the user's followers will be retrieved
473
+ # [name] The user's name
384
474
  # == Returns:
385
475
  # If the user can be retrieved, it is returned as a Hash. Otherwise,
386
476
  # the result is nil
@@ -392,27 +482,27 @@ module GHTorrent
392
482
 
393
483
  u = retrieve_user_byemail(email, name)
394
484
 
395
- if u.nil? or u['user'].nil? or u['user']['login'].nil?
485
+ if u.nil? or u['login'].nil?
396
486
  debug "GHTorrent: Cannot find #{email} through search API query"
487
+ login = (0...8).map { 65.+(rand(25)).chr }.join
397
488
  users.insert(:email => email,
398
489
  :name => name,
399
- :login => (0...8).map { 65.+(rand(25)).chr }.join,
490
+ :login => login,
400
491
  :created_at => Time.now,
401
492
  :ext_ref_id => ""
402
493
  )
403
- users.first(:email => email)
494
+ info "GHTorrent: Added fake user #{login} -> #{email}"
495
+ users.first(:login => login)
404
496
  else
405
- users.insert(:login => u['user']['login'],
406
- :name => u['user']['name'],
407
- :company => u['user']['company'],
408
- :email => u['user']['email'],
409
- :hireable => nil,
410
- :bio => nil,
411
- :location => u['user']['location'],
412
- :created_at => date(u['user']['created_at']),
497
+ users.insert(:login => u['login'],
498
+ :name => u['name'],
499
+ :company => u['company'],
500
+ :email => u['email'],
501
+ :location => u['location'],
502
+ :created_at => date(u['created_at']),
413
503
  :ext_ref_id => u[@ext_uniq])
414
- debug "GHTorrent: Found #{email} through search API query"
415
- users.first(:email => email)
504
+ info "GHTorrent: Found #{email} through search API query"
505
+ users.first(:login => u['login'])
416
506
  end
417
507
  else
418
508
  debug "GHTorrent: User with email #{email} exists"
@@ -430,15 +520,21 @@ module GHTorrent
430
520
  # == Returns:
431
521
  # If the repo can be retrieved, it is returned as a Hash. Otherwise,
432
522
  # the result is nil
433
- def ensure_repo(user, repo)
523
+ def ensure_repo(user, repo, commits = true, project_members = true, watchers = true)
434
524
 
435
- ensure_user(user, true, true)
525
+ ensure_user(user, false, false)
436
526
  repos = @db[:projects]
437
527
  curuser = @db[:users].first(:login => user)
438
528
  currepo = repos.first(:owner_id => curuser[:id], :name => repo)
439
529
 
440
530
  if currepo.nil?
441
531
  r = retrieve_repo(user, repo)
532
+
533
+ if r.nil?
534
+ warn "Repo #{user}/#{repo} does not exist"
535
+ return
536
+ end
537
+
442
538
  repos.insert(:url => r['url'],
443
539
  :owner_id => @db[:users].filter(:login => user).first[:id],
444
540
  :name => r['name'],
@@ -448,9 +544,9 @@ module GHTorrent
448
544
  :ext_ref_id => r[@ext_uniq])
449
545
 
450
546
  info "GHTorrent: New repo #{repo}"
451
- ensure_commits(user, repo)
452
- ensure_project_members(user, repo)
453
- ensure_watchers(user, repo)
547
+ ensure_commits(user, repo) if commits
548
+ ensure_project_members(user, repo) if project_members
549
+ ensure_watchers(user, repo) if watchers
454
550
  repos.first(:owner_id => curuser[:id], :name => repo)
455
551
  else
456
552
  debug "GHTorrent: Repo #{repo} exists"
@@ -461,27 +557,32 @@ module GHTorrent
461
557
  ##
462
558
  # Make sure that a project has all the registered members defined
463
559
  def ensure_project_members(user, repo)
464
- curuser = @db[:users].first(:login => user)
465
- currepo = @db[:projects].first(:owner_id => curuser[:id], :name => repo)
466
- project_members = @db[:project_members].filter(:user_id => curuser[:id],
467
- :repo_id => currepo[:id])
560
+ currepo = ensure_repo(user, repo, true, false, true)
561
+ time = currepo[:created_at]
562
+
563
+ project_members = @db.from(:project_members, :users).\
564
+ where(:project_members__user_id => :users__id).\
565
+ where(:project_members__repo_id => currepo[:id]).select(:login).all
468
566
 
469
567
  retrieve_repo_collaborators(user, repo).reduce([]) do |acc, x|
470
- if project_members.find { |y| y[:login] == x['login'] }.nil?
568
+ if project_members.find {|y| y[:login] == x['login']}.nil?
471
569
  acc << x
472
570
  else
473
571
  acc
474
572
  end
475
- end.map { |x| ensure_project_member(user, repo, x['login'], nil) }
573
+ end.map { |x| ensure_project_member(user, repo, x['login'], time) }
476
574
  end
477
575
 
478
576
  ##
479
577
  # Make sure that a project member exists in a project
480
578
  def ensure_project_member(owner, repo, new_member, date_added)
481
579
  pr_members = @db[:project_members]
580
+ project = ensure_repo(owner, repo, true, false, true)
482
581
  new_user = ensure_user(new_member, false, false)
483
- owner_id = @db[:users].first(:login => owner)[:id]
484
- project = @db[:projects].first(:owner_id => owner_id, :name => repo)
582
+
583
+ if project.nil? or new_user.nil?
584
+ return
585
+ end
485
586
 
486
587
  memb_exist = pr_members.first(:user_id => new_user[:id],
487
588
  :repo_id => project[:id])
@@ -489,6 +590,12 @@ module GHTorrent
489
590
  if memb_exist.nil?
490
591
  added = if date_added.nil? then Time.now else date_added end
491
592
  retrieved = retrieve_repo_collaborator(owner, repo, new_member)
593
+
594
+ if retrieved.nil?
595
+ warn "Project member #{new_member} does not exist in #{owner}/#{repo}"
596
+ return
597
+ end
598
+
492
599
  pr_members.insert(
493
600
  :user_id => new_user[:id],
494
601
  :repo_id => project[:id],
@@ -513,7 +620,6 @@ module GHTorrent
513
620
  # [user] The login name of the user to check the organizations for
514
621
  #
515
622
  def ensure_orgs(user)
516
- usr = @db[:users].first(:login => user)
517
623
  retrieve_orgs(user).map{|o| ensure_participation(user, o['login'])}
518
624
  end
519
625
 
@@ -525,8 +631,8 @@ module GHTorrent
525
631
  # [org] The login name of the organization to check whether the user
526
632
  # belongs in
527
633
  #
528
- def ensure_participation(user, organization)
529
- org = ensure_org(organization)
634
+ def ensure_participation(user, organization, members = true)
635
+ org = ensure_org(organization, members)
530
636
  usr = ensure_user(user, false, false)
531
637
 
532
638
  org_members = @db[:organization_members]
@@ -550,14 +656,21 @@ module GHTorrent
550
656
  # ==Parameters:
551
657
  # [organization] The login name of the organization
552
658
  #
553
- def ensure_org(organization)
554
- org = @db[:users].find(:login => organization, :type => 'org')
659
+ def ensure_org(organization, members)
660
+ org = @db[:users].first(:login => organization, :type => 'org')
555
661
 
556
662
  if org.nil?
557
- ensure_user(org, false, false)
663
+ org = ensure_user(organization, false, false)
664
+ if members
665
+ retrieve_org_members(organization).map { |x|
666
+ ensure_participation(ensure_user(x['login'], false, false)[:login],
667
+ organization, false)
668
+ }
669
+ end
670
+ org
558
671
  else
559
672
  debug "GHTorrent: Organization #{organization} exists"
560
- org.first
673
+ org
561
674
  end
562
675
  end
563
676
 
@@ -572,7 +685,6 @@ module GHTorrent
572
685
  commit_id = @db[:commits].first(:sha => sha)[:id]
573
686
  stored_comments = @db[:commit_comments].filter(:commit_id => commit_id)
574
687
  commit_comments = retrieve_commit_comments(user, repo, sha)
575
- #user_id = @db[:users].first(:login => user)[:id]
576
688
 
577
689
  not_saved = commit_comments.reduce([]) do |acc, x|
578
690
  if stored_comments.find{|y| y[:comment_id] == x['id']}.nil?
@@ -600,16 +712,16 @@ module GHTorrent
600
712
  retrieved = retrieve_commit_comment(user, repo, id)
601
713
 
602
714
  if retrieved.nil?
603
- debug "GHTorrent: Commit comment #{id} deleted"
715
+ warn "GHTorrent: Commit comment #{id} deleted"
604
716
  return
605
717
  end
606
718
 
607
- commit = ensure_commit(repo, retrieved['commit_id'], user, comments = false)
719
+ commit = ensure_commit(repo, retrieved['commit_id'], user, false)
608
720
  user = ensure_user(user, false, false)
609
721
  @db[:commit_comments].insert(
610
722
  :commit_id => commit[:id],
611
723
  :user_id => user[:id],
612
- :body => retrieved['body'],
724
+ :body => retrieved['body'][0..255],
613
725
  :line => retrieved['line'],
614
726
  :position => retrieved['position'],
615
727
  :comment_id => retrieved['id'],
@@ -617,45 +729,67 @@ module GHTorrent
617
729
  :created_at => date(retrieved['created_at'])
618
730
  )
619
731
  info "GHTorrent: Added commit comment #{commit[:sha]} -> #{user[:login]}"
620
- @db[:commit_comments].first(:comment_id => id)
621
732
  else
733
+ unless created_at.nil?
734
+ @db[:commit_comments].filter(:comment_id => id)\
735
+ .update(:created_at => date(created_at))
736
+ info "GHTorrent: Updating comment #{user}/#{repo} -> #{id}"
737
+ end
622
738
  info "GHTorrent: Commit comment #{id} exists"
623
- stored_comment
624
739
  end
740
+ @db[:commit_comments].first(:comment_id => id)
625
741
  end
626
742
 
627
743
  ##
628
744
  # Make sure that
629
745
  def ensure_watchers(owner, repo)
630
- curuser = @db[:users].first(:login => owner)
631
- currepo = @db[:projects].first(:owner_id => curuser[:id],
632
- :name => repo)
633
- watchers = @db[:watchers].filter(:user_id => curuser[:id],
634
- :repo_id => currepo[:id])
746
+ currepo = ensure_repo(owner, repo, true, true, false)
747
+ time = currepo[:created_at]
748
+
749
+ if currepo.nil?
750
+ warn "Could not retrieve watchers for #{owner}/#{repo}"
751
+ return
752
+ end
753
+
754
+ watchers = @db.from(:watchers, :users).\
755
+ where(:watchers__user_id => :users__id).\
756
+ where(:watchers__repo_id => currepo[:id]).select(:login).all
635
757
 
636
758
  retrieve_watchers(owner, repo).reduce([]) do |acc, x|
637
- if watchers.find { |y| y[:login] == x['login'] }.nil?
759
+ if watchers.find { |y|
760
+ y[:login] == x['login']
761
+ }.nil?
638
762
  acc << x
639
763
  else
640
764
  acc
641
765
  end
642
- end.map { |x| ensure_watcher(owner, repo, x['login']) }
766
+ end.map { |x| ensure_watcher(owner, repo, x['login'], time) }
643
767
  end
644
768
 
645
769
  ##
646
770
  # Make sure that a project member exists in a project
647
771
  def ensure_watcher(owner, repo, watcher, date_added = nil)
648
- watchers = @db[:watchers]
772
+ project = ensure_repo(owner, repo, false, false, false)
649
773
  new_watcher = ensure_user(watcher, false, false)
650
- owner_id = @db[:users].first(:login => owner)[:id]
651
- project = @db[:projects].first(:owner_id => owner_id, :name => repo)
652
774
 
775
+ if new_watcher.nil? or project.nil?
776
+ warn "GHTorrent: Watcher #{watcher} does not exist"
777
+ return
778
+ end
779
+
780
+ watchers = @db[:watchers]
653
781
  memb_exist = watchers.first(:user_id => new_watcher[:id],
654
- :repo_id => project[:id])
782
+ :repo_id => project[:id])
655
783
 
656
784
  if memb_exist.nil?
657
785
  added = if date_added.nil? then Time.now else date_added end
658
786
  retrieved = retrieve_watcher(owner, repo, watcher)
787
+
788
+ if retrieved.nil?
789
+ warn "Watcher #{watcher} no longer watches #{owner}/#{repo}"
790
+ return
791
+ end
792
+
659
793
  watchers.insert(
660
794
  :user_id => new_watcher[:id],
661
795
  :repo_id => project[:id],
@@ -673,6 +807,313 @@ module GHTorrent
673
807
  end
674
808
  end
675
809
 
810
+ ##
811
+ # Process all pull requests
812
+ def ensure_pull_requests(owner, repo)
813
+ currepo = ensure_repo(owner, repo, false, false, false)
814
+ if currepo.nil?
815
+ warn "Could not retrieve pull requests from #{owner}/#{repo}"
816
+ return
817
+ end
818
+
819
+ pull_reqs = @db[:pull_requests].filter(:base_repo_id => currepo[:id])
820
+
821
+ retrieve_pull_requests(owner, repo).reduce([]) do |acc, x|
822
+ if pull_reqs.find { |y| y[:pullreq_id] == x['number'] }.nil?
823
+ acc << x
824
+ else
825
+ acc
826
+ end
827
+ end.map { |x| ensure_pull_request(owner, repo, x['number']) }
828
+ end
829
+
830
+ ##
831
+ # Process a pull request
832
+ def ensure_pull_request(owner, repo, pullreq_id,
833
+ comments = true, commits = true,
834
+ state = nil, created_at = nil)
835
+ pulls_reqs = @db[:pull_requests]
836
+ pull_req_history = @db[:pull_request_history]
837
+
838
+ project = ensure_repo(owner, repo, false, false, false)
839
+
840
+ if project.nil?
841
+ return
842
+ end
843
+
844
+ # Adds a pull request history event
845
+ add_history = Proc.new do |id, ts, unq, act|
846
+
847
+ entry = pull_req_history.first(:pull_request_id => id,
848
+ :ext_ref_id => unq, :action => act)
849
+ if entry.nil?
850
+ pull_req_history.insert(:pull_request_id => id, :created_at => ts,
851
+ :ext_ref_id => unq, :action => act)
852
+ info "GHTorrent: New pull request (#{id}) history entry (#{act})"
853
+ else
854
+ pull_req_history.filter(:pull_request_id => id, :ext_ref_id => unq,
855
+ :action => act).update(:created_at => ts)
856
+ info "GHTorrent: Updating pull request (#{id}) history entry (#{act}) timestamp #{ts}"
857
+ end
858
+ end
859
+
860
+ # Checks whether a pull request concerns two branches of the same
861
+ # repository
862
+ is_intra_branch = Proc.new do |req|
863
+ req['head']['repo'].nil?
864
+ end
865
+
866
+ # Produces a log message
867
+ log_msg = Proc.new do |req|
868
+ head = if is_intra_branch.call(req)
869
+ req['base']['repo']['full_name']
870
+ else
871
+ req['head']['repo']['full_name']
872
+ end
873
+
874
+ <<-eos.gsub(/\s+/, " ").strip
875
+ GHTorrent: Pull request #{pullreq_id}
876
+ #{head} -> #{req['base']['repo']['full_name']}
877
+ eos
878
+ end
879
+
880
+ retrieved = retrieve_pull_request(owner, repo, pullreq_id)
881
+
882
+ if retrieved.nil?
883
+ warn "GHTorrent: Cannot retrieve pull request (#{owner}/#{repo} #{pullreq_id})"
884
+ return
885
+ end
886
+
887
+ base_repo = ensure_repo(retrieved['base']['repo']['owner']['login'],
888
+ retrieved['base']['repo']['name'],
889
+ false, false, false)
890
+
891
+ base_commit = ensure_commit(retrieved['base']['repo']['name'],
892
+ retrieved['base']['sha'],
893
+ retrieved['base']['repo']['owner']['login']
894
+ )
895
+
896
+ if is_intra_branch.call(retrieved)
897
+ head_repo = base_repo
898
+ head_commit =
899
+ warn "GHTorrent: Pull request is intra branch"
900
+ else
901
+
902
+ head_repo = ensure_repo(retrieved['head']['repo']['owner']['login'],
903
+ retrieved['head']['repo']['name'],
904
+ false, false, false)
905
+
906
+ head_commit = ensure_commit(retrieved['head']['repo']['name'],
907
+ retrieved['head']['sha'],
908
+ retrieved['head']['repo']['owner']['login'])
909
+ end
910
+
911
+ pull_req_user = ensure_user(retrieved['user']['login'], false, false)
912
+
913
+ merged = if retrieved['merged_at'].nil? then false else true end
914
+ closed = if retrieved['closed_at'].nil? then false else true end
915
+
916
+ pull_req = pulls_reqs.first(:base_repo_id => project[:id],
917
+ :pullreq_id => pullreq_id)
918
+ if pull_req.nil?
919
+ pulls_reqs.insert(
920
+ :head_repo_id => if not head_repo.nil? then head_repo[:id] end,
921
+ :base_repo_id => base_repo[:id],
922
+ :head_commit_id => if not head_commit.nil? then head_commit[:id] end,
923
+ :base_commit_id => base_commit[:id],
924
+ :user_id => pull_req_user[:id],
925
+ :pullreq_id => pullreq_id,
926
+ :intra_branch => is_intra_branch.call(retrieved)
927
+ )
928
+
929
+ info log_msg.call(retrieved)
930
+ else
931
+ debug log_msg.call(retrieved) + " exists"
932
+ end
933
+
934
+ pull_req = pulls_reqs.first(:base_repo_id => project[:id],
935
+ :pullreq_id => pullreq_id)
936
+
937
+ add_history.call(pull_req[:id], date(retrieved['created_at']),
938
+ retrieved[@ext_uniq], 'opened')
939
+ add_history.call(pull_req[:id], date(retrieved['merged_at']),
940
+ retrieved[@ext_uniq], 'merged') if merged
941
+ add_history.call(pull_req[:id], date(retrieved['closed_at']),
942
+ retrieved[@ext_uniq], 'closed') if closed
943
+ add_history.call(pull_req[:id], date(created_at), retrieved[@ext_uniq],
944
+ state) unless state.nil?
945
+
946
+ ensure_pull_request_commits(owner, repo, pullreq_id) if commits
947
+ ensure_pullreq_comments(owner, repo, pullreq_id, created_at) if comments
948
+
949
+ pulls_reqs.first(:base_repo_id => project[:id],
950
+ :pullreq_id => pullreq_id)
951
+ end
952
+
953
+ def ensure_pullreq_comments(owner, repo, pullreq_id, created_at)
954
+ currepo = ensure_repo(owner, repo, true, true, false)
955
+ time = if created_at.nil? then currepo[:created_at] else Time.now() end
956
+
957
+ if currepo.nil?
958
+ warn "Could not repository #{owner}/#{repo}"
959
+ return
960
+ end
961
+
962
+ pull_req = ensure_pull_request(owner, repo, pullreq_id, false, true)
963
+
964
+ if pull_req.nil?
965
+ warn "Could not retrieve pull req #{owner}/#{repo} -> #{pullreq_id}"
966
+ return
967
+ end
968
+
969
+ retrieve_pull_req_comments(owner, repo, pullreq_id).reduce([]) do |acc, x|
970
+
971
+ if @db[:pull_request_comments].first(:pullreq_id => pull_req[:id],
972
+ :comment_id => x['id']).nil?
973
+ acc << x
974
+ else
975
+ acc
976
+ end
977
+ end.map { |x|
978
+ ensure_pullreq_comment(owner, repo, pullreq_id, x['id'], time)
979
+ }
980
+ end
981
+
982
+ def ensure_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
983
+ pull_req = ensure_pull_request(owner, repo, pullreq_id, false, true)
984
+
985
+ if pull_req.nil?
986
+ warn "Could not retrieve pull req #{owner}/#{repo} -> #{pullreq_id}"
987
+ return
988
+ end
989
+
990
+ exists = @db[:pull_request_comments].first(:pull_request_id => pull_req[:id],
991
+ :comment_id => comment_id)
992
+
993
+ if exists.nil?
994
+ retrieved = retrieve_pull_req_comment(owner, repo, pullreq_id, comment_id)
995
+
996
+ if retrieved.nil?
997
+ warn "Could not retrieve comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
998
+ return
999
+ end
1000
+
1001
+ commenter = ensure_user(retrieved['user']['login'], false, false)
1002
+
1003
+ if commenter.nil?
1004
+ warn "Could not retrieve commenter #{retrieved['user']['login']}" +
1005
+ "for pullreq comment #{owner}/#{repo} -> #{pullreq_id}(#{comment_id}) "
1006
+ end
1007
+
1008
+ commit = ensure_commit(repo, retrieved['original_commit_id'],owner)
1009
+
1010
+ @db[:pull_request_comments].insert(
1011
+ :pull_request_id => pull_req[:id],
1012
+ :user_id => commenter[:id],
1013
+ :comment_id => comment_id,
1014
+ :position => retrieved['original_position'],
1015
+ :body => retrieved['body'][0..254],
1016
+ :commit_id => (commit[:id] unless commit.nil?),
1017
+ :created_at => retrieved['created_at'],
1018
+ :ext_ref_id => retrieved[@ext_uniq]
1019
+ )
1020
+ debug "GHTorrent: Adding comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1021
+ else
1022
+ debug "GHTorrent: Updating comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1023
+ end
1024
+ end
1025
+
1026
+ def ensure_pull_request_commits(owner, repo, pullreq_id)
1027
+ retrieve_pull_req_commits(owner, repo, pullreq_id).map {|c|
1028
+ ensure_commit(repo, c['sha'], owner, true)
1029
+ }.map { |c|
1030
+ pullreq = ensure_pull_request(owner, repo, pullreq_id, false, false)
1031
+ exists = @db[:pull_request_commits].first(:pull_request_id => pullreq[:id],
1032
+ :commit_id => c[:id])
1033
+ if exists.nil?
1034
+ @db[:pull_request_commits].insert(:pull_request_id => pullreq[:id],
1035
+ :commit_id => c[:id])
1036
+
1037
+ info "GHTorrent: Added commit #{c[:sha]} to pullreq #{owner}/#{repo} -> #{pullreq_id}"
1038
+ else
1039
+ debug "GHTorrent: Commit #{c[:sha]} exists in pullreq #{owner}/#{repo} -> #{pullreq_id}"
1040
+ exists
1041
+ end
1042
+ }
1043
+ end
1044
+
1045
+ ##
1046
+ # Get all forks for a project.
1047
+ #
1048
+ # ==Parameters:
1049
+ # [owner] The user to which the project belongs
1050
+ # [repo] The repository/project to find forks for
1051
+ def ensure_forks(owner, repo)
1052
+ currepo = ensure_repo(owner, repo, false, false, false)
1053
+ time = currepo[:created_at]
1054
+
1055
+ if currepo.nil?
1056
+ warn "Could not retrieve forks for #{owner}/#{repo}"
1057
+ return
1058
+ end
1059
+
1060
+ existing_forks = @db.from(:forks, :projects).\
1061
+ where(:forks__forked_project_id => :projects__id). \
1062
+ where(:forks__forked_from_id => currepo[:id]).select(:name, :login).all
1063
+
1064
+ retrieve_forks(owner, repo).reduce([]) do |acc, x|
1065
+ if existing_forks.find {|y|
1066
+ y[:login] == x['owner']['login'] && y[:name] == x['name']
1067
+ }.nil?
1068
+ acc << x
1069
+ else
1070
+ acc
1071
+ end
1072
+ end.map { |x| ensure_fork(owner, repo, x['id'], time) }
1073
+ end
1074
+
1075
+ ##
1076
+ # Make sure that a fork is retrieved for a project
1077
+ def ensure_fork(owner, repo, fork_id, date_added = nil)
1078
+
1079
+ forks = @db[:forks]
1080
+ forked = ensure_repo(owner, repo, false, false, false)
1081
+ fork_exists = forks.first(:fork_id => fork_id)
1082
+
1083
+ if fork_exists.nil?
1084
+ added = if date_added.nil? then Time.now else date_added end
1085
+ retrieved = retrieve_fork(owner, repo, fork_id)
1086
+
1087
+ if retrieved.nil?
1088
+ warn "GHTorrent: Fork #{fork_id} does not exist for #{owner}/#{repo}"
1089
+ return
1090
+ end
1091
+
1092
+ forked_repo_owner = retrieved['full_name'].split(/\//)[0]
1093
+ forked_repo_name = retrieved['full_name'].split(/\//)[1]
1094
+
1095
+ fork = ensure_repo(forked_repo_owner, forked_repo_name)
1096
+
1097
+ if forked.nil? or fork.nil?
1098
+ warn "Could not add fork #{fork_id}"
1099
+ return
1100
+ end
1101
+
1102
+ forks.insert(:forked_project_id => fork[:id],
1103
+ :forked_from_id => forked[:id],
1104
+ :fork_id => fork_id,
1105
+ :created_at => added,
1106
+ :ext_ref_id => retrieved[@ext_uniq])
1107
+ info "GHTorrent: Added #{forked_repo_owner}/#{forked_repo_name} as fork of #{owner}/#{repo}"
1108
+ else
1109
+ unless date_added.nil?
1110
+ forks.filter(:fork_id => fork_id)\
1111
+ .update(:created_at => date(date_added))
1112
+ debug "GHTorrent: Updating fork #{owner}/#{repo} (#{fork_id})"
1113
+ end
1114
+ end
1115
+ end
1116
+
676
1117
  private
677
1118
 
678
1119
  # Store a commit contained in a hash. First check whether the commit exists.
@@ -684,21 +1125,24 @@ module GHTorrent
684
1125
  author = commit_user(c['author'], c['commit']['author'])
685
1126
  commiter = commit_user(c['committer'], c['commit']['committer'])
686
1127
 
687
- userid = @db[:users].filter(:login => user).first[:id]
688
- repoid = @db[:projects].filter(:owner_id => userid,
689
- :name => repo).first[:id]
1128
+ repository = ensure_repo(user, repo, false, false, false)
1129
+
1130
+ if repository.nil?
1131
+ warn "Could not store commit #{user}/#{repo} #{c['sha']}"
1132
+ return
1133
+ end
690
1134
 
691
1135
  commits.insert(:sha => c['sha'],
692
1136
  :author_id => author[:id],
693
1137
  :committer_id => commiter[:id],
694
- :project_id => repoid,
1138
+ :project_id => repository[:id],
695
1139
  :created_at => date(c['commit']['author']['date']),
696
1140
  :ext_ref_id => c[@ext_uniq]
697
1141
  )
1142
+ debug "GHTorrent: New commit #{user}/#{repo} -> #{c['sha']} "
698
1143
  commits.first(:sha => c['sha'])
699
- debug "GHTorrent: New commit #{repo} -> #{c['sha']} "
700
1144
  else
701
- debug "GHTorrent: Commit #{repo} -> #{c['sha']} exists"
1145
+ debug "GHTorrent: Commit #{user}/#{repo} -> #{c['sha']} exists"
702
1146
  commit
703
1147
  end
704
1148
  end
@@ -706,12 +1150,28 @@ module GHTorrent
706
1150
  # Run a block in a DB transaction. Exceptions trigger transaction rollback
707
1151
  # and are rethrown.
708
1152
  def transaction(&block)
1153
+ @db ||= get_db
1154
+ @persister ||= persister
1155
+
709
1156
  start_time = Time.now
710
- @db.transaction(:rollback => :reraise, :isolation => :committed) do
711
- yield block
1157
+ begin
1158
+ @db.transaction(:rollback => :reraise, :isolation => :committed) do
1159
+ yield block
1160
+ end
1161
+ total = Time.now.to_ms - start_time.to_ms
1162
+ debug "GHTorrent: Transaction committed (#{total} ms)"
1163
+ rescue Exception => e
1164
+ total = Time.now.to_ms - start_time.to_ms
1165
+ warn "GHTorrent: Transaction failed (#{total} ms)"
1166
+ raise e
1167
+ ensure
1168
+ @db.disconnect
1169
+ @persister.close
1170
+
1171
+ @db = nil
1172
+ @persister = nil
1173
+ GC.start
712
1174
  end
713
- total = Time.now.to_ms - start_time.to_ms
714
- debug "GHTorrent: Transaction committed (#{total} ms)"
715
1175
  end
716
1176
 
717
1177
  ##
@@ -742,15 +1202,6 @@ module GHTorrent
742
1202
  email =~ /^[a-zA-Z][\w\.-]*[a-zA-Z0-9]@[a-zA-Z0-9][\w\.-]*[a-zA-Z0-9]\.[a-zA-Z][a-zA-Z\.]*[a-zA-Z]$/
743
1203
  end
744
1204
  end
745
- # Base exception for all GHTorrent exceptions
746
- class GHTorrentException < Exception
747
- end
748
- end
749
-
750
- class Time
751
- def to_ms
752
- (self.to_f * 1000.0).to_i
753
- end
754
1205
  end
755
1206
 
756
1207
  # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :