ghtorrent 0.4 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/CHANGELOG +24 -0
  2. data/Gemfile +17 -0
  3. data/Gemfile.lock +40 -0
  4. data/README.md +23 -22
  5. data/bin/ght-data-retrieval +66 -24
  6. data/bin/ght-load +41 -19
  7. data/bin/ght-mirror-events +13 -16
  8. data/bin/ght-rm-dupl +119 -77
  9. data/lib/ghtorrent.rb +14 -4
  10. data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
  11. data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
  12. data/lib/ghtorrent/api_client.rb +151 -16
  13. data/lib/ghtorrent/bson_orderedhash.rb +23 -0
  14. data/lib/ghtorrent/cache.rb +97 -0
  15. data/lib/ghtorrent/command.rb +43 -25
  16. data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
  17. data/lib/ghtorrent/ghtorrent.rb +615 -164
  18. data/lib/ghtorrent/hash.rb +11 -0
  19. data/lib/ghtorrent/logging.rb +11 -7
  20. data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
  21. data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
  22. data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
  23. data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
  24. data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
  25. data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
  26. data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
  27. data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
  28. data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
  29. data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
  30. data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
  31. data/lib/ghtorrent/persister.rb +3 -0
  32. data/lib/ghtorrent/retriever.rb +298 -102
  33. data/lib/ghtorrent/settings.rb +20 -1
  34. data/lib/ghtorrent/time.rb +5 -0
  35. data/lib/ghtorrent/utils.rb +22 -4
  36. data/lib/version.rb +5 -0
  37. metadata +173 -145
  38. data/lib/ghtorrent/call_stack.rb +0 -91
@@ -0,0 +1,6 @@
1
+ # Base exception for all GHTorrent exceptions
2
+ module GHTorrent
3
+ class GHTorrentException < Exception
4
+
5
+ end
6
+ end
@@ -1,5 +1,11 @@
1
1
  require 'sequel'
2
2
 
3
+ require 'ghtorrent/time'
4
+ require 'ghtorrent/logging'
5
+ require 'ghtorrent/settings'
6
+ require 'ghtorrent/retriever'
7
+ require 'ghtorrent/persister'
8
+
3
9
  module GHTorrent
4
10
  class Mirror
5
11
 
@@ -8,32 +14,34 @@ module GHTorrent
8
14
  include GHTorrent::Retriever
9
15
  include GHTorrent::Persister
10
16
 
11
- attr_reader :settings, :persister
12
-
13
- def initialize(configuration)
17
+ attr_reader :settings, :persister, :ext_uniq, :logger
14
18
 
15
- @settings = YAML::load_file configuration
16
- super(@settings)
19
+ def initialize(settings)
20
+ @settings = settings
17
21
  @ext_uniq = config(:uniq_id)
18
22
  @logger = Logger.new(STDOUT)
19
- @persister = connect(:mongo, @settings)
20
- get_db
21
23
  end
22
24
 
23
25
  # db related functions
24
26
  def get_db
25
-
26
- @db = Sequel.connect(config(:sql_url))
27
-
27
+ Sequel.single_threaded = true
28
+ @db = Sequel.connect(config(:sql_url), :encoding => 'utf8')
29
+ #@db.loggers << @logger
28
30
  if @db.tables.empty?
29
31
  dir = File.join(File.dirname(__FILE__), 'migrations')
30
32
  puts "Database empty, running migrations from #{dir}"
31
33
  Sequel.extension :migration
32
34
  Sequel::Migrator.apply(@db, dir)
33
35
  end
36
+
34
37
  @db
35
38
  end
36
39
 
40
+ def persister
41
+ @persister ||= connect(:mongo, @settings)
42
+ @persister
43
+ end
44
+
37
45
  ##
38
46
  # Ensure that a user exists, or fetch its latest state from Github
39
47
  # ==Parameters:
@@ -46,7 +54,7 @@ module GHTorrent
46
54
  end
47
55
 
48
56
  transaction do
49
- ensure_repo(user, repo)
57
+ ensure_user(user, true, true)
50
58
  ensure_commit(repo, sha, user)
51
59
  end
52
60
  end
@@ -60,7 +68,6 @@ module GHTorrent
60
68
  # [date_added] The timestamp that the add event took place
61
69
  def get_project_member(owner, repo, new_member, date_added)
62
70
  transaction do
63
- ensure_repo(owner, repo)
64
71
  ensure_project_member(owner, repo, new_member, date_added)
65
72
  end
66
73
  end
@@ -74,7 +81,6 @@ module GHTorrent
74
81
  # [date_added] The timestamp that the add event took place
75
82
  def get_commit_comment(user, repo, comment_id, date_added)
76
83
  transaction do
77
- ensure_repo(user, repo)
78
84
  ensure_commit_comment(user, repo, comment_id, date_added)
79
85
  end
80
86
  end
@@ -88,7 +94,6 @@ module GHTorrent
88
94
  # [date_added] The timestamp that the add event took place
89
95
  def get_watcher(owner, repo, watcher, date_added)
90
96
  transaction do
91
- ensure_repo(owner, repo)
92
97
  ensure_watcher(owner, repo, watcher, date_added)
93
98
  end
94
99
  end
@@ -101,20 +106,84 @@ module GHTorrent
101
106
  # [date_added] The timestamp that the add event took place
102
107
  def get_follower(follower, followed, date_added)
103
108
  transaction do
104
- ensure_user(follower, false, false)
105
- ensure_user(followed, false, false)
106
- ensure_user_followers(followed, date_added)
109
+ ensure_user(follower, true, true)
110
+ ensure_user(followed, true, true)
111
+ ensure_user_follower(followed, follower, date_added)
112
+ end
113
+ end
114
+
115
+ ##
116
+ # Get a pull request and record the changes it affects
117
+ # ==Parameters:
118
+ # [owner] The owner of the repository to which the pullreq will be applied
119
+ # [repo] The repository to which the pullreq will be applied
120
+ # [pullreq_id] The ID of the pull request relative to the repository
121
+ def get_pull_request(owner, repo, pullreq_id, state, created_at)
122
+ transaction do
123
+ ensure_pull_request(owner, repo, pullreq_id, true, true, state, created_at)
124
+ end
125
+ end
126
+
127
+ ##
128
+ # Retrieve details about a project fork (including the forked project)
129
+ # ==Parameters:
130
+ # [owner] The login of the repository owner
131
+ # [repo] The name of the repository
132
+ # [fork_id] The fork item id
133
+ # [date_added] The timestamp that the add event took place
134
+ def get_fork(owner, repo, fork_id, date_added)
135
+ transaction do
136
+ ensure_fork(owner, repo, fork_id, date_added)
107
137
  end
108
138
  end
109
139
 
140
+ ##
141
+ # Retrieve a pull request review comment
142
+ # ==Parameters:
143
+ # [owner] The login of the repository owner
144
+ # [repo] The name of the repository
145
+ # [fork_id] The fork item id
146
+ # [date_added] The timestamp that the add event took place
147
+ def get_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
148
+ transaction do
149
+ ensure_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
150
+ end
151
+ end
152
+
153
+ ##
154
+ # Retrieve a pull request review comment
155
+ # ==Parameters:
156
+ # [owner] The login of the repository owner
157
+ # [repo] The name of the repository
158
+ # [fork_id] The fork item id
159
+ # [date_added] The timestamp that the add event took place
160
+ def get_issue_comment(owner, repo, issue_id, comment_id, created_at)
161
+ transaction do
162
+ raise "Not implemented"
163
+ #ensure_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
164
+ end
165
+ end
166
+
167
+
110
168
  ##
111
169
  # Make sure a commit exists
112
170
  #
113
171
  def ensure_commit(repo, sha, user, comments = true)
172
+ ensure_repo(user, repo)
114
173
  c = retrieve_commit(repo, sha, user)
174
+
175
+ if c.nil?
176
+ warn "GHTorrent: Commit #{user}/#{repo} -> #{sha} does not exist"
177
+ return
178
+ end
179
+
115
180
  stored = store_commit(c, repo, user)
116
181
  ensure_parents(c)
117
- ensure_commit_comments(user, repo, sha) if comments
182
+ if not c['commit']['comment_count'].nil? \
183
+ and c['commit']['comment_count'] > 0
184
+ ensure_commit_comments(user, repo, sha) if comments
185
+ end
186
+ ensure_repo_commit(user, repo, sha)
118
187
  stored
119
188
  end
120
189
 
@@ -162,13 +231,41 @@ module GHTorrent
162
231
 
163
232
  parents.insert(:commit_id => this[:id],
164
233
  :parent_id => parent[:id])
165
- info "Added parent #{parent[:sha]} to commit #{this[:sha]}"
234
+ info "GHTorrent: Added parent #{parent[:sha]} to commit #{this[:sha]}"
166
235
  else
167
- info "Parent #{parent[:sha]} for commit #{this[:sha]} exists"
236
+ debug "GHTorrent: Parent #{parent[:sha]} for commit #{this[:sha]} exists"
168
237
  end
169
238
  end
170
239
  end
171
240
 
241
+ ##
242
+ # Make sure that a commit has been associated with the provided repo
243
+ # ==Parameters:
244
+ # [user] The user that owns the repo this commit has been submitted to
245
+ # [repo] The repo receiving the commit
246
+ # [sha] The commit SHA
247
+ def ensure_repo_commit(user, repo, sha)
248
+ userid = @db[:users].first(:login => user)[:id]
249
+ projectid = @db[:projects].first(:owner_id => userid,
250
+ :name => repo)[:id]
251
+ commitid = @db[:commits].first(:sha => sha)[:id]
252
+
253
+ exists = @db[:project_commits].first(:project_id => projectid,
254
+ :commit_id => commitid)
255
+ if exists.nil?
256
+ @db[:project_commits].insert(
257
+ :project_id => projectid,
258
+ :commit_id => commitid
259
+ )
260
+ info "GHTorrent: Added commit #{user}/#{repo} -> #{sha}"
261
+ @db[:project_commits].first(:project_id => projectid,
262
+ :commit_id => commitid)
263
+ else
264
+ debug "GHTorrent: Commit #{user}/#{repo} -> #{sha} exists"
265
+ exists
266
+ end
267
+ end
268
+
172
269
  ##
173
270
  # Add (or update) an entry for a commit author. This method uses information
174
271
  # in the JSON object returned by Github to add (or update) a user in the
@@ -193,15 +290,14 @@ module GHTorrent
193
290
  login = githubuser['login'] unless githubuser.nil?
194
291
 
195
292
  if login.nil?
196
- ensure_user("#{name}<#{email}>", true, false)
293
+ ensure_user("#{name}<#{email}>", false, false)
197
294
  else
198
295
  dbuser = users.first(:login => login)
199
296
  byemail = users.first(:email => email)
200
297
  if dbuser.nil?
201
298
  # We do not have the user in the database yet. Add him
202
- added = ensure_user(login, true, false)
299
+ added = ensure_user(login, false, false)
203
300
  if byemail.nil?
204
- #
205
301
  users.filter(:login => login).update(:name => name) if added[:name].nil?
206
302
  users.filter(:login => login).update(:email => email) if added[:email].nil?
207
303
  else
@@ -215,8 +311,6 @@ module GHTorrent
215
311
  :login => login,
216
312
  :company => added['company'],
217
313
  :location => added['location'],
218
- :hireable => added['hireable'],
219
- :bio => added['bio'],
220
314
  :created_at => added['created_at']
221
315
  )
222
316
  end
@@ -259,7 +353,6 @@ module GHTorrent
259
353
  return u
260
354
  end
261
355
 
262
-
263
356
  ##
264
357
  # Ensure that a user exists, or fetch its latest state from Github
265
358
  # ==Parameters:
@@ -275,6 +368,12 @@ module GHTorrent
275
368
 
276
369
  if usr.nil?
277
370
  u = retrieve_user_byusername(user)
371
+
372
+ if u.nil?
373
+ warn "GHTorrent: User #{user} does not exist"
374
+ return
375
+ end
376
+
278
377
  email = unless u['email'].nil?
279
378
  if u['email'].strip == "" then
280
379
  nil
@@ -283,49 +382,16 @@ module GHTorrent
283
382
  end
284
383
  end
285
384
 
286
- if not email.nil?
287
- # Check whether a user has been added by email before
288
- byemail = users.first(:email => email)
289
- unless byemail.nil?
290
- users.filter(:email => email).update(:login => u['login'],
291
- :name => u['name'],
292
- :company => u['company'],
293
- :hireable => boolean(u['hirable']),
294
- :bio => u['bio'],
295
- :location => u['location'],
296
- :type => user_type(u['type']),
297
- :created_at => date(u['created_at']),
298
- :ext_ref_id => u[@ext_uniq]
299
- )
300
- info "GHTorrent: Updating user #{user} (email #{email})"
301
- else
302
- users.insert(:login => u['login'],
303
- :name => u['name'],
304
- :company => u['company'],
305
- :email => email,
306
- :hireable => boolean(u['hirable']),
307
- :bio => u['bio'],
308
- :location => u['location'],
309
- :type => user_type(u['type']),
310
- :created_at => date(u['created_at']),
311
- :ext_ref_id => u[@ext_uniq])
312
-
313
- info "GHTorrent: New user #{user}"
314
- end
315
- else
316
- users.insert(:login => u['login'],
317
- :name => u['name'],
318
- :company => u['company'],
319
- :email => email,
320
- :hireable => boolean(u['hirable']),
321
- :bio => u['bio'],
322
- :location => u['location'],
323
- :type => user_type(u['type']),
324
- :created_at => date(u['created_at']),
325
- :ext_ref_id => u[@ext_uniq])
385
+ users.insert(:login => u['login'],
386
+ :name => u['name'],
387
+ :company => u['company'],
388
+ :email => email,
389
+ :location => u['location'],
390
+ :type => user_type(u['type']),
391
+ :created_at => date(u['created_at']),
392
+ :ext_ref_id => u[@ext_uniq])
326
393
 
327
- info "GHTorrent: New user #{user}"
328
- end
394
+ info "GHTorrent: New user #{user}"
329
395
  users.first(:login => user)
330
396
  else
331
397
  debug "GHTorrent: User #{user} exists"
@@ -340,37 +406,62 @@ module GHTorrent
340
406
  #
341
407
  # ==Parameters:
342
408
  # [user] The user login to find followers by
343
- def ensure_user_followers(user, date_added = nil)
409
+ def ensure_user_followers(followed, date_added = nil)
410
+ curuser = ensure_user(followed, false, false)
411
+ time = curuser[:created_at]
412
+ followers = @db.from(:followers, :users).\
413
+ where(:followers__follower_id => :users__id).
414
+ where(:followers__user_id => curuser[:id]).select(:login).all
415
+
416
+ retrieve_user_followers(followed).reduce([]) do |acc, x|
417
+ if followers.find {|y| y[:login] == x['login']}.nil?
418
+ acc << x
419
+ else
420
+ acc
421
+ end
422
+ end.map { |x| ensure_user_follower(followed, x['login'], time) }
423
+ end
424
+
425
+ ##
426
+ # Make sure that a user follows another one
427
+ def ensure_user_follower(followed, follower, date_added)
428
+ follower_user = ensure_user(follower, false, false)
429
+ followed_user = ensure_user(followed, false, false)
430
+
431
+ if followed_user.nil? or follower_user.nil?
432
+ warn "Could not add follower #{follower} to #{followed}"
433
+ return
434
+ end
435
+
344
436
  followers = @db[:followers]
345
- userid = @db[:users].first(:login => user)[:id]
437
+ followed_id = follower_user[:id]
438
+ follower_id = followed_user[:id]
346
439
 
347
- retrieved = retrieve_user_followers(user)
348
- retrieved.each { |f|
349
- follower = f['login']
350
- ensure_user(user, false, false)
351
- ensure_user(follower, false, false)
440
+ follower_exists = followers.first(:user_id => followed_id,
441
+ :follower_id => follower_id)
352
442
 
353
- followerid = @db[:users].first(:login => follower)[:id]
443
+ if follower_exists.nil?
444
+ added = if date_added.nil? then Time.now else date_added end
445
+ retrieved = retrieve_user_follower(followed, follower)
354
446
 
447
+ if retrieved.nil?
448
+ warn "Follower #{follower} does not exist for user #{followed}"
449
+ return
450
+ end
355
451
 
356
- if followers.first(:user_id => userid, :follower_id => followerid).nil?
357
- added = if date_added.nil? then Time.now else date_added end
358
- followers.insert(:user_id => userid,
359
- :follower_id => followerid,
360
- :created_at => added,
361
- :ext_ref_id => f[@ext_uniq]
362
- )
363
- info "GHTorrent: User #{follower} follows #{user}"
364
- else
365
- unless date_added.nil?
366
- followers.filter(:user_id => userid,
367
- :follower_id => followerid).\
368
- update(:created_at => date(date_added))
369
- info "GHTorrent: Updated follower #{follower} -> #{user}"
370
- end
371
- debug "GHTorrent: User #{follower} already follows #{user}"
452
+ followers.insert(:user_id => followed_id,
453
+ :follower_id => follower_id,
454
+ :created_at => added,
455
+ :ext_ref_id => retrieved[@ext_uniq])
456
+ info "GHTorrent: User #{follower} follows #{followed}"
457
+ else
458
+ unless date_added.nil?
459
+ followers.filter(:user_id => followed_id,
460
+ :follower_id => follower_id)\
461
+ .update(:created_at => date(date_added))
462
+ debug "GHTorrent: Updating follower #{followed} -> #{follower}"
372
463
  end
373
- }
464
+ end
374
465
  end
375
466
 
376
467
  ##
@@ -379,8 +470,7 @@ module GHTorrent
379
470
  #
380
471
  # ==Parameters:
381
472
  # [email] The email to lookup the user by
382
- # [email] The user's name
383
- # [followers] If true, the user's followers will be retrieved
473
+ # [name] The user's name
384
474
  # == Returns:
385
475
  # If the user can be retrieved, it is returned as a Hash. Otherwise,
386
476
  # the result is nil
@@ -392,27 +482,27 @@ module GHTorrent
392
482
 
393
483
  u = retrieve_user_byemail(email, name)
394
484
 
395
- if u.nil? or u['user'].nil? or u['user']['login'].nil?
485
+ if u.nil? or u['login'].nil?
396
486
  debug "GHTorrent: Cannot find #{email} through search API query"
487
+ login = (0...8).map { 65.+(rand(25)).chr }.join
397
488
  users.insert(:email => email,
398
489
  :name => name,
399
- :login => (0...8).map { 65.+(rand(25)).chr }.join,
490
+ :login => login,
400
491
  :created_at => Time.now,
401
492
  :ext_ref_id => ""
402
493
  )
403
- users.first(:email => email)
494
+ info "GHTorrent: Added fake user #{login} -> #{email}"
495
+ users.first(:login => login)
404
496
  else
405
- users.insert(:login => u['user']['login'],
406
- :name => u['user']['name'],
407
- :company => u['user']['company'],
408
- :email => u['user']['email'],
409
- :hireable => nil,
410
- :bio => nil,
411
- :location => u['user']['location'],
412
- :created_at => date(u['user']['created_at']),
497
+ users.insert(:login => u['login'],
498
+ :name => u['name'],
499
+ :company => u['company'],
500
+ :email => u['email'],
501
+ :location => u['location'],
502
+ :created_at => date(u['created_at']),
413
503
  :ext_ref_id => u[@ext_uniq])
414
- debug "GHTorrent: Found #{email} through search API query"
415
- users.first(:email => email)
504
+ info "GHTorrent: Found #{email} through search API query"
505
+ users.first(:login => u['login'])
416
506
  end
417
507
  else
418
508
  debug "GHTorrent: User with email #{email} exists"
@@ -430,15 +520,21 @@ module GHTorrent
430
520
  # == Returns:
431
521
  # If the repo can be retrieved, it is returned as a Hash. Otherwise,
432
522
  # the result is nil
433
- def ensure_repo(user, repo)
523
+ def ensure_repo(user, repo, commits = true, project_members = true, watchers = true)
434
524
 
435
- ensure_user(user, true, true)
525
+ ensure_user(user, false, false)
436
526
  repos = @db[:projects]
437
527
  curuser = @db[:users].first(:login => user)
438
528
  currepo = repos.first(:owner_id => curuser[:id], :name => repo)
439
529
 
440
530
  if currepo.nil?
441
531
  r = retrieve_repo(user, repo)
532
+
533
+ if r.nil?
534
+ warn "Repo #{user}/#{repo} does not exist"
535
+ return
536
+ end
537
+
442
538
  repos.insert(:url => r['url'],
443
539
  :owner_id => @db[:users].filter(:login => user).first[:id],
444
540
  :name => r['name'],
@@ -448,9 +544,9 @@ module GHTorrent
448
544
  :ext_ref_id => r[@ext_uniq])
449
545
 
450
546
  info "GHTorrent: New repo #{repo}"
451
- ensure_commits(user, repo)
452
- ensure_project_members(user, repo)
453
- ensure_watchers(user, repo)
547
+ ensure_commits(user, repo) if commits
548
+ ensure_project_members(user, repo) if project_members
549
+ ensure_watchers(user, repo) if watchers
454
550
  repos.first(:owner_id => curuser[:id], :name => repo)
455
551
  else
456
552
  debug "GHTorrent: Repo #{repo} exists"
@@ -461,27 +557,32 @@ module GHTorrent
461
557
  ##
462
558
  # Make sure that a project has all the registered members defined
463
559
  def ensure_project_members(user, repo)
464
- curuser = @db[:users].first(:login => user)
465
- currepo = @db[:projects].first(:owner_id => curuser[:id], :name => repo)
466
- project_members = @db[:project_members].filter(:user_id => curuser[:id],
467
- :repo_id => currepo[:id])
560
+ currepo = ensure_repo(user, repo, true, false, true)
561
+ time = currepo[:created_at]
562
+
563
+ project_members = @db.from(:project_members, :users).\
564
+ where(:project_members__user_id => :users__id).\
565
+ where(:project_members__repo_id => currepo[:id]).select(:login).all
468
566
 
469
567
  retrieve_repo_collaborators(user, repo).reduce([]) do |acc, x|
470
- if project_members.find { |y| y[:login] == x['login'] }.nil?
568
+ if project_members.find {|y| y[:login] == x['login']}.nil?
471
569
  acc << x
472
570
  else
473
571
  acc
474
572
  end
475
- end.map { |x| ensure_project_member(user, repo, x['login'], nil) }
573
+ end.map { |x| ensure_project_member(user, repo, x['login'], time) }
476
574
  end
477
575
 
478
576
  ##
479
577
  # Make sure that a project member exists in a project
480
578
  def ensure_project_member(owner, repo, new_member, date_added)
481
579
  pr_members = @db[:project_members]
580
+ project = ensure_repo(owner, repo, true, false, true)
482
581
  new_user = ensure_user(new_member, false, false)
483
- owner_id = @db[:users].first(:login => owner)[:id]
484
- project = @db[:projects].first(:owner_id => owner_id, :name => repo)
582
+
583
+ if project.nil? or new_user.nil?
584
+ return
585
+ end
485
586
 
486
587
  memb_exist = pr_members.first(:user_id => new_user[:id],
487
588
  :repo_id => project[:id])
@@ -489,6 +590,12 @@ module GHTorrent
489
590
  if memb_exist.nil?
490
591
  added = if date_added.nil? then Time.now else date_added end
491
592
  retrieved = retrieve_repo_collaborator(owner, repo, new_member)
593
+
594
+ if retrieved.nil?
595
+ warn "Project member #{new_member} does not exist in #{owner}/#{repo}"
596
+ return
597
+ end
598
+
492
599
  pr_members.insert(
493
600
  :user_id => new_user[:id],
494
601
  :repo_id => project[:id],
@@ -513,7 +620,6 @@ module GHTorrent
513
620
  # [user] The login name of the user to check the organizations for
514
621
  #
515
622
  def ensure_orgs(user)
516
- usr = @db[:users].first(:login => user)
517
623
  retrieve_orgs(user).map{|o| ensure_participation(user, o['login'])}
518
624
  end
519
625
 
@@ -525,8 +631,8 @@ module GHTorrent
525
631
  # [org] The login name of the organization to check whether the user
526
632
  # belongs in
527
633
  #
528
- def ensure_participation(user, organization)
529
- org = ensure_org(organization)
634
+ def ensure_participation(user, organization, members = true)
635
+ org = ensure_org(organization, members)
530
636
  usr = ensure_user(user, false, false)
531
637
 
532
638
  org_members = @db[:organization_members]
@@ -550,14 +656,21 @@ module GHTorrent
550
656
  # ==Parameters:
551
657
  # [organization] The login name of the organization
552
658
  #
553
- def ensure_org(organization)
554
- org = @db[:users].find(:login => organization, :type => 'org')
659
+ def ensure_org(organization, members)
660
+ org = @db[:users].first(:login => organization, :type => 'org')
555
661
 
556
662
  if org.nil?
557
- ensure_user(org, false, false)
663
+ org = ensure_user(organization, false, false)
664
+ if members
665
+ retrieve_org_members(organization).map { |x|
666
+ ensure_participation(ensure_user(x['login'], false, false)[:login],
667
+ organization, false)
668
+ }
669
+ end
670
+ org
558
671
  else
559
672
  debug "GHTorrent: Organization #{organization} exists"
560
- org.first
673
+ org
561
674
  end
562
675
  end
563
676
 
@@ -572,7 +685,6 @@ module GHTorrent
572
685
  commit_id = @db[:commits].first(:sha => sha)[:id]
573
686
  stored_comments = @db[:commit_comments].filter(:commit_id => commit_id)
574
687
  commit_comments = retrieve_commit_comments(user, repo, sha)
575
- #user_id = @db[:users].first(:login => user)[:id]
576
688
 
577
689
  not_saved = commit_comments.reduce([]) do |acc, x|
578
690
  if stored_comments.find{|y| y[:comment_id] == x['id']}.nil?
@@ -600,16 +712,16 @@ module GHTorrent
600
712
  retrieved = retrieve_commit_comment(user, repo, id)
601
713
 
602
714
  if retrieved.nil?
603
- debug "GHTorrent: Commit comment #{id} deleted"
715
+ warn "GHTorrent: Commit comment #{id} deleted"
604
716
  return
605
717
  end
606
718
 
607
- commit = ensure_commit(repo, retrieved['commit_id'], user, comments = false)
719
+ commit = ensure_commit(repo, retrieved['commit_id'], user, false)
608
720
  user = ensure_user(user, false, false)
609
721
  @db[:commit_comments].insert(
610
722
  :commit_id => commit[:id],
611
723
  :user_id => user[:id],
612
- :body => retrieved['body'],
724
+ :body => retrieved['body'][0..255],
613
725
  :line => retrieved['line'],
614
726
  :position => retrieved['position'],
615
727
  :comment_id => retrieved['id'],
@@ -617,45 +729,67 @@ module GHTorrent
617
729
  :created_at => date(retrieved['created_at'])
618
730
  )
619
731
  info "GHTorrent: Added commit comment #{commit[:sha]} -> #{user[:login]}"
620
- @db[:commit_comments].first(:comment_id => id)
621
732
  else
733
+ unless created_at.nil?
734
+ @db[:commit_comments].filter(:comment_id => id)\
735
+ .update(:created_at => date(created_at))
736
+ info "GHTorrent: Updating comment #{user}/#{repo} -> #{id}"
737
+ end
622
738
  info "GHTorrent: Commit comment #{id} exists"
623
- stored_comment
624
739
  end
740
+ @db[:commit_comments].first(:comment_id => id)
625
741
  end
626
742
 
627
743
  ##
628
744
  # Make sure that
629
745
  def ensure_watchers(owner, repo)
630
- curuser = @db[:users].first(:login => owner)
631
- currepo = @db[:projects].first(:owner_id => curuser[:id],
632
- :name => repo)
633
- watchers = @db[:watchers].filter(:user_id => curuser[:id],
634
- :repo_id => currepo[:id])
746
+ currepo = ensure_repo(owner, repo, true, true, false)
747
+ time = currepo[:created_at]
748
+
749
+ if currepo.nil?
750
+ warn "Could not retrieve watchers for #{owner}/#{repo}"
751
+ return
752
+ end
753
+
754
+ watchers = @db.from(:watchers, :users).\
755
+ where(:watchers__user_id => :users__id).\
756
+ where(:watchers__repo_id => currepo[:id]).select(:login).all
635
757
 
636
758
  retrieve_watchers(owner, repo).reduce([]) do |acc, x|
637
- if watchers.find { |y| y[:login] == x['login'] }.nil?
759
+ if watchers.find { |y|
760
+ y[:login] == x['login']
761
+ }.nil?
638
762
  acc << x
639
763
  else
640
764
  acc
641
765
  end
642
- end.map { |x| ensure_watcher(owner, repo, x['login']) }
766
+ end.map { |x| ensure_watcher(owner, repo, x['login'], time) }
643
767
  end
644
768
 
645
769
  ##
646
770
  # Make sure that a project member exists in a project
647
771
  def ensure_watcher(owner, repo, watcher, date_added = nil)
648
- watchers = @db[:watchers]
772
+ project = ensure_repo(owner, repo, false, false, false)
649
773
  new_watcher = ensure_user(watcher, false, false)
650
- owner_id = @db[:users].first(:login => owner)[:id]
651
- project = @db[:projects].first(:owner_id => owner_id, :name => repo)
652
774
 
775
+ if new_watcher.nil? or project.nil?
776
+ warn "GHTorrent: Watcher #{watcher} does not exist"
777
+ return
778
+ end
779
+
780
+ watchers = @db[:watchers]
653
781
  memb_exist = watchers.first(:user_id => new_watcher[:id],
654
- :repo_id => project[:id])
782
+ :repo_id => project[:id])
655
783
 
656
784
  if memb_exist.nil?
657
785
  added = if date_added.nil? then Time.now else date_added end
658
786
  retrieved = retrieve_watcher(owner, repo, watcher)
787
+
788
+ if retrieved.nil?
789
+ warn "Watcher #{watcher} no longer watches #{owner}/#{repo}"
790
+ return
791
+ end
792
+
659
793
  watchers.insert(
660
794
  :user_id => new_watcher[:id],
661
795
  :repo_id => project[:id],
@@ -673,6 +807,313 @@ module GHTorrent
673
807
  end
674
808
  end
675
809
 
810
+ ##
811
+ # Process all pull requests
812
+ def ensure_pull_requests(owner, repo)
813
+ currepo = ensure_repo(owner, repo, false, false, false)
814
+ if currepo.nil?
815
+ warn "Could not retrieve pull requests from #{owner}/#{repo}"
816
+ return
817
+ end
818
+
819
+ pull_reqs = @db[:pull_requests].filter(:base_repo_id => currepo[:id])
820
+
821
+ retrieve_pull_requests(owner, repo).reduce([]) do |acc, x|
822
+ if pull_reqs.find { |y| y[:pullreq_id] == x['number'] }.nil?
823
+ acc << x
824
+ else
825
+ acc
826
+ end
827
+ end.map { |x| ensure_pull_request(owner, repo, x['number']) }
828
+ end
829
+
830
+ ##
831
+ # Process a pull request
832
+ def ensure_pull_request(owner, repo, pullreq_id,
833
+ comments = true, commits = true,
834
+ state = nil, created_at = nil)
835
+ pulls_reqs = @db[:pull_requests]
836
+ pull_req_history = @db[:pull_request_history]
837
+
838
+ project = ensure_repo(owner, repo, false, false, false)
839
+
840
+ if project.nil?
841
+ return
842
+ end
843
+
844
+ # Adds a pull request history event
845
+ add_history = Proc.new do |id, ts, unq, act|
846
+
847
+ entry = pull_req_history.first(:pull_request_id => id,
848
+ :ext_ref_id => unq, :action => act)
849
+ if entry.nil?
850
+ pull_req_history.insert(:pull_request_id => id, :created_at => ts,
851
+ :ext_ref_id => unq, :action => act)
852
+ info "GHTorrent: New pull request (#{id}) history entry (#{act})"
853
+ else
854
+ pull_req_history.filter(:pull_request_id => id, :ext_ref_id => unq,
855
+ :action => act).update(:created_at => ts)
856
+ info "GHTorrent: Updating pull request (#{id}) history entry (#{act}) timestamp #{ts}"
857
+ end
858
+ end
859
+
860
+ # Checks whether a pull request concerns two branches of the same
861
+ # repository
862
+ is_intra_branch = Proc.new do |req|
863
+ req['head']['repo'].nil?
864
+ end
865
+
866
+ # Produces a log message
867
+ log_msg = Proc.new do |req|
868
+ head = if is_intra_branch.call(req)
869
+ req['base']['repo']['full_name']
870
+ else
871
+ req['head']['repo']['full_name']
872
+ end
873
+
874
+ <<-eos.gsub(/\s+/, " ").strip
875
+ GHTorrent: Pull request #{pullreq_id}
876
+ #{head} -> #{req['base']['repo']['full_name']}
877
+ eos
878
+ end
879
+
880
+ retrieved = retrieve_pull_request(owner, repo, pullreq_id)
881
+
882
+ if retrieved.nil?
883
+ warn "GHTorrent: Cannot retrieve pull request (#{owner}/#{repo} #{pullreq_id})"
884
+ return
885
+ end
886
+
887
+ base_repo = ensure_repo(retrieved['base']['repo']['owner']['login'],
888
+ retrieved['base']['repo']['name'],
889
+ false, false, false)
890
+
891
+ base_commit = ensure_commit(retrieved['base']['repo']['name'],
892
+ retrieved['base']['sha'],
893
+ retrieved['base']['repo']['owner']['login']
894
+ )
895
+
896
+ if is_intra_branch.call(retrieved)
897
+ head_repo = base_repo
898
+ head_commit =
899
+ warn "GHTorrent: Pull request is intra branch"
900
+ else
901
+
902
+ head_repo = ensure_repo(retrieved['head']['repo']['owner']['login'],
903
+ retrieved['head']['repo']['name'],
904
+ false, false, false)
905
+
906
+ head_commit = ensure_commit(retrieved['head']['repo']['name'],
907
+ retrieved['head']['sha'],
908
+ retrieved['head']['repo']['owner']['login'])
909
+ end
910
+
911
+ pull_req_user = ensure_user(retrieved['user']['login'], false, false)
912
+
913
+ merged = if retrieved['merged_at'].nil? then false else true end
914
+ closed = if retrieved['closed_at'].nil? then false else true end
915
+
916
+ pull_req = pulls_reqs.first(:base_repo_id => project[:id],
917
+ :pullreq_id => pullreq_id)
918
+ if pull_req.nil?
919
+ pulls_reqs.insert(
920
+ :head_repo_id => if not head_repo.nil? then head_repo[:id] end,
921
+ :base_repo_id => base_repo[:id],
922
+ :head_commit_id => if not head_commit.nil? then head_commit[:id] end,
923
+ :base_commit_id => base_commit[:id],
924
+ :user_id => pull_req_user[:id],
925
+ :pullreq_id => pullreq_id,
926
+ :intra_branch => is_intra_branch.call(retrieved)
927
+ )
928
+
929
+ info log_msg.call(retrieved)
930
+ else
931
+ debug log_msg.call(retrieved) + " exists"
932
+ end
933
+
934
+ pull_req = pulls_reqs.first(:base_repo_id => project[:id],
935
+ :pullreq_id => pullreq_id)
936
+
937
+ add_history.call(pull_req[:id], date(retrieved['created_at']),
938
+ retrieved[@ext_uniq], 'opened')
939
+ add_history.call(pull_req[:id], date(retrieved['merged_at']),
940
+ retrieved[@ext_uniq], 'merged') if merged
941
+ add_history.call(pull_req[:id], date(retrieved['closed_at']),
942
+ retrieved[@ext_uniq], 'closed') if closed
943
+ add_history.call(pull_req[:id], date(created_at), retrieved[@ext_uniq],
944
+ state) unless state.nil?
945
+
946
+ ensure_pull_request_commits(owner, repo, pullreq_id) if commits
947
+ ensure_pullreq_comments(owner, repo, pullreq_id, created_at) if comments
948
+
949
+ pulls_reqs.first(:base_repo_id => project[:id],
950
+ :pullreq_id => pullreq_id)
951
+ end
952
+
953
+ def ensure_pullreq_comments(owner, repo, pullreq_id, created_at)
954
+ currepo = ensure_repo(owner, repo, true, true, false)
955
+ time = if created_at.nil? then currepo[:created_at] else Time.now() end
956
+
957
+ if currepo.nil?
958
+ warn "Could not repository #{owner}/#{repo}"
959
+ return
960
+ end
961
+
962
+ pull_req = ensure_pull_request(owner, repo, pullreq_id, false, true)
963
+
964
+ if pull_req.nil?
965
+ warn "Could not retrieve pull req #{owner}/#{repo} -> #{pullreq_id}"
966
+ return
967
+ end
968
+
969
+ retrieve_pull_req_comments(owner, repo, pullreq_id).reduce([]) do |acc, x|
970
+
971
+ if @db[:pull_request_comments].first(:pullreq_id => pull_req[:id],
972
+ :comment_id => x['id']).nil?
973
+ acc << x
974
+ else
975
+ acc
976
+ end
977
+ end.map { |x|
978
+ ensure_pullreq_comment(owner, repo, pullreq_id, x['id'], time)
979
+ }
980
+ end
981
+
982
+ def ensure_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
983
+ pull_req = ensure_pull_request(owner, repo, pullreq_id, false, true)
984
+
985
+ if pull_req.nil?
986
+ warn "Could not retrieve pull req #{owner}/#{repo} -> #{pullreq_id}"
987
+ return
988
+ end
989
+
990
+ exists = @db[:pull_request_comments].first(:pull_request_id => pull_req[:id],
991
+ :comment_id => comment_id)
992
+
993
+ if exists.nil?
994
+ retrieved = retrieve_pull_req_comment(owner, repo, pullreq_id, comment_id)
995
+
996
+ if retrieved.nil?
997
+ warn "Could not retrieve comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
998
+ return
999
+ end
1000
+
1001
+ commenter = ensure_user(retrieved['user']['login'], false, false)
1002
+
1003
+ if commenter.nil?
1004
+ warn "Could not retrieve commenter #{retrieved['user']['login']}" +
1005
+ "for pullreq comment #{owner}/#{repo} -> #{pullreq_id}(#{comment_id}) "
1006
+ end
1007
+
1008
+ commit = ensure_commit(repo, retrieved['original_commit_id'],owner)
1009
+
1010
+ @db[:pull_request_comments].insert(
1011
+ :pull_request_id => pull_req[:id],
1012
+ :user_id => commenter[:id],
1013
+ :comment_id => comment_id,
1014
+ :position => retrieved['original_position'],
1015
+ :body => retrieved['body'][0..254],
1016
+ :commit_id => (commit[:id] unless commit.nil?),
1017
+ :created_at => retrieved['created_at'],
1018
+ :ext_ref_id => retrieved[@ext_uniq]
1019
+ )
1020
+ debug "GHTorrent: Adding comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1021
+ else
1022
+ debug "GHTorrent: Updating comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1023
+ end
1024
+ end
1025
+
1026
+ def ensure_pull_request_commits(owner, repo, pullreq_id)
1027
+ retrieve_pull_req_commits(owner, repo, pullreq_id).map {|c|
1028
+ ensure_commit(repo, c['sha'], owner, true)
1029
+ }.map { |c|
1030
+ pullreq = ensure_pull_request(owner, repo, pullreq_id, false, false)
1031
+ exists = @db[:pull_request_commits].first(:pull_request_id => pullreq[:id],
1032
+ :commit_id => c[:id])
1033
+ if exists.nil?
1034
+ @db[:pull_request_commits].insert(:pull_request_id => pullreq[:id],
1035
+ :commit_id => c[:id])
1036
+
1037
+ info "GHTorrent: Added commit #{c[:sha]} to pullreq #{owner}/#{repo} -> #{pullreq_id}"
1038
+ else
1039
+ debug "GHTorrent: Commit #{c[:sha]} exists in pullreq #{owner}/#{repo} -> #{pullreq_id}"
1040
+ exists
1041
+ end
1042
+ }
1043
+ end
1044
+
1045
+ ##
1046
+ # Get all forks for a project.
1047
+ #
1048
+ # ==Parameters:
1049
+ # [owner] The user to which the project belongs
1050
+ # [repo] The repository/project to find forks for
1051
+ def ensure_forks(owner, repo)
1052
+ currepo = ensure_repo(owner, repo, false, false, false)
1053
+ time = currepo[:created_at]
1054
+
1055
+ if currepo.nil?
1056
+ warn "Could not retrieve forks for #{owner}/#{repo}"
1057
+ return
1058
+ end
1059
+
1060
+ existing_forks = @db.from(:forks, :projects).\
1061
+ where(:forks__forked_project_id => :projects__id). \
1062
+ where(:forks__forked_from_id => currepo[:id]).select(:name, :login).all
1063
+
1064
+ retrieve_forks(owner, repo).reduce([]) do |acc, x|
1065
+ if existing_forks.find {|y|
1066
+ y[:login] == x['owner']['login'] && y[:name] == x['name']
1067
+ }.nil?
1068
+ acc << x
1069
+ else
1070
+ acc
1071
+ end
1072
+ end.map { |x| ensure_fork(owner, repo, x['id'], time) }
1073
+ end
1074
+
1075
+ ##
1076
+ # Make sure that a fork is retrieved for a project
1077
+ def ensure_fork(owner, repo, fork_id, date_added = nil)
1078
+
1079
+ forks = @db[:forks]
1080
+ forked = ensure_repo(owner, repo, false, false, false)
1081
+ fork_exists = forks.first(:fork_id => fork_id)
1082
+
1083
+ if fork_exists.nil?
1084
+ added = if date_added.nil? then Time.now else date_added end
1085
+ retrieved = retrieve_fork(owner, repo, fork_id)
1086
+
1087
+ if retrieved.nil?
1088
+ warn "GHTorrent: Fork #{fork_id} does not exist for #{owner}/#{repo}"
1089
+ return
1090
+ end
1091
+
1092
+ forked_repo_owner = retrieved['full_name'].split(/\//)[0]
1093
+ forked_repo_name = retrieved['full_name'].split(/\//)[1]
1094
+
1095
+ fork = ensure_repo(forked_repo_owner, forked_repo_name)
1096
+
1097
+ if forked.nil? or fork.nil?
1098
+ warn "Could not add fork #{fork_id}"
1099
+ return
1100
+ end
1101
+
1102
+ forks.insert(:forked_project_id => fork[:id],
1103
+ :forked_from_id => forked[:id],
1104
+ :fork_id => fork_id,
1105
+ :created_at => added,
1106
+ :ext_ref_id => retrieved[@ext_uniq])
1107
+ info "GHTorrent: Added #{forked_repo_owner}/#{forked_repo_name} as fork of #{owner}/#{repo}"
1108
+ else
1109
+ unless date_added.nil?
1110
+ forks.filter(:fork_id => fork_id)\
1111
+ .update(:created_at => date(date_added))
1112
+ debug "GHTorrent: Updating fork #{owner}/#{repo} (#{fork_id})"
1113
+ end
1114
+ end
1115
+ end
1116
+
676
1117
  private
677
1118
 
678
1119
  # Store a commit contained in a hash. First check whether the commit exists.
@@ -684,21 +1125,24 @@ module GHTorrent
684
1125
  author = commit_user(c['author'], c['commit']['author'])
685
1126
  commiter = commit_user(c['committer'], c['commit']['committer'])
686
1127
 
687
- userid = @db[:users].filter(:login => user).first[:id]
688
- repoid = @db[:projects].filter(:owner_id => userid,
689
- :name => repo).first[:id]
1128
+ repository = ensure_repo(user, repo, false, false, false)
1129
+
1130
+ if repository.nil?
1131
+ warn "Could not store commit #{user}/#{repo} #{c['sha']}"
1132
+ return
1133
+ end
690
1134
 
691
1135
  commits.insert(:sha => c['sha'],
692
1136
  :author_id => author[:id],
693
1137
  :committer_id => commiter[:id],
694
- :project_id => repoid,
1138
+ :project_id => repository[:id],
695
1139
  :created_at => date(c['commit']['author']['date']),
696
1140
  :ext_ref_id => c[@ext_uniq]
697
1141
  )
1142
+ debug "GHTorrent: New commit #{user}/#{repo} -> #{c['sha']} "
698
1143
  commits.first(:sha => c['sha'])
699
- debug "GHTorrent: New commit #{repo} -> #{c['sha']} "
700
1144
  else
701
- debug "GHTorrent: Commit #{repo} -> #{c['sha']} exists"
1145
+ debug "GHTorrent: Commit #{user}/#{repo} -> #{c['sha']} exists"
702
1146
  commit
703
1147
  end
704
1148
  end
@@ -706,12 +1150,28 @@ module GHTorrent
706
1150
  # Run a block in a DB transaction. Exceptions trigger transaction rollback
707
1151
  # and are rethrown.
708
1152
  def transaction(&block)
1153
+ @db ||= get_db
1154
+ @persister ||= persister
1155
+
709
1156
  start_time = Time.now
710
- @db.transaction(:rollback => :reraise, :isolation => :committed) do
711
- yield block
1157
+ begin
1158
+ @db.transaction(:rollback => :reraise, :isolation => :committed) do
1159
+ yield block
1160
+ end
1161
+ total = Time.now.to_ms - start_time.to_ms
1162
+ debug "GHTorrent: Transaction committed (#{total} ms)"
1163
+ rescue Exception => e
1164
+ total = Time.now.to_ms - start_time.to_ms
1165
+ warn "GHTorrent: Transaction failed (#{total} ms)"
1166
+ raise e
1167
+ ensure
1168
+ @db.disconnect
1169
+ @persister.close
1170
+
1171
+ @db = nil
1172
+ @persister = nil
1173
+ GC.start
712
1174
  end
713
- total = Time.now.to_ms - start_time.to_ms
714
- debug "GHTorrent: Transaction committed (#{total} ms)"
715
1175
  end
716
1176
 
717
1177
  ##
@@ -742,15 +1202,6 @@ module GHTorrent
742
1202
  email =~ /^[a-zA-Z][\w\.-]*[a-zA-Z0-9]@[a-zA-Z0-9][\w\.-]*[a-zA-Z0-9]\.[a-zA-Z][a-zA-Z\.]*[a-zA-Z]$/
743
1203
  end
744
1204
  end
745
- # Base exception for all GHTorrent exceptions
746
- class GHTorrentException < Exception
747
- end
748
- end
749
-
750
- class Time
751
- def to_ms
752
- (self.to_f * 1000.0).to_i
753
- end
754
1205
  end
755
1206
 
756
1207
  # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :