ghtorrent 0.9 → 0.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,6 +5,7 @@ require 'ghtorrent/settings'
5
5
  require 'ghtorrent/logging'
6
6
  require 'ghtorrent/command'
7
7
  require 'ghtorrent/retriever'
8
+ require 'ghtorrent/transacted_ghtorrent'
8
9
  require 'ghtorrent/commands/ght_retrieve_repo'
9
10
 
10
11
  class GHTRetrieveUser < GHTRetrieveRepo
@@ -20,7 +21,7 @@ An efficient way to get all data for a single user
20
21
 
21
22
  def validate
22
23
  super
23
- Trollop::die "One argument are required" unless args[0] && !args[0].empty?
24
+ Trollop::die "One argument is required" unless args[0] && !args[0].empty?
24
25
  end
25
26
 
26
27
  def go
@@ -44,19 +45,19 @@ An efficient way to get all data for a single user
44
45
 
45
46
  functions = %w(ensure_user_followers ensure_orgs ensure_org)
46
47
 
47
- if ARGV[2].nil?
48
+ if ARGV[1].nil?
48
49
  functions.each do |x|
49
50
  send_message(x, user)
50
51
  end
51
52
  else
52
- Trollop::die("Not a valid function: #{ARGV[2]}") unless functions.include? ARGV[2]
53
- send_message(ARGV[2], user)
53
+ Trollop::die("Not a valid function: #{ARGV[1]}") unless functions.include? ARGV[1]
54
+ send_message(ARGV[1], user)
54
55
  end
55
56
 
56
57
  end
57
58
  end
58
59
 
59
- class TransactedGHTorrent < GHTorrent::Mirror
60
+ class TransactedGhtorrent
60
61
 
61
62
  def ensure_user_followers(user)
62
63
  check_transaction do
@@ -23,6 +23,11 @@ module GHTorrent
23
23
  debug "GHTorrent: Using cache dir #{config(:cache_dir)}"
24
24
  end
25
25
 
26
+ def dispose
27
+ @db.disconnect unless @db.nil?
28
+ @persister.close unless @persister.nil?
29
+ end
30
+
26
31
  # Get a connection to the database
27
32
  def get_db
28
33
  return @db unless @db.nil?
@@ -199,35 +204,38 @@ module GHTorrent
199
204
 
200
205
  ##
201
206
  # Retrieve commits for a repository, starting from +sha+
202
- # and going back to 30 * +num_pages+ commit log entries.
203
207
  # ==Parameters:
204
208
  # [user] The user to whom the repo belongs.
205
209
  # [repo] The repo to look for commits into.
206
- # [sha] The first commit to start retrieving from. If nil, then the
207
- # earliest stored commit will be used instead.
208
- # [num_pages] The number of commit pages to retrieve
209
- def ensure_commits(user, repo, refresh = false, sha = nil,
210
- num_pages = config(:mirror_commit_pages_new_repo)
211
- )
212
- userid = @db[:users].filter(:login => user).first[:id]
213
- repoid = @db[:projects].filter(:owner_id => userid,
214
- :name => repo).first[:id]
215
-
216
- latest = if sha.nil?
217
- @db[:commits].filter(:project_id => repoid).order(:created_at).last
218
- else
219
- sha
220
- end
210
+ # [sha] The first commit to start retrieving from. If nil, then retrieval
211
+ # starts from what the project considers as master branch.
212
+ # [return_retrieved] Should retrieved commits be returned? If not, memory is
213
+ # saved while processing them if this is false
214
+ def ensure_commits(user, repo, sha = nil, return_retrieved = false)
215
+
216
+ commits = ['foo'] # Dummy entry for simplifying the loop below
217
+ commit_acc = []
218
+ until commits.empty?
219
+ commits = retrieve_commits(repo, sha, user, 1)
220
+
221
+ # This means that we retrieved the last commit page again
222
+ if commits.size == 1 and commits[0]['sha'] == sha
223
+ commits = []
224
+ end
221
225
 
222
- commits = if latest.nil?
223
- retrieve_commits(repo, "head", user, num_pages)
224
- else
225
- retrieve_commits(repo, latest[:sha], user, num_pages)
226
- end
226
+ retrieved = commits.map do |c|
227
+ sha = c['sha']
228
+ save{ensure_commit(repo, c['sha'], user)}
229
+ end
227
230
 
228
- commits.map do |c|
229
- save{ensure_commit(repo, c['sha'], user)}
230
- end.select{|x| !x.nil?}
231
+ # Store retrieved commits to return, if client requested so
232
+ if return_retrieved
233
+ commit_acc = commit_acc << retrieved
234
+ end
235
+
236
+ end
237
+
238
+ commit_acc.select{|x| !x.nil?}
231
239
  end
232
240
 
233
241
  ##
@@ -293,7 +301,7 @@ module GHTorrent
293
301
  :project_id => project[:id],
294
302
  :commit_id => commitid
295
303
  )
296
- info "GHTorrent: Associating commit #{sha} with #{user}/#{repo}"
304
+ debug "GHTorrent: Associating commit #{sha} with #{user}/#{repo}"
297
305
  @db[:project_commits].first(:project_id => project[:id],
298
306
  :commit_id => commitid)
299
307
  else
@@ -450,11 +458,18 @@ module GHTorrent
450
458
  :company => u['company'],
451
459
  :email => email,
452
460
  :location => u['location'],
461
+ :fake => false,
453
462
  :type => user_type(u['type']),
454
463
  :created_at => date(u['created_at']),
455
464
  :ext_ref_id => u[@ext_uniq])
456
465
 
457
466
  info "GHTorrent: New user #{user}"
467
+
468
+ if user_type(u['type']) == 'ORG'
469
+ info "GHTorrent: User #{user} is an organization. Retrieving members"
470
+ ensure_org(u['login'], true)
471
+ end
472
+
458
473
  users.first(:login => user)
459
474
  else
460
475
  debug "GHTorrent: User #{user} exists"
@@ -557,6 +572,7 @@ module GHTorrent
557
572
  users.insert(:email => email,
558
573
  :name => name,
559
574
  :login => login,
575
+ :fake => true,
560
576
  :created_at => Time.now,
561
577
  :ext_ref_id => "")
562
578
  info "GHTorrent: Added fake user #{login} -> #{email}"
@@ -569,6 +585,7 @@ module GHTorrent
569
585
  :company => u['company'],
570
586
  :email => u['email'],
571
587
  :location => u['location'],
588
+ :fake => false,
572
589
  :created_at => date(u['created_at']),
573
590
  :ext_ref_id => u[@ext_uniq])
574
591
  info "GHTorrent: Found #{email} through search API query"
@@ -577,6 +594,7 @@ module GHTorrent
577
594
  :company => u['company'],
578
595
  :email => u['email'],
579
596
  :location => u['location'],
597
+ :fake => false,
580
598
  :created_at => date(u['created_at']),
581
599
  :ext_ref_id => u[@ext_uniq])
582
600
  info "GHTorrent: User with email #{email} exists with username #{u['login']}"
@@ -599,8 +617,8 @@ module GHTorrent
599
617
  # == Returns:
600
618
  # If the repo can be retrieved, it is returned as a Hash. Otherwise,
601
619
  # the result is nil
602
- def ensure_repo(user, repo, commits = false, project_members = false,
603
- watchers = false, forks = false, labels = false)
620
+ def ensure_repo(user, repo, commits = true, project_members = true,
621
+ watchers = true, forks = true, labels = true)
604
622
 
605
623
  repos = @db[:projects]
606
624
  curuser = ensure_user(user, false, false)
@@ -640,11 +658,96 @@ module GHTorrent
640
658
  end
641
659
 
642
660
  info "GHTorrent: New repo #{user}/#{repo}"
643
- ensure_commits(user, repo) if commits
644
- ensure_project_members(user, repo) if project_members
645
- ensure_watchers(user, repo) if watchers
646
- ensure_forks(user, repo) if forks
647
- ensure_labels(user, repo) if labels
661
+
662
+ begin
663
+ watchdog = nil
664
+ unless parent.nil?
665
+ watchdog = Thread.new do
666
+ slept = 0
667
+ while true do
668
+ debug "GHTorrent: In ensure_repo_fork for #{slept} seconds"
669
+ sleep 1
670
+ slept += 1
671
+ end
672
+ end
673
+ # Fast path to project forking. Retrieve all commits page by page
674
+ # until we reach a commit that has been registered with the parent
675
+ # repository. Then, copy all remaining parent commits to this repo.
676
+ debug "GHTorrent: Retrieving commits for #{user}/#{repo} until we reach a commit shared with the parent"
677
+
678
+ sha = nil
679
+ # Refresh the latest commits for the parent.
680
+ retrieve_commits(parent_repo, sha, parent_owner, 1).each do |c|
681
+ sha = c['sha']
682
+ ensure_commit(parent_repo, sha, parent_owner, true)
683
+ end
684
+
685
+ sha = nil
686
+ found = false
687
+ while not found
688
+ processed = 0
689
+ commits = retrieve_commits(repo, sha, user, 1)
690
+
691
+ # If only one commit has been retrieved (and this is the same as
692
+ # the commit since which we query commits from) this mean that
693
+ # there are no more commits.
694
+ if commits.size == 1 and commits[0]['sha'] == sha
695
+ debug "GHTorrent: No shared commit found and no more commits for #{user}/#{repo}"
696
+ break
697
+ end
698
+
699
+ for c in commits
700
+ processed += 1
701
+ exists_in_parent =
702
+ !@db.from(:project_commits, :commits).\
703
+ where(:project_commits__commit_id => :commits__id).\
704
+ where(:project_commits__project_id => parent[:id]).\
705
+ where(:commits__sha => c['sha']).first.nil?
706
+
707
+ sha = c['sha']
708
+ if not exists_in_parent
709
+ ensure_commit(repo, sha, user, true)
710
+ else
711
+ found = true
712
+ debug "GHTorrent: Found commit #{sha} shared with parent, switching to copying commits"
713
+ break
714
+ end
715
+ end
716
+ if processed == 0
717
+ warn "No commits found for #{user}/#{repo}, repo deleted?"
718
+ found = true
719
+ end
720
+ end
721
+
722
+ if found
723
+ shared_commit = @db[:commits].first(:sha => sha)
724
+ forked_repo = repos.first(:owner_id => curuser[:id], :name => repo)
725
+
726
+ @db.from(:project_commits, :commits).\
727
+ where(:project_commits__commit_id => :commits__id).\
728
+ where(:project_commits__project_id => parent[:id]).\
729
+ where('commits.created_at < ?', shared_commit[:created_at]).\
730
+ select(:commits__id, :commits__sha).\
731
+ each do |c|
732
+ @db[:project_commits].insert(
733
+ :project_id => forked_repo[:id],
734
+ :commit_id => c[:id]
735
+ )
736
+ debug "GHTorrent: Copied commit #{c[:sha]} from #{parent_owner}/#{parent_repo} -> #{user}/#{repo}"
737
+ end
738
+ end
739
+ else
740
+ ensure_commits(user, repo) if commits
741
+ end
742
+ ensure_project_members(user, repo) if project_members
743
+ ensure_watchers(user, repo) if watchers
744
+ ensure_forks(user, repo) if forks
745
+ ensure_labels(user, repo) if labels
746
+ ensure
747
+ unless watchdog.nil?
748
+ watchdog.exit
749
+ end
750
+ end
648
751
  repos.first(:owner_id => curuser[:id], :name => repo)
649
752
  else
650
753
  debug "GHTorrent: Repo #{user}/#{repo} exists"
@@ -777,12 +880,11 @@ module GHTorrent
777
880
  warn "GHTorrent: Account #{organization} is not an organization"
778
881
  return nil
779
882
  end
780
-
781
- if members
782
- retrieve_org_members(organization).map do |x|
783
- ensure_participation(ensure_user(x['login'], false, false)[:login],
784
- organization, false)
785
- end
883
+ end
884
+ if members
885
+ retrieve_org_members(organization).map do |x|
886
+ ensure_participation(ensure_user(x['login'], false, false)[:login],
887
+ organization, false)
786
888
  end
787
889
  end
788
890
  org
@@ -819,7 +921,7 @@ module GHTorrent
819
921
  retrieved = retrieve_commit_comment(owner, repo, sha, comment_id)
820
922
 
821
923
  if retrieved.nil?
822
- warn "GHTorrent: Commit comment #{sha}->#{id} deleted"
924
+ warn "GHTorrent: Commit comment #{sha}->#{comment_id} deleted"
823
925
  return
824
926
  end
825
927
 
@@ -837,7 +939,7 @@ module GHTorrent
837
939
  )
838
940
  info "GHTorrent: Added commit comment #{sha} -> #{retrieved['id']} by #{user[:login]}"
839
941
  else
840
- info "GHTorrent: Commit comment #{sha} -> #{id} exists"
942
+ info "GHTorrent: Commit comment #{sha} -> #{comment_id} exists"
841
943
  end
842
944
  @db[:commit_comments].first(:comment_id => comment_id)
843
945
  end
@@ -942,6 +1044,42 @@ module GHTorrent
942
1044
  raw_pull_reqs.map { |x| save { ensure_pull_request(owner, repo, x['number']) } }.select { |x| !x.nil? }
943
1045
  end
944
1046
 
1047
+ # Adds a pull request history event
1048
+ def ensure_pull_request_history(id, ts, unq, act, actor)
1049
+ user = unless actor.nil?
1050
+ ensure_user(actor, false, false)
1051
+ end
1052
+ pull_req_history = @db[:pull_request_history]
1053
+
1054
+ entry = if ['opened', 'merged'].include? act
1055
+ pull_req_history.first(:pull_request_id => id,
1056
+ :action => act)
1057
+ else
1058
+ pull_req_history.first(:pull_request_id => id,
1059
+ :created_at => (ts - 3)..(ts + 3),
1060
+ :action => act)
1061
+ end
1062
+
1063
+ if entry.nil?
1064
+ pull_req_history.insert(:pull_request_id => id,
1065
+ :created_at => ts,
1066
+ :ext_ref_id => unq,
1067
+ :action => act,
1068
+ :actor_id => unless user.nil? then user[:id] end)
1069
+ info "GHTorrent: New pull request (#{id}) event (#{act}) by (#{actor}) timestamp #{ts}"
1070
+ else
1071
+ info "GHTorrent: Pull request (#{id}) event (#{act}) by (#{actor}) timestamp #{ts} exists"
1072
+ if entry[:actor_id].nil? and not user.nil?
1073
+ pull_req_history.where(:pull_request_id => id,
1074
+ :created_at => (ts - 3)..(ts + 3),
1075
+ :action => act)\
1076
+ .update(:actor_id => user[:id])
1077
+ debug "Pull request (#{id}) event (#{act}) timestamp #{ts} set actor -> #{user[:login]}"
1078
+ end
1079
+ end
1080
+ end
1081
+
1082
+
945
1083
  ##
946
1084
  # Process a pull request
947
1085
  def ensure_pull_request(owner, repo, pullreq_id,
@@ -955,26 +1093,6 @@ module GHTorrent
955
1093
  return
956
1094
  end
957
1095
 
958
- # Adds a pull request history event
959
- def add_history(id, ts, unq, act, actor)
960
- user = ensure_user(actor, false, false)
961
- pull_req_history = @db[:pull_request_history]
962
- entry = pull_req_history.first(:pull_request_id => id,
963
- :created_at => (ts - 4)..(ts + 4),
964
- :action => act)
965
- if entry.nil?
966
- pull_req_history.insert(:pull_request_id => id,
967
- :created_at => ts,
968
- :ext_ref_id => unq,
969
- :action => act,
970
- :actor_id => unless user.nil? then user[:id] end)
971
- info "GHTorrent: New pull request (#{id}) event (#{act}) by (#{actor}) timestamp #{ts}"
972
- else
973
- entry.update(:actor_id => user[:id])
974
- info "GHTorrent: Pull request (#{id}) history entry (#{act}) by (#{actor}) timestamp #{ts} exists"
975
- end
976
- end
977
-
978
1096
  # Checks whether a pull request concerns two branches of the same
979
1097
  # repository
980
1098
  def is_intra_branch(req)
@@ -1054,10 +1172,8 @@ module GHTorrent
1054
1172
  :base_repo_id => base_repo[:id],
1055
1173
  :head_commit_id => if not head_commit.nil? then head_commit[:id] end,
1056
1174
  :base_commit_id => base_commit[:id],
1057
- :user_id => pull_req_user[:id],
1058
1175
  :pullreq_id => pullreq_id,
1059
- :intra_branch => is_intra_branch(retrieved),
1060
- :merged => merged
1176
+ :intra_branch => is_intra_branch(retrieved)
1061
1177
  )
1062
1178
  info log_msg(retrieved) + ' was added'
1063
1179
  else
@@ -1088,18 +1204,18 @@ module GHTorrent
1088
1204
 
1089
1205
  if history
1090
1206
  # Actions on pull requests
1091
- actor = if actor.nil? then pull_req_user[:login] else actor end
1092
1207
  opener = pull_req_user[:login]
1093
- add_history(pull_req[:id], date(retrieved['created_at']),
1208
+ ensure_pull_request_history(pull_req[:id], date(retrieved['created_at']),
1094
1209
  retrieved[@ext_uniq], 'opened', opener)
1095
- # There is an additional merged_by field for merged pull requests
1210
+
1096
1211
  merger = if retrieved['merged_by'].nil? then actor else retrieved['merged_by']['login'] end
1097
- add_history(pull_req[:id], date(retrieved['merged_at']),
1212
+ ensure_pull_request_history(pull_req[:id], date(retrieved['merged_at']),
1098
1213
  retrieved[@ext_uniq], 'merged', merger) if (merged && state != 'merged')
1214
+
1099
1215
  closer = if merged then merger else actor end
1100
- add_history(pull_req[:id], date(retrieved['closed_at']),
1216
+ ensure_pull_request_history(pull_req[:id], date(retrieved['closed_at']),
1101
1217
  retrieved[@ext_uniq], 'closed', closer) if (closed && state != 'closed')
1102
- add_history(pull_req[:id], date(created_at), retrieved[@ext_uniq],
1218
+ ensure_pull_request_history(pull_req[:id], date(created_at), retrieved[@ext_uniq],
1103
1219
  state, actor) unless state.nil?
1104
1220
  end
1105
1221
  ensure_pull_request_commits(owner, repo, pullreq_id) if commits
@@ -1239,11 +1355,11 @@ module GHTorrent
1239
1355
  where(:projects__forked_from => currepo[:id]).select(:projects__name, :login).all
1240
1356
 
1241
1357
  retrieve_forks(owner, repo).reduce([]) do |acc, x|
1242
- if existing_forks.find {|y|
1358
+ if existing_forks.find do |y|
1243
1359
  forked_repo_owner = x['full_name'].split(/\//)[0]
1244
1360
  forked_repo_name = x['full_name'].split(/\//)[1]
1245
1361
  y[:login] == forked_repo_owner && y[:name] == forked_repo_name
1246
- }.nil?
1362
+ end.nil?
1247
1363
  acc << x
1248
1364
  else
1249
1365
  acc
@@ -1669,7 +1785,7 @@ module GHTorrent
1669
1785
  result = nil
1670
1786
  start_time = Time.now
1671
1787
  begin
1672
- @db.transaction(:rollback => :reraise, :isolation => :committed) do
1788
+ @db.transaction(:rollback => :reraise, :isolation => :uncommitted) do
1673
1789
  result = yield block
1674
1790
  end
1675
1791
  total = Time.now.to_ms - start_time.to_ms
@@ -1698,8 +1814,6 @@ module GHTorrent
1698
1814
  end
1699
1815
  end
1700
1816
 
1701
- private
1702
-
1703
1817
  # Store a commit contained in a hash. First check whether the commit exists.
1704
1818
  def store_commit(c, repo, user)
1705
1819
  commits = @db[:commits]
@@ -10,7 +10,7 @@ Sequel.migration do
10
10
  add_foreign_key :actor_id, :users
11
11
  end
12
12
 
13
- puts 'Remember to run the fixes/update_pull_request_history_actor.rb
13
+ puts 'Remember to run the fixes/update_pullreq_entries_from_events.rb
14
14
  script to mark deleted projects'
15
15
  end
16
16