ghtorrent 0.9 → 0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/CHANGELOG +17 -2
- data/Gemfile.lock +14 -14
- data/bin/ght-log-analyzer +133 -0
- data/bin/ght-retrieve-one +6 -0
- data/lib/ghtorrent.rb +4 -0
- data/lib/ghtorrent/adapters/mongo_persister.rb +3 -2
- data/lib/ghtorrent/api_client.rb +107 -34
- data/lib/ghtorrent/command.rb +12 -4
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +26 -34
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +4 -3
- data/lib/ghtorrent/commands/ght_load.rb +3 -2
- data/lib/ghtorrent/commands/ght_retrieve_one.rb +80 -0
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +3 -3
- data/lib/ghtorrent/commands/ght_retrieve_repos.rb +57 -100
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +6 -5
- data/lib/ghtorrent/ghtorrent.rb +188 -74
- data/lib/ghtorrent/migrations/016_add_actor_pull_request_history.rb +1 -1
- data/lib/ghtorrent/migrations/017_drop_forks_table.rb +24 -0
- data/lib/ghtorrent/migrations/018_drop_merged_user_from_pull_requests.rb +23 -0
- data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +33 -0
- data/lib/ghtorrent/multiprocess_queue_client.rb +105 -0
- data/lib/ghtorrent/persister.rb +4 -1
- data/lib/ghtorrent/retriever.rb +115 -105
- data/lib/ghtorrent/settings.rb +6 -2
- data/lib/ghtorrent/transacted_ghtorrent.rb +9 -2
- data/lib/version.rb +1 -1
- metadata +55 -10
@@ -5,6 +5,7 @@ require 'ghtorrent/settings'
|
|
5
5
|
require 'ghtorrent/logging'
|
6
6
|
require 'ghtorrent/command'
|
7
7
|
require 'ghtorrent/retriever'
|
8
|
+
require 'ghtorrent/transacted_ghtorrent'
|
8
9
|
require 'ghtorrent/commands/ght_retrieve_repo'
|
9
10
|
|
10
11
|
class GHTRetrieveUser < GHTRetrieveRepo
|
@@ -20,7 +21,7 @@ An efficient way to get all data for a single user
|
|
20
21
|
|
21
22
|
def validate
|
22
23
|
super
|
23
|
-
Trollop::die "One argument
|
24
|
+
Trollop::die "One argument is required" unless args[0] && !args[0].empty?
|
24
25
|
end
|
25
26
|
|
26
27
|
def go
|
@@ -44,19 +45,19 @@ An efficient way to get all data for a single user
|
|
44
45
|
|
45
46
|
functions = %w(ensure_user_followers ensure_orgs ensure_org)
|
46
47
|
|
47
|
-
if ARGV[
|
48
|
+
if ARGV[1].nil?
|
48
49
|
functions.each do |x|
|
49
50
|
send_message(x, user)
|
50
51
|
end
|
51
52
|
else
|
52
|
-
Trollop::die("Not a valid function: #{ARGV[
|
53
|
-
send_message(ARGV[
|
53
|
+
Trollop::die("Not a valid function: #{ARGV[1]}") unless functions.include? ARGV[1]
|
54
|
+
send_message(ARGV[1], user)
|
54
55
|
end
|
55
56
|
|
56
57
|
end
|
57
58
|
end
|
58
59
|
|
59
|
-
class
|
60
|
+
class TransactedGhtorrent
|
60
61
|
|
61
62
|
def ensure_user_followers(user)
|
62
63
|
check_transaction do
|
data/lib/ghtorrent/ghtorrent.rb
CHANGED
@@ -23,6 +23,11 @@ module GHTorrent
|
|
23
23
|
debug "GHTorrent: Using cache dir #{config(:cache_dir)}"
|
24
24
|
end
|
25
25
|
|
26
|
+
def dispose
|
27
|
+
@db.disconnect unless @db.nil?
|
28
|
+
@persister.close unless @persister.nil?
|
29
|
+
end
|
30
|
+
|
26
31
|
# Get a connection to the database
|
27
32
|
def get_db
|
28
33
|
return @db unless @db.nil?
|
@@ -199,35 +204,38 @@ module GHTorrent
|
|
199
204
|
|
200
205
|
##
|
201
206
|
# Retrieve commits for a repository, starting from +sha+
|
202
|
-
# and going back to 30 * +num_pages+ commit log entries.
|
203
207
|
# ==Parameters:
|
204
208
|
# [user] The user to whom the repo belongs.
|
205
209
|
# [repo] The repo to look for commits into.
|
206
|
-
# [sha] The first commit to start retrieving from. If nil, then
|
207
|
-
#
|
208
|
-
# [
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
210
|
+
# [sha] The first commit to start retrieving from. If nil, then retrieval
|
211
|
+
# starts from what the project considers as master branch.
|
212
|
+
# [return_retrieved] Should retrieved commits be returned? If not, memory is
|
213
|
+
# saved while processing them if this is false
|
214
|
+
def ensure_commits(user, repo, sha = nil, return_retrieved = false)
|
215
|
+
|
216
|
+
commits = ['foo'] # Dummy entry for simplifying the loop below
|
217
|
+
commit_acc = []
|
218
|
+
until commits.empty?
|
219
|
+
commits = retrieve_commits(repo, sha, user, 1)
|
220
|
+
|
221
|
+
# This means that we retrieved the last commit page again
|
222
|
+
if commits.size == 1 and commits[0]['sha'] == sha
|
223
|
+
commits = []
|
224
|
+
end
|
221
225
|
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
end
|
226
|
+
retrieved = commits.map do |c|
|
227
|
+
sha = c['sha']
|
228
|
+
save{ensure_commit(repo, c['sha'], user)}
|
229
|
+
end
|
227
230
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
+
# Store retrieved commits to return, if client requested so
|
232
|
+
if return_retrieved
|
233
|
+
commit_acc = commit_acc << retrieved
|
234
|
+
end
|
235
|
+
|
236
|
+
end
|
237
|
+
|
238
|
+
commit_acc.select{|x| !x.nil?}
|
231
239
|
end
|
232
240
|
|
233
241
|
##
|
@@ -293,7 +301,7 @@ module GHTorrent
|
|
293
301
|
:project_id => project[:id],
|
294
302
|
:commit_id => commitid
|
295
303
|
)
|
296
|
-
|
304
|
+
debug "GHTorrent: Associating commit #{sha} with #{user}/#{repo}"
|
297
305
|
@db[:project_commits].first(:project_id => project[:id],
|
298
306
|
:commit_id => commitid)
|
299
307
|
else
|
@@ -450,11 +458,18 @@ module GHTorrent
|
|
450
458
|
:company => u['company'],
|
451
459
|
:email => email,
|
452
460
|
:location => u['location'],
|
461
|
+
:fake => false,
|
453
462
|
:type => user_type(u['type']),
|
454
463
|
:created_at => date(u['created_at']),
|
455
464
|
:ext_ref_id => u[@ext_uniq])
|
456
465
|
|
457
466
|
info "GHTorrent: New user #{user}"
|
467
|
+
|
468
|
+
if user_type(u['type']) == 'ORG'
|
469
|
+
info "GHTorrent: User #{user} is an organization. Retrieving members"
|
470
|
+
ensure_org(u['login'], true)
|
471
|
+
end
|
472
|
+
|
458
473
|
users.first(:login => user)
|
459
474
|
else
|
460
475
|
debug "GHTorrent: User #{user} exists"
|
@@ -557,6 +572,7 @@ module GHTorrent
|
|
557
572
|
users.insert(:email => email,
|
558
573
|
:name => name,
|
559
574
|
:login => login,
|
575
|
+
:fake => true,
|
560
576
|
:created_at => Time.now,
|
561
577
|
:ext_ref_id => "")
|
562
578
|
info "GHTorrent: Added fake user #{login} -> #{email}"
|
@@ -569,6 +585,7 @@ module GHTorrent
|
|
569
585
|
:company => u['company'],
|
570
586
|
:email => u['email'],
|
571
587
|
:location => u['location'],
|
588
|
+
:fake => false,
|
572
589
|
:created_at => date(u['created_at']),
|
573
590
|
:ext_ref_id => u[@ext_uniq])
|
574
591
|
info "GHTorrent: Found #{email} through search API query"
|
@@ -577,6 +594,7 @@ module GHTorrent
|
|
577
594
|
:company => u['company'],
|
578
595
|
:email => u['email'],
|
579
596
|
:location => u['location'],
|
597
|
+
:fake => false,
|
580
598
|
:created_at => date(u['created_at']),
|
581
599
|
:ext_ref_id => u[@ext_uniq])
|
582
600
|
info "GHTorrent: User with email #{email} exists with username #{u['login']}"
|
@@ -599,8 +617,8 @@ module GHTorrent
|
|
599
617
|
# == Returns:
|
600
618
|
# If the repo can be retrieved, it is returned as a Hash. Otherwise,
|
601
619
|
# the result is nil
|
602
|
-
def ensure_repo(user, repo, commits =
|
603
|
-
watchers =
|
620
|
+
def ensure_repo(user, repo, commits = true, project_members = true,
|
621
|
+
watchers = true, forks = true, labels = true)
|
604
622
|
|
605
623
|
repos = @db[:projects]
|
606
624
|
curuser = ensure_user(user, false, false)
|
@@ -640,11 +658,96 @@ module GHTorrent
|
|
640
658
|
end
|
641
659
|
|
642
660
|
info "GHTorrent: New repo #{user}/#{repo}"
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
661
|
+
|
662
|
+
begin
|
663
|
+
watchdog = nil
|
664
|
+
unless parent.nil?
|
665
|
+
watchdog = Thread.new do
|
666
|
+
slept = 0
|
667
|
+
while true do
|
668
|
+
debug "GHTorrent: In ensure_repo_fork for #{slept} seconds"
|
669
|
+
sleep 1
|
670
|
+
slept += 1
|
671
|
+
end
|
672
|
+
end
|
673
|
+
# Fast path to project forking. Retrieve all commits page by page
|
674
|
+
# until we reach a commit that has been registered with the parent
|
675
|
+
# repository. Then, copy all remaining parent commits to this repo.
|
676
|
+
debug "GHTorrent: Retrieving commits for #{user}/#{repo} until we reach a commit shared with the parent"
|
677
|
+
|
678
|
+
sha = nil
|
679
|
+
# Refresh the latest commits for the parent.
|
680
|
+
retrieve_commits(parent_repo, sha, parent_owner, 1).each do |c|
|
681
|
+
sha = c['sha']
|
682
|
+
ensure_commit(parent_repo, sha, parent_owner, true)
|
683
|
+
end
|
684
|
+
|
685
|
+
sha = nil
|
686
|
+
found = false
|
687
|
+
while not found
|
688
|
+
processed = 0
|
689
|
+
commits = retrieve_commits(repo, sha, user, 1)
|
690
|
+
|
691
|
+
# If only one commit has been retrieved (and this is the same as
|
692
|
+
# the commit since which we query commits from) this mean that
|
693
|
+
# there are no more commits.
|
694
|
+
if commits.size == 1 and commits[0]['sha'] == sha
|
695
|
+
debug "GHTorrent: No shared commit found and no more commits for #{user}/#{repo}"
|
696
|
+
break
|
697
|
+
end
|
698
|
+
|
699
|
+
for c in commits
|
700
|
+
processed += 1
|
701
|
+
exists_in_parent =
|
702
|
+
!@db.from(:project_commits, :commits).\
|
703
|
+
where(:project_commits__commit_id => :commits__id).\
|
704
|
+
where(:project_commits__project_id => parent[:id]).\
|
705
|
+
where(:commits__sha => c['sha']).first.nil?
|
706
|
+
|
707
|
+
sha = c['sha']
|
708
|
+
if not exists_in_parent
|
709
|
+
ensure_commit(repo, sha, user, true)
|
710
|
+
else
|
711
|
+
found = true
|
712
|
+
debug "GHTorrent: Found commit #{sha} shared with parent, switching to copying commits"
|
713
|
+
break
|
714
|
+
end
|
715
|
+
end
|
716
|
+
if processed == 0
|
717
|
+
warn "No commits found for #{user}/#{repo}, repo deleted?"
|
718
|
+
found = true
|
719
|
+
end
|
720
|
+
end
|
721
|
+
|
722
|
+
if found
|
723
|
+
shared_commit = @db[:commits].first(:sha => sha)
|
724
|
+
forked_repo = repos.first(:owner_id => curuser[:id], :name => repo)
|
725
|
+
|
726
|
+
@db.from(:project_commits, :commits).\
|
727
|
+
where(:project_commits__commit_id => :commits__id).\
|
728
|
+
where(:project_commits__project_id => parent[:id]).\
|
729
|
+
where('commits.created_at < ?', shared_commit[:created_at]).\
|
730
|
+
select(:commits__id, :commits__sha).\
|
731
|
+
each do |c|
|
732
|
+
@db[:project_commits].insert(
|
733
|
+
:project_id => forked_repo[:id],
|
734
|
+
:commit_id => c[:id]
|
735
|
+
)
|
736
|
+
debug "GHTorrent: Copied commit #{c[:sha]} from #{parent_owner}/#{parent_repo} -> #{user}/#{repo}"
|
737
|
+
end
|
738
|
+
end
|
739
|
+
else
|
740
|
+
ensure_commits(user, repo) if commits
|
741
|
+
end
|
742
|
+
ensure_project_members(user, repo) if project_members
|
743
|
+
ensure_watchers(user, repo) if watchers
|
744
|
+
ensure_forks(user, repo) if forks
|
745
|
+
ensure_labels(user, repo) if labels
|
746
|
+
ensure
|
747
|
+
unless watchdog.nil?
|
748
|
+
watchdog.exit
|
749
|
+
end
|
750
|
+
end
|
648
751
|
repos.first(:owner_id => curuser[:id], :name => repo)
|
649
752
|
else
|
650
753
|
debug "GHTorrent: Repo #{user}/#{repo} exists"
|
@@ -777,12 +880,11 @@ module GHTorrent
|
|
777
880
|
warn "GHTorrent: Account #{organization} is not an organization"
|
778
881
|
return nil
|
779
882
|
end
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
end
|
883
|
+
end
|
884
|
+
if members
|
885
|
+
retrieve_org_members(organization).map do |x|
|
886
|
+
ensure_participation(ensure_user(x['login'], false, false)[:login],
|
887
|
+
organization, false)
|
786
888
|
end
|
787
889
|
end
|
788
890
|
org
|
@@ -819,7 +921,7 @@ module GHTorrent
|
|
819
921
|
retrieved = retrieve_commit_comment(owner, repo, sha, comment_id)
|
820
922
|
|
821
923
|
if retrieved.nil?
|
822
|
-
warn "GHTorrent: Commit comment #{sha}->#{
|
924
|
+
warn "GHTorrent: Commit comment #{sha}->#{comment_id} deleted"
|
823
925
|
return
|
824
926
|
end
|
825
927
|
|
@@ -837,7 +939,7 @@ module GHTorrent
|
|
837
939
|
)
|
838
940
|
info "GHTorrent: Added commit comment #{sha} -> #{retrieved['id']} by #{user[:login]}"
|
839
941
|
else
|
840
|
-
info "GHTorrent: Commit comment #{sha} -> #{
|
942
|
+
info "GHTorrent: Commit comment #{sha} -> #{comment_id} exists"
|
841
943
|
end
|
842
944
|
@db[:commit_comments].first(:comment_id => comment_id)
|
843
945
|
end
|
@@ -942,6 +1044,42 @@ module GHTorrent
|
|
942
1044
|
raw_pull_reqs.map { |x| save { ensure_pull_request(owner, repo, x['number']) } }.select { |x| !x.nil? }
|
943
1045
|
end
|
944
1046
|
|
1047
|
+
# Adds a pull request history event
|
1048
|
+
def ensure_pull_request_history(id, ts, unq, act, actor)
|
1049
|
+
user = unless actor.nil?
|
1050
|
+
ensure_user(actor, false, false)
|
1051
|
+
end
|
1052
|
+
pull_req_history = @db[:pull_request_history]
|
1053
|
+
|
1054
|
+
entry = if ['opened', 'merged'].include? act
|
1055
|
+
pull_req_history.first(:pull_request_id => id,
|
1056
|
+
:action => act)
|
1057
|
+
else
|
1058
|
+
pull_req_history.first(:pull_request_id => id,
|
1059
|
+
:created_at => (ts - 3)..(ts + 3),
|
1060
|
+
:action => act)
|
1061
|
+
end
|
1062
|
+
|
1063
|
+
if entry.nil?
|
1064
|
+
pull_req_history.insert(:pull_request_id => id,
|
1065
|
+
:created_at => ts,
|
1066
|
+
:ext_ref_id => unq,
|
1067
|
+
:action => act,
|
1068
|
+
:actor_id => unless user.nil? then user[:id] end)
|
1069
|
+
info "GHTorrent: New pull request (#{id}) event (#{act}) by (#{actor}) timestamp #{ts}"
|
1070
|
+
else
|
1071
|
+
info "GHTorrent: Pull request (#{id}) event (#{act}) by (#{actor}) timestamp #{ts} exists"
|
1072
|
+
if entry[:actor_id].nil? and not user.nil?
|
1073
|
+
pull_req_history.where(:pull_request_id => id,
|
1074
|
+
:created_at => (ts - 3)..(ts + 3),
|
1075
|
+
:action => act)\
|
1076
|
+
.update(:actor_id => user[:id])
|
1077
|
+
debug "Pull request (#{id}) event (#{act}) timestamp #{ts} set actor -> #{user[:login]}"
|
1078
|
+
end
|
1079
|
+
end
|
1080
|
+
end
|
1081
|
+
|
1082
|
+
|
945
1083
|
##
|
946
1084
|
# Process a pull request
|
947
1085
|
def ensure_pull_request(owner, repo, pullreq_id,
|
@@ -955,26 +1093,6 @@ module GHTorrent
|
|
955
1093
|
return
|
956
1094
|
end
|
957
1095
|
|
958
|
-
# Adds a pull request history event
|
959
|
-
def add_history(id, ts, unq, act, actor)
|
960
|
-
user = ensure_user(actor, false, false)
|
961
|
-
pull_req_history = @db[:pull_request_history]
|
962
|
-
entry = pull_req_history.first(:pull_request_id => id,
|
963
|
-
:created_at => (ts - 4)..(ts + 4),
|
964
|
-
:action => act)
|
965
|
-
if entry.nil?
|
966
|
-
pull_req_history.insert(:pull_request_id => id,
|
967
|
-
:created_at => ts,
|
968
|
-
:ext_ref_id => unq,
|
969
|
-
:action => act,
|
970
|
-
:actor_id => unless user.nil? then user[:id] end)
|
971
|
-
info "GHTorrent: New pull request (#{id}) event (#{act}) by (#{actor}) timestamp #{ts}"
|
972
|
-
else
|
973
|
-
entry.update(:actor_id => user[:id])
|
974
|
-
info "GHTorrent: Pull request (#{id}) history entry (#{act}) by (#{actor}) timestamp #{ts} exists"
|
975
|
-
end
|
976
|
-
end
|
977
|
-
|
978
1096
|
# Checks whether a pull request concerns two branches of the same
|
979
1097
|
# repository
|
980
1098
|
def is_intra_branch(req)
|
@@ -1054,10 +1172,8 @@ module GHTorrent
|
|
1054
1172
|
:base_repo_id => base_repo[:id],
|
1055
1173
|
:head_commit_id => if not head_commit.nil? then head_commit[:id] end,
|
1056
1174
|
:base_commit_id => base_commit[:id],
|
1057
|
-
:user_id => pull_req_user[:id],
|
1058
1175
|
:pullreq_id => pullreq_id,
|
1059
|
-
:intra_branch => is_intra_branch(retrieved)
|
1060
|
-
:merged => merged
|
1176
|
+
:intra_branch => is_intra_branch(retrieved)
|
1061
1177
|
)
|
1062
1178
|
info log_msg(retrieved) + ' was added'
|
1063
1179
|
else
|
@@ -1088,18 +1204,18 @@ module GHTorrent
|
|
1088
1204
|
|
1089
1205
|
if history
|
1090
1206
|
# Actions on pull requests
|
1091
|
-
actor = if actor.nil? then pull_req_user[:login] else actor end
|
1092
1207
|
opener = pull_req_user[:login]
|
1093
|
-
|
1208
|
+
ensure_pull_request_history(pull_req[:id], date(retrieved['created_at']),
|
1094
1209
|
retrieved[@ext_uniq], 'opened', opener)
|
1095
|
-
|
1210
|
+
|
1096
1211
|
merger = if retrieved['merged_by'].nil? then actor else retrieved['merged_by']['login'] end
|
1097
|
-
|
1212
|
+
ensure_pull_request_history(pull_req[:id], date(retrieved['merged_at']),
|
1098
1213
|
retrieved[@ext_uniq], 'merged', merger) if (merged && state != 'merged')
|
1214
|
+
|
1099
1215
|
closer = if merged then merger else actor end
|
1100
|
-
|
1216
|
+
ensure_pull_request_history(pull_req[:id], date(retrieved['closed_at']),
|
1101
1217
|
retrieved[@ext_uniq], 'closed', closer) if (closed && state != 'closed')
|
1102
|
-
|
1218
|
+
ensure_pull_request_history(pull_req[:id], date(created_at), retrieved[@ext_uniq],
|
1103
1219
|
state, actor) unless state.nil?
|
1104
1220
|
end
|
1105
1221
|
ensure_pull_request_commits(owner, repo, pullreq_id) if commits
|
@@ -1239,11 +1355,11 @@ module GHTorrent
|
|
1239
1355
|
where(:projects__forked_from => currepo[:id]).select(:projects__name, :login).all
|
1240
1356
|
|
1241
1357
|
retrieve_forks(owner, repo).reduce([]) do |acc, x|
|
1242
|
-
if existing_forks.find
|
1358
|
+
if existing_forks.find do |y|
|
1243
1359
|
forked_repo_owner = x['full_name'].split(/\//)[0]
|
1244
1360
|
forked_repo_name = x['full_name'].split(/\//)[1]
|
1245
1361
|
y[:login] == forked_repo_owner && y[:name] == forked_repo_name
|
1246
|
-
|
1362
|
+
end.nil?
|
1247
1363
|
acc << x
|
1248
1364
|
else
|
1249
1365
|
acc
|
@@ -1669,7 +1785,7 @@ module GHTorrent
|
|
1669
1785
|
result = nil
|
1670
1786
|
start_time = Time.now
|
1671
1787
|
begin
|
1672
|
-
@db.transaction(:rollback => :reraise, :isolation => :
|
1788
|
+
@db.transaction(:rollback => :reraise, :isolation => :uncommitted) do
|
1673
1789
|
result = yield block
|
1674
1790
|
end
|
1675
1791
|
total = Time.now.to_ms - start_time.to_ms
|
@@ -1698,8 +1814,6 @@ module GHTorrent
|
|
1698
1814
|
end
|
1699
1815
|
end
|
1700
1816
|
|
1701
|
-
private
|
1702
|
-
|
1703
1817
|
# Store a commit contained in a hash. First check whether the commit exists.
|
1704
1818
|
def store_commit(c, repo, user)
|
1705
1819
|
commits = @db[:commits]
|