ghtorrent 0.5 → 0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,118 @@
1
+ require 'rubygems'
2
+
3
+ require 'ghtorrent/ghtorrent'
4
+ require 'ghtorrent/settings'
5
+ require 'ghtorrent/logging'
6
+ require 'ghtorrent/command'
7
+ require 'ghtorrent/retriever'
8
+
9
+ class GHTRetrieveRepo < GHTorrent::Command
10
+
11
+ include GHTorrent::Settings
12
+ include GHTorrent::Retriever
13
+ include GHTorrent::Persister
14
+
15
+ def prepare_options(options)
16
+ options.banner <<-BANNER
17
+ An efficient way to get all data for a single repo
18
+
19
+ #{command_name} [options] owner repo
20
+
21
+ BANNER
22
+ end
23
+
24
+ def validate
25
+ super
26
+ Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
27
+ end
28
+
29
+ def logger
30
+ ght.logger
31
+ end
32
+
33
+ def persister
34
+ @persister ||= connect(:mongo, settings)
35
+ @persister
36
+ end
37
+
38
+ def ext_uniq
39
+ @ext_uniq ||= config(:uniq_id)
40
+ @ext_uniq
41
+ end
42
+
43
+ def ght
44
+ @ght ||= TransactedGHTorrent.new(settings)
45
+ @ght
46
+ end
47
+
48
+ def go
49
+ user_entry = ght.transaction{ght.ensure_user(ARGV[0], false, false)}
50
+
51
+ if user_entry.nil?
52
+ Trollop::die "Cannot find user #{owner}"
53
+ end
54
+
55
+ user = user_entry[:login]
56
+
57
+ repo_entry = ght.transaction{ght.ensure_repo(ARGV[0], ARGV[1], false, false, false)}
58
+
59
+ if repo_entry.nil?
60
+ Trollop::die "Cannot find repository #{owner}/#{ARGV[1]}"
61
+ end
62
+
63
+ repo = repo_entry[:name]
64
+
65
+ %w(ensure_commits ensure_forks ensure_pull_requests
66
+ ensure_issues ensure_project_members ensure_watchers).each {|x|
67
+ begin
68
+ ght.send(x, user, repo)
69
+ rescue Exception => e
70
+ puts STDERR, e.message
71
+ puts STDERR, e.backtrace
72
+ end
73
+ }
74
+ end
75
+ end
76
+
77
+ # A version of the GHTorrent class that creates a transaction per processed
78
+ # item
79
+ class TransactedGHTorrent < GHTorrent::Mirror
80
+
81
+ def ensure_commit(repo, sha, user, comments = true)
82
+ transaction do
83
+ super(repo, sha, user, comments)
84
+ end
85
+ end
86
+
87
+ def ensure_fork(owner, repo, fork_id, date_added = nil)
88
+ transaction do
89
+ super(owner, repo, fork_id, date_added)
90
+ end
91
+ end
92
+
93
+ def ensure_pull_request(owner, repo, pullreq_id,
94
+ comments = true, commits = true,
95
+ state = nil, created_at = nil)
96
+ transaction do
97
+ super(owner, repo, pullreq_id, comments, commits, state, created_at)
98
+ end
99
+ end
100
+
101
+ def ensure_issue(owner, repo, issue_id, events = true, comments = true)
102
+ transaction do
103
+ super(owner, repo, issue_id, events, comments)
104
+ end
105
+ end
106
+
107
+ def ensure_project_member(owner, repo, new_member, date_added)
108
+ transaction do
109
+ super(owner, repo, new_member, date_added)
110
+ end
111
+ end
112
+
113
+ def ensure_watcher(owner, repo, watcher, date_added = nil)
114
+ transaction do
115
+ super(owner, repo, watcher, date_added)
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,132 @@
1
+ require 'rubygems'
2
+ require 'mongo'
3
+
4
+ require 'ghtorrent/settings'
5
+ require 'ghtorrent/logging'
6
+ require 'ghtorrent/command'
7
+ require 'ghtorrent/persister'
8
+
9
+ class GHRMDupl < GHTorrent::Command
10
+
11
+ include GHTorrent::Settings
12
+ include GHTorrent::Persister
13
+
14
+ def col_info()
15
+ {
16
+ :commits => {
17
+ :unq => "sha",
18
+ :col => persister.get_underlying_connection.collection(:commits.to_s),
19
+ },
20
+ :events => {
21
+ :unq => "id",
22
+ :col => persister.get_underlying_connection.collection(:events.to_s),
23
+ }
24
+ }
25
+ end
26
+
27
+ def persister
28
+ @persister ||= connect(:mongo, @settings)
29
+ @persister
30
+ end
31
+
32
+ def prepare_options(options)
33
+ options.banner <<-BANNER
34
+ Removes duplicate entries from collections (currently, commits and events)
35
+
36
+ #{command_name} [options] collection
37
+
38
+ #{command_name} options:
39
+ BANNER
40
+
41
+ options.opt :earliest, 'Seconds since epoch of earliest item to load',
42
+ :short => 'e', :default => 0, :type => :int
43
+ options.opt :snapshot, 'Perform clean up every x records',
44
+ :short => 's', :default => -1, :type => :int
45
+ end
46
+
47
+ def validate
48
+ super
49
+ Trollop::die "no collection specified" unless args[0] && !args[0].empty?
50
+ end
51
+
52
+ # Print MongoDB remove statements that
53
+ # remove all but one entries for each commit.
54
+ def remove_duplicates(data, col)
55
+ removed = 0
56
+ data.select { |k, v| v.size > 1 }.each do |k, v|
57
+ v.slice(0..(v.size - 2)).map do |x|
58
+ removed += 1 if delete_by_id col, x
59
+ end
60
+ end
61
+ removed
62
+ end
63
+
64
+ def delete_by_id(col, id)
65
+ begin
66
+ col.remove({'_id' => id})
67
+ true
68
+ rescue Mongo::OperationFailure
69
+ puts "Cannot remove record with id #{id} from #{col.name}"
70
+ false
71
+ end
72
+ end
73
+
74
+ def go
75
+ collection = case ARGV[0]
76
+ when "commits" then
77
+ :commits
78
+ when "events" then
79
+ :events
80
+ else
81
+ puts "Not a known collection name: #{ARGV[0]}\n"
82
+ end
83
+
84
+ from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
85
+
86
+ snapshot = options[:snapshot]
87
+
88
+ puts "Deleting duplicates from collection #{collection}"
89
+ puts "Deleting duplicates after #{Time.at(options[:earliest])}"
90
+ puts "Perform clean up every #{snapshot} records"
91
+
92
+ # Various counters to report stats
93
+ processed = total_processed = removed = 0
94
+
95
+ data = Hash.new
96
+
97
+ # The following code needs to save intermediate results to cope
98
+ # with large datasets
99
+ col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
100
+ _id = r["_id"]
101
+ commit = read_value(r, col_info[collection][:unq])
102
+
103
+ # If entries cannot be parsed, remove them
104
+ if commit.empty?
105
+ puts "Deleting unknown entry #{_id}"
106
+ removed += 1 if delete_by_id col_info[collection][:col], _id
107
+ else
108
+ data[commit] = [] if data[commit].nil?
109
+ data[commit] << _id
110
+ end
111
+
112
+ processed += 1
113
+ total_processed += 1
114
+
115
+ print "\rProcessed #{processed} records"
116
+
117
+ # Calculate duplicates, save intermediate result
118
+ if snapshot > 0 and processed > snapshot
119
+ puts "\nLoaded #{data.size} values, cleaning"
120
+ removed += remove_duplicates data, col_info[collection][:col]
121
+ data = Hash.new
122
+ processed = 0
123
+ end
124
+ end
125
+
126
+ removed += remove_duplicates data, col_info[collection][:col]
127
+
128
+ puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
129
+ end
130
+ end
131
+
132
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -22,7 +22,7 @@ module GHTorrent
22
22
  @logger = Logger.new(STDOUT)
23
23
  end
24
24
 
25
- # db related functions
25
+ # Get a connection to the database
26
26
  def get_db
27
27
  Sequel.single_threaded = true
28
28
  @db = Sequel.connect(config(:sql_url), :encoding => 'utf8')
@@ -151,19 +151,31 @@ module GHTorrent
151
151
  end
152
152
 
153
153
  ##
154
- # Retrieve a pull request review comment
154
+ # Retrieve an issue
155
155
  # ==Parameters:
156
156
  # [owner] The login of the repository owner
157
157
  # [repo] The name of the repository
158
- # [fork_id] The fork item id
158
+ # [issue_id] The fork item id
159
+ # [action] The action that took place for the issue
159
160
  # [date_added] The timestamp that the add event took place
160
- def get_issue_comment(owner, repo, issue_id, comment_id, created_at)
161
+ def get_issue(owner, repo, issue_id, created_at)
161
162
  transaction do
162
- raise "Not implemented"
163
- #ensure_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
163
+ ensure_issue(owner, repo, issue_id, created_at)
164
164
  end
165
165
  end
166
166
 
167
+ ##
168
+ # Retrieve a issue comment
169
+ # ==Parameters:
170
+ # [owner] The login of the repository owner
171
+ # [repo] The name of the repository
172
+ # [issue_id] The fork item id
173
+ # [comment_id] The issue comment unique identifier
174
+ def get_issue_comment(owner, repo, issue_id, comment_id)
175
+ transaction do
176
+ ensure_issue_comment(owner, repo, issue_id, comment_id)
177
+ end
178
+ end
167
179
 
168
180
  ##
169
181
  # Make sure a commit exists
@@ -188,21 +200,30 @@ module GHTorrent
188
200
  end
189
201
 
190
202
  ##
191
- # Get as many commits for a repository as allowed by Github
192
- #
203
+ # Retrieve commits for a repository, starting from +sha+
204
+ # and going back to 30 * +num_pages+ commit log entries.
193
205
  # ==Parameters:
194
206
  # [user] The user to whom the repo belongs.
195
207
  # [repo] The repo to look for commits into.
196
- def ensure_commits(user, repo)
208
+ # [sha] The first commit to start retrieving from. If nil, then the
209
+ # earliest stored commit will be used instead.
210
+ # [num_pages] The number of commit pages to retrieve
211
+ def ensure_commits(user, repo, sha = nil,
212
+ num_pages = config(:mirror_commit_pages_new_repo))
197
213
  userid = @db[:users].filter(:login => user).first[:id]
198
214
  repoid = @db[:projects].filter(:owner_id => userid,
199
215
  :name => repo).first[:id]
200
216
 
201
- latest = @db[:commits].filter(:project_id => repoid).order(:created_at).last
217
+ latest = if sha.nil?
218
+ @db[:commits].filter(:project_id => repoid).order(:created_at).last
219
+ else
220
+ sha
221
+ end
222
+
202
223
  commits = if latest.nil?
203
- retrieve_commits(repo, nil, user)
224
+ retrieve_commits(repo, "head", user, num_pages)
204
225
  else
205
- retrieve_commits(repo, latest[:sha], user)
226
+ retrieve_commits(repo, latest[:sha], user, num_pages)
206
227
  end
207
228
 
208
229
  commits.map do |c|
@@ -226,6 +247,11 @@ module GHTorrent
226
247
  parent = commits.first(:sha => url[7])
227
248
  end
228
249
 
250
+ if parent.nil?
251
+ warn "GHTorrent: Could not retrieve #{url[4]}/#{url[5]} -> #{url[7]}, parent to commit #{this[:sha]}"
252
+ return
253
+ end
254
+
229
255
  if parents.first(:commit_id => this[:id],
230
256
  :parent_id => parent[:id]).nil?
231
257
 
@@ -297,6 +323,16 @@ module GHTorrent
297
323
  if dbuser.nil?
298
324
  # We do not have the user in the database yet. Add him
299
325
  added = ensure_user(login, false, false)
326
+
327
+ # A commit user can be found by email but not
328
+ # by the user name he used to commit. This probably means that the
329
+ # user has probably changed his user name. Treat the user's by-email
330
+ # description as valid.
331
+ if added.nil? and not byemail.nil?
332
+ warn "GHTorrent: Found user #{byemail[:login]} with same email #{email} as non existing user #{login}. Assigning user #{login} to #{byemail[:login]}"
333
+ return users.first(:login => byemail[:login])
334
+ end
335
+
300
336
  if byemail.nil?
301
337
  users.filter(:login => login).update(:name => name) if added[:name].nil?
302
338
  users.filter(:login => login).update(:email => email) if added[:email].nil?
@@ -342,7 +378,11 @@ module GHTorrent
342
378
  name, email = user.split("<")
343
379
  email = email.split(">")[0]
344
380
  rescue Exception
345
- raise new GHTorrentException("Not a valid email address: #{user}")
381
+ raise new GHTorrentException.new("Not a valid email address: #{user}")
382
+ end
383
+
384
+ unless is_valid_email(email)
385
+ warn("GHTorrent: Extracted email(#{email}) not valid for user #{user}")
346
386
  end
347
387
  u = ensure_user_byemail(email.strip, name.strip)
348
388
  else
@@ -522,9 +562,8 @@ module GHTorrent
522
562
  # the result is nil
523
563
  def ensure_repo(user, repo, commits = true, project_members = true, watchers = true)
524
564
 
525
- ensure_user(user, false, false)
526
565
  repos = @db[:projects]
527
- curuser = @db[:users].first(:login => user)
566
+ curuser = ensure_user(user, false, false)
528
567
  currepo = repos.first(:owner_id => curuser[:id], :name => repo)
529
568
 
530
569
  if currepo.nil?
@@ -549,7 +588,7 @@ module GHTorrent
549
588
  ensure_watchers(user, repo) if watchers
550
589
  repos.first(:owner_id => curuser[:id], :name => repo)
551
590
  else
552
- debug "GHTorrent: Repo #{repo} exists"
591
+ debug "GHTorrent: Repo #{user}/#{repo} exists"
553
592
  currepo
554
593
  end
555
594
  end
@@ -604,11 +643,12 @@ module GHTorrent
604
643
  )
605
644
  info "GHTorrent: Added project member #{repo} -> #{new_member}"
606
645
  else
646
+ debug "GHTorrent: Project member #{repo} -> #{new_member} exists"
607
647
  unless date_added.nil?
608
648
  pr_members.filter(:user_id => new_user[:id],
609
649
  :repo_id => project[:id])\
610
650
  .update(:created_at => date(date_added))
611
- info "GHTorrent: Updating #{repo} -> #{new_member}"
651
+ info "GHTorrent: Updating project member #{repo} -> #{new_member}"
612
652
  end
613
653
  end
614
654
  end
@@ -796,13 +836,14 @@ module GHTorrent
796
836
  :created_at => date(added),
797
837
  :ext_ref_id => retrieved[@ext_uniq]
798
838
  )
799
- info "GHTorrent: Added watcher #{repo} -> #{watcher}"
839
+ info "GHTorrent: Added watcher #{owner}/#{repo} -> #{watcher}"
800
840
  else
841
+ debug "GHTorrent: Watcher #{owner}/#{repo} -> #{watcher} exists"
801
842
  unless date_added.nil?
802
843
  watchers.filter(:user_id => new_watcher[:id],
803
844
  :repo_id => project[:id])\
804
845
  .update(:created_at => date(date_added))
805
- info "GHTorrent: Updating #{repo} -> #{watcher}"
846
+ info "GHTorrent: Updating watcher #{owner}/#{repo} -> #{watcher}"
806
847
  end
807
848
  end
808
849
  end
@@ -816,7 +857,7 @@ module GHTorrent
816
857
  return
817
858
  end
818
859
 
819
- pull_reqs = @db[:pull_requests].filter(:base_repo_id => currepo[:id])
860
+ pull_reqs = @db[:pull_requests].filter(:base_repo_id => currepo[:id]).all
820
861
 
821
862
  retrieve_pull_requests(owner, repo).reduce([]) do |acc, x|
822
863
  if pull_reqs.find { |y| y[:pullreq_id] == x['number'] }.nil?
@@ -833,7 +874,6 @@ module GHTorrent
833
874
  comments = true, commits = true,
834
875
  state = nil, created_at = nil)
835
876
  pulls_reqs = @db[:pull_requests]
836
- pull_req_history = @db[:pull_request_history]
837
877
 
838
878
  project = ensure_repo(owner, repo, false, false, false)
839
879
 
@@ -842,8 +882,8 @@ module GHTorrent
842
882
  end
843
883
 
844
884
  # Adds a pull request history event
845
- add_history = Proc.new do |id, ts, unq, act|
846
-
885
+ def add_history(id, ts, unq, act)
886
+ pull_req_history = @db[:pull_request_history]
847
887
  entry = pull_req_history.first(:pull_request_id => id,
848
888
  :ext_ref_id => unq, :action => act)
849
889
  if entry.nil?
@@ -859,20 +899,33 @@ module GHTorrent
859
899
 
860
900
  # Checks whether a pull request concerns two branches of the same
861
901
  # repository
862
- is_intra_branch = Proc.new do |req|
863
- req['head']['repo'].nil?
902
+ def is_intra_branch(req)
903
+ return false unless has_head_repo(req)
904
+
905
+ if req['head']['repo']['owner']['login'] ==
906
+ req['base']['repo']['owner']['login'] and
907
+ req['head']['repo']['full_name'] == req['base']['repo']['full_name']
908
+ true
909
+ else
910
+ false
911
+ end
912
+ end
913
+
914
+ # Checks if the pull request has a head repo specified
915
+ def has_head_repo(req)
916
+ not req['head']['repo'].nil?
864
917
  end
865
918
 
866
919
  # Produces a log message
867
- log_msg = Proc.new do |req|
868
- head = if is_intra_branch.call(req)
869
- req['base']['repo']['full_name']
870
- else
920
+ def log_msg(req)
921
+ head = if has_head_repo(req)
871
922
  req['head']['repo']['full_name']
923
+ else
924
+ "(head deleted)"
872
925
  end
873
926
 
874
927
  <<-eos.gsub(/\s+/, " ").strip
875
- GHTorrent: Pull request #{pullreq_id}
928
+ GHTorrent: Pull request #{req['number']}
876
929
  #{head} -> #{req['base']['repo']['full_name']}
877
930
  eos
878
931
  end
@@ -890,22 +943,26 @@ module GHTorrent
890
943
 
891
944
  base_commit = ensure_commit(retrieved['base']['repo']['name'],
892
945
  retrieved['base']['sha'],
893
- retrieved['base']['repo']['owner']['login']
894
- )
946
+ retrieved['base']['repo']['owner']['login'])
895
947
 
896
- if is_intra_branch.call(retrieved)
948
+ if is_intra_branch(retrieved)
897
949
  head_repo = base_repo
898
- head_commit =
899
- warn "GHTorrent: Pull request is intra branch"
900
- else
901
-
902
- head_repo = ensure_repo(retrieved['head']['repo']['owner']['login'],
903
- retrieved['head']['repo']['name'],
904
- false, false, false)
905
-
906
- head_commit = ensure_commit(retrieved['head']['repo']['name'],
950
+ head_commit = ensure_commit(retrieved['base']['repo']['name'],
907
951
  retrieved['head']['sha'],
908
- retrieved['head']['repo']['owner']['login'])
952
+ retrieved['base']['repo']['owner']['login'])
953
+ info log_msg(retrieved) + " is intra branch"
954
+ else
955
+ head_repo = if has_head_repo(retrieved)
956
+ ensure_repo(retrieved['head']['repo']['owner']['login'],
957
+ retrieved['head']['repo']['name'],
958
+ false, false, false)
959
+ end
960
+
961
+ head_commit = if not head_repo.nil?
962
+ ensure_commit(retrieved['head']['repo']['name'],
963
+ retrieved['head']['sha'],
964
+ retrieved['head']['repo']['owner']['login'])
965
+ end
909
966
  end
910
967
 
911
968
  pull_req_user = ensure_user(retrieved['user']['login'], false, false)
@@ -923,24 +980,24 @@ module GHTorrent
923
980
  :base_commit_id => base_commit[:id],
924
981
  :user_id => pull_req_user[:id],
925
982
  :pullreq_id => pullreq_id,
926
- :intra_branch => is_intra_branch.call(retrieved)
983
+ :intra_branch => is_intra_branch(retrieved)
927
984
  )
928
985
 
929
- info log_msg.call(retrieved)
986
+ info log_msg(retrieved)
930
987
  else
931
- debug log_msg.call(retrieved) + " exists"
988
+ debug log_msg(retrieved) + " exists"
932
989
  end
933
990
 
934
991
  pull_req = pulls_reqs.first(:base_repo_id => project[:id],
935
992
  :pullreq_id => pullreq_id)
936
993
 
937
- add_history.call(pull_req[:id], date(retrieved['created_at']),
994
+ add_history(pull_req[:id], date(retrieved['created_at']),
938
995
  retrieved[@ext_uniq], 'opened')
939
- add_history.call(pull_req[:id], date(retrieved['merged_at']),
996
+ add_history(pull_req[:id], date(retrieved['merged_at']),
940
997
  retrieved[@ext_uniq], 'merged') if merged
941
- add_history.call(pull_req[:id], date(retrieved['closed_at']),
998
+ add_history(pull_req[:id], date(retrieved['closed_at']),
942
999
  retrieved[@ext_uniq], 'closed') if closed
943
- add_history.call(pull_req[:id], date(created_at), retrieved[@ext_uniq],
1000
+ add_history(pull_req[:id], date(created_at), retrieved[@ext_uniq],
944
1001
  state) unless state.nil?
945
1002
 
946
1003
  ensure_pull_request_commits(owner, repo, pullreq_id) if commits
@@ -955,7 +1012,7 @@ module GHTorrent
955
1012
  time = if created_at.nil? then currepo[:created_at] else Time.now() end
956
1013
 
957
1014
  if currepo.nil?
958
- warn "Could not repository #{owner}/#{repo}"
1015
+ warn "GHTorrent: Could not find repository #{owner}/#{repo}"
959
1016
  return
960
1017
  end
961
1018
 
@@ -983,7 +1040,7 @@ module GHTorrent
983
1040
  pull_req = ensure_pull_request(owner, repo, pullreq_id, false, true)
984
1041
 
985
1042
  if pull_req.nil?
986
- warn "Could not retrieve pull req #{owner}/#{repo} -> #{pullreq_id}"
1043
+ warn "GHTorrent: Could not retrieve pull req #{owner}/#{repo} -> #{pullreq_id}"
987
1044
  return
988
1045
  end
989
1046
 
@@ -994,7 +1051,7 @@ module GHTorrent
994
1051
  retrieved = retrieve_pull_req_comment(owner, repo, pullreq_id, comment_id)
995
1052
 
996
1053
  if retrieved.nil?
997
- warn "Could not retrieve comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1054
+ warn "GHTorrent: Could not retrieve comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
998
1055
  return
999
1056
  end
1000
1057
 
@@ -1018,14 +1075,19 @@ module GHTorrent
1018
1075
  :ext_ref_id => retrieved[@ext_uniq]
1019
1076
  )
1020
1077
  debug "GHTorrent: Adding comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1078
+ @db[:pull_request_comments].first(:pull_request_id => pull_req[:id],
1079
+ :comment_id => comment_id)
1021
1080
  else
1022
- debug "GHTorrent: Updating comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1081
+ debug "GHTorrent: Comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id} exists"
1082
+ exists
1023
1083
  end
1024
1084
  end
1025
1085
 
1026
1086
  def ensure_pull_request_commits(owner, repo, pullreq_id)
1027
- retrieve_pull_req_commits(owner, repo, pullreq_id).map {|c|
1028
- ensure_commit(repo, c['sha'], owner, true)
1087
+ retrieve_pull_req_commits(owner, repo, pullreq_id).reduce([]){|acc, c|
1088
+ x = ensure_commit(repo, c['sha'], owner, true)
1089
+ acc << x if not x.nil?
1090
+ acc
1029
1091
  }.map { |c|
1030
1092
  pullreq = ensure_pull_request(owner, repo, pullreq_id, false, false)
1031
1093
  exists = @db[:pull_request_commits].first(:pull_request_id => pullreq[:id],
@@ -1050,26 +1112,28 @@ module GHTorrent
1050
1112
  # [repo] The repository/project to find forks for
1051
1113
  def ensure_forks(owner, repo)
1052
1114
  currepo = ensure_repo(owner, repo, false, false, false)
1053
- time = currepo[:created_at]
1054
1115
 
1055
1116
  if currepo.nil?
1056
1117
  warn "Could not retrieve forks for #{owner}/#{repo}"
1057
1118
  return
1058
1119
  end
1059
1120
 
1060
- existing_forks = @db.from(:forks, :projects).\
1121
+ existing_forks = @db.from(:forks, :projects, :users).\
1061
1122
  where(:forks__forked_project_id => :projects__id). \
1062
- where(:forks__forked_from_id => currepo[:id]).select(:name, :login).all
1123
+ where(:users__id => :projects__owner_id). \
1124
+ where(:forks__forked_from_id => currepo[:id]).select(:projects__name, :login).all
1063
1125
 
1064
1126
  retrieve_forks(owner, repo).reduce([]) do |acc, x|
1065
1127
  if existing_forks.find {|y|
1066
- y[:login] == x['owner']['login'] && y[:name] == x['name']
1128
+ forked_repo_owner = x['full_name'].split(/\//)[0]
1129
+ forked_repo_name = x['full_name'].split(/\//)[1]
1130
+ y[:login] == forked_repo_owner && y[:name] == forked_repo_name
1067
1131
  }.nil?
1068
1132
  acc << x
1069
1133
  else
1070
1134
  acc
1071
1135
  end
1072
- end.map { |x| ensure_fork(owner, repo, x['id'], time) }
1136
+ end.map { |x| ensure_fork(owner, repo, x['id']) }
1073
1137
  end
1074
1138
 
1075
1139
  ##
@@ -1081,8 +1145,8 @@ module GHTorrent
1081
1145
  fork_exists = forks.first(:fork_id => fork_id)
1082
1146
 
1083
1147
  if fork_exists.nil?
1084
- added = if date_added.nil? then Time.now else date_added end
1085
1148
  retrieved = retrieve_fork(owner, repo, fork_id)
1149
+ added = if date_added.nil? then retrieved['created_at'] else date_added end
1086
1150
 
1087
1151
  if retrieved.nil?
1088
1152
  warn "GHTorrent: Fork #{fork_id} does not exist for #{owner}/#{repo}"
@@ -1106,6 +1170,7 @@ module GHTorrent
1106
1170
  :ext_ref_id => retrieved[@ext_uniq])
1107
1171
  info "GHTorrent: Added #{forked_repo_owner}/#{forked_repo_name} as fork of #{owner}/#{repo}"
1108
1172
  else
1173
+ debug "GHTorrent: Fork #{fork_id} exists as fork of #{owner}/#{repo}"
1109
1174
  unless date_added.nil?
1110
1175
  forks.filter(:fork_id => fork_id)\
1111
1176
  .update(:created_at => date(date_added))
@@ -1114,36 +1179,253 @@ module GHTorrent
1114
1179
  end
1115
1180
  end
1116
1181
 
1117
- private
1182
+ ##
1183
+ # Make sure all issues exist for a project
1184
+ def ensure_issues(owner, repo)
1185
+ currepo = ensure_repo(owner, repo, false, false, false)
1186
+ if currepo.nil?
1187
+ warn "GHTorrent: Could not retrieve issues for #{owner}/#{repo}"
1188
+ return
1189
+ end
1118
1190
 
1119
- # Store a commit contained in a hash. First check whether the commit exists.
1120
- def store_commit(c, repo, user)
1121
- commits = @db[:commits]
1122
- commit = commits.first(:sha => c['sha'])
1191
+ issues = @db[:issues].filter(:repo_id => currepo[:id]).all
1123
1192
 
1124
- if commit.nil?
1125
- author = commit_user(c['author'], c['commit']['author'])
1126
- commiter = commit_user(c['committer'], c['commit']['committer'])
1193
+ retrieve_issues(owner, repo).reduce([]) do |acc, x|
1194
+ if issues.find { |y| y[:issue_id] == x['number'] }.nil?
1195
+ acc << x
1196
+ else
1197
+ acc
1198
+ end
1199
+ end.map { |x| ensure_issue(owner, repo, x['number']) }
1200
+ end
1127
1201
 
1128
- repository = ensure_repo(user, repo, false, false, false)
1202
+ ##
1203
+ # Make sure that the issue exists
1204
+ def ensure_issue(owner, repo, issue_id, events = true, comments = true)
1129
1205
 
1130
- if repository.nil?
1131
- warn "Could not store commit #{user}/#{repo} #{c['sha']}"
1206
+ issues = @db[:issues]
1207
+ repository = ensure_repo(owner, repo, false, false, false)
1208
+
1209
+ if repo.nil?
1210
+ warn "Cannot find repo #{owner}/#{repo}"
1211
+ return
1212
+ end
1213
+
1214
+ cur_issue = issues.first(:issue_id => issue_id,
1215
+ :repo_id => repository[:id])
1216
+
1217
+ if cur_issue.nil?
1218
+ retrieved = retrieve_issue(owner, repo, issue_id)
1219
+
1220
+ if retrieved.nil?
1221
+ warn "GHTorrent: Issue #{issue_id} does not exist for #{owner}/#{repo}"
1132
1222
  return
1133
1223
  end
1134
1224
 
1135
- commits.insert(:sha => c['sha'],
1136
- :author_id => author[:id],
1137
- :committer_id => commiter[:id],
1138
- :project_id => repository[:id],
1139
- :created_at => date(c['commit']['author']['date']),
1140
- :ext_ref_id => c[@ext_uniq]
1225
+ reporter = ensure_user(retrieved['user']['login'], false, false)
1226
+ assignee = unless retrieved['assignee'].nil?
1227
+ ensure_user(retrieved['assignee']['login'], false, false)
1228
+ end
1229
+
1230
+ # Pull requests and issues share the same issue_id
1231
+ pull_req = unless retrieved['pull_request'].nil? or retrieved['pull_request']['patch_url'].nil?
1232
+ ensure_pull_request(owner, repo, issue_id)
1233
+ end
1234
+
1235
+ issues.insert(:repo_id => repository[:id],
1236
+ :assignee_id => unless assignee.nil? then assignee[:id] end,
1237
+ :reporter_id => reporter[:id],
1238
+ :issue_id => issue_id,
1239
+ :pull_request => if pull_req.nil? then false else true end,
1240
+ :pull_request_id => unless pull_req.nil? then pull_req[:id] end,
1241
+ :created_at => date(retrieved['created_at']),
1242
+ :ext_ref_id => retrieved[@ext_uniq])
1243
+
1244
+ ensure_issue_events(owner, repo, issue_id) if events
1245
+ ensure_issue_comments(owner, repo, issue_id) if comments and retrieved['comments'] > 0
1246
+
1247
+ info "GHTorrent: Added issue #{owner}/#{repo} -> #{issue_id}"
1248
+ issues.first(:issue_id => issue_id,
1249
+ :repo_id => repository[:id])
1250
+ else
1251
+ info "GHTorrent: Issue #{owner}/#{repo}->#{issue_id} exists"
1252
+ cur_issue
1253
+ end
1254
+ end
1255
+
1256
+ ##
1257
+ # Retrieve and process all events for an issue
1258
+ def ensure_issue_events(owner, repo, issue_id)
1259
+ currepo = ensure_repo(owner, repo, true, true, false)
1260
+ #time = if created_at.nil? then currepo[:created_at] else Time.now() end
1261
+
1262
+ if currepo.nil?
1263
+ warn "GHTorrent: Could not find repository #{owner}/#{repo}"
1264
+ return
1265
+ end
1266
+
1267
+ issue = ensure_issue(owner, repo, issue_id, false, false)
1268
+ if issue.nil?
1269
+ warn "Could not retrieve issue #{owner}/#{repo} -> #{issue_id}"
1270
+ return
1271
+ end
1272
+
1273
+ retrieve_issue_events(owner, repo, issue_id).reduce([]) do |acc, x|
1274
+
1275
+ if @db[:issue_events].first(:issue_id => issue[:id],
1276
+ :event_id => x['id']).nil?
1277
+ acc << x
1278
+ else
1279
+ acc
1280
+ end
1281
+ end.map { |x|
1282
+ ensure_issue_event(owner, repo, issue_id, x['id'])
1283
+ }
1284
+ end
1285
+
1286
+ ##
1287
+ # Retrieve and process +event_id+ for an +issue_id+
1288
+ def ensure_issue_event(owner, repo, issue_id, event_id)
1289
+ issue = ensure_issue(owner, repo, issue_id, false, false)
1290
+
1291
+ if issue.nil?
1292
+ warn "GHTorrent: Could not retrieve issue #{owner}/#{repo} -> #{issue_id}"
1293
+ return
1294
+ end
1295
+
1296
+ issue_event_str = "#{owner}/#{repo} -> #{issue_id}/#{event_id}"
1297
+
1298
+ curevent = @db[:issue_events].first(:issue_id => issue[:id],
1299
+ :event_id => event_id)
1300
+ if curevent.nil?
1301
+
1302
+ retrieved = retrieve_issue_event(owner, repo, issue_id, event_id)
1303
+
1304
+ if retrieved.nil?
1305
+ warn "GHTorrent: Could not retrieve issue event #{issue_event_str}"
1306
+ return
1307
+ elsif retrieved['actor'].nil?
1308
+ warn "GHTorrent: Issue event #{issue_event_str} does not contain an actor"
1309
+ return
1310
+ end
1311
+
1312
+ actor = ensure_user(retrieved['actor']['login'], false, false)
1313
+
1314
+ action_specific = case retrieved['event']
1315
+ when "referenced" then retrieved['commit_id']
1316
+ when "merged" then retrieved['commit_id']
1317
+ when "closed" then retrieved['commit_id']
1318
+ else nil
1319
+ end
1320
+
1321
+ if retrieved['event'] == "assigned"
1322
+
1323
+ def update_assignee(owner, repo, issue, actor)
1324
+ @db[:issues][:id => issue[:id]] = {:assignee_id => actor[:id]}
1325
+ info "Updating #{owner}/#{repo} -> #{issue[:id]} assignee to #{actor[:id]}"
1326
+ end
1327
+
1328
+ if issue[:assignee_id].nil? then
1329
+ update_assignee(owner, repo, issue, actor)
1330
+ else
1331
+ existing = @db[:issue_events].\
1332
+ filter(:issue_id => issue[:id],:action => "assigned").\
1333
+ order(Sequel.desc(:created_at)).first
1334
+ if existing.nil?
1335
+ update_assignee(owner, repo, issue, actor)
1336
+ elsif date(existing[:created_at]) < date(retrieved['created_at'])
1337
+ update_assignee(owner, repo, issue, actor)
1338
+ end
1339
+ end
1340
+ end
1341
+
1342
+ @db[:issue_events].insert(
1343
+ :event_id => event_id,
1344
+ :issue_id => issue[:id],
1345
+ :actor_id => unless actor.nil? then actor[:id] end,
1346
+ :action => retrieved['event'],
1347
+ :action_specific => action_specific,
1348
+ :created_at => date(retrieved['created_at']),
1349
+ :ext_ref_id => retrieved[@ext_uniq]
1141
1350
  )
1142
- debug "GHTorrent: New commit #{user}/#{repo} -> #{c['sha']} "
1143
- commits.first(:sha => c['sha'])
1351
+
1352
+ info "GHTorrent: Added issue event #{issue_event_str}"
1353
+ @db[:issue_events].first(:issue_id => issue[:id],
1354
+ :event_id => event_id)
1144
1355
  else
1145
- debug "GHTorrent: Commit #{user}/#{repo} -> #{c['sha']} exists"
1146
- commit
1356
+ debug "GHTorrent: Issue event #{issue_event_str} exists"
1357
+ curevent
1358
+ end
1359
+ end
1360
+
1361
+ ##
1362
+ # Retrieve and process all comments for an issue
1363
+ def ensure_issue_comments(owner, repo, issue_id)
1364
+ currepo = ensure_repo(owner, repo, true, true, false)
1365
+
1366
+ if currepo.nil?
1367
+ warn "GHTorrent: Could not find repository #{owner}/#{repo}"
1368
+ return
1369
+ end
1370
+
1371
+ issue = ensure_issue(owner, repo, issue_id, false, false)
1372
+ if issue.nil?
1373
+ warn "Could not retrieve issue #{owner}/#{repo} -> #{issue_id}"
1374
+ return
1375
+ end
1376
+
1377
+ retrieve_issue_comments(owner, repo, issue_id).reduce([]) do |acc, x|
1378
+
1379
+ if @db[:issue_comments].first(:issue_id => issue[:id],
1380
+ :comment_id => x['id']).nil?
1381
+ acc << x
1382
+ else
1383
+ acc
1384
+ end
1385
+ end.map { |x|
1386
+ ensure_issue_comment(owner, repo, issue_id, x['id'])
1387
+ }
1388
+ end
1389
+
1390
+ ##
1391
+ # Retrieve and process +comment_id+ for an +issue_id+
1392
+ def ensure_issue_comment(owner, repo, issue_id, comment_id)
1393
+ issue = ensure_issue(owner, repo, issue_id)
1394
+
1395
+ if issue.nil?
1396
+ warn "GHTorrent: Could not retrieve issue #{owner}/#{repo} -> #{issue_id}"
1397
+ return
1398
+ end
1399
+
1400
+ issue_comment_str = "#{owner}/#{repo} -> #{issue_id}/#{comment_id}"
1401
+
1402
+ curcomment = @db[:issue_comments].first(:issue_id => issue[:id],
1403
+ :comment_id => comment_id)
1404
+ if curcomment.nil?
1405
+
1406
+ retrieved = retrieve_issue_comment(owner, repo, issue_id, comment_id)
1407
+
1408
+ if retrieved.nil?
1409
+ warn "GHTorrent: Could not retrieve issue comment #{issue_comment_str}"
1410
+ return
1411
+ end
1412
+
1413
+ user = ensure_user(retrieved['user']['login'], false, false)
1414
+
1415
+ @db[:issue_comments].insert(
1416
+ :comment_id => comment_id,
1417
+ :issue_id => issue[:id],
1418
+ :user_id => unless user.nil? then user[:id] end,
1419
+ :created_at => date(retrieved['created_at']),
1420
+ :ext_ref_id => retrieved[@ext_uniq]
1421
+ )
1422
+
1423
+ info "GHTorrent: Added issue comment #{issue_comment_str}"
1424
+ @db[:issue_comments].first(:issue_id => issue[:id],
1425
+ :comment_id => comment_id)
1426
+ else
1427
+ debug "GHTorrent: Issue comment #{issue_comment_str} exists"
1428
+ curcomment
1147
1429
  end
1148
1430
  end
1149
1431
 
@@ -1153,27 +1435,57 @@ module GHTorrent
1153
1435
  @db ||= get_db
1154
1436
  @persister ||= persister
1155
1437
 
1438
+ result = nil
1156
1439
  start_time = Time.now
1157
1440
  begin
1158
1441
  @db.transaction(:rollback => :reraise, :isolation => :committed) do
1159
- yield block
1442
+ result = yield block
1160
1443
  end
1161
1444
  total = Time.now.to_ms - start_time.to_ms
1162
1445
  debug "GHTorrent: Transaction committed (#{total} ms)"
1446
+ result
1163
1447
  rescue Exception => e
1164
1448
  total = Time.now.to_ms - start_time.to_ms
1165
1449
  warn "GHTorrent: Transaction failed (#{total} ms)"
1166
1450
  raise e
1167
1451
  ensure
1168
- @db.disconnect
1169
- @persister.close
1170
-
1171
- @db = nil
1172
- @persister = nil
1173
1452
  GC.start
1174
1453
  end
1175
1454
  end
1176
1455
 
1456
+ private
1457
+
1458
+ # Store a commit contained in a hash. First check whether the commit exists.
1459
+ def store_commit(c, repo, user)
1460
+ commits = @db[:commits]
1461
+ commit = commits.first(:sha => c['sha'])
1462
+
1463
+ if commit.nil?
1464
+ author = commit_user(c['author'], c['commit']['author'])
1465
+ commiter = commit_user(c['committer'], c['commit']['committer'])
1466
+
1467
+ repository = ensure_repo(user, repo, false, false, false)
1468
+
1469
+ if repository.nil?
1470
+ warn "Could not store commit #{user}/#{repo} #{c['sha']}"
1471
+ return
1472
+ end
1473
+
1474
+ commits.insert(:sha => c['sha'],
1475
+ :author_id => author[:id],
1476
+ :committer_id => commiter[:id],
1477
+ :project_id => repository[:id],
1478
+ :created_at => date(c['commit']['author']['date']),
1479
+ :ext_ref_id => c[@ext_uniq]
1480
+ )
1481
+ debug "GHTorrent: New commit #{user}/#{repo} -> #{c['sha']} "
1482
+ commits.first(:sha => c['sha'])
1483
+ else
1484
+ debug "GHTorrent: Commit #{user}/#{repo} -> #{c['sha']} exists"
1485
+ commit
1486
+ end
1487
+ end
1488
+
1177
1489
  ##
1178
1490
  # Convert a string value to boolean, the SQL way
1179
1491
  def boolean(arg)