ghtorrent 0.5 → 0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ require 'rubygems'
2
+
3
+ require 'ghtorrent/ghtorrent'
4
+ require 'ghtorrent/settings'
5
+ require 'ghtorrent/logging'
6
+ require 'ghtorrent/command'
7
+ require 'ghtorrent/retriever'
8
+
9
+ class GHTRetrieveRepo < GHTorrent::Command
10
+
11
+ include GHTorrent::Settings
12
+ include GHTorrent::Retriever
13
+ include GHTorrent::Persister
14
+
15
+ def prepare_options(options)
16
+ options.banner <<-BANNER
17
+ An efficient way to get all data for a single repo
18
+
19
+ #{command_name} [options] owner repo
20
+
21
+ BANNER
22
+ end
23
+
24
+ def validate
25
+ super
26
+ Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
27
+ end
28
+
29
+ def logger
30
+ ght.logger
31
+ end
32
+
33
+ def persister
34
+ @persister ||= connect(:mongo, settings)
35
+ @persister
36
+ end
37
+
38
+ def ext_uniq
39
+ @ext_uniq ||= config(:uniq_id)
40
+ @ext_uniq
41
+ end
42
+
43
+ def ght
44
+ @ght ||= TransactedGHTorrent.new(settings)
45
+ @ght
46
+ end
47
+
48
+ def go
49
+ user_entry = ght.transaction{ght.ensure_user(ARGV[0], false, false)}
50
+
51
+ if user_entry.nil?
52
+ Trollop::die "Cannot find user #{owner}"
53
+ end
54
+
55
+ user = user_entry[:login]
56
+
57
+ repo_entry = ght.transaction{ght.ensure_repo(ARGV[0], ARGV[1], false, false, false)}
58
+
59
+ if repo_entry.nil?
60
+ Trollop::die "Cannot find repository #{owner}/#{ARGV[1]}"
61
+ end
62
+
63
+ repo = repo_entry[:name]
64
+
65
+ %w(ensure_commits ensure_forks ensure_pull_requests
66
+ ensure_issues ensure_project_members ensure_watchers).each {|x|
67
+ begin
68
+ ght.send(x, user, repo)
69
+ rescue Exception => e
70
+ puts STDERR, e.message
71
+ puts STDERR, e.backtrace
72
+ end
73
+ }
74
+ end
75
+ end
76
+
77
+ # A version of the GHTorrent class that creates a transaction per processed
78
+ # item
79
+ class TransactedGHTorrent < GHTorrent::Mirror
80
+
81
+ def ensure_commit(repo, sha, user, comments = true)
82
+ transaction do
83
+ super(repo, sha, user, comments)
84
+ end
85
+ end
86
+
87
+ def ensure_fork(owner, repo, fork_id, date_added = nil)
88
+ transaction do
89
+ super(owner, repo, fork_id, date_added)
90
+ end
91
+ end
92
+
93
+ def ensure_pull_request(owner, repo, pullreq_id,
94
+ comments = true, commits = true,
95
+ state = nil, created_at = nil)
96
+ transaction do
97
+ super(owner, repo, pullreq_id, comments, commits, state, created_at)
98
+ end
99
+ end
100
+
101
+ def ensure_issue(owner, repo, issue_id, events = true, comments = true)
102
+ transaction do
103
+ super(owner, repo, issue_id, events, comments)
104
+ end
105
+ end
106
+
107
+ def ensure_project_member(owner, repo, new_member, date_added)
108
+ transaction do
109
+ super(owner, repo, new_member, date_added)
110
+ end
111
+ end
112
+
113
+ def ensure_watcher(owner, repo, watcher, date_added = nil)
114
+ transaction do
115
+ super(owner, repo, watcher, date_added)
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,132 @@
1
+ require 'rubygems'
2
+ require 'mongo'
3
+
4
+ require 'ghtorrent/settings'
5
+ require 'ghtorrent/logging'
6
+ require 'ghtorrent/command'
7
+ require 'ghtorrent/persister'
8
+
9
+ class GHRMDupl < GHTorrent::Command
10
+
11
+ include GHTorrent::Settings
12
+ include GHTorrent::Persister
13
+
14
+ def col_info()
15
+ {
16
+ :commits => {
17
+ :unq => "sha",
18
+ :col => persister.get_underlying_connection.collection(:commits.to_s),
19
+ },
20
+ :events => {
21
+ :unq => "id",
22
+ :col => persister.get_underlying_connection.collection(:events.to_s),
23
+ }
24
+ }
25
+ end
26
+
27
+ def persister
28
+ @persister ||= connect(:mongo, @settings)
29
+ @persister
30
+ end
31
+
32
+ def prepare_options(options)
33
+ options.banner <<-BANNER
34
+ Removes duplicate entries from collections (currently, commits and events)
35
+
36
+ #{command_name} [options] collection
37
+
38
+ #{command_name} options:
39
+ BANNER
40
+
41
+ options.opt :earliest, 'Seconds since epoch of earliest item to load',
42
+ :short => 'e', :default => 0, :type => :int
43
+ options.opt :snapshot, 'Perform clean up every x records',
44
+ :short => 's', :default => -1, :type => :int
45
+ end
46
+
47
+ def validate
48
+ super
49
+ Trollop::die "no collection specified" unless args[0] && !args[0].empty?
50
+ end
51
+
52
+ # Print MongoDB remove statements that
53
+ # remove all but one entries for each commit.
54
+ def remove_duplicates(data, col)
55
+ removed = 0
56
+ data.select { |k, v| v.size > 1 }.each do |k, v|
57
+ v.slice(0..(v.size - 2)).map do |x|
58
+ removed += 1 if delete_by_id col, x
59
+ end
60
+ end
61
+ removed
62
+ end
63
+
64
+ def delete_by_id(col, id)
65
+ begin
66
+ col.remove({'_id' => id})
67
+ true
68
+ rescue Mongo::OperationFailure
69
+ puts "Cannot remove record with id #{id} from #{col.name}"
70
+ false
71
+ end
72
+ end
73
+
74
+ def go
75
+ collection = case ARGV[0]
76
+ when "commits" then
77
+ :commits
78
+ when "events" then
79
+ :events
80
+ else
81
+ puts "Not a known collection name: #{ARGV[0]}\n"
82
+ end
83
+
84
+ from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
85
+
86
+ snapshot = options[:snapshot]
87
+
88
+ puts "Deleting duplicates from collection #{collection}"
89
+ puts "Deleting duplicates after #{Time.at(options[:earliest])}"
90
+ puts "Perform clean up every #{snapshot} records"
91
+
92
+ # Various counters to report stats
93
+ processed = total_processed = removed = 0
94
+
95
+ data = Hash.new
96
+
97
+ # The following code needs to save intermediate results to cope
98
+ # with large datasets
99
+ col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
100
+ _id = r["_id"]
101
+ commit = read_value(r, col_info[collection][:unq])
102
+
103
+ # If entries cannot be parsed, remove them
104
+ if commit.empty?
105
+ puts "Deleting unknown entry #{_id}"
106
+ removed += 1 if delete_by_id col_info[collection][:col], _id
107
+ else
108
+ data[commit] = [] if data[commit].nil?
109
+ data[commit] << _id
110
+ end
111
+
112
+ processed += 1
113
+ total_processed += 1
114
+
115
+ print "\rProcessed #{processed} records"
116
+
117
+ # Calculate duplicates, save intermediate result
118
+ if snapshot > 0 and processed > snapshot
119
+ puts "\nLoaded #{data.size} values, cleaning"
120
+ removed += remove_duplicates data, col_info[collection][:col]
121
+ data = Hash.new
122
+ processed = 0
123
+ end
124
+ end
125
+
126
+ removed += remove_duplicates data, col_info[collection][:col]
127
+
128
+ puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
129
+ end
130
+ end
131
+
132
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -22,7 +22,7 @@ module GHTorrent
22
22
  @logger = Logger.new(STDOUT)
23
23
  end
24
24
 
25
- # db related functions
25
+ # Get a connection to the database
26
26
  def get_db
27
27
  Sequel.single_threaded = true
28
28
  @db = Sequel.connect(config(:sql_url), :encoding => 'utf8')
@@ -151,19 +151,31 @@ module GHTorrent
151
151
  end
152
152
 
153
153
  ##
154
- # Retrieve a pull request review comment
154
+ # Retrieve an issue
155
155
  # ==Parameters:
156
156
  # [owner] The login of the repository owner
157
157
  # [repo] The name of the repository
158
- # [fork_id] The fork item id
158
+ # [issue_id] The fork item id
159
+ # [action] The action that took place for the issue
159
160
  # [date_added] The timestamp that the add event took place
160
- def get_issue_comment(owner, repo, issue_id, comment_id, created_at)
161
+ def get_issue(owner, repo, issue_id, created_at)
161
162
  transaction do
162
- raise "Not implemented"
163
- #ensure_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
163
+ ensure_issue(owner, repo, issue_id, created_at)
164
164
  end
165
165
  end
166
166
 
167
+ ##
168
+ # Retrieve a issue comment
169
+ # ==Parameters:
170
+ # [owner] The login of the repository owner
171
+ # [repo] The name of the repository
172
+ # [issue_id] The fork item id
173
+ # [comment_id] The issue comment unique identifier
174
+ def get_issue_comment(owner, repo, issue_id, comment_id)
175
+ transaction do
176
+ ensure_issue_comment(owner, repo, issue_id, comment_id)
177
+ end
178
+ end
167
179
 
168
180
  ##
169
181
  # Make sure a commit exists
@@ -188,21 +200,30 @@ module GHTorrent
188
200
  end
189
201
 
190
202
  ##
191
- # Get as many commits for a repository as allowed by Github
192
- #
203
+ # Retrieve commits for a repository, starting from +sha+
204
+ # and going back to 30 * +num_pages+ commit log entries.
193
205
  # ==Parameters:
194
206
  # [user] The user to whom the repo belongs.
195
207
  # [repo] The repo to look for commits into.
196
- def ensure_commits(user, repo)
208
+ # [sha] The first commit to start retrieving from. If nil, then the
209
+ # earliest stored commit will be used instead.
210
+ # [num_pages] The number of commit pages to retrieve
211
+ def ensure_commits(user, repo, sha = nil,
212
+ num_pages = config(:mirror_commit_pages_new_repo))
197
213
  userid = @db[:users].filter(:login => user).first[:id]
198
214
  repoid = @db[:projects].filter(:owner_id => userid,
199
215
  :name => repo).first[:id]
200
216
 
201
- latest = @db[:commits].filter(:project_id => repoid).order(:created_at).last
217
+ latest = if sha.nil?
218
+ @db[:commits].filter(:project_id => repoid).order(:created_at).last
219
+ else
220
+ sha
221
+ end
222
+
202
223
  commits = if latest.nil?
203
- retrieve_commits(repo, nil, user)
224
+ retrieve_commits(repo, "head", user, num_pages)
204
225
  else
205
- retrieve_commits(repo, latest[:sha], user)
226
+ retrieve_commits(repo, latest[:sha], user, num_pages)
206
227
  end
207
228
 
208
229
  commits.map do |c|
@@ -226,6 +247,11 @@ module GHTorrent
226
247
  parent = commits.first(:sha => url[7])
227
248
  end
228
249
 
250
+ if parent.nil?
251
+ warn "GHTorrent: Could not retrieve #{url[4]}/#{url[5]} -> #{url[7]}, parent to commit #{this[:sha]}"
252
+ return
253
+ end
254
+
229
255
  if parents.first(:commit_id => this[:id],
230
256
  :parent_id => parent[:id]).nil?
231
257
 
@@ -297,6 +323,16 @@ module GHTorrent
297
323
  if dbuser.nil?
298
324
  # We do not have the user in the database yet. Add him
299
325
  added = ensure_user(login, false, false)
326
+
327
+ # A commit user can be found by email but not
328
+ # by the user name he used to commit. This probably means that the
329
+ # user has probably changed his user name. Treat the user's by-email
330
+ # description as valid.
331
+ if added.nil? and not byemail.nil?
332
+ warn "GHTorrent: Found user #{byemail[:login]} with same email #{email} as non existing user #{login}. Assigning user #{login} to #{byemail[:login]}"
333
+ return users.first(:login => byemail[:login])
334
+ end
335
+
300
336
  if byemail.nil?
301
337
  users.filter(:login => login).update(:name => name) if added[:name].nil?
302
338
  users.filter(:login => login).update(:email => email) if added[:email].nil?
@@ -342,7 +378,11 @@ module GHTorrent
342
378
  name, email = user.split("<")
343
379
  email = email.split(">")[0]
344
380
  rescue Exception
345
- raise new GHTorrentException("Not a valid email address: #{user}")
381
+ raise new GHTorrentException.new("Not a valid email address: #{user}")
382
+ end
383
+
384
+ unless is_valid_email(email)
385
+ warn("GHTorrent: Extracted email(#{email}) not valid for user #{user}")
346
386
  end
347
387
  u = ensure_user_byemail(email.strip, name.strip)
348
388
  else
@@ -522,9 +562,8 @@ module GHTorrent
522
562
  # the result is nil
523
563
  def ensure_repo(user, repo, commits = true, project_members = true, watchers = true)
524
564
 
525
- ensure_user(user, false, false)
526
565
  repos = @db[:projects]
527
- curuser = @db[:users].first(:login => user)
566
+ curuser = ensure_user(user, false, false)
528
567
  currepo = repos.first(:owner_id => curuser[:id], :name => repo)
529
568
 
530
569
  if currepo.nil?
@@ -549,7 +588,7 @@ module GHTorrent
549
588
  ensure_watchers(user, repo) if watchers
550
589
  repos.first(:owner_id => curuser[:id], :name => repo)
551
590
  else
552
- debug "GHTorrent: Repo #{repo} exists"
591
+ debug "GHTorrent: Repo #{user}/#{repo} exists"
553
592
  currepo
554
593
  end
555
594
  end
@@ -604,11 +643,12 @@ module GHTorrent
604
643
  )
605
644
  info "GHTorrent: Added project member #{repo} -> #{new_member}"
606
645
  else
646
+ debug "GHTorrent: Project member #{repo} -> #{new_member} exists"
607
647
  unless date_added.nil?
608
648
  pr_members.filter(:user_id => new_user[:id],
609
649
  :repo_id => project[:id])\
610
650
  .update(:created_at => date(date_added))
611
- info "GHTorrent: Updating #{repo} -> #{new_member}"
651
+ info "GHTorrent: Updating project member #{repo} -> #{new_member}"
612
652
  end
613
653
  end
614
654
  end
@@ -796,13 +836,14 @@ module GHTorrent
796
836
  :created_at => date(added),
797
837
  :ext_ref_id => retrieved[@ext_uniq]
798
838
  )
799
- info "GHTorrent: Added watcher #{repo} -> #{watcher}"
839
+ info "GHTorrent: Added watcher #{owner}/#{repo} -> #{watcher}"
800
840
  else
841
+ debug "GHTorrent: Watcher #{owner}/#{repo} -> #{watcher} exists"
801
842
  unless date_added.nil?
802
843
  watchers.filter(:user_id => new_watcher[:id],
803
844
  :repo_id => project[:id])\
804
845
  .update(:created_at => date(date_added))
805
- info "GHTorrent: Updating #{repo} -> #{watcher}"
846
+ info "GHTorrent: Updating watcher #{owner}/#{repo} -> #{watcher}"
806
847
  end
807
848
  end
808
849
  end
@@ -816,7 +857,7 @@ module GHTorrent
816
857
  return
817
858
  end
818
859
 
819
- pull_reqs = @db[:pull_requests].filter(:base_repo_id => currepo[:id])
860
+ pull_reqs = @db[:pull_requests].filter(:base_repo_id => currepo[:id]).all
820
861
 
821
862
  retrieve_pull_requests(owner, repo).reduce([]) do |acc, x|
822
863
  if pull_reqs.find { |y| y[:pullreq_id] == x['number'] }.nil?
@@ -833,7 +874,6 @@ module GHTorrent
833
874
  comments = true, commits = true,
834
875
  state = nil, created_at = nil)
835
876
  pulls_reqs = @db[:pull_requests]
836
- pull_req_history = @db[:pull_request_history]
837
877
 
838
878
  project = ensure_repo(owner, repo, false, false, false)
839
879
 
@@ -842,8 +882,8 @@ module GHTorrent
842
882
  end
843
883
 
844
884
  # Adds a pull request history event
845
- add_history = Proc.new do |id, ts, unq, act|
846
-
885
+ def add_history(id, ts, unq, act)
886
+ pull_req_history = @db[:pull_request_history]
847
887
  entry = pull_req_history.first(:pull_request_id => id,
848
888
  :ext_ref_id => unq, :action => act)
849
889
  if entry.nil?
@@ -859,20 +899,33 @@ module GHTorrent
859
899
 
860
900
  # Checks whether a pull request concerns two branches of the same
861
901
  # repository
862
- is_intra_branch = Proc.new do |req|
863
- req['head']['repo'].nil?
902
+ def is_intra_branch(req)
903
+ return false unless has_head_repo(req)
904
+
905
+ if req['head']['repo']['owner']['login'] ==
906
+ req['base']['repo']['owner']['login'] and
907
+ req['head']['repo']['full_name'] == req['base']['repo']['full_name']
908
+ true
909
+ else
910
+ false
911
+ end
912
+ end
913
+
914
+ # Checks if the pull request has a head repo specified
915
+ def has_head_repo(req)
916
+ not req['head']['repo'].nil?
864
917
  end
865
918
 
866
919
  # Produces a log message
867
- log_msg = Proc.new do |req|
868
- head = if is_intra_branch.call(req)
869
- req['base']['repo']['full_name']
870
- else
920
+ def log_msg(req)
921
+ head = if has_head_repo(req)
871
922
  req['head']['repo']['full_name']
923
+ else
924
+ "(head deleted)"
872
925
  end
873
926
 
874
927
  <<-eos.gsub(/\s+/, " ").strip
875
- GHTorrent: Pull request #{pullreq_id}
928
+ GHTorrent: Pull request #{req['number']}
876
929
  #{head} -> #{req['base']['repo']['full_name']}
877
930
  eos
878
931
  end
@@ -890,22 +943,26 @@ module GHTorrent
890
943
 
891
944
  base_commit = ensure_commit(retrieved['base']['repo']['name'],
892
945
  retrieved['base']['sha'],
893
- retrieved['base']['repo']['owner']['login']
894
- )
946
+ retrieved['base']['repo']['owner']['login'])
895
947
 
896
- if is_intra_branch.call(retrieved)
948
+ if is_intra_branch(retrieved)
897
949
  head_repo = base_repo
898
- head_commit =
899
- warn "GHTorrent: Pull request is intra branch"
900
- else
901
-
902
- head_repo = ensure_repo(retrieved['head']['repo']['owner']['login'],
903
- retrieved['head']['repo']['name'],
904
- false, false, false)
905
-
906
- head_commit = ensure_commit(retrieved['head']['repo']['name'],
950
+ head_commit = ensure_commit(retrieved['base']['repo']['name'],
907
951
  retrieved['head']['sha'],
908
- retrieved['head']['repo']['owner']['login'])
952
+ retrieved['base']['repo']['owner']['login'])
953
+ info log_msg(retrieved) + " is intra branch"
954
+ else
955
+ head_repo = if has_head_repo(retrieved)
956
+ ensure_repo(retrieved['head']['repo']['owner']['login'],
957
+ retrieved['head']['repo']['name'],
958
+ false, false, false)
959
+ end
960
+
961
+ head_commit = if not head_repo.nil?
962
+ ensure_commit(retrieved['head']['repo']['name'],
963
+ retrieved['head']['sha'],
964
+ retrieved['head']['repo']['owner']['login'])
965
+ end
909
966
  end
910
967
 
911
968
  pull_req_user = ensure_user(retrieved['user']['login'], false, false)
@@ -923,24 +980,24 @@ module GHTorrent
923
980
  :base_commit_id => base_commit[:id],
924
981
  :user_id => pull_req_user[:id],
925
982
  :pullreq_id => pullreq_id,
926
- :intra_branch => is_intra_branch.call(retrieved)
983
+ :intra_branch => is_intra_branch(retrieved)
927
984
  )
928
985
 
929
- info log_msg.call(retrieved)
986
+ info log_msg(retrieved)
930
987
  else
931
- debug log_msg.call(retrieved) + " exists"
988
+ debug log_msg(retrieved) + " exists"
932
989
  end
933
990
 
934
991
  pull_req = pulls_reqs.first(:base_repo_id => project[:id],
935
992
  :pullreq_id => pullreq_id)
936
993
 
937
- add_history.call(pull_req[:id], date(retrieved['created_at']),
994
+ add_history(pull_req[:id], date(retrieved['created_at']),
938
995
  retrieved[@ext_uniq], 'opened')
939
- add_history.call(pull_req[:id], date(retrieved['merged_at']),
996
+ add_history(pull_req[:id], date(retrieved['merged_at']),
940
997
  retrieved[@ext_uniq], 'merged') if merged
941
- add_history.call(pull_req[:id], date(retrieved['closed_at']),
998
+ add_history(pull_req[:id], date(retrieved['closed_at']),
942
999
  retrieved[@ext_uniq], 'closed') if closed
943
- add_history.call(pull_req[:id], date(created_at), retrieved[@ext_uniq],
1000
+ add_history(pull_req[:id], date(created_at), retrieved[@ext_uniq],
944
1001
  state) unless state.nil?
945
1002
 
946
1003
  ensure_pull_request_commits(owner, repo, pullreq_id) if commits
@@ -955,7 +1012,7 @@ module GHTorrent
955
1012
  time = if created_at.nil? then currepo[:created_at] else Time.now() end
956
1013
 
957
1014
  if currepo.nil?
958
- warn "Could not repository #{owner}/#{repo}"
1015
+ warn "GHTorrent: Could not find repository #{owner}/#{repo}"
959
1016
  return
960
1017
  end
961
1018
 
@@ -983,7 +1040,7 @@ module GHTorrent
983
1040
  pull_req = ensure_pull_request(owner, repo, pullreq_id, false, true)
984
1041
 
985
1042
  if pull_req.nil?
986
- warn "Could not retrieve pull req #{owner}/#{repo} -> #{pullreq_id}"
1043
+ warn "GHTorrent: Could not retrieve pull req #{owner}/#{repo} -> #{pullreq_id}"
987
1044
  return
988
1045
  end
989
1046
 
@@ -994,7 +1051,7 @@ module GHTorrent
994
1051
  retrieved = retrieve_pull_req_comment(owner, repo, pullreq_id, comment_id)
995
1052
 
996
1053
  if retrieved.nil?
997
- warn "Could not retrieve comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1054
+ warn "GHTorrent: Could not retrieve comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
998
1055
  return
999
1056
  end
1000
1057
 
@@ -1018,14 +1075,19 @@ module GHTorrent
1018
1075
  :ext_ref_id => retrieved[@ext_uniq]
1019
1076
  )
1020
1077
  debug "GHTorrent: Adding comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1078
+ @db[:pull_request_comments].first(:pull_request_id => pull_req[:id],
1079
+ :comment_id => comment_id)
1021
1080
  else
1022
- debug "GHTorrent: Updating comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id}"
1081
+ debug "GHTorrent: Comment #{comment_id} for pullreq #{owner}/#{repo} -> #{pullreq_id} exists"
1082
+ exists
1023
1083
  end
1024
1084
  end
1025
1085
 
1026
1086
  def ensure_pull_request_commits(owner, repo, pullreq_id)
1027
- retrieve_pull_req_commits(owner, repo, pullreq_id).map {|c|
1028
- ensure_commit(repo, c['sha'], owner, true)
1087
+ retrieve_pull_req_commits(owner, repo, pullreq_id).reduce([]){|acc, c|
1088
+ x = ensure_commit(repo, c['sha'], owner, true)
1089
+ acc << x if not x.nil?
1090
+ acc
1029
1091
  }.map { |c|
1030
1092
  pullreq = ensure_pull_request(owner, repo, pullreq_id, false, false)
1031
1093
  exists = @db[:pull_request_commits].first(:pull_request_id => pullreq[:id],
@@ -1050,26 +1112,28 @@ module GHTorrent
1050
1112
  # [repo] The repository/project to find forks for
1051
1113
  def ensure_forks(owner, repo)
1052
1114
  currepo = ensure_repo(owner, repo, false, false, false)
1053
- time = currepo[:created_at]
1054
1115
 
1055
1116
  if currepo.nil?
1056
1117
  warn "Could not retrieve forks for #{owner}/#{repo}"
1057
1118
  return
1058
1119
  end
1059
1120
 
1060
- existing_forks = @db.from(:forks, :projects).\
1121
+ existing_forks = @db.from(:forks, :projects, :users).\
1061
1122
  where(:forks__forked_project_id => :projects__id). \
1062
- where(:forks__forked_from_id => currepo[:id]).select(:name, :login).all
1123
+ where(:users__id => :projects__owner_id). \
1124
+ where(:forks__forked_from_id => currepo[:id]).select(:projects__name, :login).all
1063
1125
 
1064
1126
  retrieve_forks(owner, repo).reduce([]) do |acc, x|
1065
1127
  if existing_forks.find {|y|
1066
- y[:login] == x['owner']['login'] && y[:name] == x['name']
1128
+ forked_repo_owner = x['full_name'].split(/\//)[0]
1129
+ forked_repo_name = x['full_name'].split(/\//)[1]
1130
+ y[:login] == forked_repo_owner && y[:name] == forked_repo_name
1067
1131
  }.nil?
1068
1132
  acc << x
1069
1133
  else
1070
1134
  acc
1071
1135
  end
1072
- end.map { |x| ensure_fork(owner, repo, x['id'], time) }
1136
+ end.map { |x| ensure_fork(owner, repo, x['id']) }
1073
1137
  end
1074
1138
 
1075
1139
  ##
@@ -1081,8 +1145,8 @@ module GHTorrent
1081
1145
  fork_exists = forks.first(:fork_id => fork_id)
1082
1146
 
1083
1147
  if fork_exists.nil?
1084
- added = if date_added.nil? then Time.now else date_added end
1085
1148
  retrieved = retrieve_fork(owner, repo, fork_id)
1149
+ added = if date_added.nil? then retrieved['created_at'] else date_added end
1086
1150
 
1087
1151
  if retrieved.nil?
1088
1152
  warn "GHTorrent: Fork #{fork_id} does not exist for #{owner}/#{repo}"
@@ -1106,6 +1170,7 @@ module GHTorrent
1106
1170
  :ext_ref_id => retrieved[@ext_uniq])
1107
1171
  info "GHTorrent: Added #{forked_repo_owner}/#{forked_repo_name} as fork of #{owner}/#{repo}"
1108
1172
  else
1173
+ debug "GHTorrent: Fork #{fork_id} exists as fork of #{owner}/#{repo}"
1109
1174
  unless date_added.nil?
1110
1175
  forks.filter(:fork_id => fork_id)\
1111
1176
  .update(:created_at => date(date_added))
@@ -1114,36 +1179,253 @@ module GHTorrent
1114
1179
  end
1115
1180
  end
1116
1181
 
1117
- private
1182
+ ##
1183
+ # Make sure all issues exist for a project
1184
+ def ensure_issues(owner, repo)
1185
+ currepo = ensure_repo(owner, repo, false, false, false)
1186
+ if currepo.nil?
1187
+ warn "GHTorrent: Could not retrieve issues for #{owner}/#{repo}"
1188
+ return
1189
+ end
1118
1190
 
1119
- # Store a commit contained in a hash. First check whether the commit exists.
1120
- def store_commit(c, repo, user)
1121
- commits = @db[:commits]
1122
- commit = commits.first(:sha => c['sha'])
1191
+ issues = @db[:issues].filter(:repo_id => currepo[:id]).all
1123
1192
 
1124
- if commit.nil?
1125
- author = commit_user(c['author'], c['commit']['author'])
1126
- commiter = commit_user(c['committer'], c['commit']['committer'])
1193
+ retrieve_issues(owner, repo).reduce([]) do |acc, x|
1194
+ if issues.find { |y| y[:issue_id] == x['number'] }.nil?
1195
+ acc << x
1196
+ else
1197
+ acc
1198
+ end
1199
+ end.map { |x| ensure_issue(owner, repo, x['number']) }
1200
+ end
1127
1201
 
1128
- repository = ensure_repo(user, repo, false, false, false)
1202
+ ##
1203
+ # Make sure that the issue exists
1204
+ def ensure_issue(owner, repo, issue_id, events = true, comments = true)
1129
1205
 
1130
- if repository.nil?
1131
- warn "Could not store commit #{user}/#{repo} #{c['sha']}"
1206
+ issues = @db[:issues]
1207
+ repository = ensure_repo(owner, repo, false, false, false)
1208
+
1209
+ if repo.nil?
1210
+ warn "Cannot find repo #{owner}/#{repo}"
1211
+ return
1212
+ end
1213
+
1214
+ cur_issue = issues.first(:issue_id => issue_id,
1215
+ :repo_id => repository[:id])
1216
+
1217
+ if cur_issue.nil?
1218
+ retrieved = retrieve_issue(owner, repo, issue_id)
1219
+
1220
+ if retrieved.nil?
1221
+ warn "GHTorrent: Issue #{issue_id} does not exist for #{owner}/#{repo}"
1132
1222
  return
1133
1223
  end
1134
1224
 
1135
- commits.insert(:sha => c['sha'],
1136
- :author_id => author[:id],
1137
- :committer_id => commiter[:id],
1138
- :project_id => repository[:id],
1139
- :created_at => date(c['commit']['author']['date']),
1140
- :ext_ref_id => c[@ext_uniq]
1225
+ reporter = ensure_user(retrieved['user']['login'], false, false)
1226
+ assignee = unless retrieved['assignee'].nil?
1227
+ ensure_user(retrieved['assignee']['login'], false, false)
1228
+ end
1229
+
1230
+ # Pull requests and issues share the same issue_id
1231
+ pull_req = unless retrieved['pull_request'].nil? or retrieved['pull_request']['patch_url'].nil?
1232
+ ensure_pull_request(owner, repo, issue_id)
1233
+ end
1234
+
1235
+ issues.insert(:repo_id => repository[:id],
1236
+ :assignee_id => unless assignee.nil? then assignee[:id] end,
1237
+ :reporter_id => reporter[:id],
1238
+ :issue_id => issue_id,
1239
+ :pull_request => if pull_req.nil? then false else true end,
1240
+ :pull_request_id => unless pull_req.nil? then pull_req[:id] end,
1241
+ :created_at => date(retrieved['created_at']),
1242
+ :ext_ref_id => retrieved[@ext_uniq])
1243
+
1244
+ ensure_issue_events(owner, repo, issue_id) if events
1245
+ ensure_issue_comments(owner, repo, issue_id) if comments and retrieved['comments'] > 0
1246
+
1247
+ info "GHTorrent: Added issue #{owner}/#{repo} -> #{issue_id}"
1248
+ issues.first(:issue_id => issue_id,
1249
+ :repo_id => repository[:id])
1250
+ else
1251
+ info "GHTorrent: Issue #{owner}/#{repo}->#{issue_id} exists"
1252
+ cur_issue
1253
+ end
1254
+ end
1255
+
1256
+ ##
1257
+ # Retrieve and process all events for an issue
1258
+ def ensure_issue_events(owner, repo, issue_id)
1259
+ currepo = ensure_repo(owner, repo, true, true, false)
1260
+ #time = if created_at.nil? then currepo[:created_at] else Time.now() end
1261
+
1262
+ if currepo.nil?
1263
+ warn "GHTorrent: Could not find repository #{owner}/#{repo}"
1264
+ return
1265
+ end
1266
+
1267
+ issue = ensure_issue(owner, repo, issue_id, false, false)
1268
+ if issue.nil?
1269
+ warn "Could not retrieve issue #{owner}/#{repo} -> #{issue_id}"
1270
+ return
1271
+ end
1272
+
1273
+ retrieve_issue_events(owner, repo, issue_id).reduce([]) do |acc, x|
1274
+
1275
+ if @db[:issue_events].first(:issue_id => issue[:id],
1276
+ :event_id => x['id']).nil?
1277
+ acc << x
1278
+ else
1279
+ acc
1280
+ end
1281
+ end.map { |x|
1282
+ ensure_issue_event(owner, repo, issue_id, x['id'])
1283
+ }
1284
+ end
1285
+
1286
+ ##
1287
+ # Retrieve and process +event_id+ for an +issue_id+
1288
+ def ensure_issue_event(owner, repo, issue_id, event_id)
1289
+ issue = ensure_issue(owner, repo, issue_id, false, false)
1290
+
1291
+ if issue.nil?
1292
+ warn "GHTorrent: Could not retrieve issue #{owner}/#{repo} -> #{issue_id}"
1293
+ return
1294
+ end
1295
+
1296
+ issue_event_str = "#{owner}/#{repo} -> #{issue_id}/#{event_id}"
1297
+
1298
+ curevent = @db[:issue_events].first(:issue_id => issue[:id],
1299
+ :event_id => event_id)
1300
+ if curevent.nil?
1301
+
1302
+ retrieved = retrieve_issue_event(owner, repo, issue_id, event_id)
1303
+
1304
+ if retrieved.nil?
1305
+ warn "GHTorrent: Could not retrieve issue event #{issue_event_str}"
1306
+ return
1307
+ elsif retrieved['actor'].nil?
1308
+ warn "GHTorrent: Issue event #{issue_event_str} does not contain an actor"
1309
+ return
1310
+ end
1311
+
1312
+ actor = ensure_user(retrieved['actor']['login'], false, false)
1313
+
1314
+ action_specific = case retrieved['event']
1315
+ when "referenced" then retrieved['commit_id']
1316
+ when "merged" then retrieved['commit_id']
1317
+ when "closed" then retrieved['commit_id']
1318
+ else nil
1319
+ end
1320
+
1321
+ if retrieved['event'] == "assigned"
1322
+
1323
+ def update_assignee(owner, repo, issue, actor)
1324
+ @db[:issues][:id => issue[:id]] = {:assignee_id => actor[:id]}
1325
+ info "Updating #{owner}/#{repo} -> #{issue[:id]} assignee to #{actor[:id]}"
1326
+ end
1327
+
1328
+ if issue[:assignee_id].nil? then
1329
+ update_assignee(owner, repo, issue, actor)
1330
+ else
1331
+ existing = @db[:issue_events].\
1332
+ filter(:issue_id => issue[:id],:action => "assigned").\
1333
+ order(Sequel.desc(:created_at)).first
1334
+ if existing.nil?
1335
+ update_assignee(owner, repo, issue, actor)
1336
+ elsif date(existing[:created_at]) < date(retrieved['created_at'])
1337
+ update_assignee(owner, repo, issue, actor)
1338
+ end
1339
+ end
1340
+ end
1341
+
1342
+ @db[:issue_events].insert(
1343
+ :event_id => event_id,
1344
+ :issue_id => issue[:id],
1345
+ :actor_id => unless actor.nil? then actor[:id] end,
1346
+ :action => retrieved['event'],
1347
+ :action_specific => action_specific,
1348
+ :created_at => date(retrieved['created_at']),
1349
+ :ext_ref_id => retrieved[@ext_uniq]
1141
1350
  )
1142
- debug "GHTorrent: New commit #{user}/#{repo} -> #{c['sha']} "
1143
- commits.first(:sha => c['sha'])
1351
+
1352
+ info "GHTorrent: Added issue event #{issue_event_str}"
1353
+ @db[:issue_events].first(:issue_id => issue[:id],
1354
+ :event_id => event_id)
1144
1355
  else
1145
- debug "GHTorrent: Commit #{user}/#{repo} -> #{c['sha']} exists"
1146
- commit
1356
+ debug "GHTorrent: Issue event #{issue_event_str} exists"
1357
+ curevent
1358
+ end
1359
+ end
1360
+
1361
+ ##
1362
+ # Retrieve and process all comments for an issue
1363
+ def ensure_issue_comments(owner, repo, issue_id)
1364
+ currepo = ensure_repo(owner, repo, true, true, false)
1365
+
1366
+ if currepo.nil?
1367
+ warn "GHTorrent: Could not find repository #{owner}/#{repo}"
1368
+ return
1369
+ end
1370
+
1371
+ issue = ensure_issue(owner, repo, issue_id, false, false)
1372
+ if issue.nil?
1373
+ warn "Could not retrieve issue #{owner}/#{repo} -> #{issue_id}"
1374
+ return
1375
+ end
1376
+
1377
+ retrieve_issue_comments(owner, repo, issue_id).reduce([]) do |acc, x|
1378
+
1379
+ if @db[:issue_comments].first(:issue_id => issue[:id],
1380
+ :comment_id => x['id']).nil?
1381
+ acc << x
1382
+ else
1383
+ acc
1384
+ end
1385
+ end.map { |x|
1386
+ ensure_issue_comment(owner, repo, issue_id, x['id'])
1387
+ }
1388
+ end
1389
+
1390
+ ##
1391
+ # Retrieve and process +comment_id+ for an +issue_id+
1392
+ def ensure_issue_comment(owner, repo, issue_id, comment_id)
1393
+ issue = ensure_issue(owner, repo, issue_id)
1394
+
1395
+ if issue.nil?
1396
+ warn "GHTorrent: Could not retrieve issue #{owner}/#{repo} -> #{issue_id}"
1397
+ return
1398
+ end
1399
+
1400
+ issue_comment_str = "#{owner}/#{repo} -> #{issue_id}/#{comment_id}"
1401
+
1402
+ curcomment = @db[:issue_comments].first(:issue_id => issue[:id],
1403
+ :comment_id => comment_id)
1404
+ if curcomment.nil?
1405
+
1406
+ retrieved = retrieve_issue_comment(owner, repo, issue_id, comment_id)
1407
+
1408
+ if retrieved.nil?
1409
+ warn "GHTorrent: Could not retrieve issue comment #{issue_comment_str}"
1410
+ return
1411
+ end
1412
+
1413
+ user = ensure_user(retrieved['user']['login'], false, false)
1414
+
1415
+ @db[:issue_comments].insert(
1416
+ :comment_id => comment_id,
1417
+ :issue_id => issue[:id],
1418
+ :user_id => unless user.nil? then user[:id] end,
1419
+ :created_at => date(retrieved['created_at']),
1420
+ :ext_ref_id => retrieved[@ext_uniq]
1421
+ )
1422
+
1423
+ info "GHTorrent: Added issue comment #{issue_comment_str}"
1424
+ @db[:issue_comments].first(:issue_id => issue[:id],
1425
+ :comment_id => comment_id)
1426
+ else
1427
+ debug "GHTorrent: Issue comment #{issue_comment_str} exists"
1428
+ curcomment
1147
1429
  end
1148
1430
  end
1149
1431
 
@@ -1153,27 +1435,57 @@ module GHTorrent
1153
1435
  @db ||= get_db
1154
1436
  @persister ||= persister
1155
1437
 
1438
+ result = nil
1156
1439
  start_time = Time.now
1157
1440
  begin
1158
1441
  @db.transaction(:rollback => :reraise, :isolation => :committed) do
1159
- yield block
1442
+ result = yield block
1160
1443
  end
1161
1444
  total = Time.now.to_ms - start_time.to_ms
1162
1445
  debug "GHTorrent: Transaction committed (#{total} ms)"
1446
+ result
1163
1447
  rescue Exception => e
1164
1448
  total = Time.now.to_ms - start_time.to_ms
1165
1449
  warn "GHTorrent: Transaction failed (#{total} ms)"
1166
1450
  raise e
1167
1451
  ensure
1168
- @db.disconnect
1169
- @persister.close
1170
-
1171
- @db = nil
1172
- @persister = nil
1173
1452
  GC.start
1174
1453
  end
1175
1454
  end
1176
1455
 
1456
+ private
1457
+
1458
+ # Store a commit contained in a hash. First check whether the commit exists.
1459
+ def store_commit(c, repo, user)
1460
+ commits = @db[:commits]
1461
+ commit = commits.first(:sha => c['sha'])
1462
+
1463
+ if commit.nil?
1464
+ author = commit_user(c['author'], c['commit']['author'])
1465
+ commiter = commit_user(c['committer'], c['commit']['committer'])
1466
+
1467
+ repository = ensure_repo(user, repo, false, false, false)
1468
+
1469
+ if repository.nil?
1470
+ warn "Could not store commit #{user}/#{repo} #{c['sha']}"
1471
+ return
1472
+ end
1473
+
1474
+ commits.insert(:sha => c['sha'],
1475
+ :author_id => author[:id],
1476
+ :committer_id => commiter[:id],
1477
+ :project_id => repository[:id],
1478
+ :created_at => date(c['commit']['author']['date']),
1479
+ :ext_ref_id => c[@ext_uniq]
1480
+ )
1481
+ debug "GHTorrent: New commit #{user}/#{repo} -> #{c['sha']} "
1482
+ commits.first(:sha => c['sha'])
1483
+ else
1484
+ debug "GHTorrent: Commit #{user}/#{repo} -> #{c['sha']} exists"
1485
+ commit
1486
+ end
1487
+ end
1488
+
1177
1489
  ##
1178
1490
  # Convert a string value to boolean, the SQL way
1179
1491
  def boolean(arg)