ghtorrent 0.9 → 0.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -54,6 +54,12 @@ module GHTorrent
54
54
  command.options[:password])
55
55
  end
56
56
 
57
+ unless command.options[:token].nil?
58
+ command.settings = command.override_config(command.settings,
59
+ :github_token,
60
+ command.options[:token])
61
+ end
62
+
57
63
  begin
58
64
  command.go
59
65
  rescue => e
@@ -86,6 +92,8 @@ Standard options:
86
92
  :type => String
87
93
  opt :username, 'Username at Github', :short => 's', :type => String
88
94
  opt :password, 'Password at Github', :type => String
95
+ opt :token, 'OAuth Github token (use instead of username/password)',
96
+ :type => String, :short => 't'
89
97
  end
90
98
  end
91
99
 
@@ -166,13 +174,13 @@ Standard options:
166
174
  :ack => true) do |delivery_info, properties, msg|
167
175
 
168
176
  if ack == :before
169
- ch.acknowledge(delivery_info.delivery_tag, false)
177
+ ch.acknowledge(delivery_info.delivery_tag)
170
178
  end
171
179
 
172
180
  begin
173
181
  block.call(msg)
174
182
  ensure
175
- ch.acknowledge(delivery_info.delivery_tag, false)
183
+ ch.acknowledge(delivery_info.delivery_tag)
176
184
  end
177
185
  end
178
186
 
@@ -197,8 +205,8 @@ Standard options:
197
205
  end
198
206
 
199
207
  def override_config(config_file, setting, new_value)
200
- puts "Overriding configuration #{setting}=#{config(setting)} with cmd line #{new_value}"
201
- merge_config_values(config_file, {setting => new_value})
208
+ puts "Overriding configuration #{setting}=#{config(setting)} with new value #{new_value}"
209
+ super(config_file, setting, new_value)
202
210
  end
203
211
 
204
212
  private
@@ -81,7 +81,7 @@ class GHTDataRetrieval < GHTorrent::Command
81
81
  repo = data['repo']['name'].split(/\//)[1]
82
82
  fork_id = data['payload']['forkee']['id']
83
83
 
84
- ghtorrent.get_fork(owner, repo, fork_id)
84
+ #ghtorrent.get_fork(owner, repo, fork_id)
85
85
  end
86
86
 
87
87
  def PullRequestReviewCommentEvent(data)
@@ -119,20 +119,15 @@ class GHTDataRetrieval < GHTorrent::Command
119
119
 
120
120
  def prepare_options(options)
121
121
  options.banner <<-BANNER
122
- Retrieves events from queues and processes them through GHTorrent
123
- #{command_name} [options]
124
-
125
- #{command_name} options:
122
+ Retrieves events from queues and processes them through GHTorrent.
123
+ If event_id is provided, only this event is processed.
124
+ #{command_name} [event_id]
126
125
  BANNER
127
126
 
128
- options.opt :filter,
129
- 'Only process messages for repos in the provided file',
130
- :short => 'f', :type => String
131
127
  end
132
128
 
133
129
  def validate
134
130
  super
135
- Trollop::die "Filter file does not exist" if options[:filter] and not File.exist?(options[:filter])
136
131
  end
137
132
 
138
133
  def logger
@@ -144,19 +139,26 @@ Retrieves events from queues and processes them through GHTorrent
144
139
  @gh
145
140
  end
146
141
 
142
+ def retrieve_event(evt_id)
143
+ event = persister.get_underlying_connection[:events].find_one('id' => evt_id)
144
+ event.delete '_id'
145
+ data = parse(event.to_json)
146
+ info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
147
+ data
148
+ end
149
+
147
150
  def go
148
- filter = Array.new
149
-
150
- if options[:filter]
151
- File.open(options[:filter]).each { |l|
152
- next if l.match(/^ *#/)
153
- parts = l.split(/ /)
154
- next if parts.size < 2
155
- debug "GHTDataRetrieval: Filtering events by #{parts[0] + "/" + parts[1]}"
156
- filter << parts[0] + "/" + parts[1]
157
- }
158
- end
159
151
 
152
+ unless ARGV[0].nil?
153
+ event = retrieve_event(ARGV[0])
154
+
155
+ if event.nil?
156
+ warn "GHTDataRetrieval: No event with id: #{ARGV[0]}"
157
+ else
158
+ send(event['type'], event)
159
+ end
160
+ return
161
+ end
160
162
 
161
163
  conn = Bunny.new(:host => config(:amqp_host),
162
164
  :port => config(:amqp_port),
@@ -181,20 +183,9 @@ Retrieves events from queues and processes them through GHTorrent
181
183
  queue.subscribe(:ack => true) do |headers, properties, msg|
182
184
  begin
183
185
 
184
- event = persister.get_underlying_connection[:events].find_one('id' => msg)
185
- event.delete '_id'
186
- data = parse(event.to_json)
187
- info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
188
-
189
- unless options[:filter].nil?
190
- if filter.include?(data['repo']['name'])
191
- send(h, data)
192
- else
193
- info "GHTDataRetrieval: Repo #{data['repo']['name']} not in process list. Ignoring event #{data['type']}-#{data['id']}"
194
- end
195
- else
196
- send(h, data)
197
- end
186
+ data = retrieve_event(msg)
187
+ send(h, data)
188
+
198
189
  channel.acknowledge(headers.delivery_tag, false)
199
190
  info "GHTDataRetrieval: Processed event: #{data['type']}-#{data['id']}"
200
191
  rescue Exception => e
@@ -227,6 +218,7 @@ Retrieves events from queues and processes them through GHTorrent
227
218
  conn.close unless conn.nil?
228
219
 
229
220
  end
221
+
230
222
  end
231
223
 
232
224
  # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -61,7 +61,7 @@ Retrieves more commits for the provided repository
61
61
 
62
62
  user = user_entry[:login]
63
63
 
64
- repo_entry = @ght.transaction{@ght.ensure_repo(ARGV[0], ARGV[1], false, false, false)}
64
+ repo_entry = @ght.transaction{@ght.ensure_repo(ARGV[0], ARGV[1])}
65
65
 
66
66
  if repo_entry.nil?
67
67
  Trollop::die "Cannot find repository #{owner}/#{ARGV[1]}"
@@ -75,7 +75,7 @@ Retrieves more commits for the provided repository
75
75
  order(:created_at).\
76
76
  first[:sha]
77
77
  else
78
- "master"
78
+ nil
79
79
  end
80
80
 
81
81
  total_commits = 0
@@ -84,7 +84,8 @@ Retrieves more commits for the provided repository
84
84
  begin
85
85
  logger.debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
86
86
 
87
- commits = retrieve_commits(repo, head, user, 1)
87
+ @settings = override_config(@settings, :mirror_history_pages_back, 1)
88
+ commits = retrieve_commits(repo, head, user)
88
89
 
89
90
  if commits.nil? or commits.empty? or commits.size == 1
90
91
  break
@@ -107,13 +107,14 @@ Loads object ids from a collection to a queue for further processing.
107
107
  :routing_key => "evt.#{e['type']}"
108
108
 
109
109
  total_read += 1
110
- puts "Publish id = #{e['id']} (#{total_read} read)" if options.verbose
110
+ puts "Publish id = #{e['id']} #{e['created_at']} (#{total_read} read)" if options.verbose
111
111
 
112
112
  if total_read >= options[:number]
113
113
  puts 'Finished reading, exiting'
114
- break
114
+ return
115
115
  end
116
116
  end
117
+ stopped = true
117
118
  rescue Interrupt
118
119
  puts 'Interrupted'
119
120
  stopped = true
@@ -0,0 +1,80 @@
1
+ require 'rubygems'
2
+
3
+ require 'ghtorrent/ghtorrent'
4
+ require 'ghtorrent/settings'
5
+ require 'ghtorrent/logging'
6
+ require 'ghtorrent/command'
7
+ require 'ghtorrent/retriever'
8
+
9
+ class GHTRetrieveOne < GHTorrent::Command
10
+
11
+ include GHTorrent::Settings
12
+ include GHTorrent::Retriever
13
+ include GHTorrent::Persister
14
+
15
+ def prepare_options(options)
16
+ options.banner <<-BANNER
17
+ Retrieve just one item
18
+
19
+ #{command_name} [options] <what> options...
20
+ what can have the following values and arguments
21
+ * pullreq <owner> <repo> <github_id>
22
+ * issue <owner> <repo> <github_id>
23
+ BANNER
24
+ end
25
+
26
+
27
+ def validate
28
+ super
29
+ Trollop::die 'One argument required' unless args[0] && !args[0].empty?
30
+ end
31
+
32
+ def logger
33
+ ght.logger
34
+ end
35
+
36
+ def persister
37
+ @persister ||= connect(:mongo, settings)
38
+ @persister
39
+ end
40
+
41
+ def ext_uniq
42
+ @ext_uniq ||= config(:uniq_id)
43
+ @ext_uniq
44
+ end
45
+
46
+ def ght
47
+ @ght ||= TransactedGhtorrent.new(settings)
48
+ @ght
49
+ end
50
+
51
+ def go
52
+
53
+ ght.get_db
54
+ case ARGV[0]
55
+ when /pullreq/
56
+ retrieve_pullreq(ARGV[1..-1])
57
+ when /issue/
58
+ retrieve_issue(ARGV[1..-1])
59
+ else
60
+ Trollop::die "Don't know how to retrieve #{ARGV[0]}"
61
+ end
62
+ end
63
+
64
+ def retrieve_pullreq(args)
65
+ owner = args[0]
66
+ repo = args[1]
67
+ pull_req_id = args[2]
68
+
69
+ ght.ensure_pull_request(owner, repo, pull_req_id)
70
+ end
71
+
72
+ def retrieve_issue(args)
73
+ owner = args[0]
74
+ repo = args[1]
75
+ issue_id = args[2]
76
+
77
+ ght.ensure_issue(wner, repo, issue_id)
78
+ end
79
+
80
+ end
@@ -41,12 +41,12 @@ An efficient way to get all data for a single repo
41
41
  end
42
42
 
43
43
  def ght
44
- @ght ||= TransactedGHTorrent.new(settings)
44
+ @ght ||= TransactedGhtorrent.new(settings)
45
45
  @ght
46
46
  end
47
47
 
48
48
  def go
49
- self.settings = override_config(settings, :mirror_history_pages_back, -1)
49
+ self.settings = override_config(settings, :mirror_history_pages_back, 1000)
50
50
  user_entry = ght.transaction{ght.ensure_user(ARGV[0], false, false)}
51
51
 
52
52
  if user_entry.nil?
@@ -64,7 +64,7 @@ An efficient way to get all data for a single repo
64
64
  repo = repo_entry[:name]
65
65
 
66
66
  def send_message(function, user, repo)
67
- ght.send(function, user, repo, refresh = true)
67
+ ght.send(function, user, repo)
68
68
  end
69
69
 
70
70
  functions = %w(ensure_commits ensure_forks ensure_pull_requests
@@ -3,98 +3,21 @@ require 'ghtorrent/settings'
3
3
  require 'ghtorrent/logging'
4
4
  require 'ghtorrent/command'
5
5
  require 'ghtorrent/retriever'
6
+ require 'ghtorrent/multiprocess_queue_client'
6
7
  require "bunny"
7
8
 
9
+ class GHTRetrieveRepos < MultiprocessQueueClient
8
10
 
9
- class GHTRetrieveRepos < GHTorrent::Command
10
-
11
- include GHTorrent::Settings
12
- include GHTorrent::Logging
13
-
14
- def logger
15
- @logger ||= Logger.new(STDOUT)
16
- @logger
17
- end
18
-
19
- def prepare_options(options)
20
- options.banner <<-BANNER
21
- Retrieve data for multiple repos in parallel. To work, it requires
22
- a mapping file formatted as follows:
23
-
24
- IP UNAME PASSWD NUM_PROCS where
25
-
26
- IP = address to use for outgoing requests (use 0.0.0.0 on non-multihomed hosts)
27
- UNAME = Github user name to use for outgoing requests
28
- PASSWD = Github password to use for outgoing requests
29
- NUM_PROCS = Number of processes to spawn for this IP/UNAME combination
30
-
31
- Values in the config.yaml file set with the -c command are overriden.
32
-
33
- #{command_name} [options] mapping-file
34
-
35
- BANNER
36
- options.opt :queue, 'Queue to retrieve project names from',
37
- :short => 'q', :default => 'retrieve-repo', :type => :string
38
-
11
+ def clazz
12
+ GHTRepoRetriever
39
13
  end
40
14
 
41
- def validate
42
- super
43
- Trollop::die 'Argument mapping-file is required' unless not args[0].nil?
44
- end
45
-
46
- def go
47
-
48
- configs = File.open(ARGV[0]).readlines.map do |line|
49
- next if line =~ /^#/
50
- ip,name,passwd,instances = line.strip.split(/ /)
51
- (1..instances.to_i).map do |i|
52
- newcfg = self.settings.clone
53
- newcfg = override_config(newcfg, :attach_ip, ip)
54
- newcfg = override_config(newcfg, :github_username, name)
55
- newcfg = override_config(newcfg, :github_passwd, passwd)
56
- newcfg = override_config(newcfg, :mirror_history_pages_back, 1000)
57
- newcfg = override_config(newcfg, :mirror_commit_pages_new_repo, 1000)
58
- newcfg
59
- end
60
- end.flatten.select{|x| !x.nil?}
61
-
62
- children = configs.map do |config|
63
- pid = Process::fork
64
-
65
- if pid.nil?
66
- retriever = GHTRepoRetriever.new(config, options[:queue])
67
-
68
- Signal.trap('TERM') {
69
- retriever.stop
70
- }
71
-
72
- retriever.run(self)
73
- exit
74
- else
75
- debug "Parent #{Process.pid} forked child #{pid}"
76
- pid
77
- end
78
- end
79
-
80
- debug 'Waiting for children'
81
- begin
82
- children.each do |pid|
83
- debug "Waiting for child #{pid}"
84
- Process.waitpid(pid, 0)
85
- debug "Child #{pid} exited"
86
- end
87
- rescue Interrupt
88
- debug 'Stopping'
89
- end
90
- end
91
15
  end
92
16
 
93
17
  class GHTRepoRetriever
94
18
 
95
19
  include GHTorrent::Settings
96
20
  include GHTorrent::Retriever
97
- include GHTorrent::Persister
98
21
 
99
22
  def initialize(config, queue)
100
23
  @config = config
@@ -105,11 +28,6 @@ class GHTRepoRetriever
105
28
  ght.logger
106
29
  end
107
30
 
108
- def persister
109
- @persister ||= connect(:mongo, settings)
110
- @persister
111
- end
112
-
113
31
  def ext_uniq
114
32
  @ext_uniq ||= config(:uniq_id)
115
33
  @ext_uniq
@@ -128,14 +46,34 @@ class GHTRepoRetriever
128
46
 
129
47
  processor = Proc.new do |msg|
130
48
  owner, repo = msg.split(/ /)
131
- user_entry = ght.transaction { ght.ensure_user(owner, false, false) }
49
+
50
+ # On rare occasions, 2 instances might try to add the same user
51
+ # at the same time, which might lead to transaction conflicts
52
+ # Give the script one more opportunity before bailing out
53
+ user_entry = nil
54
+ i = 0
55
+
56
+ while user_entry.nil? and i < 10 do
57
+ i += 1
58
+ warn("Trying to get user #{owner}, attempt #{i}")
59
+ begin
60
+ user_entry = ght.transaction { ght.ensure_user(owner, false, false) }
61
+ rescue Exception => e
62
+ warn e.message
63
+ end
64
+ end
132
65
 
133
66
  if user_entry.nil?
134
67
  warn("Cannot find user #{owner}")
135
68
  next
136
69
  end
137
70
 
138
- repo_entry = ght.transaction { ght.ensure_repo(owner, repo) }
71
+ repo_entry = ght.transaction { ght.ensure_repo(owner, repo,
72
+ commits = false,
73
+ project_members = false,
74
+ watchers = false,
75
+ forks = false,
76
+ labels = false) }
139
77
 
140
78
  if repo_entry.nil?
141
79
  warn("Cannot find repository #{owner}/#{repo}")
@@ -144,30 +82,49 @@ class GHTRepoRetriever
144
82
 
145
83
  debug("Retrieving repo #{owner}/#{repo}")
146
84
 
147
- def send_message(function, user, repo)
148
- ght.send(function, user, repo, refresh = false)
149
- end
85
+ retrieval_stages = %w(ensure_commits ensure_forks ensure_pull_requests
86
+ ensure_issues ensure_project_members
87
+ ensure_watchers ensure_labels)
150
88
 
151
- functions = %w(ensure_commits ensure_forks ensure_pull_requests
152
- ensure_issues ensure_project_members ensure_watchers ensure_labels)
89
+ retrieval_stages.each do |x|
90
+ run_retrieval_stage(ght, owner, repo, x)
91
+ end
153
92
 
154
- functions.each do |x|
93
+ # Repository owner bound data retrieval
94
+ run_retrieval_stage(ght, owner, repo, 'ensure_user_followers',
95
+ onlyuser = true)
155
96
 
156
- begin
157
- send_message(x, owner, repo)
158
- rescue Exception
159
- warn("Error processing #{x} for #{owner}/#{repo}")
160
- next
161
- end
97
+ if user_entry[:type] == 'ORG'
98
+ run_retrieval_stage(ght, owner, repo, 'ensure_org', onlyuser = true)
162
99
  end
100
+
101
+ # Cleanup
102
+ ght.dispose
103
+ ght = nil
104
+ GC.start
163
105
  end
164
106
 
165
107
  command.queue_client(@queue, :before, processor)
166
108
  end
167
109
 
110
+ def run_retrieval_stage(ght, owner, repo, function, only_user = false)
111
+ begin
112
+ if only_user
113
+ ght.send(function, owner)
114
+ else
115
+ ght.send(function, owner, repo)
116
+ end
117
+ rescue Exception => e
118
+ warn("Error processing #{function} for #{owner}/#{repo}")
119
+ warn("Exception message #{$!}")
120
+ warn("Exception trace #{e.backtrace.join("\n")}")
121
+ end
122
+ end
123
+
168
124
  def stop
169
125
  warn('Stop flag set, waiting for operations to finish')
170
126
  @stop = true
171
127
  end
172
128
  end
173
129
 
130
+ # vim: ft=ruby: