ghtorrent 0.9 → 0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/CHANGELOG +17 -2
- data/Gemfile.lock +14 -14
- data/bin/ght-log-analyzer +133 -0
- data/bin/ght-retrieve-one +6 -0
- data/lib/ghtorrent.rb +4 -0
- data/lib/ghtorrent/adapters/mongo_persister.rb +3 -2
- data/lib/ghtorrent/api_client.rb +107 -34
- data/lib/ghtorrent/command.rb +12 -4
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +26 -34
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +4 -3
- data/lib/ghtorrent/commands/ght_load.rb +3 -2
- data/lib/ghtorrent/commands/ght_retrieve_one.rb +80 -0
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +3 -3
- data/lib/ghtorrent/commands/ght_retrieve_repos.rb +57 -100
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +6 -5
- data/lib/ghtorrent/ghtorrent.rb +188 -74
- data/lib/ghtorrent/migrations/016_add_actor_pull_request_history.rb +1 -1
- data/lib/ghtorrent/migrations/017_drop_forks_table.rb +24 -0
- data/lib/ghtorrent/migrations/018_drop_merged_user_from_pull_requests.rb +23 -0
- data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +33 -0
- data/lib/ghtorrent/multiprocess_queue_client.rb +105 -0
- data/lib/ghtorrent/persister.rb +4 -1
- data/lib/ghtorrent/retriever.rb +115 -105
- data/lib/ghtorrent/settings.rb +6 -2
- data/lib/ghtorrent/transacted_ghtorrent.rb +9 -2
- data/lib/version.rb +1 -1
- metadata +55 -10
data/lib/ghtorrent/command.rb
CHANGED
@@ -54,6 +54,12 @@ module GHTorrent
|
|
54
54
|
command.options[:password])
|
55
55
|
end
|
56
56
|
|
57
|
+
unless command.options[:token].nil?
|
58
|
+
command.settings = command.override_config(command.settings,
|
59
|
+
:github_token,
|
60
|
+
command.options[:token])
|
61
|
+
end
|
62
|
+
|
57
63
|
begin
|
58
64
|
command.go
|
59
65
|
rescue => e
|
@@ -86,6 +92,8 @@ Standard options:
|
|
86
92
|
:type => String
|
87
93
|
opt :username, 'Username at Github', :short => 's', :type => String
|
88
94
|
opt :password, 'Password at Github', :type => String
|
95
|
+
opt :token, 'OAuth Github token (use instead of username/password)',
|
96
|
+
:type => String, :short => 't'
|
89
97
|
end
|
90
98
|
end
|
91
99
|
|
@@ -166,13 +174,13 @@ Standard options:
|
|
166
174
|
:ack => true) do |delivery_info, properties, msg|
|
167
175
|
|
168
176
|
if ack == :before
|
169
|
-
ch.acknowledge(delivery_info.delivery_tag
|
177
|
+
ch.acknowledge(delivery_info.delivery_tag)
|
170
178
|
end
|
171
179
|
|
172
180
|
begin
|
173
181
|
block.call(msg)
|
174
182
|
ensure
|
175
|
-
ch.acknowledge(delivery_info.delivery_tag
|
183
|
+
ch.acknowledge(delivery_info.delivery_tag)
|
176
184
|
end
|
177
185
|
end
|
178
186
|
|
@@ -197,8 +205,8 @@ Standard options:
|
|
197
205
|
end
|
198
206
|
|
199
207
|
def override_config(config_file, setting, new_value)
|
200
|
-
puts "Overriding configuration #{setting}=#{config(setting)} with
|
201
|
-
|
208
|
+
puts "Overriding configuration #{setting}=#{config(setting)} with new value #{new_value}"
|
209
|
+
super(config_file, setting, new_value)
|
202
210
|
end
|
203
211
|
|
204
212
|
private
|
@@ -81,7 +81,7 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
81
81
|
repo = data['repo']['name'].split(/\//)[1]
|
82
82
|
fork_id = data['payload']['forkee']['id']
|
83
83
|
|
84
|
-
ghtorrent.get_fork(owner, repo, fork_id)
|
84
|
+
#ghtorrent.get_fork(owner, repo, fork_id)
|
85
85
|
end
|
86
86
|
|
87
87
|
def PullRequestReviewCommentEvent(data)
|
@@ -119,20 +119,15 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
119
119
|
|
120
120
|
def prepare_options(options)
|
121
121
|
options.banner <<-BANNER
|
122
|
-
Retrieves events from queues and processes them through GHTorrent
|
123
|
-
|
124
|
-
|
125
|
-
#{command_name} options:
|
122
|
+
Retrieves events from queues and processes them through GHTorrent.
|
123
|
+
If event_id is provided, only this event is processed.
|
124
|
+
#{command_name} [event_id]
|
126
125
|
BANNER
|
127
126
|
|
128
|
-
options.opt :filter,
|
129
|
-
'Only process messages for repos in the provided file',
|
130
|
-
:short => 'f', :type => String
|
131
127
|
end
|
132
128
|
|
133
129
|
def validate
|
134
130
|
super
|
135
|
-
Trollop::die "Filter file does not exist" if options[:filter] and not File.exist?(options[:filter])
|
136
131
|
end
|
137
132
|
|
138
133
|
def logger
|
@@ -144,19 +139,26 @@ Retrieves events from queues and processes them through GHTorrent
|
|
144
139
|
@gh
|
145
140
|
end
|
146
141
|
|
142
|
+
def retrieve_event(evt_id)
|
143
|
+
event = persister.get_underlying_connection[:events].find_one('id' => evt_id)
|
144
|
+
event.delete '_id'
|
145
|
+
data = parse(event.to_json)
|
146
|
+
info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
|
147
|
+
data
|
148
|
+
end
|
149
|
+
|
147
150
|
def go
|
148
|
-
filter = Array.new
|
149
|
-
|
150
|
-
if options[:filter]
|
151
|
-
File.open(options[:filter]).each { |l|
|
152
|
-
next if l.match(/^ *#/)
|
153
|
-
parts = l.split(/ /)
|
154
|
-
next if parts.size < 2
|
155
|
-
debug "GHTDataRetrieval: Filtering events by #{parts[0] + "/" + parts[1]}"
|
156
|
-
filter << parts[0] + "/" + parts[1]
|
157
|
-
}
|
158
|
-
end
|
159
151
|
|
152
|
+
unless ARGV[0].nil?
|
153
|
+
event = retrieve_event(ARGV[0])
|
154
|
+
|
155
|
+
if event.nil?
|
156
|
+
warn "GHTDataRetrieval: No event with id: #{ARGV[0]}"
|
157
|
+
else
|
158
|
+
send(event['type'], event)
|
159
|
+
end
|
160
|
+
return
|
161
|
+
end
|
160
162
|
|
161
163
|
conn = Bunny.new(:host => config(:amqp_host),
|
162
164
|
:port => config(:amqp_port),
|
@@ -181,20 +183,9 @@ Retrieves events from queues and processes them through GHTorrent
|
|
181
183
|
queue.subscribe(:ack => true) do |headers, properties, msg|
|
182
184
|
begin
|
183
185
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
|
188
|
-
|
189
|
-
unless options[:filter].nil?
|
190
|
-
if filter.include?(data['repo']['name'])
|
191
|
-
send(h, data)
|
192
|
-
else
|
193
|
-
info "GHTDataRetrieval: Repo #{data['repo']['name']} not in process list. Ignoring event #{data['type']}-#{data['id']}"
|
194
|
-
end
|
195
|
-
else
|
196
|
-
send(h, data)
|
197
|
-
end
|
186
|
+
data = retrieve_event(msg)
|
187
|
+
send(h, data)
|
188
|
+
|
198
189
|
channel.acknowledge(headers.delivery_tag, false)
|
199
190
|
info "GHTDataRetrieval: Processed event: #{data['type']}-#{data['id']}"
|
200
191
|
rescue Exception => e
|
@@ -227,6 +218,7 @@ Retrieves events from queues and processes them through GHTorrent
|
|
227
218
|
conn.close unless conn.nil?
|
228
219
|
|
229
220
|
end
|
221
|
+
|
230
222
|
end
|
231
223
|
|
232
224
|
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|
@@ -61,7 +61,7 @@ Retrieves more commits for the provided repository
|
|
61
61
|
|
62
62
|
user = user_entry[:login]
|
63
63
|
|
64
|
-
repo_entry = @ght.transaction{@ght.ensure_repo(ARGV[0], ARGV[1]
|
64
|
+
repo_entry = @ght.transaction{@ght.ensure_repo(ARGV[0], ARGV[1])}
|
65
65
|
|
66
66
|
if repo_entry.nil?
|
67
67
|
Trollop::die "Cannot find repository #{owner}/#{ARGV[1]}"
|
@@ -75,7 +75,7 @@ Retrieves more commits for the provided repository
|
|
75
75
|
order(:created_at).\
|
76
76
|
first[:sha]
|
77
77
|
else
|
78
|
-
|
78
|
+
nil
|
79
79
|
end
|
80
80
|
|
81
81
|
total_commits = 0
|
@@ -84,7 +84,8 @@ Retrieves more commits for the provided repository
|
|
84
84
|
begin
|
85
85
|
logger.debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
|
86
86
|
|
87
|
-
|
87
|
+
@settings = override_config(@settings, :mirror_history_pages_back, 1)
|
88
|
+
commits = retrieve_commits(repo, head, user)
|
88
89
|
|
89
90
|
if commits.nil? or commits.empty? or commits.size == 1
|
90
91
|
break
|
@@ -107,13 +107,14 @@ Loads object ids from a collection to a queue for further processing.
|
|
107
107
|
:routing_key => "evt.#{e['type']}"
|
108
108
|
|
109
109
|
total_read += 1
|
110
|
-
puts "Publish id = #{e['id']} (#{total_read} read)" if options.verbose
|
110
|
+
puts "Publish id = #{e['id']} #{e['created_at']} (#{total_read} read)" if options.verbose
|
111
111
|
|
112
112
|
if total_read >= options[:number]
|
113
113
|
puts 'Finished reading, exiting'
|
114
|
-
|
114
|
+
return
|
115
115
|
end
|
116
116
|
end
|
117
|
+
stopped = true
|
117
118
|
rescue Interrupt
|
118
119
|
puts 'Interrupted'
|
119
120
|
stopped = true
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require 'ghtorrent/ghtorrent'
|
4
|
+
require 'ghtorrent/settings'
|
5
|
+
require 'ghtorrent/logging'
|
6
|
+
require 'ghtorrent/command'
|
7
|
+
require 'ghtorrent/retriever'
|
8
|
+
|
9
|
+
class GHTRetrieveOne < GHTorrent::Command
|
10
|
+
|
11
|
+
include GHTorrent::Settings
|
12
|
+
include GHTorrent::Retriever
|
13
|
+
include GHTorrent::Persister
|
14
|
+
|
15
|
+
def prepare_options(options)
|
16
|
+
options.banner <<-BANNER
|
17
|
+
Retrieve just one item
|
18
|
+
|
19
|
+
#{command_name} [options] <what> options...
|
20
|
+
what can have the following values and arguments
|
21
|
+
* pullreq <owner> <repo> <github_id>
|
22
|
+
* issue <owner> <repo> <github_id>
|
23
|
+
BANNER
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
def validate
|
28
|
+
super
|
29
|
+
Trollop::die 'One argument required' unless args[0] && !args[0].empty?
|
30
|
+
end
|
31
|
+
|
32
|
+
def logger
|
33
|
+
ght.logger
|
34
|
+
end
|
35
|
+
|
36
|
+
def persister
|
37
|
+
@persister ||= connect(:mongo, settings)
|
38
|
+
@persister
|
39
|
+
end
|
40
|
+
|
41
|
+
def ext_uniq
|
42
|
+
@ext_uniq ||= config(:uniq_id)
|
43
|
+
@ext_uniq
|
44
|
+
end
|
45
|
+
|
46
|
+
def ght
|
47
|
+
@ght ||= TransactedGhtorrent.new(settings)
|
48
|
+
@ght
|
49
|
+
end
|
50
|
+
|
51
|
+
def go
|
52
|
+
|
53
|
+
ght.get_db
|
54
|
+
case ARGV[0]
|
55
|
+
when /pullreq/
|
56
|
+
retrieve_pullreq(ARGV[1..-1])
|
57
|
+
when /issue/
|
58
|
+
retrieve_issue(ARGV[1..-1])
|
59
|
+
else
|
60
|
+
Trollop::die "Don't know how to retrieve #{ARGV[0]}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def retrieve_pullreq(args)
|
65
|
+
owner = args[0]
|
66
|
+
repo = args[1]
|
67
|
+
pull_req_id = args[2]
|
68
|
+
|
69
|
+
ght.ensure_pull_request(owner, repo, pull_req_id)
|
70
|
+
end
|
71
|
+
|
72
|
+
def retrieve_issue(args)
|
73
|
+
owner = args[0]
|
74
|
+
repo = args[1]
|
75
|
+
issue_id = args[2]
|
76
|
+
|
77
|
+
ght.ensure_issue(wner, repo, issue_id)
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -41,12 +41,12 @@ An efficient way to get all data for a single repo
|
|
41
41
|
end
|
42
42
|
|
43
43
|
def ght
|
44
|
-
@ght ||=
|
44
|
+
@ght ||= TransactedGhtorrent.new(settings)
|
45
45
|
@ght
|
46
46
|
end
|
47
47
|
|
48
48
|
def go
|
49
|
-
self.settings = override_config(settings, :mirror_history_pages_back,
|
49
|
+
self.settings = override_config(settings, :mirror_history_pages_back, 1000)
|
50
50
|
user_entry = ght.transaction{ght.ensure_user(ARGV[0], false, false)}
|
51
51
|
|
52
52
|
if user_entry.nil?
|
@@ -64,7 +64,7 @@ An efficient way to get all data for a single repo
|
|
64
64
|
repo = repo_entry[:name]
|
65
65
|
|
66
66
|
def send_message(function, user, repo)
|
67
|
-
ght.send(function, user, repo
|
67
|
+
ght.send(function, user, repo)
|
68
68
|
end
|
69
69
|
|
70
70
|
functions = %w(ensure_commits ensure_forks ensure_pull_requests
|
@@ -3,98 +3,21 @@ require 'ghtorrent/settings'
|
|
3
3
|
require 'ghtorrent/logging'
|
4
4
|
require 'ghtorrent/command'
|
5
5
|
require 'ghtorrent/retriever'
|
6
|
+
require 'ghtorrent/multiprocess_queue_client'
|
6
7
|
require "bunny"
|
7
8
|
|
9
|
+
class GHTRetrieveRepos < MultiprocessQueueClient
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
include GHTorrent::Settings
|
12
|
-
include GHTorrent::Logging
|
13
|
-
|
14
|
-
def logger
|
15
|
-
@logger ||= Logger.new(STDOUT)
|
16
|
-
@logger
|
17
|
-
end
|
18
|
-
|
19
|
-
def prepare_options(options)
|
20
|
-
options.banner <<-BANNER
|
21
|
-
Retrieve data for multiple repos in parallel. To work, it requires
|
22
|
-
a mapping file formatted as follows:
|
23
|
-
|
24
|
-
IP UNAME PASSWD NUM_PROCS where
|
25
|
-
|
26
|
-
IP = address to use for outgoing requests (use 0.0.0.0 on non-multihomed hosts)
|
27
|
-
UNAME = Github user name to use for outgoing requests
|
28
|
-
PASSWD = Github password to use for outgoing requests
|
29
|
-
NUM_PROCS = Number of processes to spawn for this IP/UNAME combination
|
30
|
-
|
31
|
-
Values in the config.yaml file set with the -c command are overriden.
|
32
|
-
|
33
|
-
#{command_name} [options] mapping-file
|
34
|
-
|
35
|
-
BANNER
|
36
|
-
options.opt :queue, 'Queue to retrieve project names from',
|
37
|
-
:short => 'q', :default => 'retrieve-repo', :type => :string
|
38
|
-
|
11
|
+
def clazz
|
12
|
+
GHTRepoRetriever
|
39
13
|
end
|
40
14
|
|
41
|
-
def validate
|
42
|
-
super
|
43
|
-
Trollop::die 'Argument mapping-file is required' unless not args[0].nil?
|
44
|
-
end
|
45
|
-
|
46
|
-
def go
|
47
|
-
|
48
|
-
configs = File.open(ARGV[0]).readlines.map do |line|
|
49
|
-
next if line =~ /^#/
|
50
|
-
ip,name,passwd,instances = line.strip.split(/ /)
|
51
|
-
(1..instances.to_i).map do |i|
|
52
|
-
newcfg = self.settings.clone
|
53
|
-
newcfg = override_config(newcfg, :attach_ip, ip)
|
54
|
-
newcfg = override_config(newcfg, :github_username, name)
|
55
|
-
newcfg = override_config(newcfg, :github_passwd, passwd)
|
56
|
-
newcfg = override_config(newcfg, :mirror_history_pages_back, 1000)
|
57
|
-
newcfg = override_config(newcfg, :mirror_commit_pages_new_repo, 1000)
|
58
|
-
newcfg
|
59
|
-
end
|
60
|
-
end.flatten.select{|x| !x.nil?}
|
61
|
-
|
62
|
-
children = configs.map do |config|
|
63
|
-
pid = Process::fork
|
64
|
-
|
65
|
-
if pid.nil?
|
66
|
-
retriever = GHTRepoRetriever.new(config, options[:queue])
|
67
|
-
|
68
|
-
Signal.trap('TERM') {
|
69
|
-
retriever.stop
|
70
|
-
}
|
71
|
-
|
72
|
-
retriever.run(self)
|
73
|
-
exit
|
74
|
-
else
|
75
|
-
debug "Parent #{Process.pid} forked child #{pid}"
|
76
|
-
pid
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
debug 'Waiting for children'
|
81
|
-
begin
|
82
|
-
children.each do |pid|
|
83
|
-
debug "Waiting for child #{pid}"
|
84
|
-
Process.waitpid(pid, 0)
|
85
|
-
debug "Child #{pid} exited"
|
86
|
-
end
|
87
|
-
rescue Interrupt
|
88
|
-
debug 'Stopping'
|
89
|
-
end
|
90
|
-
end
|
91
15
|
end
|
92
16
|
|
93
17
|
class GHTRepoRetriever
|
94
18
|
|
95
19
|
include GHTorrent::Settings
|
96
20
|
include GHTorrent::Retriever
|
97
|
-
include GHTorrent::Persister
|
98
21
|
|
99
22
|
def initialize(config, queue)
|
100
23
|
@config = config
|
@@ -105,11 +28,6 @@ class GHTRepoRetriever
|
|
105
28
|
ght.logger
|
106
29
|
end
|
107
30
|
|
108
|
-
def persister
|
109
|
-
@persister ||= connect(:mongo, settings)
|
110
|
-
@persister
|
111
|
-
end
|
112
|
-
|
113
31
|
def ext_uniq
|
114
32
|
@ext_uniq ||= config(:uniq_id)
|
115
33
|
@ext_uniq
|
@@ -128,14 +46,34 @@ class GHTRepoRetriever
|
|
128
46
|
|
129
47
|
processor = Proc.new do |msg|
|
130
48
|
owner, repo = msg.split(/ /)
|
131
|
-
|
49
|
+
|
50
|
+
# On rare occasions, 2 instances might try to add the same user
|
51
|
+
# at the same time, which might lead to transaction conflicts
|
52
|
+
# Give the script one more opportunity before bailing out
|
53
|
+
user_entry = nil
|
54
|
+
i = 0
|
55
|
+
|
56
|
+
while user_entry.nil? and i < 10 do
|
57
|
+
i += 1
|
58
|
+
warn("Trying to get user #{owner}, attempt #{i}")
|
59
|
+
begin
|
60
|
+
user_entry = ght.transaction { ght.ensure_user(owner, false, false) }
|
61
|
+
rescue Exception => e
|
62
|
+
warn e.message
|
63
|
+
end
|
64
|
+
end
|
132
65
|
|
133
66
|
if user_entry.nil?
|
134
67
|
warn("Cannot find user #{owner}")
|
135
68
|
next
|
136
69
|
end
|
137
70
|
|
138
|
-
repo_entry = ght.transaction { ght.ensure_repo(owner, repo
|
71
|
+
repo_entry = ght.transaction { ght.ensure_repo(owner, repo,
|
72
|
+
commits = false,
|
73
|
+
project_members = false,
|
74
|
+
watchers = false,
|
75
|
+
forks = false,
|
76
|
+
labels = false) }
|
139
77
|
|
140
78
|
if repo_entry.nil?
|
141
79
|
warn("Cannot find repository #{owner}/#{repo}")
|
@@ -144,30 +82,49 @@ class GHTRepoRetriever
|
|
144
82
|
|
145
83
|
debug("Retrieving repo #{owner}/#{repo}")
|
146
84
|
|
147
|
-
|
148
|
-
|
149
|
-
|
85
|
+
retrieval_stages = %w(ensure_commits ensure_forks ensure_pull_requests
|
86
|
+
ensure_issues ensure_project_members
|
87
|
+
ensure_watchers ensure_labels)
|
150
88
|
|
151
|
-
|
152
|
-
|
89
|
+
retrieval_stages.each do |x|
|
90
|
+
run_retrieval_stage(ght, owner, repo, x)
|
91
|
+
end
|
153
92
|
|
154
|
-
|
93
|
+
# Repository owner bound data retrieval
|
94
|
+
run_retrieval_stage(ght, owner, repo, 'ensure_user_followers',
|
95
|
+
onlyuser = true)
|
155
96
|
|
156
|
-
|
157
|
-
|
158
|
-
rescue Exception
|
159
|
-
warn("Error processing #{x} for #{owner}/#{repo}")
|
160
|
-
next
|
161
|
-
end
|
97
|
+
if user_entry[:type] == 'ORG'
|
98
|
+
run_retrieval_stage(ght, owner, repo, 'ensure_org', onlyuser = true)
|
162
99
|
end
|
100
|
+
|
101
|
+
# Cleanup
|
102
|
+
ght.dispose
|
103
|
+
ght = nil
|
104
|
+
GC.start
|
163
105
|
end
|
164
106
|
|
165
107
|
command.queue_client(@queue, :before, processor)
|
166
108
|
end
|
167
109
|
|
110
|
+
def run_retrieval_stage(ght, owner, repo, function, only_user = false)
|
111
|
+
begin
|
112
|
+
if only_user
|
113
|
+
ght.send(function, owner)
|
114
|
+
else
|
115
|
+
ght.send(function, owner, repo)
|
116
|
+
end
|
117
|
+
rescue Exception => e
|
118
|
+
warn("Error processing #{function} for #{owner}/#{repo}")
|
119
|
+
warn("Exception message #{$!}")
|
120
|
+
warn("Exception trace #{e.backtrace.join("\n")}")
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
168
124
|
def stop
|
169
125
|
warn('Stop flag set, waiting for operations to finish')
|
170
126
|
@stop = true
|
171
127
|
end
|
172
128
|
end
|
173
129
|
|
130
|
+
# vim: ft=ruby:
|