ghtorrent 0.5 → 0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +16 -1
- data/README.md +6 -1
- data/bin/ght-data-retrieval +2 -162
- data/bin/ght-get-more-commits +6 -0
- data/bin/ght-load +1 -224
- data/bin/ght-mirror-events +2 -147
- data/bin/ght-process-event +35 -0
- data/bin/ght-retrieve-repo +6 -0
- data/bin/ght-rm-dupl +2 -130
- data/lib/ghtorrent.rb +10 -0
- data/lib/ghtorrent/adapters/base_adapter.rb +1 -1
- data/lib/ghtorrent/adapters/mongo_persister.rb +12 -1
- data/lib/ghtorrent/api_client.rb +47 -13
- data/lib/ghtorrent/bson_orderedhash.rb +2 -1
- data/lib/ghtorrent/command.rb +18 -0
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +218 -0
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +116 -0
- data/lib/ghtorrent/commands/ght_load.rb +227 -0
- data/lib/ghtorrent/commands/ght_mirror_events.rb +147 -0
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +118 -0
- data/lib/ghtorrent/commands/ght_rm_dupl.rb +132 -0
- data/lib/ghtorrent/ghtorrent.rb +401 -89
- data/lib/ghtorrent/hash.rb +1 -1
- data/lib/ghtorrent/migrations/011_add_issues.rb +74 -0
- data/lib/ghtorrent/retriever.rb +88 -16
- data/lib/ghtorrent/settings.rb +6 -1
- data/lib/version.rb +1 -1
- metadata +36 -26
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'time'
|
3
|
+
|
4
|
+
require 'ghtorrent/ghtorrent'
|
5
|
+
require 'ghtorrent/settings'
|
6
|
+
require 'ghtorrent/logging'
|
7
|
+
require 'ghtorrent/command'
|
8
|
+
require 'ghtorrent/retriever'
|
9
|
+
|
10
|
+
class GHTMoreCommitsRetriever < GHTorrent::Command
|
11
|
+
|
12
|
+
include GHTorrent::Settings
|
13
|
+
include GHTorrent::Retriever
|
14
|
+
include GHTorrent::Persister
|
15
|
+
|
16
|
+
def prepare_options(options)
|
17
|
+
options.banner <<-BANNER
|
18
|
+
Retrieves more commits for the provided repository
|
19
|
+
|
20
|
+
#{command_name} [options] owner repo
|
21
|
+
|
22
|
+
#{command_name} options:
|
23
|
+
BANNER
|
24
|
+
|
25
|
+
options.opt :num, 'Number of commits to retrieve',
|
26
|
+
:short => 'n', :default => -1, :type => :int
|
27
|
+
options.opt :full, 'Retrieve all commits, filling in potential holes',
|
28
|
+
:short => 'f', :default => -1, :type => :int
|
29
|
+
end
|
30
|
+
|
31
|
+
def validate
|
32
|
+
super
|
33
|
+
Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
|
34
|
+
|
35
|
+
Trollop::die "-a and -n cannot be defined at the same time" \
|
36
|
+
if not options[:all].nil? and not options[:foo].nil?
|
37
|
+
end
|
38
|
+
|
39
|
+
def logger
|
40
|
+
@ght.logger
|
41
|
+
end
|
42
|
+
|
43
|
+
def persister
|
44
|
+
@persister ||= connect(:mongo, settings)
|
45
|
+
@persister
|
46
|
+
end
|
47
|
+
|
48
|
+
def ext_uniq
|
49
|
+
@ext_uniq ||= config(:uniq_id)
|
50
|
+
@ext_uniq
|
51
|
+
end
|
52
|
+
|
53
|
+
def go
|
54
|
+
|
55
|
+
@ght ||= GHTorrent::Mirror.new(settings)
|
56
|
+
user_entry = @ght.transaction{@ght.ensure_user(ARGV[0], false, false)}
|
57
|
+
|
58
|
+
if user_entry.nil?
|
59
|
+
Trollop::die "Cannot find user #{owner}"
|
60
|
+
end
|
61
|
+
|
62
|
+
user = user_entry[:login]
|
63
|
+
|
64
|
+
repo_entry = @ght.transaction{@ght.ensure_repo(ARGV[0], ARGV[1], false, false, false)}
|
65
|
+
|
66
|
+
if repo_entry.nil?
|
67
|
+
Trollop::die "Cannot find repository #{owner}/#{ARGV[1]}"
|
68
|
+
end
|
69
|
+
|
70
|
+
repo = repo_entry[:name]
|
71
|
+
num_pages = if options[:num] == -1 then 1024 * 1024 else options[:n]/30 end
|
72
|
+
num_pages = if options[:full] == -1 then num_pages else 1024 * 1024 end
|
73
|
+
page = 0
|
74
|
+
|
75
|
+
|
76
|
+
head = unless options[:full] == -1
|
77
|
+
@ght.get_db.from(:commits).\
|
78
|
+
where(:commits__project_id => repo_entry[:id]).\
|
79
|
+
order(:created_at).\
|
80
|
+
first.\
|
81
|
+
select(:sha)
|
82
|
+
else
|
83
|
+
"master"
|
84
|
+
end
|
85
|
+
|
86
|
+
total_commits = 0
|
87
|
+
while (page < num_pages)
|
88
|
+
begin
|
89
|
+
logger.debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
|
90
|
+
|
91
|
+
commits = retrieve_commits(repo, head, user, 1)
|
92
|
+
page += 1
|
93
|
+
if commits.nil? or commits.empty? or commits.size == 1
|
94
|
+
page = num_pages # To break the loop
|
95
|
+
break
|
96
|
+
end
|
97
|
+
|
98
|
+
total_commits += commits.size
|
99
|
+
head = commits.last['sha']
|
100
|
+
|
101
|
+
commits.map do |c|
|
102
|
+
@ght.transaction do
|
103
|
+
@ght.ensure_commit(repo, c['sha'], user)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
rescue Exception => e
|
107
|
+
logger.warn("Error processing: #{e}")
|
108
|
+
logger.warn(e.backtrace.join("\n"))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
logger.debug("Processed #{total_commits} commits for #{user}/#{repo}")
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
#vim: set filetype=ruby expandtab tabstop=2 shiftwidth=2 autoindent smartindent:
|
@@ -0,0 +1,227 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mongo'
|
3
|
+
require 'amqp'
|
4
|
+
require 'set'
|
5
|
+
require 'eventmachine'
|
6
|
+
require 'pp'
|
7
|
+
require "amqp/extensions/rabbitmq"
|
8
|
+
|
9
|
+
require 'ghtorrent/settings'
|
10
|
+
require 'ghtorrent/logging'
|
11
|
+
require 'ghtorrent/persister'
|
12
|
+
require 'ghtorrent/command'
|
13
|
+
require 'ghtorrent/bson_orderedhash'
|
14
|
+
|
15
|
+
class GHTLoad < GHTorrent::Command
|
16
|
+
|
17
|
+
include GHTorrent::Settings
|
18
|
+
include GHTorrent::Persister
|
19
|
+
|
20
|
+
def col_info()
|
21
|
+
{
|
22
|
+
:commits => {
|
23
|
+
:name => "commits",
|
24
|
+
:payload => "commit.id",
|
25
|
+
:unq => "commit.id",
|
26
|
+
:col => persister.get_underlying_connection.collection(:commits.to_s),
|
27
|
+
:routekey => "commit.%s"
|
28
|
+
},
|
29
|
+
:events => {
|
30
|
+
:name => "events",
|
31
|
+
:payload => "",
|
32
|
+
:unq => "type",
|
33
|
+
:col => persister.get_underlying_connection.collection(:events.to_s),
|
34
|
+
:routekey => "evt.%s"
|
35
|
+
}
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
39
|
+
def persister
|
40
|
+
@persister ||= connect(:mongo, @settings)
|
41
|
+
@persister
|
42
|
+
end
|
43
|
+
|
44
|
+
def prepare_options(options)
|
45
|
+
options.banner <<-BANNER
|
46
|
+
Loads object ids from a collection to a queue for further processing.
|
47
|
+
|
48
|
+
#{command_name} [options] collection
|
49
|
+
|
50
|
+
#{command_name} options:
|
51
|
+
BANNER
|
52
|
+
|
53
|
+
options.opt :earliest, 'Seconds since epoch of earliest item to load',
|
54
|
+
:short => 'e', :default => 0, :type => :int
|
55
|
+
options.opt :number, 'Number of items to load (-1 means all)',
|
56
|
+
:short => 'n', :type => :int, :default => -1
|
57
|
+
options.opt :filter,
|
58
|
+
'Filter items by regexp on item attributes: item.attr=regexp',
|
59
|
+
:short => 'f', :type => String, :multi => true
|
60
|
+
end
|
61
|
+
|
62
|
+
def validate
|
63
|
+
super
|
64
|
+
Trollop::die "no collection specified" unless args[0] && !args[0].empty?
|
65
|
+
filter = options[:filter]
|
66
|
+
case
|
67
|
+
when filter.is_a?(Array)
|
68
|
+
options[:filter].each { |x|
|
69
|
+
Trollop::die "not a valid filter #{x}" unless is_filter_valid?(x)
|
70
|
+
}
|
71
|
+
when filter == []
|
72
|
+
# Noop
|
73
|
+
else
|
74
|
+
Trollop::die "A filter can only be a string"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def go
|
79
|
+
# Message tags await publisher ack
|
80
|
+
awaiting_ack = SortedSet.new
|
81
|
+
|
82
|
+
# Num events read
|
83
|
+
num_read = 0
|
84
|
+
|
85
|
+
collection = case args[0]
|
86
|
+
when "events"
|
87
|
+
:events
|
88
|
+
when "commits"
|
89
|
+
:commits
|
90
|
+
end
|
91
|
+
|
92
|
+
puts "Loading from collection #{collection}"
|
93
|
+
puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
|
94
|
+
puts "Loading #{options[:number]} items" if options[:verbose] && options[:number] != -1
|
95
|
+
|
96
|
+
what = case
|
97
|
+
when options[:filter].is_a?(Array)
|
98
|
+
options[:filter].reduce({}) { |acc,x|
|
99
|
+
(k,r) = x.split(/=/)
|
100
|
+
acc[k] = Regexp.new(r)
|
101
|
+
acc
|
102
|
+
}
|
103
|
+
when filter == []
|
104
|
+
{}
|
105
|
+
end
|
106
|
+
|
107
|
+
from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
|
108
|
+
|
109
|
+
(puts "Mongo filter:"; pp what.merge(from)) if options[:verbose]
|
110
|
+
|
111
|
+
AMQP.start(:host => config(:amqp_host),
|
112
|
+
:port => config(:amqp_port),
|
113
|
+
:username => config(:amqp_username),
|
114
|
+
:password => config(:amqp_password)) do |connection|
|
115
|
+
|
116
|
+
channel = AMQP::Channel.new(connection)
|
117
|
+
exchange = channel.topic(config(:amqp_exchange),
|
118
|
+
:durable => true, :auto_delete => false)
|
119
|
+
|
120
|
+
# What to do when the user hits Ctrl+c
|
121
|
+
show_stopper = Proc.new {
|
122
|
+
connection.close { EventMachine.stop }
|
123
|
+
}
|
124
|
+
|
125
|
+
# Read next 1000 items and queue them
|
126
|
+
read_and_publish = Proc.new {
|
127
|
+
|
128
|
+
to_read = if options.number == -1
|
129
|
+
1000
|
130
|
+
else
|
131
|
+
if options.number - num_read - 1 <= 0
|
132
|
+
-1
|
133
|
+
else
|
134
|
+
options.number - num_read - 1
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
read = 0
|
139
|
+
col_info[collection][:col].find(what.merge(from),
|
140
|
+
:skip => num_read,
|
141
|
+
:limit => to_read).each do |e|
|
142
|
+
|
143
|
+
payload = read_value(e, col_info[collection][:payload])
|
144
|
+
payload = if payload.class == BSON::OrderedHash
|
145
|
+
payload.delete "_id" # Inserted by MongoDB on event insert
|
146
|
+
payload.to_json
|
147
|
+
end
|
148
|
+
read += 1
|
149
|
+
unq = read_value(e, col_info[collection][:unq])
|
150
|
+
if unq.class != String or unq.nil? then
|
151
|
+
throw Exception.new("Unique value can only be a String")
|
152
|
+
end
|
153
|
+
|
154
|
+
key = col_info[collection][:routekey] % unq
|
155
|
+
|
156
|
+
exchange.publish payload, :persistent => true, :routing_key => key
|
157
|
+
|
158
|
+
num_read += 1
|
159
|
+
puts("Publish id = #{payload[unq]} (#{num_read} total)") if options.verbose
|
160
|
+
awaiting_ack << num_read
|
161
|
+
end
|
162
|
+
|
163
|
+
# Nothing new in the DB and no msgs waiting ack
|
164
|
+
if (read == 0 and awaiting_ack.size == 0) or to_read == -1
|
165
|
+
puts("Finished reading, exiting")
|
166
|
+
show_stopper.call
|
167
|
+
end
|
168
|
+
}
|
169
|
+
|
170
|
+
# Remove acknowledged or failed msg tags from the queue
|
171
|
+
# Trigger more messages to be read when ack msg queue size drops to zero
|
172
|
+
publisher_event = Proc.new { |ack|
|
173
|
+
if ack.multiple then
|
174
|
+
awaiting_ack.delete_if { |x| x <= ack.delivery_tag }
|
175
|
+
else
|
176
|
+
awaiting_ack.delete ack.delivery_tag
|
177
|
+
end
|
178
|
+
|
179
|
+
if awaiting_ack.size == 0
|
180
|
+
puts("ACKS.size= #{awaiting_ack.size}") if options.verbose
|
181
|
+
EventMachine.next_tick do
|
182
|
+
read_and_publish.call
|
183
|
+
end
|
184
|
+
end
|
185
|
+
}
|
186
|
+
|
187
|
+
# Await publisher confirms
|
188
|
+
channel.confirm_select
|
189
|
+
|
190
|
+
# Callback when confirms have arrived
|
191
|
+
channel.on_ack do |ack|
|
192
|
+
puts "ACK: tag=#{ack.delivery_tag}, mul=#{ack.multiple}" if options.verbose
|
193
|
+
publisher_event.call(ack)
|
194
|
+
end
|
195
|
+
|
196
|
+
# Callback when confirms failed.
|
197
|
+
channel.on_nack do |nack|
|
198
|
+
puts "NACK: tag=#{nack.delivery_tag}, mul=#{nack.multiple}" if options.verbose
|
199
|
+
publisher_event.call(nack)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Signal handlers
|
203
|
+
Signal.trap('INT', show_stopper)
|
204
|
+
Signal.trap('TERM', show_stopper)
|
205
|
+
|
206
|
+
# Trigger start processing
|
207
|
+
EventMachine.add_timer(0.1) do
|
208
|
+
read_and_publish.call
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
private
|
214
|
+
|
215
|
+
def is_filter_valid?(filter)
|
216
|
+
(k, r) = filter.split(/=/)
|
217
|
+
return false if r.nil?
|
218
|
+
begin
|
219
|
+
Regexp.new(r)
|
220
|
+
true
|
221
|
+
rescue
|
222
|
+
false
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
#vim: set filetype=ruby expandtab tabstop=2 shiftwidth=2 autoindent smartindent:
|
@@ -0,0 +1,147 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'yaml'
|
3
|
+
require 'amqp'
|
4
|
+
require 'eventmachine'
|
5
|
+
require 'json'
|
6
|
+
require 'logger'
|
7
|
+
|
8
|
+
require 'ghtorrent/api_client'
|
9
|
+
require 'ghtorrent/settings'
|
10
|
+
require 'ghtorrent/logging'
|
11
|
+
require 'ghtorrent/persister'
|
12
|
+
require 'ghtorrent/command'
|
13
|
+
|
14
|
+
class GHTMirrorEvents < GHTorrent::Command
|
15
|
+
|
16
|
+
include GHTorrent::Settings
|
17
|
+
include GHTorrent::Logging
|
18
|
+
include GHTorrent::Persister
|
19
|
+
include GHTorrent::APIClient
|
20
|
+
|
21
|
+
def logger
|
22
|
+
@logger
|
23
|
+
end
|
24
|
+
|
25
|
+
def store_count(events)
|
26
|
+
stored = Array.new
|
27
|
+
new = dupl = 0
|
28
|
+
events.each do |e|
|
29
|
+
if @persister.find(:events, {'id' => e['id']}).empty?
|
30
|
+
stored << e
|
31
|
+
new += 1
|
32
|
+
@persister.store(:events, e)
|
33
|
+
info "Added #{e['id']}"
|
34
|
+
else
|
35
|
+
info "Already got #{e['id']}"
|
36
|
+
dupl += 1
|
37
|
+
end
|
38
|
+
end
|
39
|
+
return new, dupl, stored
|
40
|
+
end
|
41
|
+
|
42
|
+
# Retrieve events from Github, store them in the DB
|
43
|
+
def retrieve(exchange)
|
44
|
+
begin
|
45
|
+
new = dupl = 0
|
46
|
+
events = api_request "https://api.github.com/events", false
|
47
|
+
(new, dupl, stored) = store_count events
|
48
|
+
|
49
|
+
# This means that first page cannot contain all new events. Go
|
50
|
+
# up to 10 pages back to find all new events not contained in first page.
|
51
|
+
if dupl == 0
|
52
|
+
events = paged_api_request "https://api.github.com/events"
|
53
|
+
(new1, dupl1, stored1) = store_count events
|
54
|
+
stored = stored | stored1
|
55
|
+
new = new + new1
|
56
|
+
new
|
57
|
+
end
|
58
|
+
|
59
|
+
stored.each do |e|
|
60
|
+
msg = JSON.dump(e)
|
61
|
+
key = "evt.%s" % e['type']
|
62
|
+
exchange.publish msg, :persistent => true, :routing_key => key
|
63
|
+
end
|
64
|
+
return new, dupl
|
65
|
+
rescue Exception => e
|
66
|
+
STDERR.puts e.message
|
67
|
+
STDERR.puts e.backtrace
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def go
|
72
|
+
@persister = connect(:mongo, @settings)
|
73
|
+
@logger = Logger.new(STDOUT)
|
74
|
+
|
75
|
+
# Graceful exit
|
76
|
+
Signal.trap('INT') {
|
77
|
+
info "Received SIGINT, exiting"
|
78
|
+
AMQP.stop { EM.stop }
|
79
|
+
}
|
80
|
+
Signal.trap('TERM') {
|
81
|
+
info "Received SIGTERM, exiting"
|
82
|
+
AMQP.stop { EM.stop }
|
83
|
+
}
|
84
|
+
|
85
|
+
# The event loop
|
86
|
+
AMQP.start(:host => config(:amqp_host),
|
87
|
+
:port => config(:amqp_port),
|
88
|
+
:username => config(:amqp_username),
|
89
|
+
:password => config(:amqp_password)) do |connection|
|
90
|
+
|
91
|
+
# Statistics used to recalibrate event delays
|
92
|
+
dupl_msgs = new_msgs = 1
|
93
|
+
|
94
|
+
debug "connected to rabbit"
|
95
|
+
|
96
|
+
channel = AMQP::Channel.new(connection)
|
97
|
+
exchange = channel.topic(config(:amqp_exchange), :durable => true,
|
98
|
+
:auto_delete => false)
|
99
|
+
|
100
|
+
# Initial delay for the retrieve event loop
|
101
|
+
retrieval_delay = config(:mirror_pollevery)
|
102
|
+
|
103
|
+
# Retrieve events
|
104
|
+
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
105
|
+
(new, dupl) = retrieve exchange
|
106
|
+
dupl_msgs += dupl
|
107
|
+
new_msgs += new
|
108
|
+
end
|
109
|
+
|
110
|
+
# Adjust event retrieval delay time to reduce load to Github
|
111
|
+
EventMachine.add_periodic_timer(120) do
|
112
|
+
ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
|
113
|
+
|
114
|
+
info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
|
115
|
+
|
116
|
+
new_delay = if ratio >= 0 and ratio < 0.3 then
|
117
|
+
-1
|
118
|
+
elsif ratio >= 0.3 and ratio <= 0.5 then
|
119
|
+
0
|
120
|
+
elsif ratio > 0.5 and ratio < 1 then
|
121
|
+
+1
|
122
|
+
end
|
123
|
+
|
124
|
+
# Reset counters for new loop
|
125
|
+
dupl_msgs = new_msgs = 0
|
126
|
+
|
127
|
+
# Update the retrieval delay and restart the event retriever
|
128
|
+
if new_delay != 0
|
129
|
+
|
130
|
+
# Stop the retriever task and adjust retrieval delay
|
131
|
+
retriever.cancel
|
132
|
+
retrieval_delay = retrieval_delay + new_delay
|
133
|
+
info("Setting event retrieval delay to #{retrieval_delay} secs")
|
134
|
+
|
135
|
+
# Restart the retriever
|
136
|
+
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
137
|
+
(new, dupl) = retrieve exchange
|
138
|
+
dupl_msgs += dupl
|
139
|
+
new_msgs += new
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|