ghtorrent 0.5 → 0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +16 -1
- data/README.md +6 -1
- data/bin/ght-data-retrieval +2 -162
- data/bin/ght-get-more-commits +6 -0
- data/bin/ght-load +1 -224
- data/bin/ght-mirror-events +2 -147
- data/bin/ght-process-event +35 -0
- data/bin/ght-retrieve-repo +6 -0
- data/bin/ght-rm-dupl +2 -130
- data/lib/ghtorrent.rb +10 -0
- data/lib/ghtorrent/adapters/base_adapter.rb +1 -1
- data/lib/ghtorrent/adapters/mongo_persister.rb +12 -1
- data/lib/ghtorrent/api_client.rb +47 -13
- data/lib/ghtorrent/bson_orderedhash.rb +2 -1
- data/lib/ghtorrent/command.rb +18 -0
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +218 -0
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +116 -0
- data/lib/ghtorrent/commands/ght_load.rb +227 -0
- data/lib/ghtorrent/commands/ght_mirror_events.rb +147 -0
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +118 -0
- data/lib/ghtorrent/commands/ght_rm_dupl.rb +132 -0
- data/lib/ghtorrent/ghtorrent.rb +401 -89
- data/lib/ghtorrent/hash.rb +1 -1
- data/lib/ghtorrent/migrations/011_add_issues.rb +74 -0
- data/lib/ghtorrent/retriever.rb +88 -16
- data/lib/ghtorrent/settings.rb +6 -1
- data/lib/version.rb +1 -1
- metadata +36 -26
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'time'
|
3
|
+
|
4
|
+
require 'ghtorrent/ghtorrent'
|
5
|
+
require 'ghtorrent/settings'
|
6
|
+
require 'ghtorrent/logging'
|
7
|
+
require 'ghtorrent/command'
|
8
|
+
require 'ghtorrent/retriever'
|
9
|
+
|
10
|
+
class GHTMoreCommitsRetriever < GHTorrent::Command
|
11
|
+
|
12
|
+
include GHTorrent::Settings
|
13
|
+
include GHTorrent::Retriever
|
14
|
+
include GHTorrent::Persister
|
15
|
+
|
16
|
+
def prepare_options(options)
|
17
|
+
options.banner <<-BANNER
|
18
|
+
Retrieves more commits for the provided repository
|
19
|
+
|
20
|
+
#{command_name} [options] owner repo
|
21
|
+
|
22
|
+
#{command_name} options:
|
23
|
+
BANNER
|
24
|
+
|
25
|
+
options.opt :num, 'Number of commits to retrieve',
|
26
|
+
:short => 'n', :default => -1, :type => :int
|
27
|
+
options.opt :full, 'Retrieve all commits, filling in potential holes',
|
28
|
+
:short => 'f', :default => -1, :type => :int
|
29
|
+
end
|
30
|
+
|
31
|
+
def validate
|
32
|
+
super
|
33
|
+
Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
|
34
|
+
|
35
|
+
Trollop::die "-a and -n cannot be defined at the same time" \
|
36
|
+
if not options[:all].nil? and not options[:foo].nil?
|
37
|
+
end
|
38
|
+
|
39
|
+
def logger
|
40
|
+
@ght.logger
|
41
|
+
end
|
42
|
+
|
43
|
+
def persister
|
44
|
+
@persister ||= connect(:mongo, settings)
|
45
|
+
@persister
|
46
|
+
end
|
47
|
+
|
48
|
+
def ext_uniq
|
49
|
+
@ext_uniq ||= config(:uniq_id)
|
50
|
+
@ext_uniq
|
51
|
+
end
|
52
|
+
|
53
|
+
def go
|
54
|
+
|
55
|
+
@ght ||= GHTorrent::Mirror.new(settings)
|
56
|
+
user_entry = @ght.transaction{@ght.ensure_user(ARGV[0], false, false)}
|
57
|
+
|
58
|
+
if user_entry.nil?
|
59
|
+
Trollop::die "Cannot find user #{owner}"
|
60
|
+
end
|
61
|
+
|
62
|
+
user = user_entry[:login]
|
63
|
+
|
64
|
+
repo_entry = @ght.transaction{@ght.ensure_repo(ARGV[0], ARGV[1], false, false, false)}
|
65
|
+
|
66
|
+
if repo_entry.nil?
|
67
|
+
Trollop::die "Cannot find repository #{owner}/#{ARGV[1]}"
|
68
|
+
end
|
69
|
+
|
70
|
+
repo = repo_entry[:name]
|
71
|
+
num_pages = if options[:num] == -1 then 1024 * 1024 else options[:n]/30 end
|
72
|
+
num_pages = if options[:full] == -1 then num_pages else 1024 * 1024 end
|
73
|
+
page = 0
|
74
|
+
|
75
|
+
|
76
|
+
head = unless options[:full] == -1
|
77
|
+
@ght.get_db.from(:commits).\
|
78
|
+
where(:commits__project_id => repo_entry[:id]).\
|
79
|
+
order(:created_at).\
|
80
|
+
first.\
|
81
|
+
select(:sha)
|
82
|
+
else
|
83
|
+
"master"
|
84
|
+
end
|
85
|
+
|
86
|
+
total_commits = 0
|
87
|
+
while (page < num_pages)
|
88
|
+
begin
|
89
|
+
logger.debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
|
90
|
+
|
91
|
+
commits = retrieve_commits(repo, head, user, 1)
|
92
|
+
page += 1
|
93
|
+
if commits.nil? or commits.empty? or commits.size == 1
|
94
|
+
page = num_pages # To break the loop
|
95
|
+
break
|
96
|
+
end
|
97
|
+
|
98
|
+
total_commits += commits.size
|
99
|
+
head = commits.last['sha']
|
100
|
+
|
101
|
+
commits.map do |c|
|
102
|
+
@ght.transaction do
|
103
|
+
@ght.ensure_commit(repo, c['sha'], user)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
rescue Exception => e
|
107
|
+
logger.warn("Error processing: #{e}")
|
108
|
+
logger.warn(e.backtrace.join("\n"))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
logger.debug("Processed #{total_commits} commits for #{user}/#{repo}")
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
#vim: set filetype=ruby expandtab tabstop=2 shiftwidth=2 autoindent smartindent:
|
@@ -0,0 +1,227 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mongo'
|
3
|
+
require 'amqp'
|
4
|
+
require 'set'
|
5
|
+
require 'eventmachine'
|
6
|
+
require 'pp'
|
7
|
+
require "amqp/extensions/rabbitmq"
|
8
|
+
|
9
|
+
require 'ghtorrent/settings'
|
10
|
+
require 'ghtorrent/logging'
|
11
|
+
require 'ghtorrent/persister'
|
12
|
+
require 'ghtorrent/command'
|
13
|
+
require 'ghtorrent/bson_orderedhash'
|
14
|
+
|
15
|
+
class GHTLoad < GHTorrent::Command
|
16
|
+
|
17
|
+
include GHTorrent::Settings
|
18
|
+
include GHTorrent::Persister
|
19
|
+
|
20
|
+
def col_info()
|
21
|
+
{
|
22
|
+
:commits => {
|
23
|
+
:name => "commits",
|
24
|
+
:payload => "commit.id",
|
25
|
+
:unq => "commit.id",
|
26
|
+
:col => persister.get_underlying_connection.collection(:commits.to_s),
|
27
|
+
:routekey => "commit.%s"
|
28
|
+
},
|
29
|
+
:events => {
|
30
|
+
:name => "events",
|
31
|
+
:payload => "",
|
32
|
+
:unq => "type",
|
33
|
+
:col => persister.get_underlying_connection.collection(:events.to_s),
|
34
|
+
:routekey => "evt.%s"
|
35
|
+
}
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
39
|
+
def persister
|
40
|
+
@persister ||= connect(:mongo, @settings)
|
41
|
+
@persister
|
42
|
+
end
|
43
|
+
|
44
|
+
def prepare_options(options)
|
45
|
+
options.banner <<-BANNER
|
46
|
+
Loads object ids from a collection to a queue for further processing.
|
47
|
+
|
48
|
+
#{command_name} [options] collection
|
49
|
+
|
50
|
+
#{command_name} options:
|
51
|
+
BANNER
|
52
|
+
|
53
|
+
options.opt :earliest, 'Seconds since epoch of earliest item to load',
|
54
|
+
:short => 'e', :default => 0, :type => :int
|
55
|
+
options.opt :number, 'Number of items to load (-1 means all)',
|
56
|
+
:short => 'n', :type => :int, :default => -1
|
57
|
+
options.opt :filter,
|
58
|
+
'Filter items by regexp on item attributes: item.attr=regexp',
|
59
|
+
:short => 'f', :type => String, :multi => true
|
60
|
+
end
|
61
|
+
|
62
|
+
def validate
|
63
|
+
super
|
64
|
+
Trollop::die "no collection specified" unless args[0] && !args[0].empty?
|
65
|
+
filter = options[:filter]
|
66
|
+
case
|
67
|
+
when filter.is_a?(Array)
|
68
|
+
options[:filter].each { |x|
|
69
|
+
Trollop::die "not a valid filter #{x}" unless is_filter_valid?(x)
|
70
|
+
}
|
71
|
+
when filter == []
|
72
|
+
# Noop
|
73
|
+
else
|
74
|
+
Trollop::die "A filter can only be a string"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def go
|
79
|
+
# Message tags await publisher ack
|
80
|
+
awaiting_ack = SortedSet.new
|
81
|
+
|
82
|
+
# Num events read
|
83
|
+
num_read = 0
|
84
|
+
|
85
|
+
collection = case args[0]
|
86
|
+
when "events"
|
87
|
+
:events
|
88
|
+
when "commits"
|
89
|
+
:commits
|
90
|
+
end
|
91
|
+
|
92
|
+
puts "Loading from collection #{collection}"
|
93
|
+
puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
|
94
|
+
puts "Loading #{options[:number]} items" if options[:verbose] && options[:number] != -1
|
95
|
+
|
96
|
+
what = case
|
97
|
+
when options[:filter].is_a?(Array)
|
98
|
+
options[:filter].reduce({}) { |acc,x|
|
99
|
+
(k,r) = x.split(/=/)
|
100
|
+
acc[k] = Regexp.new(r)
|
101
|
+
acc
|
102
|
+
}
|
103
|
+
when filter == []
|
104
|
+
{}
|
105
|
+
end
|
106
|
+
|
107
|
+
from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
|
108
|
+
|
109
|
+
(puts "Mongo filter:"; pp what.merge(from)) if options[:verbose]
|
110
|
+
|
111
|
+
AMQP.start(:host => config(:amqp_host),
|
112
|
+
:port => config(:amqp_port),
|
113
|
+
:username => config(:amqp_username),
|
114
|
+
:password => config(:amqp_password)) do |connection|
|
115
|
+
|
116
|
+
channel = AMQP::Channel.new(connection)
|
117
|
+
exchange = channel.topic(config(:amqp_exchange),
|
118
|
+
:durable => true, :auto_delete => false)
|
119
|
+
|
120
|
+
# What to do when the user hits Ctrl+c
|
121
|
+
show_stopper = Proc.new {
|
122
|
+
connection.close { EventMachine.stop }
|
123
|
+
}
|
124
|
+
|
125
|
+
# Read next 1000 items and queue them
|
126
|
+
read_and_publish = Proc.new {
|
127
|
+
|
128
|
+
to_read = if options.number == -1
|
129
|
+
1000
|
130
|
+
else
|
131
|
+
if options.number - num_read - 1 <= 0
|
132
|
+
-1
|
133
|
+
else
|
134
|
+
options.number - num_read - 1
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
read = 0
|
139
|
+
col_info[collection][:col].find(what.merge(from),
|
140
|
+
:skip => num_read,
|
141
|
+
:limit => to_read).each do |e|
|
142
|
+
|
143
|
+
payload = read_value(e, col_info[collection][:payload])
|
144
|
+
payload = if payload.class == BSON::OrderedHash
|
145
|
+
payload.delete "_id" # Inserted by MongoDB on event insert
|
146
|
+
payload.to_json
|
147
|
+
end
|
148
|
+
read += 1
|
149
|
+
unq = read_value(e, col_info[collection][:unq])
|
150
|
+
if unq.class != String or unq.nil? then
|
151
|
+
throw Exception.new("Unique value can only be a String")
|
152
|
+
end
|
153
|
+
|
154
|
+
key = col_info[collection][:routekey] % unq
|
155
|
+
|
156
|
+
exchange.publish payload, :persistent => true, :routing_key => key
|
157
|
+
|
158
|
+
num_read += 1
|
159
|
+
puts("Publish id = #{payload[unq]} (#{num_read} total)") if options.verbose
|
160
|
+
awaiting_ack << num_read
|
161
|
+
end
|
162
|
+
|
163
|
+
# Nothing new in the DB and no msgs waiting ack
|
164
|
+
if (read == 0 and awaiting_ack.size == 0) or to_read == -1
|
165
|
+
puts("Finished reading, exiting")
|
166
|
+
show_stopper.call
|
167
|
+
end
|
168
|
+
}
|
169
|
+
|
170
|
+
# Remove acknowledged or failed msg tags from the queue
|
171
|
+
# Trigger more messages to be read when ack msg queue size drops to zero
|
172
|
+
publisher_event = Proc.new { |ack|
|
173
|
+
if ack.multiple then
|
174
|
+
awaiting_ack.delete_if { |x| x <= ack.delivery_tag }
|
175
|
+
else
|
176
|
+
awaiting_ack.delete ack.delivery_tag
|
177
|
+
end
|
178
|
+
|
179
|
+
if awaiting_ack.size == 0
|
180
|
+
puts("ACKS.size= #{awaiting_ack.size}") if options.verbose
|
181
|
+
EventMachine.next_tick do
|
182
|
+
read_and_publish.call
|
183
|
+
end
|
184
|
+
end
|
185
|
+
}
|
186
|
+
|
187
|
+
# Await publisher confirms
|
188
|
+
channel.confirm_select
|
189
|
+
|
190
|
+
# Callback when confirms have arrived
|
191
|
+
channel.on_ack do |ack|
|
192
|
+
puts "ACK: tag=#{ack.delivery_tag}, mul=#{ack.multiple}" if options.verbose
|
193
|
+
publisher_event.call(ack)
|
194
|
+
end
|
195
|
+
|
196
|
+
# Callback when confirms failed.
|
197
|
+
channel.on_nack do |nack|
|
198
|
+
puts "NACK: tag=#{nack.delivery_tag}, mul=#{nack.multiple}" if options.verbose
|
199
|
+
publisher_event.call(nack)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Signal handlers
|
203
|
+
Signal.trap('INT', show_stopper)
|
204
|
+
Signal.trap('TERM', show_stopper)
|
205
|
+
|
206
|
+
# Trigger start processing
|
207
|
+
EventMachine.add_timer(0.1) do
|
208
|
+
read_and_publish.call
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
private
|
214
|
+
|
215
|
+
def is_filter_valid?(filter)
|
216
|
+
(k, r) = filter.split(/=/)
|
217
|
+
return false if r.nil?
|
218
|
+
begin
|
219
|
+
Regexp.new(r)
|
220
|
+
true
|
221
|
+
rescue
|
222
|
+
false
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
#vim: set filetype=ruby expandtab tabstop=2 shiftwidth=2 autoindent smartindent:
|
@@ -0,0 +1,147 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'yaml'
|
3
|
+
require 'amqp'
|
4
|
+
require 'eventmachine'
|
5
|
+
require 'json'
|
6
|
+
require 'logger'
|
7
|
+
|
8
|
+
require 'ghtorrent/api_client'
|
9
|
+
require 'ghtorrent/settings'
|
10
|
+
require 'ghtorrent/logging'
|
11
|
+
require 'ghtorrent/persister'
|
12
|
+
require 'ghtorrent/command'
|
13
|
+
|
14
|
+
class GHTMirrorEvents < GHTorrent::Command
|
15
|
+
|
16
|
+
include GHTorrent::Settings
|
17
|
+
include GHTorrent::Logging
|
18
|
+
include GHTorrent::Persister
|
19
|
+
include GHTorrent::APIClient
|
20
|
+
|
21
|
+
def logger
|
22
|
+
@logger
|
23
|
+
end
|
24
|
+
|
25
|
+
def store_count(events)
|
26
|
+
stored = Array.new
|
27
|
+
new = dupl = 0
|
28
|
+
events.each do |e|
|
29
|
+
if @persister.find(:events, {'id' => e['id']}).empty?
|
30
|
+
stored << e
|
31
|
+
new += 1
|
32
|
+
@persister.store(:events, e)
|
33
|
+
info "Added #{e['id']}"
|
34
|
+
else
|
35
|
+
info "Already got #{e['id']}"
|
36
|
+
dupl += 1
|
37
|
+
end
|
38
|
+
end
|
39
|
+
return new, dupl, stored
|
40
|
+
end
|
41
|
+
|
42
|
+
# Retrieve events from Github, store them in the DB
|
43
|
+
def retrieve(exchange)
|
44
|
+
begin
|
45
|
+
new = dupl = 0
|
46
|
+
events = api_request "https://api.github.com/events", false
|
47
|
+
(new, dupl, stored) = store_count events
|
48
|
+
|
49
|
+
# This means that first page cannot contain all new events. Go
|
50
|
+
# up to 10 pages back to find all new events not contained in first page.
|
51
|
+
if dupl == 0
|
52
|
+
events = paged_api_request "https://api.github.com/events"
|
53
|
+
(new1, dupl1, stored1) = store_count events
|
54
|
+
stored = stored | stored1
|
55
|
+
new = new + new1
|
56
|
+
new
|
57
|
+
end
|
58
|
+
|
59
|
+
stored.each do |e|
|
60
|
+
msg = JSON.dump(e)
|
61
|
+
key = "evt.%s" % e['type']
|
62
|
+
exchange.publish msg, :persistent => true, :routing_key => key
|
63
|
+
end
|
64
|
+
return new, dupl
|
65
|
+
rescue Exception => e
|
66
|
+
STDERR.puts e.message
|
67
|
+
STDERR.puts e.backtrace
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def go
|
72
|
+
@persister = connect(:mongo, @settings)
|
73
|
+
@logger = Logger.new(STDOUT)
|
74
|
+
|
75
|
+
# Graceful exit
|
76
|
+
Signal.trap('INT') {
|
77
|
+
info "Received SIGINT, exiting"
|
78
|
+
AMQP.stop { EM.stop }
|
79
|
+
}
|
80
|
+
Signal.trap('TERM') {
|
81
|
+
info "Received SIGTERM, exiting"
|
82
|
+
AMQP.stop { EM.stop }
|
83
|
+
}
|
84
|
+
|
85
|
+
# The event loop
|
86
|
+
AMQP.start(:host => config(:amqp_host),
|
87
|
+
:port => config(:amqp_port),
|
88
|
+
:username => config(:amqp_username),
|
89
|
+
:password => config(:amqp_password)) do |connection|
|
90
|
+
|
91
|
+
# Statistics used to recalibrate event delays
|
92
|
+
dupl_msgs = new_msgs = 1
|
93
|
+
|
94
|
+
debug "connected to rabbit"
|
95
|
+
|
96
|
+
channel = AMQP::Channel.new(connection)
|
97
|
+
exchange = channel.topic(config(:amqp_exchange), :durable => true,
|
98
|
+
:auto_delete => false)
|
99
|
+
|
100
|
+
# Initial delay for the retrieve event loop
|
101
|
+
retrieval_delay = config(:mirror_pollevery)
|
102
|
+
|
103
|
+
# Retrieve events
|
104
|
+
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
105
|
+
(new, dupl) = retrieve exchange
|
106
|
+
dupl_msgs += dupl
|
107
|
+
new_msgs += new
|
108
|
+
end
|
109
|
+
|
110
|
+
# Adjust event retrieval delay time to reduce load to Github
|
111
|
+
EventMachine.add_periodic_timer(120) do
|
112
|
+
ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
|
113
|
+
|
114
|
+
info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
|
115
|
+
|
116
|
+
new_delay = if ratio >= 0 and ratio < 0.3 then
|
117
|
+
-1
|
118
|
+
elsif ratio >= 0.3 and ratio <= 0.5 then
|
119
|
+
0
|
120
|
+
elsif ratio > 0.5 and ratio < 1 then
|
121
|
+
+1
|
122
|
+
end
|
123
|
+
|
124
|
+
# Reset counters for new loop
|
125
|
+
dupl_msgs = new_msgs = 0
|
126
|
+
|
127
|
+
# Update the retrieval delay and restart the event retriever
|
128
|
+
if new_delay != 0
|
129
|
+
|
130
|
+
# Stop the retriever task and adjust retrieval delay
|
131
|
+
retriever.cancel
|
132
|
+
retrieval_delay = retrieval_delay + new_delay
|
133
|
+
info("Setting event retrieval delay to #{retrieval_delay} secs")
|
134
|
+
|
135
|
+
# Restart the retriever
|
136
|
+
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
137
|
+
(new, dupl) = retrieve exchange
|
138
|
+
dupl_msgs += dupl
|
139
|
+
new_msgs += new
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|