ghtorrent 0.5 → 0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,116 @@
1
+ require 'rubygems'
2
+ require 'time'
3
+
4
+ require 'ghtorrent/ghtorrent'
5
+ require 'ghtorrent/settings'
6
+ require 'ghtorrent/logging'
7
+ require 'ghtorrent/command'
8
+ require 'ghtorrent/retriever'
9
+
10
+ class GHTMoreCommitsRetriever < GHTorrent::Command
11
+
12
+ include GHTorrent::Settings
13
+ include GHTorrent::Retriever
14
+ include GHTorrent::Persister
15
+
16
+ def prepare_options(options)
17
+ options.banner <<-BANNER
18
+ Retrieves more commits for the provided repository
19
+
20
+ #{command_name} [options] owner repo
21
+
22
+ #{command_name} options:
23
+ BANNER
24
+
25
+ options.opt :num, 'Number of commits to retrieve',
26
+ :short => 'n', :default => -1, :type => :int
27
+ options.opt :full, 'Retrieve all commits, filling in potential holes',
28
+ :short => 'f', :default => -1, :type => :int
29
+ end
30
+
31
+ def validate
32
+ super
33
+ Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
34
+
35
+ Trollop::die "-a and -n cannot be defined at the same time" \
36
+ if not options[:all].nil? and not options[:foo].nil?
37
+ end
38
+
39
+ def logger
40
+ @ght.logger
41
+ end
42
+
43
+ def persister
44
+ @persister ||= connect(:mongo, settings)
45
+ @persister
46
+ end
47
+
48
+ def ext_uniq
49
+ @ext_uniq ||= config(:uniq_id)
50
+ @ext_uniq
51
+ end
52
+
53
+ def go
54
+
55
+ @ght ||= GHTorrent::Mirror.new(settings)
56
+ user_entry = @ght.transaction{@ght.ensure_user(ARGV[0], false, false)}
57
+
58
+ if user_entry.nil?
59
+ Trollop::die "Cannot find user #{owner}"
60
+ end
61
+
62
+ user = user_entry[:login]
63
+
64
+ repo_entry = @ght.transaction{@ght.ensure_repo(ARGV[0], ARGV[1], false, false, false)}
65
+
66
+ if repo_entry.nil?
67
+ Trollop::die "Cannot find repository #{owner}/#{ARGV[1]}"
68
+ end
69
+
70
+ repo = repo_entry[:name]
71
+ num_pages = if options[:num] == -1 then 1024 * 1024 else options[:n]/30 end
72
+ num_pages = if options[:full] == -1 then num_pages else 1024 * 1024 end
73
+ page = 0
74
+
75
+
76
+ head = unless options[:full] == -1
77
+ @ght.get_db.from(:commits).\
78
+ where(:commits__project_id => repo_entry[:id]).\
79
+ order(:created_at).\
80
+ first.\
81
+ select(:sha)
82
+ else
83
+ "master"
84
+ end
85
+
86
+ total_commits = 0
87
+ while (page < num_pages)
88
+ begin
89
+ logger.debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
90
+
91
+ commits = retrieve_commits(repo, head, user, 1)
92
+ page += 1
93
+ if commits.nil? or commits.empty? or commits.size == 1
94
+ page = num_pages # To break the loop
95
+ break
96
+ end
97
+
98
+ total_commits += commits.size
99
+ head = commits.last['sha']
100
+
101
+ commits.map do |c|
102
+ @ght.transaction do
103
+ @ght.ensure_commit(repo, c['sha'], user)
104
+ end
105
+ end
106
+ rescue Exception => e
107
+ logger.warn("Error processing: #{e}")
108
+ logger.warn(e.backtrace.join("\n"))
109
+ end
110
+ end
111
+ logger.debug("Processed #{total_commits} commits for #{user}/#{repo}")
112
+ end
113
+ end
114
+
115
+
116
+ #vim: set filetype=ruby expandtab tabstop=2 shiftwidth=2 autoindent smartindent:
@@ -0,0 +1,227 @@
1
+ require 'rubygems'
2
+ require 'mongo'
3
+ require 'amqp'
4
+ require 'set'
5
+ require 'eventmachine'
6
+ require 'pp'
7
+ require "amqp/extensions/rabbitmq"
8
+
9
+ require 'ghtorrent/settings'
10
+ require 'ghtorrent/logging'
11
+ require 'ghtorrent/persister'
12
+ require 'ghtorrent/command'
13
+ require 'ghtorrent/bson_orderedhash'
14
+
15
+ class GHTLoad < GHTorrent::Command
16
+
17
+ include GHTorrent::Settings
18
+ include GHTorrent::Persister
19
+
20
+ def col_info()
21
+ {
22
+ :commits => {
23
+ :name => "commits",
24
+ :payload => "commit.id",
25
+ :unq => "commit.id",
26
+ :col => persister.get_underlying_connection.collection(:commits.to_s),
27
+ :routekey => "commit.%s"
28
+ },
29
+ :events => {
30
+ :name => "events",
31
+ :payload => "",
32
+ :unq => "type",
33
+ :col => persister.get_underlying_connection.collection(:events.to_s),
34
+ :routekey => "evt.%s"
35
+ }
36
+ }
37
+ end
38
+
39
+ def persister
40
+ @persister ||= connect(:mongo, @settings)
41
+ @persister
42
+ end
43
+
44
+ def prepare_options(options)
45
+ options.banner <<-BANNER
46
+ Loads object ids from a collection to a queue for further processing.
47
+
48
+ #{command_name} [options] collection
49
+
50
+ #{command_name} options:
51
+ BANNER
52
+
53
+ options.opt :earliest, 'Seconds since epoch of earliest item to load',
54
+ :short => 'e', :default => 0, :type => :int
55
+ options.opt :number, 'Number of items to load (-1 means all)',
56
+ :short => 'n', :type => :int, :default => -1
57
+ options.opt :filter,
58
+ 'Filter items by regexp on item attributes: item.attr=regexp',
59
+ :short => 'f', :type => String, :multi => true
60
+ end
61
+
62
+ def validate
63
+ super
64
+ Trollop::die "no collection specified" unless args[0] && !args[0].empty?
65
+ filter = options[:filter]
66
+ case
67
+ when filter.is_a?(Array)
68
+ options[:filter].each { |x|
69
+ Trollop::die "not a valid filter #{x}" unless is_filter_valid?(x)
70
+ }
71
+ when filter == []
72
+ # Noop
73
+ else
74
+ Trollop::die "A filter can only be a string"
75
+ end
76
+ end
77
+
78
+ def go
79
+ # Message tags await publisher ack
80
+ awaiting_ack = SortedSet.new
81
+
82
+ # Num events read
83
+ num_read = 0
84
+
85
+ collection = case args[0]
86
+ when "events"
87
+ :events
88
+ when "commits"
89
+ :commits
90
+ end
91
+
92
+ puts "Loading from collection #{collection}"
93
+ puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
94
+ puts "Loading #{options[:number]} items" if options[:verbose] && options[:number] != -1
95
+
96
+ what = case
97
+ when options[:filter].is_a?(Array)
98
+ options[:filter].reduce({}) { |acc,x|
99
+ (k,r) = x.split(/=/)
100
+ acc[k] = Regexp.new(r)
101
+ acc
102
+ }
103
+ when filter == []
104
+ {}
105
+ end
106
+
107
+ from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
108
+
109
+ (puts "Mongo filter:"; pp what.merge(from)) if options[:verbose]
110
+
111
+ AMQP.start(:host => config(:amqp_host),
112
+ :port => config(:amqp_port),
113
+ :username => config(:amqp_username),
114
+ :password => config(:amqp_password)) do |connection|
115
+
116
+ channel = AMQP::Channel.new(connection)
117
+ exchange = channel.topic(config(:amqp_exchange),
118
+ :durable => true, :auto_delete => false)
119
+
120
+ # What to do when the user hits Ctrl+c
121
+ show_stopper = Proc.new {
122
+ connection.close { EventMachine.stop }
123
+ }
124
+
125
+ # Read next 1000 items and queue them
126
+ read_and_publish = Proc.new {
127
+
128
+ to_read = if options.number == -1
129
+ 1000
130
+ else
131
+ if options.number - num_read - 1 <= 0
132
+ -1
133
+ else
134
+ options.number - num_read - 1
135
+ end
136
+ end
137
+
138
+ read = 0
139
+ col_info[collection][:col].find(what.merge(from),
140
+ :skip => num_read,
141
+ :limit => to_read).each do |e|
142
+
143
+ payload = read_value(e, col_info[collection][:payload])
144
+ payload = if payload.class == BSON::OrderedHash
145
+ payload.delete "_id" # Inserted by MongoDB on event insert
146
+ payload.to_json
147
+ end
148
+ read += 1
149
+ unq = read_value(e, col_info[collection][:unq])
150
+ if unq.class != String or unq.nil? then
151
+ throw Exception.new("Unique value can only be a String")
152
+ end
153
+
154
+ key = col_info[collection][:routekey] % unq
155
+
156
+ exchange.publish payload, :persistent => true, :routing_key => key
157
+
158
+ num_read += 1
159
+ puts("Publish id = #{payload[unq]} (#{num_read} total)") if options.verbose
160
+ awaiting_ack << num_read
161
+ end
162
+
163
+ # Nothing new in the DB and no msgs waiting ack
164
+ if (read == 0 and awaiting_ack.size == 0) or to_read == -1
165
+ puts("Finished reading, exiting")
166
+ show_stopper.call
167
+ end
168
+ }
169
+
170
+ # Remove acknowledged or failed msg tags from the queue
171
+ # Trigger more messages to be read when ack msg queue size drops to zero
172
+ publisher_event = Proc.new { |ack|
173
+ if ack.multiple then
174
+ awaiting_ack.delete_if { |x| x <= ack.delivery_tag }
175
+ else
176
+ awaiting_ack.delete ack.delivery_tag
177
+ end
178
+
179
+ if awaiting_ack.size == 0
180
+ puts("ACKS.size= #{awaiting_ack.size}") if options.verbose
181
+ EventMachine.next_tick do
182
+ read_and_publish.call
183
+ end
184
+ end
185
+ }
186
+
187
+ # Await publisher confirms
188
+ channel.confirm_select
189
+
190
+ # Callback when confirms have arrived
191
+ channel.on_ack do |ack|
192
+ puts "ACK: tag=#{ack.delivery_tag}, mul=#{ack.multiple}" if options.verbose
193
+ publisher_event.call(ack)
194
+ end
195
+
196
+ # Callback when confirms failed.
197
+ channel.on_nack do |nack|
198
+ puts "NACK: tag=#{nack.delivery_tag}, mul=#{nack.multiple}" if options.verbose
199
+ publisher_event.call(nack)
200
+ end
201
+
202
+ # Signal handlers
203
+ Signal.trap('INT', show_stopper)
204
+ Signal.trap('TERM', show_stopper)
205
+
206
+ # Trigger start processing
207
+ EventMachine.add_timer(0.1) do
208
+ read_and_publish.call
209
+ end
210
+ end
211
+ end
212
+
213
+ private
214
+
215
+ def is_filter_valid?(filter)
216
+ (k, r) = filter.split(/=/)
217
+ return false if r.nil?
218
+ begin
219
+ Regexp.new(r)
220
+ true
221
+ rescue
222
+ false
223
+ end
224
+ end
225
+ end
226
+
227
+ #vim: set filetype=ruby expandtab tabstop=2 shiftwidth=2 autoindent smartindent:
@@ -0,0 +1,147 @@
1
+ require 'rubygems'
2
+ require 'yaml'
3
+ require 'amqp'
4
+ require 'eventmachine'
5
+ require 'json'
6
+ require 'logger'
7
+
8
+ require 'ghtorrent/api_client'
9
+ require 'ghtorrent/settings'
10
+ require 'ghtorrent/logging'
11
+ require 'ghtorrent/persister'
12
+ require 'ghtorrent/command'
13
+
14
+ class GHTMirrorEvents < GHTorrent::Command
15
+
16
+ include GHTorrent::Settings
17
+ include GHTorrent::Logging
18
+ include GHTorrent::Persister
19
+ include GHTorrent::APIClient
20
+
21
+ def logger
22
+ @logger
23
+ end
24
+
25
+ def store_count(events)
26
+ stored = Array.new
27
+ new = dupl = 0
28
+ events.each do |e|
29
+ if @persister.find(:events, {'id' => e['id']}).empty?
30
+ stored << e
31
+ new += 1
32
+ @persister.store(:events, e)
33
+ info "Added #{e['id']}"
34
+ else
35
+ info "Already got #{e['id']}"
36
+ dupl += 1
37
+ end
38
+ end
39
+ return new, dupl, stored
40
+ end
41
+
42
+ # Retrieve events from Github, store them in the DB
43
+ def retrieve(exchange)
44
+ begin
45
+ new = dupl = 0
46
+ events = api_request "https://api.github.com/events", false
47
+ (new, dupl, stored) = store_count events
48
+
49
+ # This means that first page cannot contain all new events. Go
50
+ # up to 10 pages back to find all new events not contained in first page.
51
+ if dupl == 0
52
+ events = paged_api_request "https://api.github.com/events"
53
+ (new1, dupl1, stored1) = store_count events
54
+ stored = stored | stored1
55
+ new = new + new1
56
+ new
57
+ end
58
+
59
+ stored.each do |e|
60
+ msg = JSON.dump(e)
61
+ key = "evt.%s" % e['type']
62
+ exchange.publish msg, :persistent => true, :routing_key => key
63
+ end
64
+ return new, dupl
65
+ rescue Exception => e
66
+ STDERR.puts e.message
67
+ STDERR.puts e.backtrace
68
+ end
69
+ end
70
+
71
+ def go
72
+ @persister = connect(:mongo, @settings)
73
+ @logger = Logger.new(STDOUT)
74
+
75
+ # Graceful exit
76
+ Signal.trap('INT') {
77
+ info "Received SIGINT, exiting"
78
+ AMQP.stop { EM.stop }
79
+ }
80
+ Signal.trap('TERM') {
81
+ info "Received SIGTERM, exiting"
82
+ AMQP.stop { EM.stop }
83
+ }
84
+
85
+ # The event loop
86
+ AMQP.start(:host => config(:amqp_host),
87
+ :port => config(:amqp_port),
88
+ :username => config(:amqp_username),
89
+ :password => config(:amqp_password)) do |connection|
90
+
91
+ # Statistics used to recalibrate event delays
92
+ dupl_msgs = new_msgs = 1
93
+
94
+ debug "connected to rabbit"
95
+
96
+ channel = AMQP::Channel.new(connection)
97
+ exchange = channel.topic(config(:amqp_exchange), :durable => true,
98
+ :auto_delete => false)
99
+
100
+ # Initial delay for the retrieve event loop
101
+ retrieval_delay = config(:mirror_pollevery)
102
+
103
+ # Retrieve events
104
+ retriever = EventMachine.add_periodic_timer(retrieval_delay) do
105
+ (new, dupl) = retrieve exchange
106
+ dupl_msgs += dupl
107
+ new_msgs += new
108
+ end
109
+
110
+ # Adjust event retrieval delay time to reduce load to Github
111
+ EventMachine.add_periodic_timer(120) do
112
+ ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
113
+
114
+ info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
115
+
116
+ new_delay = if ratio >= 0 and ratio < 0.3 then
117
+ -1
118
+ elsif ratio >= 0.3 and ratio <= 0.5 then
119
+ 0
120
+ elsif ratio > 0.5 and ratio < 1 then
121
+ +1
122
+ end
123
+
124
+ # Reset counters for new loop
125
+ dupl_msgs = new_msgs = 0
126
+
127
+ # Update the retrieval delay and restart the event retriever
128
+ if new_delay != 0
129
+
130
+ # Stop the retriever task and adjust retrieval delay
131
+ retriever.cancel
132
+ retrieval_delay = retrieval_delay + new_delay
133
+ info("Setting event retrieval delay to #{retrieval_delay} secs")
134
+
135
+ # Restart the retriever
136
+ retriever = EventMachine.add_periodic_timer(retrieval_delay) do
137
+ (new, dupl) = retrieve exchange
138
+ dupl_msgs += dupl
139
+ new_msgs += new
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end
146
+
147
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :