ghtorrent 0.5 → 0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ require 'rubygems'
2
+ require 'time'
3
+
4
+ require 'ghtorrent/ghtorrent'
5
+ require 'ghtorrent/settings'
6
+ require 'ghtorrent/logging'
7
+ require 'ghtorrent/command'
8
+ require 'ghtorrent/retriever'
9
+
10
+ class GHTMoreCommitsRetriever < GHTorrent::Command
11
+
12
+ include GHTorrent::Settings
13
+ include GHTorrent::Retriever
14
+ include GHTorrent::Persister
15
+
16
+ def prepare_options(options)
17
+ options.banner <<-BANNER
18
+ Retrieves more commits for the provided repository
19
+
20
+ #{command_name} [options] owner repo
21
+
22
+ #{command_name} options:
23
+ BANNER
24
+
25
+ options.opt :num, 'Number of commits to retrieve',
26
+ :short => 'n', :default => -1, :type => :int
27
+ options.opt :full, 'Retrieve all commits, filling in potential holes',
28
+ :short => 'f', :default => -1, :type => :int
29
+ end
30
+
31
+ def validate
32
+ super
33
+ Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
34
+
35
+ Trollop::die "-a and -n cannot be defined at the same time" \
36
+ if not options[:all].nil? and not options[:foo].nil?
37
+ end
38
+
39
+ def logger
40
+ @ght.logger
41
+ end
42
+
43
+ def persister
44
+ @persister ||= connect(:mongo, settings)
45
+ @persister
46
+ end
47
+
48
+ def ext_uniq
49
+ @ext_uniq ||= config(:uniq_id)
50
+ @ext_uniq
51
+ end
52
+
53
+ def go
54
+
55
+ @ght ||= GHTorrent::Mirror.new(settings)
56
+ user_entry = @ght.transaction{@ght.ensure_user(ARGV[0], false, false)}
57
+
58
+ if user_entry.nil?
59
+ Trollop::die "Cannot find user #{owner}"
60
+ end
61
+
62
+ user = user_entry[:login]
63
+
64
+ repo_entry = @ght.transaction{@ght.ensure_repo(ARGV[0], ARGV[1], false, false, false)}
65
+
66
+ if repo_entry.nil?
67
+ Trollop::die "Cannot find repository #{owner}/#{ARGV[1]}"
68
+ end
69
+
70
+ repo = repo_entry[:name]
71
+ num_pages = if options[:num] == -1 then 1024 * 1024 else options[:n]/30 end
72
+ num_pages = if options[:full] == -1 then num_pages else 1024 * 1024 end
73
+ page = 0
74
+
75
+
76
+ head = unless options[:full] == -1
77
+ @ght.get_db.from(:commits).\
78
+ where(:commits__project_id => repo_entry[:id]).\
79
+ order(:created_at).\
80
+ first.\
81
+ select(:sha)
82
+ else
83
+ "master"
84
+ end
85
+
86
+ total_commits = 0
87
+ while (page < num_pages)
88
+ begin
89
+ logger.debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
90
+
91
+ commits = retrieve_commits(repo, head, user, 1)
92
+ page += 1
93
+ if commits.nil? or commits.empty? or commits.size == 1
94
+ page = num_pages # To break the loop
95
+ break
96
+ end
97
+
98
+ total_commits += commits.size
99
+ head = commits.last['sha']
100
+
101
+ commits.map do |c|
102
+ @ght.transaction do
103
+ @ght.ensure_commit(repo, c['sha'], user)
104
+ end
105
+ end
106
+ rescue Exception => e
107
+ logger.warn("Error processing: #{e}")
108
+ logger.warn(e.backtrace.join("\n"))
109
+ end
110
+ end
111
+ logger.debug("Processed #{total_commits} commits for #{user}/#{repo}")
112
+ end
113
+ end
114
+
115
+
116
+ #vim: set filetype=ruby expandtab tabstop=2 shiftwidth=2 autoindent smartindent:
@@ -0,0 +1,227 @@
1
+ require 'rubygems'
2
+ require 'mongo'
3
+ require 'amqp'
4
+ require 'set'
5
+ require 'eventmachine'
6
+ require 'pp'
7
+ require "amqp/extensions/rabbitmq"
8
+
9
+ require 'ghtorrent/settings'
10
+ require 'ghtorrent/logging'
11
+ require 'ghtorrent/persister'
12
+ require 'ghtorrent/command'
13
+ require 'ghtorrent/bson_orderedhash'
14
+
15
+ class GHTLoad < GHTorrent::Command
16
+
17
+ include GHTorrent::Settings
18
+ include GHTorrent::Persister
19
+
20
+ def col_info()
21
+ {
22
+ :commits => {
23
+ :name => "commits",
24
+ :payload => "commit.id",
25
+ :unq => "commit.id",
26
+ :col => persister.get_underlying_connection.collection(:commits.to_s),
27
+ :routekey => "commit.%s"
28
+ },
29
+ :events => {
30
+ :name => "events",
31
+ :payload => "",
32
+ :unq => "type",
33
+ :col => persister.get_underlying_connection.collection(:events.to_s),
34
+ :routekey => "evt.%s"
35
+ }
36
+ }
37
+ end
38
+
39
+ def persister
40
+ @persister ||= connect(:mongo, @settings)
41
+ @persister
42
+ end
43
+
44
+ def prepare_options(options)
45
+ options.banner <<-BANNER
46
+ Loads object ids from a collection to a queue for further processing.
47
+
48
+ #{command_name} [options] collection
49
+
50
+ #{command_name} options:
51
+ BANNER
52
+
53
+ options.opt :earliest, 'Seconds since epoch of earliest item to load',
54
+ :short => 'e', :default => 0, :type => :int
55
+ options.opt :number, 'Number of items to load (-1 means all)',
56
+ :short => 'n', :type => :int, :default => -1
57
+ options.opt :filter,
58
+ 'Filter items by regexp on item attributes: item.attr=regexp',
59
+ :short => 'f', :type => String, :multi => true
60
+ end
61
+
62
+ def validate
63
+ super
64
+ Trollop::die "no collection specified" unless args[0] && !args[0].empty?
65
+ filter = options[:filter]
66
+ case
67
+ when filter.is_a?(Array)
68
+ options[:filter].each { |x|
69
+ Trollop::die "not a valid filter #{x}" unless is_filter_valid?(x)
70
+ }
71
+ when filter == []
72
+ # Noop
73
+ else
74
+ Trollop::die "A filter can only be a string"
75
+ end
76
+ end
77
+
78
+ def go
79
+ # Message tags await publisher ack
80
+ awaiting_ack = SortedSet.new
81
+
82
+ # Num events read
83
+ num_read = 0
84
+
85
+ collection = case args[0]
86
+ when "events"
87
+ :events
88
+ when "commits"
89
+ :commits
90
+ end
91
+
92
+ puts "Loading from collection #{collection}"
93
+ puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
94
+ puts "Loading #{options[:number]} items" if options[:verbose] && options[:number] != -1
95
+
96
+ what = case
97
+ when options[:filter].is_a?(Array)
98
+ options[:filter].reduce({}) { |acc,x|
99
+ (k,r) = x.split(/=/)
100
+ acc[k] = Regexp.new(r)
101
+ acc
102
+ }
103
+ when filter == []
104
+ {}
105
+ end
106
+
107
+ from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
108
+
109
+ (puts "Mongo filter:"; pp what.merge(from)) if options[:verbose]
110
+
111
+ AMQP.start(:host => config(:amqp_host),
112
+ :port => config(:amqp_port),
113
+ :username => config(:amqp_username),
114
+ :password => config(:amqp_password)) do |connection|
115
+
116
+ channel = AMQP::Channel.new(connection)
117
+ exchange = channel.topic(config(:amqp_exchange),
118
+ :durable => true, :auto_delete => false)
119
+
120
+ # What to do when the user hits Ctrl+c
121
+ show_stopper = Proc.new {
122
+ connection.close { EventMachine.stop }
123
+ }
124
+
125
+ # Read next 1000 items and queue them
126
+ read_and_publish = Proc.new {
127
+
128
+ to_read = if options.number == -1
129
+ 1000
130
+ else
131
+ if options.number - num_read - 1 <= 0
132
+ -1
133
+ else
134
+ options.number - num_read - 1
135
+ end
136
+ end
137
+
138
+ read = 0
139
+ col_info[collection][:col].find(what.merge(from),
140
+ :skip => num_read,
141
+ :limit => to_read).each do |e|
142
+
143
+ payload = read_value(e, col_info[collection][:payload])
144
+ payload = if payload.class == BSON::OrderedHash
145
+ payload.delete "_id" # Inserted by MongoDB on event insert
146
+ payload.to_json
147
+ end
148
+ read += 1
149
+ unq = read_value(e, col_info[collection][:unq])
150
+ if unq.class != String or unq.nil? then
151
+ throw Exception.new("Unique value can only be a String")
152
+ end
153
+
154
+ key = col_info[collection][:routekey] % unq
155
+
156
+ exchange.publish payload, :persistent => true, :routing_key => key
157
+
158
+ num_read += 1
159
+ puts("Publish id = #{payload[unq]} (#{num_read} total)") if options.verbose
160
+ awaiting_ack << num_read
161
+ end
162
+
163
+ # Nothing new in the DB and no msgs waiting ack
164
+ if (read == 0 and awaiting_ack.size == 0) or to_read == -1
165
+ puts("Finished reading, exiting")
166
+ show_stopper.call
167
+ end
168
+ }
169
+
170
+ # Remove acknowledged or failed msg tags from the queue
171
+ # Trigger more messages to be read when ack msg queue size drops to zero
172
+ publisher_event = Proc.new { |ack|
173
+ if ack.multiple then
174
+ awaiting_ack.delete_if { |x| x <= ack.delivery_tag }
175
+ else
176
+ awaiting_ack.delete ack.delivery_tag
177
+ end
178
+
179
+ if awaiting_ack.size == 0
180
+ puts("ACKS.size= #{awaiting_ack.size}") if options.verbose
181
+ EventMachine.next_tick do
182
+ read_and_publish.call
183
+ end
184
+ end
185
+ }
186
+
187
+ # Await publisher confirms
188
+ channel.confirm_select
189
+
190
+ # Callback when confirms have arrived
191
+ channel.on_ack do |ack|
192
+ puts "ACK: tag=#{ack.delivery_tag}, mul=#{ack.multiple}" if options.verbose
193
+ publisher_event.call(ack)
194
+ end
195
+
196
+ # Callback when confirms failed.
197
+ channel.on_nack do |nack|
198
+ puts "NACK: tag=#{nack.delivery_tag}, mul=#{nack.multiple}" if options.verbose
199
+ publisher_event.call(nack)
200
+ end
201
+
202
+ # Signal handlers
203
+ Signal.trap('INT', show_stopper)
204
+ Signal.trap('TERM', show_stopper)
205
+
206
+ # Trigger start processing
207
+ EventMachine.add_timer(0.1) do
208
+ read_and_publish.call
209
+ end
210
+ end
211
+ end
212
+
213
+ private
214
+
215
+ def is_filter_valid?(filter)
216
+ (k, r) = filter.split(/=/)
217
+ return false if r.nil?
218
+ begin
219
+ Regexp.new(r)
220
+ true
221
+ rescue
222
+ false
223
+ end
224
+ end
225
+ end
226
+
227
+ #vim: set filetype=ruby expandtab tabstop=2 shiftwidth=2 autoindent smartindent:
@@ -0,0 +1,147 @@
1
+ require 'rubygems'
2
+ require 'yaml'
3
+ require 'amqp'
4
+ require 'eventmachine'
5
+ require 'json'
6
+ require 'logger'
7
+
8
+ require 'ghtorrent/api_client'
9
+ require 'ghtorrent/settings'
10
+ require 'ghtorrent/logging'
11
+ require 'ghtorrent/persister'
12
+ require 'ghtorrent/command'
13
+
14
+ class GHTMirrorEvents < GHTorrent::Command
15
+
16
+ include GHTorrent::Settings
17
+ include GHTorrent::Logging
18
+ include GHTorrent::Persister
19
+ include GHTorrent::APIClient
20
+
21
+ def logger
22
+ @logger
23
+ end
24
+
25
+ def store_count(events)
26
+ stored = Array.new
27
+ new = dupl = 0
28
+ events.each do |e|
29
+ if @persister.find(:events, {'id' => e['id']}).empty?
30
+ stored << e
31
+ new += 1
32
+ @persister.store(:events, e)
33
+ info "Added #{e['id']}"
34
+ else
35
+ info "Already got #{e['id']}"
36
+ dupl += 1
37
+ end
38
+ end
39
+ return new, dupl, stored
40
+ end
41
+
42
+ # Retrieve events from Github, store them in the DB
43
+ def retrieve(exchange)
44
+ begin
45
+ new = dupl = 0
46
+ events = api_request "https://api.github.com/events", false
47
+ (new, dupl, stored) = store_count events
48
+
49
+ # This means that first page cannot contain all new events. Go
50
+ # up to 10 pages back to find all new events not contained in first page.
51
+ if dupl == 0
52
+ events = paged_api_request "https://api.github.com/events"
53
+ (new1, dupl1, stored1) = store_count events
54
+ stored = stored | stored1
55
+ new = new + new1
56
+ new
57
+ end
58
+
59
+ stored.each do |e|
60
+ msg = JSON.dump(e)
61
+ key = "evt.%s" % e['type']
62
+ exchange.publish msg, :persistent => true, :routing_key => key
63
+ end
64
+ return new, dupl
65
+ rescue Exception => e
66
+ STDERR.puts e.message
67
+ STDERR.puts e.backtrace
68
+ end
69
+ end
70
+
71
+ def go
72
+ @persister = connect(:mongo, @settings)
73
+ @logger = Logger.new(STDOUT)
74
+
75
+ # Graceful exit
76
+ Signal.trap('INT') {
77
+ info "Received SIGINT, exiting"
78
+ AMQP.stop { EM.stop }
79
+ }
80
+ Signal.trap('TERM') {
81
+ info "Received SIGTERM, exiting"
82
+ AMQP.stop { EM.stop }
83
+ }
84
+
85
+ # The event loop
86
+ AMQP.start(:host => config(:amqp_host),
87
+ :port => config(:amqp_port),
88
+ :username => config(:amqp_username),
89
+ :password => config(:amqp_password)) do |connection|
90
+
91
+ # Statistics used to recalibrate event delays
92
+ dupl_msgs = new_msgs = 1
93
+
94
+ debug "connected to rabbit"
95
+
96
+ channel = AMQP::Channel.new(connection)
97
+ exchange = channel.topic(config(:amqp_exchange), :durable => true,
98
+ :auto_delete => false)
99
+
100
+ # Initial delay for the retrieve event loop
101
+ retrieval_delay = config(:mirror_pollevery)
102
+
103
+ # Retrieve events
104
+ retriever = EventMachine.add_periodic_timer(retrieval_delay) do
105
+ (new, dupl) = retrieve exchange
106
+ dupl_msgs += dupl
107
+ new_msgs += new
108
+ end
109
+
110
+ # Adjust event retrieval delay time to reduce load to Github
111
+ EventMachine.add_periodic_timer(120) do
112
+ ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
113
+
114
+ info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
115
+
116
+ new_delay = if ratio >= 0 and ratio < 0.3 then
117
+ -1
118
+ elsif ratio >= 0.3 and ratio <= 0.5 then
119
+ 0
120
+ elsif ratio > 0.5 and ratio < 1 then
121
+ +1
122
+ end
123
+
124
+ # Reset counters for new loop
125
+ dupl_msgs = new_msgs = 0
126
+
127
+ # Update the retrieval delay and restart the event retriever
128
+ if new_delay != 0
129
+
130
+ # Stop the retriever task and adjust retrieval delay
131
+ retriever.cancel
132
+ retrieval_delay = retrieval_delay + new_delay
133
+ info("Setting event retrieval delay to #{retrieval_delay} secs")
134
+
135
+ # Restart the retriever
136
+ retriever = EventMachine.add_periodic_timer(retrieval_delay) do
137
+ (new, dupl) = retrieve exchange
138
+ dupl_msgs += dupl
139
+ new_msgs += new
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end
146
+
147
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :