ghtorrent 0.5 → 0.6

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,6 +1,21 @@
1
+ = Version 0.6
2
+
3
+ * Support retrieval of issues, issue events and issue history
4
+ * Support for setting username/password for performing requests
5
+ * Respect by default Github's x-ratelimit-remaining header
6
+ * Selective processing of events for user-specified repos
7
+ * New tool (ght-get-more-commits) to retrieve all commits for a repository
8
+ * New tool (ght-process-events) to process just one event by id
9
+ * Retrieve 100 items at once by default on multipage requests
10
+ * Rename watchers -> stargazers, as per Github API change
11
+ * Fixes to bugs that permitted efficient processing of multipage requests
12
+ * Several fixes on how pull requests are being processed
13
+ * Users with invalid git setups are now allowed
14
+ * Compatibility with Ruby 1.8 restored
15
+
1
16
  = Version 0.5
2
17
 
3
- * Generic methods for retrieving items that are bound to repositories
18
+ * Generic methods for retrieving items that are bound to repositories
4
19
  * Processing of pull requests with commits, comments and history
5
20
  * Processing of project forks
6
21
  * New tool (ght-load) to filter and load events to the queue
data/README.md CHANGED
@@ -129,9 +129,14 @@ please consider citing the following paper:
129
129
 
130
130
  > Georgios Gousios and Diomidis Spinellis, "GHTorrent: GitHub’s data from a firehose," in _MSR '12: Proceedings of the 9th Working Conference on Mining Software Repositories_, June 2-–3, 2012. Zurich, Switzerland.
131
131
 
132
+ See also the following presentation:
133
+
134
+ <iframe src="http://www.slideshare.net/slideshow/embed_code/13184524?rel=0" width="342" height="291" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" style="border:1px solid #CCC;border-width:1px 1px 0;margin-bottom:5px" allowfullscreen/>
135
+ <div style="margin-bottom:5px"> <strong> <a href="http://www.slideshare.net/gousiosg/ghtorrent-githubs-data-from-a-firehose-13184524" title="GHTorrent: Github&#39;s Data from a Firehose" target="_blank">GHTorrent: Github&#39;s Data from a Firehose</a> </strong> </div>
136
+
132
137
  #### Authors
133
138
 
134
- Georgios Gousios <gousiosg@gmail.com>
139
+ [Georgios Gousios](http://istlab.dmst.aueb.gr/~george) <gousiosg@gmail.com>
135
140
 
136
141
  [Diomidis Spinellis](http://www.dmst.aueb.gr/dds) <dds@aueb.gr>
137
142
 
@@ -1,166 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require 'amqp'
5
- require 'json'
6
- require 'pp'
4
+ require 'ghtorrent'
7
5
 
8
- require 'ghtorrent/ghtorrent'
9
- require 'ghtorrent/settings'
10
- require 'ghtorrent/logging'
11
- require 'ghtorrent/command'
12
-
13
- class GHTDataRetrieval < GHTorrent::Command
14
-
15
- include GHTorrent::Settings
16
- include GHTorrent::Logging
17
-
18
- def parse(msg)
19
- JSON.parse(msg)
20
- end
21
-
22
- def PushEvent(data)
23
- data['payload']['commits'].each do |c|
24
- url = c['url'].split(/\//)
25
-
26
- @gh.get_commit url[4], url[5], url[7]
27
- end
28
- end
29
-
30
- def WatchEvent(data)
31
- owner = data['repo']['name'].split(/\//)[0]
32
- repo = data['repo']['name'].split(/\//)[1]
33
- watcher = data['actor']['login']
34
- created_at = data['created_at']
35
-
36
- @gh.get_watcher owner, repo, watcher, created_at
37
- end
38
-
39
- def FollowEvent(data)
40
- follower = data['actor']['login']
41
- followed = data['payload']['target']['login']
42
- created_at = data['created_at']
43
-
44
- @gh.get_follower(follower, followed, created_at)
45
- end
46
-
47
- def MemberEvent(data)
48
- owner = data['actor']['login']
49
- repo = data['repo']['name'].split(/\//)[1]
50
- new_member = data['payload']['member']['login']
51
- created_at = data['created_at']
52
-
53
- @gh.get_project_member(owner, repo, new_member, created_at)
54
- end
55
-
56
- def CommitCommentEvent(data)
57
- user = data['actor']['login']
58
- repo = data['repo']['name'].split(/\//)[1]
59
- id = data['payload']['comment']['id']
60
- created_at = data['created_at']
61
-
62
- @gh.get_commit_comment(user, repo, id, created_at)
63
- end
64
-
65
- def PullRequestEvent(data)
66
- owner = data['payload']['pull_request']['base']['repo']['owner']['login']
67
- repo = data['payload']['pull_request']['base']['repo']['name']
68
- pullreq_id = data['payload']['number']
69
- action = data['payload']['action']
70
- created_at = data['created_at']
71
-
72
- @gh.get_pull_request(owner, repo, pullreq_id, action, created_at)
73
- end
74
-
75
- def ForkEvent(data)
76
- owner = data['repo']['name'].split(/\//)[0]
77
- repo = data['repo']['name'].split(/\//)[1]
78
- fork_id = data['payload']['forkee']['id']
79
- created_at = data['created_at']
80
-
81
- @gh.get_fork(owner, repo, fork_id, created_at)
82
- end
83
-
84
- def PullRequestReviewCommentEvent(data)
85
- owner = data['repo']['name'].split(/\//)[0]
86
- repo = data['repo']['name'].split(/\//)[1]
87
- comment_id = data['payload']['comment']['id']
88
- pullreq_id = data['payload']['comment']['_links']['pull_request']['href'].split(/\//)[-1]
89
- created_at = data['created_at']
90
-
91
- @gh.get_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
92
- end
93
-
94
- def IssueCommentEvent(data)
95
- owner = data['repo']['name'].split(/\//)[0]
96
- repo = data['repo']['name'].split(/\//)[1]
97
- pullreq_id = data['payload']['forkee']['id']
98
- created_at = data['created_at']
99
-
100
- @gh.get_issue_comment(owner, repo, issue_id, comment_id, created_at)
101
- end
102
-
103
- def handlers
104
- %w(PushEvent WatchEvent FollowEvent MemberEvent CommitCommentEvent PullRequestEvent ForkEvent PullRequestReviewCommentEvent)
105
- #%w(PullRequestReviewCommentEvent)
106
- end
107
-
108
- def logger
109
- @gh.logger
110
- end
111
-
112
- def go
113
- @gh = GHTorrent::Mirror.new(@settings)
114
-
115
- # Graceful exit
116
- Signal.trap('INT') {
117
- info "GHTDataRetrieval: Received SIGINT, exiting"
118
- AMQP.stop { EM.stop }
119
- }
120
- Signal.trap('TERM') {
121
- info "GHTDataRetrieval: Received SIGTERM, exiting"
122
- AMQP.stop { EM.stop }
123
- }
124
-
125
- AMQP.start(:host => config(:amqp_host),
126
- :port => config(:amqp_port),
127
- :username => config(:amqp_username),
128
- :password => config(:amqp_password)) do |connection|
129
-
130
- channel = AMQP::Channel.new(connection, :prefetch => config(:amqp_prefetch))
131
- exchange = channel.topic(config(:amqp_exchange), :durable => true,
132
- :auto_delete => false)
133
-
134
- handlers.each { |h|
135
- queue = channel.queue("#{h}s", {:durable => true})\
136
- .bind(exchange, :routing_key => "evt.#{h}")
137
-
138
- info "GHTDataRetrieval: Binding handler #{h} to routing key evt.#{h}"
139
-
140
- queue.subscribe(:ack => true) do |headers, msg|
141
- begin
142
- data = parse(msg)
143
- info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
144
- send(h, data)
145
- headers.ack
146
- info "GHTDataRetrieval: Processed event: #{data['type']}-#{data['id']}"
147
- rescue Exception => e
148
- # Give a message a chance to be reprocessed
149
- if headers.redelivered?
150
- data = parse(msg)
151
- warn "GHTDataRetrieval: Could not process event: #{data['type']}-#{data['id']}"
152
- headers.reject(:requeue => false)
153
- else
154
- headers.reject(:requeue => true)
155
- end
156
-
157
- STDERR.puts e
158
- STDERR.puts e.backtrace.join("\n")
159
- end
160
- end
161
- }
162
- end
163
- end
164
- end
165
-
166
- GHTDataRetrieval.run
6
+ GHTDataRetrieval.run(ARGV)
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'ghtorrent'
5
+
6
+ GHTMoreCommitsRetriever.run(ARGV)
data/bin/ght-load CHANGED
@@ -1,230 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require 'mongo'
5
- require 'amqp'
6
- require 'set'
7
- require 'eventmachine'
8
- require 'pp'
9
- require "amqp/extensions/rabbitmq"
10
-
11
- require 'ghtorrent/settings'
12
- require 'ghtorrent/logging'
13
- require 'ghtorrent/persister'
14
- require 'ghtorrent/command'
15
- require 'ghtorrent/bson_orderedhash'
16
-
17
- class GHTLoad < GHTorrent::Command
18
-
19
- include GHTorrent::Settings
20
- include GHTorrent::Persister
21
-
22
- def col_info()
23
- {
24
- :commits => {
25
- :name => "commits",
26
- :payload => "commit.id",
27
- :unq => "commit.id",
28
- :col => persister.get_underlying_connection.collection(:commits.to_s),
29
- :routekey => "commit.%s"
30
- },
31
- :events => {
32
- :name => "events",
33
- :payload => "",
34
- :unq => "type",
35
- :col => persister.get_underlying_connection.collection(:events.to_s),
36
- :routekey => "evt.%s"
37
- }
38
- }
39
- end
40
-
41
- def persister
42
- @persister ||= connect(:mongo, @settings)
43
- @persister
44
- end
45
-
46
- def prepare_options(options)
47
- options.banner <<-BANNER
48
- Loads object ids from a collection to a queue for further processing.
49
-
50
- #{command_name} [options] collection
51
-
52
- #{command_name} options:
53
- BANNER
54
-
55
- options.opt :earliest, 'Seconds since epoch of earliest item to load',
56
- :short => 'e', :default => 0, :type => :int
57
- options.opt :number, 'Number of items to load (-1 means all)',
58
- :short => 'n', :type => :int, :default => -1
59
- options.opt :filter,
60
- 'Filter items by regexp on item attributes: item.attr=regexp',
61
- :short => 'f', :type => String, :multi => true
62
- end
63
-
64
- def validate
65
- super
66
- Trollop::die "no collection specified" unless args[0] && !args[0].empty?
67
- filter = options[:filter]
68
- case
69
- when filter.is_a?(Array)
70
- options[:filter].each { |x|
71
- Trollop::die "not a valid filter #{x}" unless is_filter_valid?(x)
72
- }
73
- when filter == []
74
- # Noop
75
- else
76
- Trollop::die "A filter can only be a string"
77
- end
78
- end
79
-
80
- def go
81
- # Message tags await publisher ack
82
- awaiting_ack = SortedSet.new
83
-
84
- # Num events read
85
- num_read = 0
86
-
87
- collection = case args[0]
88
- when "events"
89
- :events
90
- when "commits"
91
- :commits
92
- end
93
-
94
- puts "Loading from collection #{collection}"
95
- puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
96
- puts "Loading #{options[:number]} items" if options[:verbose] && options[:number] != -1
97
-
98
- what = case
99
- when options[:filter].is_a?(Array)
100
- options[:filter].reduce({}) { |acc,x|
101
- (k,r) = x.split(/=/)
102
- acc[k] = Regexp.new(r)
103
- acc
104
- }
105
- when filter == []
106
- {}
107
- end
108
-
109
- from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
110
-
111
- (puts "Mongo filter:"; pp what.merge(from)) if options[:verbose]
112
-
113
- AMQP.start(:host => config(:amqp_host),
114
- :port => config(:amqp_port),
115
- :username => config(:amqp_username),
116
- :password => config(:amqp_password)) do |connection|
117
-
118
- channel = AMQP::Channel.new(connection)
119
- exchange = channel.topic(config(:amqp_exchange),
120
- :durable => true, :auto_delete => false)
121
-
122
- # What to do when the user hits Ctrl+c
123
- show_stopper = Proc.new {
124
- connection.close { EventMachine.stop }
125
- }
126
-
127
- # Read next 1000 items and queue them
128
- read_and_publish = Proc.new {
129
-
130
- to_read = if options.number == -1
131
- 1000
132
- else
133
- if options.number - num_read - 1 <= 0
134
- -1
135
- else
136
- options.number - num_read - 1
137
- end
138
- end
139
-
140
- read = 0
141
- col_info[collection][:col].find(what.merge(from),
142
- :skip => num_read,
143
- :limit => to_read).each do |e|
144
-
145
- payload = read_value(e, col_info[collection][:payload])
146
- payload = if payload.class == BSON::OrderedHash
147
- payload.delete "_id" # Inserted by MongoDB on event insert
148
- payload.to_json
149
- end
150
- read += 1
151
- unq = read_value(e, col_info[collection][:unq])
152
- if unq.class != String or unq.nil? then
153
- throw Exception("Unique value can only be a String")
154
- end
155
-
156
- key = col_info[collection][:routekey] % unq
157
-
158
- exchange.publish payload, :persistent => true, :routing_key => key
159
-
160
- num_read += 1
161
- puts("Publish id = #{payload[unq]} (#{num_read} total)") if options.verbose
162
- awaiting_ack << num_read
163
- end
164
-
165
- # Nothing new in the DB and no msgs waiting ack
166
- if (read == 0 and awaiting_ack.size == 0) or to_read == -1
167
- puts("Finished reading, exiting")
168
- show_stopper.call
169
- end
170
- }
171
-
172
- # Remove acknowledged or failed msg tags from the queue
173
- # Trigger more messages to be read when ack msg queue size drops to zero
174
- publisher_event = Proc.new { |ack|
175
- if ack.multiple then
176
- awaiting_ack.delete_if { |x| x <= ack.delivery_tag }
177
- else
178
- awaiting_ack.delete ack.delivery_tag
179
- end
180
-
181
- if awaiting_ack.size == 0
182
- puts("ACKS.size= #{awaiting_ack.size}") if options.verbose
183
- EventMachine.next_tick do
184
- read_and_publish.call
185
- end
186
- end
187
- }
188
-
189
- # Await publisher confirms
190
- channel.confirm_select
191
-
192
- # Callback when confirms have arrived
193
- channel.on_ack do |ack|
194
- puts "ACK: tag=#{ack.delivery_tag}, mul=#{ack.multiple}" if options.verbose
195
- publisher_event.call(ack)
196
- end
197
-
198
- # Callback when confirms failed.
199
- channel.on_nack do |nack|
200
- puts "NACK: tag=#{nack.delivery_tag}, mul=#{nack.multiple}" if options.verbose
201
- publisher_event.call(nack)
202
- end
203
-
204
- # Signal handlers
205
- Signal.trap('INT', show_stopper)
206
- Signal.trap('TERM', show_stopper)
207
-
208
- # Trigger start processing
209
- EventMachine.add_timer(0.1) do
210
- read_and_publish.call
211
- end
212
- end
213
- end
214
-
215
- private
216
-
217
- def is_filter_valid?(filter)
218
- (k, r) = filter.split(/=/)
219
- return false if r.nil?
220
- begin
221
- Regexp.new(r)
222
- true
223
- rescue
224
- false
225
- end
226
- end
227
- end
4
+ require 'ghtorrent'
228
5
 
229
6
  GHTLoad.run
230
7
 
@@ -1,151 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require 'yaml'
5
- require 'amqp'
6
- require 'eventmachine'
7
- require 'json'
8
- require 'logger'
4
+ require 'ghtorrent'
9
5
 
10
- require 'ghtorrent/api_client'
11
- require 'ghtorrent/settings'
12
- require 'ghtorrent/logging'
13
- require 'ghtorrent/persister'
14
- require 'ghtorrent/command'
15
-
16
- class GHTMirrorEvents < GHTorrent::Command
17
-
18
- include GHTorrent::Settings
19
- include GHTorrent::Logging
20
- include GHTorrent::Persister
21
- include GHTorrent::APIClient
22
-
23
- def logger
24
- @logger
25
- end
26
-
27
- def store_count(events)
28
- stored = Array.new
29
- new = dupl = 0
30
- events.each do |e|
31
- if @persister.find(:events, {'id' => e['id']}).empty?
32
- stored << e
33
- new += 1
34
- @persister.store(:events, e)
35
- info "Added #{e['id']}"
36
- else
37
- info "Already got #{e['id']}"
38
- dupl += 1
39
- end
40
- end
41
- return new, dupl, stored
42
- end
43
-
44
- # Retrieve events from Github, store them in the DB
45
- def retrieve(exchange)
46
- begin
47
- new = dupl = 0
48
- events = api_request "https://api.github.com/events", false
49
- (new, dupl, stored) = store_count events
50
-
51
- # This means that first page cannot contain all new events. Go
52
- # up to 10 pages back to find all new events not contained in first page.
53
- if dupl == 0
54
- events = paged_api_request "https://api.github.com/events", 10
55
- (new1, dupl1, stored1) = store_count events
56
- stored = stored | stored1
57
- new = new + new1
58
- new
59
- end
60
-
61
- stored.each do |e|
62
- msg = JSON.dump(e)
63
- key = "evt.%s" % e['type']
64
- exchange.publish msg, :persistent => true, :routing_key => key
65
- end
66
- return new, dupl
67
- rescue Exception => e
68
- STDERR.puts e.message
69
- STDERR.puts e.backtrace
70
- end
71
- end
72
-
73
- def go
74
- @persister = connect(:mongo, @settings)
75
- @logger = Logger.new(STDOUT)
76
-
77
- # Graceful exit
78
- Signal.trap('INT') {
79
- info "Received SIGINT, exiting"
80
- AMQP.stop { EM.stop }
81
- }
82
- Signal.trap('TERM') {
83
- info "Received SIGTERM, exiting"
84
- AMQP.stop { EM.stop }
85
- }
86
-
87
- # The event loop
88
- AMQP.start(:host => config(:amqp_host),
89
- :port => config(:amqp_port),
90
- :username => config(:amqp_username),
91
- :password => config(:amqp_password)) do |connection|
92
-
93
- # Statistics used to recalibrate event delays
94
- dupl_msgs = new_msgs = 1
95
-
96
- debug "connected to rabbit"
97
-
98
- channel = AMQP::Channel.new(connection)
99
- exchange = channel.topic(config(:amqp_exchange), :durable => true,
100
- :auto_delete => false)
101
-
102
- # Initial delay for the retrieve event loop
103
- retrieval_delay = config(:mirror_pollevery)
104
-
105
- # Retrieve events
106
- retriever = EventMachine.add_periodic_timer(retrieval_delay) do
107
- (new, dupl) = retrieve exchange
108
- dupl_msgs += dupl
109
- new_msgs += new
110
- end
111
-
112
- # Adjust event retrieval delay time to reduce load to Github
113
- EventMachine.add_periodic_timer(120) do
114
- ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
115
-
116
- info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
117
-
118
- new_delay = if ratio >= 0 and ratio < 0.3 then
119
- -1
120
- elsif ratio >= 0.3 and ratio <= 0.5 then
121
- 0
122
- elsif ratio > 0.5 and ratio < 1 then
123
- +1
124
- end
125
-
126
- # Reset counters for new loop
127
- dupl_msgs = new_msgs = 0
128
-
129
- # Update the retrieval delay and restart the event retriever
130
- if new_delay != 0
131
-
132
- # Stop the retriever task and adjust retrieval delay
133
- retriever.cancel
134
- retrieval_delay = retrieval_delay + new_delay
135
- info("Setting event retrieval delay to #{retrieval_delay} secs")
136
-
137
- # Restart the retriever
138
- retriever = EventMachine.add_periodic_timer(retrieval_delay) do
139
- (new, dupl) = retrieve exchange
140
- dupl_msgs += dupl
141
- new_msgs += new
142
- end
143
- end
144
- end
145
- end
146
- end
147
- end
148
-
149
- GHTMirrorEvents.run
150
-
151
- # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
6
+ GHTMirrorEvents.run(ARGV)