ghtorrent 0.5 → 0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,6 +1,21 @@
1
+ = Version 0.6
2
+
3
+ * Support retrieval of issues, issue events and issue history
4
+ * Support for setting username/password for performing requests
5
+ * Respect by default Github's x-ratelimit-remaining header
6
+ * Selective processing of events for user-specified repos
7
+ * New tool (ght-get-more-commits) to retrieve all commits for a repository
8
+ * New tool (ght-process-events) to process just one event by id
9
+ * Retrieve 100 items at once by default on multipage requests
10
+ * Rename watchers -> stargazers, as per Github API change
11
+ * Fixes to bugs that permitted efficient processing of multipage requests
12
+ * Several fixes on how pull requests are being processed
13
+ * Users with invalid git setups are now allowed
14
+ * Compatibility with Ruby 1.8 restored
15
+
1
16
  = Version 0.5
2
17
 
3
- * Generic methods for retrieving items that are bound to repositories
18
+ * Generic methods for retrieving items that are bound to repositories
4
19
  * Processing of pull requests with commits, comments and history
5
20
  * Processing of project forks
6
21
  * New tool (ght-load) to filter and load events to the queue
data/README.md CHANGED
@@ -129,9 +129,14 @@ please consider citing the following paper:
129
129
 
130
130
  > Georgios Gousios and Diomidis Spinellis, "GHTorrent: GitHub’s data from a firehose," in _MSR '12: Proceedings of the 9th Working Conference on Mining Software Repositories_, June 2-–3, 2012. Zurich, Switzerland.
131
131
 
132
+ See also the following presentation:
133
+
134
+ <iframe src="http://www.slideshare.net/slideshow/embed_code/13184524?rel=0" width="342" height="291" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" style="border:1px solid #CCC;border-width:1px 1px 0;margin-bottom:5px" allowfullscreen/>
135
+ <div style="margin-bottom:5px"> <strong> <a href="http://www.slideshare.net/gousiosg/ghtorrent-githubs-data-from-a-firehose-13184524" title="GHTorrent: Github&#39;s Data from a Firehose" target="_blank">GHTorrent: Github&#39;s Data from a Firehose</a> </strong> </div>
136
+
132
137
  #### Authors
133
138
 
134
- Georgios Gousios <gousiosg@gmail.com>
139
+ [Georgios Gousios](http://istlab.dmst.aueb.gr/~george) <gousiosg@gmail.com>
135
140
 
136
141
  [Diomidis Spinellis](http://www.dmst.aueb.gr/dds) <dds@aueb.gr>
137
142
 
@@ -1,166 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require 'amqp'
5
- require 'json'
6
- require 'pp'
4
+ require 'ghtorrent'
7
5
 
8
- require 'ghtorrent/ghtorrent'
9
- require 'ghtorrent/settings'
10
- require 'ghtorrent/logging'
11
- require 'ghtorrent/command'
12
-
13
- class GHTDataRetrieval < GHTorrent::Command
14
-
15
- include GHTorrent::Settings
16
- include GHTorrent::Logging
17
-
18
- def parse(msg)
19
- JSON.parse(msg)
20
- end
21
-
22
- def PushEvent(data)
23
- data['payload']['commits'].each do |c|
24
- url = c['url'].split(/\//)
25
-
26
- @gh.get_commit url[4], url[5], url[7]
27
- end
28
- end
29
-
30
- def WatchEvent(data)
31
- owner = data['repo']['name'].split(/\//)[0]
32
- repo = data['repo']['name'].split(/\//)[1]
33
- watcher = data['actor']['login']
34
- created_at = data['created_at']
35
-
36
- @gh.get_watcher owner, repo, watcher, created_at
37
- end
38
-
39
- def FollowEvent(data)
40
- follower = data['actor']['login']
41
- followed = data['payload']['target']['login']
42
- created_at = data['created_at']
43
-
44
- @gh.get_follower(follower, followed, created_at)
45
- end
46
-
47
- def MemberEvent(data)
48
- owner = data['actor']['login']
49
- repo = data['repo']['name'].split(/\//)[1]
50
- new_member = data['payload']['member']['login']
51
- created_at = data['created_at']
52
-
53
- @gh.get_project_member(owner, repo, new_member, created_at)
54
- end
55
-
56
- def CommitCommentEvent(data)
57
- user = data['actor']['login']
58
- repo = data['repo']['name'].split(/\//)[1]
59
- id = data['payload']['comment']['id']
60
- created_at = data['created_at']
61
-
62
- @gh.get_commit_comment(user, repo, id, created_at)
63
- end
64
-
65
- def PullRequestEvent(data)
66
- owner = data['payload']['pull_request']['base']['repo']['owner']['login']
67
- repo = data['payload']['pull_request']['base']['repo']['name']
68
- pullreq_id = data['payload']['number']
69
- action = data['payload']['action']
70
- created_at = data['created_at']
71
-
72
- @gh.get_pull_request(owner, repo, pullreq_id, action, created_at)
73
- end
74
-
75
- def ForkEvent(data)
76
- owner = data['repo']['name'].split(/\//)[0]
77
- repo = data['repo']['name'].split(/\//)[1]
78
- fork_id = data['payload']['forkee']['id']
79
- created_at = data['created_at']
80
-
81
- @gh.get_fork(owner, repo, fork_id, created_at)
82
- end
83
-
84
- def PullRequestReviewCommentEvent(data)
85
- owner = data['repo']['name'].split(/\//)[0]
86
- repo = data['repo']['name'].split(/\//)[1]
87
- comment_id = data['payload']['comment']['id']
88
- pullreq_id = data['payload']['comment']['_links']['pull_request']['href'].split(/\//)[-1]
89
- created_at = data['created_at']
90
-
91
- @gh.get_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
92
- end
93
-
94
- def IssueCommentEvent(data)
95
- owner = data['repo']['name'].split(/\//)[0]
96
- repo = data['repo']['name'].split(/\//)[1]
97
- pullreq_id = data['payload']['forkee']['id']
98
- created_at = data['created_at']
99
-
100
- @gh.get_issue_comment(owner, repo, issue_id, comment_id, created_at)
101
- end
102
-
103
- def handlers
104
- %w(PushEvent WatchEvent FollowEvent MemberEvent CommitCommentEvent PullRequestEvent ForkEvent PullRequestReviewCommentEvent)
105
- #%w(PullRequestReviewCommentEvent)
106
- end
107
-
108
- def logger
109
- @gh.logger
110
- end
111
-
112
- def go
113
- @gh = GHTorrent::Mirror.new(@settings)
114
-
115
- # Graceful exit
116
- Signal.trap('INT') {
117
- info "GHTDataRetrieval: Received SIGINT, exiting"
118
- AMQP.stop { EM.stop }
119
- }
120
- Signal.trap('TERM') {
121
- info "GHTDataRetrieval: Received SIGTERM, exiting"
122
- AMQP.stop { EM.stop }
123
- }
124
-
125
- AMQP.start(:host => config(:amqp_host),
126
- :port => config(:amqp_port),
127
- :username => config(:amqp_username),
128
- :password => config(:amqp_password)) do |connection|
129
-
130
- channel = AMQP::Channel.new(connection, :prefetch => config(:amqp_prefetch))
131
- exchange = channel.topic(config(:amqp_exchange), :durable => true,
132
- :auto_delete => false)
133
-
134
- handlers.each { |h|
135
- queue = channel.queue("#{h}s", {:durable => true})\
136
- .bind(exchange, :routing_key => "evt.#{h}")
137
-
138
- info "GHTDataRetrieval: Binding handler #{h} to routing key evt.#{h}"
139
-
140
- queue.subscribe(:ack => true) do |headers, msg|
141
- begin
142
- data = parse(msg)
143
- info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
144
- send(h, data)
145
- headers.ack
146
- info "GHTDataRetrieval: Processed event: #{data['type']}-#{data['id']}"
147
- rescue Exception => e
148
- # Give a message a chance to be reprocessed
149
- if headers.redelivered?
150
- data = parse(msg)
151
- warn "GHTDataRetrieval: Could not process event: #{data['type']}-#{data['id']}"
152
- headers.reject(:requeue => false)
153
- else
154
- headers.reject(:requeue => true)
155
- end
156
-
157
- STDERR.puts e
158
- STDERR.puts e.backtrace.join("\n")
159
- end
160
- end
161
- }
162
- end
163
- end
164
- end
165
-
166
- GHTDataRetrieval.run
6
+ GHTDataRetrieval.run(ARGV)
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'ghtorrent'
5
+
6
+ GHTMoreCommitsRetriever.run(ARGV)
data/bin/ght-load CHANGED
@@ -1,230 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require 'mongo'
5
- require 'amqp'
6
- require 'set'
7
- require 'eventmachine'
8
- require 'pp'
9
- require "amqp/extensions/rabbitmq"
10
-
11
- require 'ghtorrent/settings'
12
- require 'ghtorrent/logging'
13
- require 'ghtorrent/persister'
14
- require 'ghtorrent/command'
15
- require 'ghtorrent/bson_orderedhash'
16
-
17
- class GHTLoad < GHTorrent::Command
18
-
19
- include GHTorrent::Settings
20
- include GHTorrent::Persister
21
-
22
- def col_info()
23
- {
24
- :commits => {
25
- :name => "commits",
26
- :payload => "commit.id",
27
- :unq => "commit.id",
28
- :col => persister.get_underlying_connection.collection(:commits.to_s),
29
- :routekey => "commit.%s"
30
- },
31
- :events => {
32
- :name => "events",
33
- :payload => "",
34
- :unq => "type",
35
- :col => persister.get_underlying_connection.collection(:events.to_s),
36
- :routekey => "evt.%s"
37
- }
38
- }
39
- end
40
-
41
- def persister
42
- @persister ||= connect(:mongo, @settings)
43
- @persister
44
- end
45
-
46
- def prepare_options(options)
47
- options.banner <<-BANNER
48
- Loads object ids from a collection to a queue for further processing.
49
-
50
- #{command_name} [options] collection
51
-
52
- #{command_name} options:
53
- BANNER
54
-
55
- options.opt :earliest, 'Seconds since epoch of earliest item to load',
56
- :short => 'e', :default => 0, :type => :int
57
- options.opt :number, 'Number of items to load (-1 means all)',
58
- :short => 'n', :type => :int, :default => -1
59
- options.opt :filter,
60
- 'Filter items by regexp on item attributes: item.attr=regexp',
61
- :short => 'f', :type => String, :multi => true
62
- end
63
-
64
- def validate
65
- super
66
- Trollop::die "no collection specified" unless args[0] && !args[0].empty?
67
- filter = options[:filter]
68
- case
69
- when filter.is_a?(Array)
70
- options[:filter].each { |x|
71
- Trollop::die "not a valid filter #{x}" unless is_filter_valid?(x)
72
- }
73
- when filter == []
74
- # Noop
75
- else
76
- Trollop::die "A filter can only be a string"
77
- end
78
- end
79
-
80
- def go
81
- # Message tags await publisher ack
82
- awaiting_ack = SortedSet.new
83
-
84
- # Num events read
85
- num_read = 0
86
-
87
- collection = case args[0]
88
- when "events"
89
- :events
90
- when "commits"
91
- :commits
92
- end
93
-
94
- puts "Loading from collection #{collection}"
95
- puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
96
- puts "Loading #{options[:number]} items" if options[:verbose] && options[:number] != -1
97
-
98
- what = case
99
- when options[:filter].is_a?(Array)
100
- options[:filter].reduce({}) { |acc,x|
101
- (k,r) = x.split(/=/)
102
- acc[k] = Regexp.new(r)
103
- acc
104
- }
105
- when filter == []
106
- {}
107
- end
108
-
109
- from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
110
-
111
- (puts "Mongo filter:"; pp what.merge(from)) if options[:verbose]
112
-
113
- AMQP.start(:host => config(:amqp_host),
114
- :port => config(:amqp_port),
115
- :username => config(:amqp_username),
116
- :password => config(:amqp_password)) do |connection|
117
-
118
- channel = AMQP::Channel.new(connection)
119
- exchange = channel.topic(config(:amqp_exchange),
120
- :durable => true, :auto_delete => false)
121
-
122
- # What to do when the user hits Ctrl+c
123
- show_stopper = Proc.new {
124
- connection.close { EventMachine.stop }
125
- }
126
-
127
- # Read next 1000 items and queue them
128
- read_and_publish = Proc.new {
129
-
130
- to_read = if options.number == -1
131
- 1000
132
- else
133
- if options.number - num_read - 1 <= 0
134
- -1
135
- else
136
- options.number - num_read - 1
137
- end
138
- end
139
-
140
- read = 0
141
- col_info[collection][:col].find(what.merge(from),
142
- :skip => num_read,
143
- :limit => to_read).each do |e|
144
-
145
- payload = read_value(e, col_info[collection][:payload])
146
- payload = if payload.class == BSON::OrderedHash
147
- payload.delete "_id" # Inserted by MongoDB on event insert
148
- payload.to_json
149
- end
150
- read += 1
151
- unq = read_value(e, col_info[collection][:unq])
152
- if unq.class != String or unq.nil? then
153
- throw Exception("Unique value can only be a String")
154
- end
155
-
156
- key = col_info[collection][:routekey] % unq
157
-
158
- exchange.publish payload, :persistent => true, :routing_key => key
159
-
160
- num_read += 1
161
- puts("Publish id = #{payload[unq]} (#{num_read} total)") if options.verbose
162
- awaiting_ack << num_read
163
- end
164
-
165
- # Nothing new in the DB and no msgs waiting ack
166
- if (read == 0 and awaiting_ack.size == 0) or to_read == -1
167
- puts("Finished reading, exiting")
168
- show_stopper.call
169
- end
170
- }
171
-
172
- # Remove acknowledged or failed msg tags from the queue
173
- # Trigger more messages to be read when ack msg queue size drops to zero
174
- publisher_event = Proc.new { |ack|
175
- if ack.multiple then
176
- awaiting_ack.delete_if { |x| x <= ack.delivery_tag }
177
- else
178
- awaiting_ack.delete ack.delivery_tag
179
- end
180
-
181
- if awaiting_ack.size == 0
182
- puts("ACKS.size= #{awaiting_ack.size}") if options.verbose
183
- EventMachine.next_tick do
184
- read_and_publish.call
185
- end
186
- end
187
- }
188
-
189
- # Await publisher confirms
190
- channel.confirm_select
191
-
192
- # Callback when confirms have arrived
193
- channel.on_ack do |ack|
194
- puts "ACK: tag=#{ack.delivery_tag}, mul=#{ack.multiple}" if options.verbose
195
- publisher_event.call(ack)
196
- end
197
-
198
- # Callback when confirms failed.
199
- channel.on_nack do |nack|
200
- puts "NACK: tag=#{nack.delivery_tag}, mul=#{nack.multiple}" if options.verbose
201
- publisher_event.call(nack)
202
- end
203
-
204
- # Signal handlers
205
- Signal.trap('INT', show_stopper)
206
- Signal.trap('TERM', show_stopper)
207
-
208
- # Trigger start processing
209
- EventMachine.add_timer(0.1) do
210
- read_and_publish.call
211
- end
212
- end
213
- end
214
-
215
- private
216
-
217
- def is_filter_valid?(filter)
218
- (k, r) = filter.split(/=/)
219
- return false if r.nil?
220
- begin
221
- Regexp.new(r)
222
- true
223
- rescue
224
- false
225
- end
226
- end
227
- end
4
+ require 'ghtorrent'
228
5
 
229
6
  GHTLoad.run
230
7
 
@@ -1,151 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require 'yaml'
5
- require 'amqp'
6
- require 'eventmachine'
7
- require 'json'
8
- require 'logger'
4
+ require 'ghtorrent'
9
5
 
10
- require 'ghtorrent/api_client'
11
- require 'ghtorrent/settings'
12
- require 'ghtorrent/logging'
13
- require 'ghtorrent/persister'
14
- require 'ghtorrent/command'
15
-
16
- class GHTMirrorEvents < GHTorrent::Command
17
-
18
- include GHTorrent::Settings
19
- include GHTorrent::Logging
20
- include GHTorrent::Persister
21
- include GHTorrent::APIClient
22
-
23
- def logger
24
- @logger
25
- end
26
-
27
- def store_count(events)
28
- stored = Array.new
29
- new = dupl = 0
30
- events.each do |e|
31
- if @persister.find(:events, {'id' => e['id']}).empty?
32
- stored << e
33
- new += 1
34
- @persister.store(:events, e)
35
- info "Added #{e['id']}"
36
- else
37
- info "Already got #{e['id']}"
38
- dupl += 1
39
- end
40
- end
41
- return new, dupl, stored
42
- end
43
-
44
- # Retrieve events from Github, store them in the DB
45
- def retrieve(exchange)
46
- begin
47
- new = dupl = 0
48
- events = api_request "https://api.github.com/events", false
49
- (new, dupl, stored) = store_count events
50
-
51
- # This means that first page cannot contain all new events. Go
52
- # up to 10 pages back to find all new events not contained in first page.
53
- if dupl == 0
54
- events = paged_api_request "https://api.github.com/events", 10
55
- (new1, dupl1, stored1) = store_count events
56
- stored = stored | stored1
57
- new = new + new1
58
- new
59
- end
60
-
61
- stored.each do |e|
62
- msg = JSON.dump(e)
63
- key = "evt.%s" % e['type']
64
- exchange.publish msg, :persistent => true, :routing_key => key
65
- end
66
- return new, dupl
67
- rescue Exception => e
68
- STDERR.puts e.message
69
- STDERR.puts e.backtrace
70
- end
71
- end
72
-
73
- def go
74
- @persister = connect(:mongo, @settings)
75
- @logger = Logger.new(STDOUT)
76
-
77
- # Graceful exit
78
- Signal.trap('INT') {
79
- info "Received SIGINT, exiting"
80
- AMQP.stop { EM.stop }
81
- }
82
- Signal.trap('TERM') {
83
- info "Received SIGTERM, exiting"
84
- AMQP.stop { EM.stop }
85
- }
86
-
87
- # The event loop
88
- AMQP.start(:host => config(:amqp_host),
89
- :port => config(:amqp_port),
90
- :username => config(:amqp_username),
91
- :password => config(:amqp_password)) do |connection|
92
-
93
- # Statistics used to recalibrate event delays
94
- dupl_msgs = new_msgs = 1
95
-
96
- debug "connected to rabbit"
97
-
98
- channel = AMQP::Channel.new(connection)
99
- exchange = channel.topic(config(:amqp_exchange), :durable => true,
100
- :auto_delete => false)
101
-
102
- # Initial delay for the retrieve event loop
103
- retrieval_delay = config(:mirror_pollevery)
104
-
105
- # Retrieve events
106
- retriever = EventMachine.add_periodic_timer(retrieval_delay) do
107
- (new, dupl) = retrieve exchange
108
- dupl_msgs += dupl
109
- new_msgs += new
110
- end
111
-
112
- # Adjust event retrieval delay time to reduce load to Github
113
- EventMachine.add_periodic_timer(120) do
114
- ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
115
-
116
- info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
117
-
118
- new_delay = if ratio >= 0 and ratio < 0.3 then
119
- -1
120
- elsif ratio >= 0.3 and ratio <= 0.5 then
121
- 0
122
- elsif ratio > 0.5 and ratio < 1 then
123
- +1
124
- end
125
-
126
- # Reset counters for new loop
127
- dupl_msgs = new_msgs = 0
128
-
129
- # Update the retrieval delay and restart the event retriever
130
- if new_delay != 0
131
-
132
- # Stop the retriever task and adjust retrieval delay
133
- retriever.cancel
134
- retrieval_delay = retrieval_delay + new_delay
135
- info("Setting event retrieval delay to #{retrieval_delay} secs")
136
-
137
- # Restart the retriever
138
- retriever = EventMachine.add_periodic_timer(retrieval_delay) do
139
- (new, dupl) = retrieve exchange
140
- dupl_msgs += dupl
141
- new_msgs += new
142
- end
143
- end
144
- end
145
- end
146
- end
147
- end
148
-
149
- GHTMirrorEvents.run
150
-
151
- # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
6
+ GHTMirrorEvents.run(ARGV)