ghtorrent 0.7.3 → 0.8

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ = Version 0.8
2
+ * Retrieve and process issue labels
3
+ * Retrive and process actors for pull request events
4
+ * Retrieve pullreq comments for projects with no issuetracker
5
+ * Better heuristics for avoiding dulpicate entries in pull request histories
6
+ * The event loader now loads events by IDs to reduce pressure to the queue
7
+ * Compound indexes in MongoDB by default
8
+ * X-RateLimit-Reset header support
9
+ * Remove lots of dead code, general cleanups
10
+
1
11
  = Version 0.7.3
2
12
  * Support for running in standalone mode
3
13
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ghtorrent (0.7.2)
4
+ ghtorrent (0.7.3)
5
5
  amqp (~> 1.0.0)
6
6
  bson_ext (~> 1.8.0)
7
7
  daemons (~> 1.1.0)
@@ -4,7 +4,8 @@ module GHTorrent
4
4
 
5
5
  ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
6
6
  :commit_comments, :repo_collaborators, :watchers, :pull_requests,
7
- :forks, :pull_request_comments, :issue_comments, :issues, :issue_events
7
+ :forks, :pull_request_comments, :issue_comments, :issues, :issue_events,
8
+ :repo_labels
8
9
  ]
9
10
 
10
11
  # Stores +data+ into +entity+. Returns a unique key for the stored entry.
@@ -20,6 +20,25 @@ module GHTorrent
20
20
  :mongo_replicas => "mongo.replicas"
21
21
  }
22
22
 
23
+ IDXS = {
24
+ :events => %w(id),
25
+ :users => %w(login),
26
+ :commits => %w(sha),
27
+ :commit_comments => %w(repo user commit_id),
28
+ :repos => %w(name owner.login),
29
+ :repo_labels => %w(repo owner),
30
+ :repo_collaborators => %w(repo owner login),
31
+ :followers => %w(follows login),
32
+ :org_members => %w(org),
33
+ :watchers => %w(repo owner login),
34
+ :forks => %w(repo owner id),
35
+ :pull_requests => %w(repo owner number),
36
+ :pull_request_comments => %w(repo owner pullreq_id id),
37
+ :issues => %w(repo owner number),
38
+ :issue_events => %w(repo owner issue_id id),
39
+ :issue_comments => %w(repo owner issue_id id)
40
+ }
41
+
23
42
  attr_reader :settings
24
43
 
25
44
  # Creates a new instance of the MongoDB persistence adapter.
@@ -122,6 +141,8 @@ module GHTorrent
122
141
  get_collection("issue_comments")
123
142
  when :issue_events
124
143
  get_collection("issue_events")
144
+ when :repo_labels
145
+ get_collection("repo_labels")
125
146
  end
126
147
  end
127
148
 
@@ -140,68 +161,35 @@ module GHTorrent
140
161
  Mongo::ReplSetConnection.new(repl_arr, :read => :secondary)\
141
162
  .db(config(:mongo_db))
142
163
  end
143
- init_db(@mongo) if @mongo.collections.size < ENTITIES.size
164
+
165
+ stats = @mongo.stats
166
+ init_db(@mongo) if stats['collections'] < ENTITIES.size + 2
167
+ init_db(@mongo) if stats['indexes'] < IDXS.keys.size + ENTITIES.size
168
+
144
169
  @mongo
145
170
  else
146
171
  @mongo
147
172
  end
148
173
  end
149
174
 
150
- # Declare an index on +field+ for +collection+ if it does not exist
151
- def ensure_index(collection, field)
152
- col = get_entity(collection)
153
-
154
- exists = col.index_information.find {|k,v|
155
- k == "#{field}_1"
156
- }
157
-
158
- if exists.nil?
159
- col.create_index(field, :background => true)
160
- STDERR.puts "Creating index on #{collection}(#{field})"
161
- end
162
- end
163
-
164
175
  def init_db(mongo)
165
176
  ENTITIES.each {|x| mongo.collection(x.to_s)}
166
177
 
167
178
  # Ensure that the necessary indexes exist
168
- ensure_index(:events, "id")
169
- ensure_index(:users, "login")
170
- ensure_index(:commits, "sha")
171
- ensure_index(:repos, "name")
172
- ensure_index(:repos, "owner.login")
173
- ensure_index(:followers, "follows")
174
- ensure_index(:followers, "login")
175
- ensure_index(:org_members, "org")
176
- ensure_index(:commit_comments, "repo")
177
- ensure_index(:commit_comments, "user")
178
- ensure_index(:commit_comments, "commit_id")
179
- ensure_index(:repo_collaborators, "repo")
180
- ensure_index(:repo_collaborators, "owner")
181
- ensure_index(:repo_collaborators, "login")
182
- ensure_index(:watchers, "repo")
183
- ensure_index(:watchers, "owner")
184
- ensure_index(:watchers, "login")
185
- ensure_index(:pull_requests, "repo")
186
- ensure_index(:pull_requests, "owner")
187
- ensure_index(:forks, "repo")
188
- ensure_index(:forks, "owner")
189
- ensure_index(:forks, "id")
190
- ensure_index(:issue_comments, "repo")
191
- ensure_index(:issue_comments, "owner")
192
- ensure_index(:issue_comments, "issue_id")
193
- ensure_index(:issue_comments, "id")
194
- ensure_index(:pull_request_comments, "repo")
195
- ensure_index(:pull_request_comments, "owner")
196
- ensure_index(:pull_request_comments, "pullreq_id")
197
- ensure_index(:pull_request_comments, "id")
198
- ensure_index(:issues, "repo")
199
- ensure_index(:issues, "owner")
200
- ensure_index(:issues, "issue_id")
201
- ensure_index(:issue_events, "repo")
202
- ensure_index(:issue_events, "owner")
203
- ensure_index(:issue_events, "issue_id")
204
- ensure_index(:issue_events, "id")
179
+ IDXS.each do |k,v|
180
+ col = get_entity(k)
181
+ name = v.join('_1_') + '_1'
182
+ exists = col.index_information.find {|k,v| k == name}
183
+
184
+ idx_fields = v.reduce({}){|acc, x| acc.merge({x => 1})}
185
+ if exists.nil?
186
+ col.create_index(idx_fields, :background => true)
187
+ STDERR.puts "Creating index on #{collection}(#{v})"
188
+ else
189
+ STDERR.puts "Index on #{collection}(#{v}) exists"
190
+ end
191
+
192
+ end
205
193
  end
206
194
 
207
195
  def rescue_connection_failure(max_retries=60)
@@ -169,14 +169,7 @@ module GHTorrent
169
169
  end
170
170
 
171
171
  total = Time.now.to_ms - start_time.to_ms
172
- debug "APIClient: Request: #{url} #{if from_cache then "from cache," else "(#{contents.meta['x-ratelimit-remaining']} remaining)," end} Total: #{total} ms"
173
-
174
- if not from_cache and config(:respect_api_ratelimit) and
175
- contents.meta['x-ratelimit-remaining'].to_i < 20
176
- sleep = 61 - Time.now.min
177
- debug "APIClient: Request limit reached, sleeping for #{sleep} min"
178
- sleep(sleep * 60)
179
- end
172
+ debug "APIClient: Request: #{url} #{if from_cache then "from cache," else "(#{@remaining} remaining)," end} Total: #{total} ms"
180
173
 
181
174
  contents
182
175
  rescue OpenURI::HTTPError => e
@@ -188,21 +181,29 @@ module GHTorrent
188
181
  404, # Not found
189
182
  422 then # Unprocessable entity
190
183
  warn "APIClient: #{url}: #{e.io.status[1]}"
184
+ @remaining = e.io.meta['x-ratelimit-remaining'].to_i
185
+ @reset = e.io.meta['x-ratelimit-reset'].to_i
191
186
  return nil
192
187
  else # Server error or HTTP conditions that Github does not report
193
188
  warn "APIClient: #{url}: #{e.io.status[1]}"
194
189
  raise e
195
190
  end
191
+ ensure
192
+ if not from_cache and config(:respect_api_ratelimit) and @remaining < 10
193
+ sleep = (@reset - Time.now.to_i) / 60
194
+ debug "APIClient: Request limit reached, sleeping for #{sleep} min"
195
+ sleep(@reset - Time.now.to_i)
196
+ end
196
197
  end
197
198
  end
198
199
 
199
200
  def do_request(url)
200
- @attach_ip ||= config(:attach_ip)
201
- @username ||= config(:github_username)
202
- @passwd ||= config(:github_passwd)
201
+ @attach_ip ||= config(:attach_ip)
202
+ @username ||= config(:github_username)
203
+ @passwd ||= config(:github_passwd)
203
204
  @user_agent ||= config(:user_agent)
204
205
 
205
- @open_func ||= if @username.nil?
206
+ open_func ||= if @username.nil?
206
207
  lambda {|url| open(url, 'User-Agent' => @user_agent)}
207
208
  else
208
209
  lambda {|url| open(url,
@@ -210,13 +211,16 @@ module GHTorrent
210
211
  :http_basic_authentication => [@username, @passwd])}
211
212
  end
212
213
 
213
- if @attach_ip.nil? or @attach_ip.eql? '0.0.0.0'
214
- @open_func.call(url)
215
- else
216
- attach_to(@attach_ip) do
217
- @open_func.call(url)
214
+ result = if @attach_ip.nil? or @attach_ip.eql? '0.0.0.0'
215
+ open_func.call(url)
216
+ else
217
+ attach_to(@attach_ip) do
218
+ open_func.call(url)
219
+ end
218
220
  end
219
- end
221
+ @remaining = result.meta['x-ratelimit-remaining'].to_i
222
+ @reset = result.meta['x-ratelimit-reset'].to_i
223
+ result
220
224
  end
221
225
 
222
226
  # Attach to a specific IP address if the machine has multiple
@@ -17,7 +17,6 @@ module GHTorrent
17
17
  class Command
18
18
 
19
19
  include GHTorrent::Settings
20
- include GHTorrent::Settings
21
20
 
22
21
  # Specify the run method for subclasses.
23
22
  class << self
@@ -12,6 +12,12 @@ class GHTDataRetrieval < GHTorrent::Command
12
12
 
13
13
  include GHTorrent::Settings
14
14
  include GHTorrent::Logging
15
+ include GHTorrent::Persister
16
+
17
+ def persister
18
+ @persister ||= connect(:mongo, settings)
19
+ @persister
20
+ end
15
21
 
16
22
  def parse(msg)
17
23
  JSON.parse(msg)
@@ -64,9 +70,10 @@ class GHTDataRetrieval < GHTorrent::Command
64
70
  repo = data['payload']['pull_request']['base']['repo']['name']
65
71
  pullreq_id = data['payload']['number']
66
72
  action = data['payload']['action']
73
+ actor = data['actor']['login']
67
74
  created_at = data['created_at']
68
75
 
69
- ghtorrent.get_pull_request(owner, repo, pullreq_id, action, created_at)
76
+ ghtorrent.get_pull_request(owner, repo, pullreq_id, action, actor, created_at)
70
77
  end
71
78
 
72
79
  def ForkEvent(data)
@@ -155,6 +162,7 @@ Retrieves events from queues and processes them through GHTorrent
155
162
  info "GHTDataRetrieval: Received SIGINT, exiting"
156
163
  AMQP.stop { EM.stop }
157
164
  }
165
+
158
166
  Signal.trap('TERM') {
159
167
  info "GHTDataRetrieval: Received SIGTERM, exiting"
160
168
  AMQP.stop { EM.stop }
@@ -165,7 +173,8 @@ Retrieves events from queues and processes them through GHTorrent
165
173
  :username => config(:amqp_username),
166
174
  :password => config(:amqp_password)) do |connection|
167
175
 
168
- channel = AMQP::Channel.new(connection, :prefetch => config(:amqp_prefetch))
176
+ channel = AMQP::Channel.new(connection)
177
+ channel.prefetch(config(:amqp_prefetch))
169
178
  exchange = channel.topic(config(:amqp_exchange), :durable => true,
170
179
  :auto_delete => false)
171
180
 
@@ -177,7 +186,10 @@ Retrieves events from queues and processes them through GHTorrent
177
186
 
178
187
  queue.subscribe(:ack => true) do |headers, msg|
179
188
  begin
180
- data = parse(msg)
189
+
190
+ event = persister.get_underlying_connection[:events].find_one('id' => msg)
191
+ event.delete '_id'
192
+ data = parse(event.to_json)
181
193
  info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
182
194
 
183
195
  unless options[:filter].nil?
@@ -194,8 +206,7 @@ Retrieves events from queues and processes them through GHTorrent
194
206
  rescue Exception => e
195
207
  # Give a message a chance to be reprocessed
196
208
  if headers.redelivered?
197
- data = parse(msg)
198
- warn "GHTDataRetrieval: Could not process event: #{data['type']}-#{data['id']}"
209
+ warn "GHTDataRetrieval: Could not process event: #{msg}"
199
210
  headers.reject(:requeue => false)
200
211
  else
201
212
  headers.reject(:requeue => true)
@@ -1,10 +1,8 @@
1
1
  require 'rubygems'
2
2
  require 'mongo'
3
3
  require 'amqp'
4
- require 'set'
5
4
  require 'eventmachine'
6
5
  require 'pp'
7
- require "amqp/extensions/rabbitmq"
8
6
 
9
7
  require 'ghtorrent/settings'
10
8
  require 'ghtorrent/logging'
@@ -17,27 +15,8 @@ class GHTLoad < GHTorrent::Command
17
15
  include GHTorrent::Settings
18
16
  include GHTorrent::Persister
19
17
 
20
- def col_info()
21
- {
22
- :commits => {
23
- :name => "commits",
24
- :payload => "commit.id",
25
- :unq => "commit.id",
26
- :col => persister.get_underlying_connection.collection(:commits.to_s),
27
- :routekey => "commit.%s"
28
- },
29
- :events => {
30
- :name => "events",
31
- :payload => "",
32
- :unq => "type",
33
- :col => persister.get_underlying_connection.collection(:events.to_s),
34
- :routekey => "evt.%s"
35
- }
36
- }
37
- end
38
-
39
18
  def persister
40
- @persister ||= connect(:mongo, @settings)
19
+ @persister ||= connect(:mongo, settings)
41
20
  @persister
42
21
  end
43
22
 
@@ -52,8 +31,13 @@ Loads object ids from a collection to a queue for further processing.
52
31
 
53
32
  options.opt :earliest, 'Seconds since epoch of earliest item to load',
54
33
  :short => 'e', :default => 0, :type => :int
55
- options.opt :number, 'Number of items to load (-1 means all)',
56
- :short => 'n', :type => :int, :default => -1
34
+ options.opt :latest, 'Seconds since epoch of latest item to load',
35
+ :short => 'l', :default => Time.now.to_i + (60 * 60 * 24 * 360 * 20),
36
+ :type => :int
37
+ options.opt :number, 'Total number of items to load',
38
+ :short => 'n', :type => :int, :default => 2**48
39
+ options.opt :batch, 'Number of items to process in a batch',
40
+ :short => 'b', :type => :int, :default => 10000
57
41
  options.opt :filter,
58
42
  'Filter items by regexp on item attributes: item.attr=regexp',
59
43
  :short => 'f', :type => String, :multi => true
@@ -61,7 +45,6 @@ Loads object ids from a collection to a queue for further processing.
61
45
 
62
46
  def validate
63
47
  super
64
- Trollop::die "no collection specified" unless args[0] && !args[0].empty?
65
48
  filter = options[:filter]
66
49
  case
67
50
  when filter.is_a?(Array)
@@ -71,27 +54,23 @@ Loads object ids from a collection to a queue for further processing.
71
54
  when filter == []
72
55
  # Noop
73
56
  else
74
- Trollop::die "A filter can only be a string"
57
+ Trollop::die 'A filter can only be a string'
75
58
  end
76
59
  end
77
60
 
78
61
  def go
79
- # Message tags await publisher ack
80
- awaiting_ack = SortedSet.new
81
-
82
62
  # Num events read
83
63
  num_read = 0
84
64
 
85
- collection = case args[0]
86
- when "events"
87
- :events
88
- when "commits"
89
- :commits
90
- end
91
-
92
- puts "Loading from collection #{collection}"
93
65
  puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
94
- puts "Loading #{options[:number]} items" if options[:verbose] && options[:number] != -1
66
+ puts "Loading items before #{Time.at(options[:latest])}" if options[:verbose]
67
+ puts "Loading #{options[:batch]} items per batch" if options[:batch]
68
+ puts "Loading #{options[:number]} items" if options[:verbose]
69
+
70
+ if options[:batch] >= options[:number]
71
+ puts "Batch > number of items, setting batch to #{options[:number]}"
72
+ options[:batch] = options[:number]
73
+ end
95
74
 
96
75
  what = case
97
76
  when options[:filter].is_a?(Array)
@@ -104,9 +83,12 @@ Loads object ids from a collection to a queue for further processing.
104
83
  {}
105
84
  end
106
85
 
107
- from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
86
+ from = {'_id' => {
87
+ '$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest])),
88
+ '$lte' => BSON::ObjectId.from_time(Time.at(options[:latest]))}
89
+ }
108
90
 
109
- (puts "Mongo filter:"; pp what.merge(from)) if options[:verbose]
91
+ (puts 'Mongo filter:'; pp what.merge(from)) if options[:verbose]
110
92
 
111
93
  AMQP.start(:host => config(:amqp_host),
112
94
  :port => config(:amqp_port),
@@ -119,94 +101,47 @@ Loads object ids from a collection to a queue for further processing.
119
101
 
120
102
  # What to do when the user hits Ctrl+c
121
103
  show_stopper = Proc.new {
104
+ puts('Closing connection')
122
105
  connection.close { EventMachine.stop }
123
106
  }
124
107
 
125
- # Read next 100000 items and queue them
108
+ # Read next options[:batch] items and queue them
126
109
  read_and_publish = Proc.new {
127
110
 
128
- to_read = if options.number == -1
129
- 100000
130
- else
131
- if options.number - num_read - 1 <= 0
132
- -1
133
- else
134
- options.number - num_read - 1
135
- end
136
- end
137
-
138
- read = 0
139
- col_info[collection][:col].find(what.merge(from),
111
+ persister.get_underlying_connection[:events].find(what.merge(from),
140
112
  :skip => num_read,
141
- :limit => to_read).each do |e|
142
-
143
- payload = read_value(e, col_info[collection][:payload])
144
- payload = if payload.class == BSON::OrderedHash
145
- payload.delete "_id" # Inserted by MongoDB on event insert
146
- payload.to_json
147
- end
148
- read += 1
149
- unq = read_value(e, col_info[collection][:unq])
113
+ :limit => options[:batch]).each do |e|
114
+ unq = read_value(e, 'type')
150
115
  if unq.class != String or unq.nil? then
151
- throw Exception.new("Unique value can only be a String")
116
+ throw Exception.new('Unique value can only be a String')
152
117
  end
153
118
 
154
- key = col_info[collection][:routekey] % unq
155
-
156
- exchange.publish payload, :persistent => true, :routing_key => key
119
+ exchange.publish e['id'], :persistent => false,
120
+ :routing_key => "evt.#{e['type']}"
157
121
 
158
122
  num_read += 1
159
- puts("Publish id = #{payload[unq]} (#{num_read} total)") if options.verbose
160
- awaiting_ack << num_read
123
+ puts "Publish id = #{e['id']} (#{num_read} total)" if options.verbose
161
124
  end
162
125
 
163
- # Nothing new in the DB and no msgs waiting ack
164
- if (read == 0 and awaiting_ack.size == 0) or to_read == -1
165
- puts("Finished reading, exiting")
126
+ if num_read >= options[:number]
127
+ puts 'Finished reading, exiting'
166
128
  show_stopper.call
167
- end
168
- }
169
-
170
- # Remove acknowledged or failed msg tags from the queue
171
- # Trigger more messages to be read when ack msg queue size drops to zero
172
- publisher_event = Proc.new { |ack|
173
- if ack.multiple then
174
- awaiting_ack.delete_if { |x| x <= ack.delivery_tag }
175
129
  else
176
- awaiting_ack.delete ack.delivery_tag
177
- end
178
-
179
- if awaiting_ack.size == 0
180
- puts("ACKS.size= #{awaiting_ack.size}") if options.verbose
130
+ # Schedule new event processing cycle
181
131
  EventMachine.next_tick do
182
132
  read_and_publish.call
183
133
  end
184
134
  end
185
135
  }
186
136
 
187
- # Await publisher confirms
188
- channel.confirm_select
189
-
190
- # Callback when confirms have arrived
191
- channel.on_ack do |ack|
192
- puts "ACK: tag=#{ack.delivery_tag}, mul=#{ack.multiple}" if options.verbose
193
- publisher_event.call(ack)
194
- end
195
-
196
- # Callback when confirms failed.
197
- channel.on_nack do |nack|
198
- puts "NACK: tag=#{nack.delivery_tag}, mul=#{nack.multiple}" if options.verbose
199
- publisher_event.call(nack)
200
- end
201
-
202
137
  # Signal handlers
203
138
  Signal.trap('INT', show_stopper)
204
139
  Signal.trap('TERM', show_stopper)
205
140
 
206
- # Trigger start processing
207
141
  EventMachine.add_timer(0.1) do
208
142
  read_and_publish.call
209
143
  end
144
+
210
145
  end
211
146
  end
212
147