ghtorrent 0.7.3 → 0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ = Version 0.8
2
+ * Retrieve and process issue labels
3
+ * Retrive and process actors for pull request events
4
+ * Retrieve pullreq comments for projects with no issuetracker
5
+ * Better heuristics for avoiding dulpicate entries in pull request histories
6
+ * The event loader now loads events by IDs to reduce pressure to the queue
7
+ * Compound indexes in MongoDB by default
8
+ * X-RateLimit-Reset header support
9
+ * Remove lots of dead code, general cleanups
10
+
1
11
  = Version 0.7.3
2
12
  * Support for running in standalone mode
3
13
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ghtorrent (0.7.2)
4
+ ghtorrent (0.7.3)
5
5
  amqp (~> 1.0.0)
6
6
  bson_ext (~> 1.8.0)
7
7
  daemons (~> 1.1.0)
@@ -4,7 +4,8 @@ module GHTorrent
4
4
 
5
5
  ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
6
6
  :commit_comments, :repo_collaborators, :watchers, :pull_requests,
7
- :forks, :pull_request_comments, :issue_comments, :issues, :issue_events
7
+ :forks, :pull_request_comments, :issue_comments, :issues, :issue_events,
8
+ :repo_labels
8
9
  ]
9
10
 
10
11
  # Stores +data+ into +entity+. Returns a unique key for the stored entry.
@@ -20,6 +20,25 @@ module GHTorrent
20
20
  :mongo_replicas => "mongo.replicas"
21
21
  }
22
22
 
23
+ IDXS = {
24
+ :events => %w(id),
25
+ :users => %w(login),
26
+ :commits => %w(sha),
27
+ :commit_comments => %w(repo user commit_id),
28
+ :repos => %w(name owner.login),
29
+ :repo_labels => %w(repo owner),
30
+ :repo_collaborators => %w(repo owner login),
31
+ :followers => %w(follows login),
32
+ :org_members => %w(org),
33
+ :watchers => %w(repo owner login),
34
+ :forks => %w(repo owner id),
35
+ :pull_requests => %w(repo owner number),
36
+ :pull_request_comments => %w(repo owner pullreq_id id),
37
+ :issues => %w(repo owner number),
38
+ :issue_events => %w(repo owner issue_id id),
39
+ :issue_comments => %w(repo owner issue_id id)
40
+ }
41
+
23
42
  attr_reader :settings
24
43
 
25
44
  # Creates a new instance of the MongoDB persistence adapter.
@@ -122,6 +141,8 @@ module GHTorrent
122
141
  get_collection("issue_comments")
123
142
  when :issue_events
124
143
  get_collection("issue_events")
144
+ when :repo_labels
145
+ get_collection("repo_labels")
125
146
  end
126
147
  end
127
148
 
@@ -140,68 +161,35 @@ module GHTorrent
140
161
  Mongo::ReplSetConnection.new(repl_arr, :read => :secondary)\
141
162
  .db(config(:mongo_db))
142
163
  end
143
- init_db(@mongo) if @mongo.collections.size < ENTITIES.size
164
+
165
+ stats = @mongo.stats
166
+ init_db(@mongo) if stats['collections'] < ENTITIES.size + 2
167
+ init_db(@mongo) if stats['indexes'] < IDXS.keys.size + ENTITIES.size
168
+
144
169
  @mongo
145
170
  else
146
171
  @mongo
147
172
  end
148
173
  end
149
174
 
150
- # Declare an index on +field+ for +collection+ if it does not exist
151
- def ensure_index(collection, field)
152
- col = get_entity(collection)
153
-
154
- exists = col.index_information.find {|k,v|
155
- k == "#{field}_1"
156
- }
157
-
158
- if exists.nil?
159
- col.create_index(field, :background => true)
160
- STDERR.puts "Creating index on #{collection}(#{field})"
161
- end
162
- end
163
-
164
175
  def init_db(mongo)
165
176
  ENTITIES.each {|x| mongo.collection(x.to_s)}
166
177
 
167
178
  # Ensure that the necessary indexes exist
168
- ensure_index(:events, "id")
169
- ensure_index(:users, "login")
170
- ensure_index(:commits, "sha")
171
- ensure_index(:repos, "name")
172
- ensure_index(:repos, "owner.login")
173
- ensure_index(:followers, "follows")
174
- ensure_index(:followers, "login")
175
- ensure_index(:org_members, "org")
176
- ensure_index(:commit_comments, "repo")
177
- ensure_index(:commit_comments, "user")
178
- ensure_index(:commit_comments, "commit_id")
179
- ensure_index(:repo_collaborators, "repo")
180
- ensure_index(:repo_collaborators, "owner")
181
- ensure_index(:repo_collaborators, "login")
182
- ensure_index(:watchers, "repo")
183
- ensure_index(:watchers, "owner")
184
- ensure_index(:watchers, "login")
185
- ensure_index(:pull_requests, "repo")
186
- ensure_index(:pull_requests, "owner")
187
- ensure_index(:forks, "repo")
188
- ensure_index(:forks, "owner")
189
- ensure_index(:forks, "id")
190
- ensure_index(:issue_comments, "repo")
191
- ensure_index(:issue_comments, "owner")
192
- ensure_index(:issue_comments, "issue_id")
193
- ensure_index(:issue_comments, "id")
194
- ensure_index(:pull_request_comments, "repo")
195
- ensure_index(:pull_request_comments, "owner")
196
- ensure_index(:pull_request_comments, "pullreq_id")
197
- ensure_index(:pull_request_comments, "id")
198
- ensure_index(:issues, "repo")
199
- ensure_index(:issues, "owner")
200
- ensure_index(:issues, "issue_id")
201
- ensure_index(:issue_events, "repo")
202
- ensure_index(:issue_events, "owner")
203
- ensure_index(:issue_events, "issue_id")
204
- ensure_index(:issue_events, "id")
179
+ IDXS.each do |k,v|
180
+ col = get_entity(k)
181
+ name = v.join('_1_') + '_1'
182
+ exists = col.index_information.find {|k,v| k == name}
183
+
184
+ idx_fields = v.reduce({}){|acc, x| acc.merge({x => 1})}
185
+ if exists.nil?
186
+ col.create_index(idx_fields, :background => true)
187
+ STDERR.puts "Creating index on #{collection}(#{v})"
188
+ else
189
+ STDERR.puts "Index on #{collection}(#{v}) exists"
190
+ end
191
+
192
+ end
205
193
  end
206
194
 
207
195
  def rescue_connection_failure(max_retries=60)
@@ -169,14 +169,7 @@ module GHTorrent
169
169
  end
170
170
 
171
171
  total = Time.now.to_ms - start_time.to_ms
172
- debug "APIClient: Request: #{url} #{if from_cache then "from cache," else "(#{contents.meta['x-ratelimit-remaining']} remaining)," end} Total: #{total} ms"
173
-
174
- if not from_cache and config(:respect_api_ratelimit) and
175
- contents.meta['x-ratelimit-remaining'].to_i < 20
176
- sleep = 61 - Time.now.min
177
- debug "APIClient: Request limit reached, sleeping for #{sleep} min"
178
- sleep(sleep * 60)
179
- end
172
+ debug "APIClient: Request: #{url} #{if from_cache then "from cache," else "(#{@remaining} remaining)," end} Total: #{total} ms"
180
173
 
181
174
  contents
182
175
  rescue OpenURI::HTTPError => e
@@ -188,21 +181,29 @@ module GHTorrent
188
181
  404, # Not found
189
182
  422 then # Unprocessable entity
190
183
  warn "APIClient: #{url}: #{e.io.status[1]}"
184
+ @remaining = e.io.meta['x-ratelimit-remaining'].to_i
185
+ @reset = e.io.meta['x-ratelimit-reset'].to_i
191
186
  return nil
192
187
  else # Server error or HTTP conditions that Github does not report
193
188
  warn "APIClient: #{url}: #{e.io.status[1]}"
194
189
  raise e
195
190
  end
191
+ ensure
192
+ if not from_cache and config(:respect_api_ratelimit) and @remaining < 10
193
+ sleep = (@reset - Time.now.to_i) / 60
194
+ debug "APIClient: Request limit reached, sleeping for #{sleep} min"
195
+ sleep(@reset - Time.now.to_i)
196
+ end
196
197
  end
197
198
  end
198
199
 
199
200
  def do_request(url)
200
- @attach_ip ||= config(:attach_ip)
201
- @username ||= config(:github_username)
202
- @passwd ||= config(:github_passwd)
201
+ @attach_ip ||= config(:attach_ip)
202
+ @username ||= config(:github_username)
203
+ @passwd ||= config(:github_passwd)
203
204
  @user_agent ||= config(:user_agent)
204
205
 
205
- @open_func ||= if @username.nil?
206
+ open_func ||= if @username.nil?
206
207
  lambda {|url| open(url, 'User-Agent' => @user_agent)}
207
208
  else
208
209
  lambda {|url| open(url,
@@ -210,13 +211,16 @@ module GHTorrent
210
211
  :http_basic_authentication => [@username, @passwd])}
211
212
  end
212
213
 
213
- if @attach_ip.nil? or @attach_ip.eql? '0.0.0.0'
214
- @open_func.call(url)
215
- else
216
- attach_to(@attach_ip) do
217
- @open_func.call(url)
214
+ result = if @attach_ip.nil? or @attach_ip.eql? '0.0.0.0'
215
+ open_func.call(url)
216
+ else
217
+ attach_to(@attach_ip) do
218
+ open_func.call(url)
219
+ end
218
220
  end
219
- end
221
+ @remaining = result.meta['x-ratelimit-remaining'].to_i
222
+ @reset = result.meta['x-ratelimit-reset'].to_i
223
+ result
220
224
  end
221
225
 
222
226
  # Attach to a specific IP address if the machine has multiple
@@ -17,7 +17,6 @@ module GHTorrent
17
17
  class Command
18
18
 
19
19
  include GHTorrent::Settings
20
- include GHTorrent::Settings
21
20
 
22
21
  # Specify the run method for subclasses.
23
22
  class << self
@@ -12,6 +12,12 @@ class GHTDataRetrieval < GHTorrent::Command
12
12
 
13
13
  include GHTorrent::Settings
14
14
  include GHTorrent::Logging
15
+ include GHTorrent::Persister
16
+
17
+ def persister
18
+ @persister ||= connect(:mongo, settings)
19
+ @persister
20
+ end
15
21
 
16
22
  def parse(msg)
17
23
  JSON.parse(msg)
@@ -64,9 +70,10 @@ class GHTDataRetrieval < GHTorrent::Command
64
70
  repo = data['payload']['pull_request']['base']['repo']['name']
65
71
  pullreq_id = data['payload']['number']
66
72
  action = data['payload']['action']
73
+ actor = data['actor']['login']
67
74
  created_at = data['created_at']
68
75
 
69
- ghtorrent.get_pull_request(owner, repo, pullreq_id, action, created_at)
76
+ ghtorrent.get_pull_request(owner, repo, pullreq_id, action, actor, created_at)
70
77
  end
71
78
 
72
79
  def ForkEvent(data)
@@ -155,6 +162,7 @@ Retrieves events from queues and processes them through GHTorrent
155
162
  info "GHTDataRetrieval: Received SIGINT, exiting"
156
163
  AMQP.stop { EM.stop }
157
164
  }
165
+
158
166
  Signal.trap('TERM') {
159
167
  info "GHTDataRetrieval: Received SIGTERM, exiting"
160
168
  AMQP.stop { EM.stop }
@@ -165,7 +173,8 @@ Retrieves events from queues and processes them through GHTorrent
165
173
  :username => config(:amqp_username),
166
174
  :password => config(:amqp_password)) do |connection|
167
175
 
168
- channel = AMQP::Channel.new(connection, :prefetch => config(:amqp_prefetch))
176
+ channel = AMQP::Channel.new(connection)
177
+ channel.prefetch(config(:amqp_prefetch))
169
178
  exchange = channel.topic(config(:amqp_exchange), :durable => true,
170
179
  :auto_delete => false)
171
180
 
@@ -177,7 +186,10 @@ Retrieves events from queues and processes them through GHTorrent
177
186
 
178
187
  queue.subscribe(:ack => true) do |headers, msg|
179
188
  begin
180
- data = parse(msg)
189
+
190
+ event = persister.get_underlying_connection[:events].find_one('id' => msg)
191
+ event.delete '_id'
192
+ data = parse(event.to_json)
181
193
  info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
182
194
 
183
195
  unless options[:filter].nil?
@@ -194,8 +206,7 @@ Retrieves events from queues and processes them through GHTorrent
194
206
  rescue Exception => e
195
207
  # Give a message a chance to be reprocessed
196
208
  if headers.redelivered?
197
- data = parse(msg)
198
- warn "GHTDataRetrieval: Could not process event: #{data['type']}-#{data['id']}"
209
+ warn "GHTDataRetrieval: Could not process event: #{msg}"
199
210
  headers.reject(:requeue => false)
200
211
  else
201
212
  headers.reject(:requeue => true)
@@ -1,10 +1,8 @@
1
1
  require 'rubygems'
2
2
  require 'mongo'
3
3
  require 'amqp'
4
- require 'set'
5
4
  require 'eventmachine'
6
5
  require 'pp'
7
- require "amqp/extensions/rabbitmq"
8
6
 
9
7
  require 'ghtorrent/settings'
10
8
  require 'ghtorrent/logging'
@@ -17,27 +15,8 @@ class GHTLoad < GHTorrent::Command
17
15
  include GHTorrent::Settings
18
16
  include GHTorrent::Persister
19
17
 
20
- def col_info()
21
- {
22
- :commits => {
23
- :name => "commits",
24
- :payload => "commit.id",
25
- :unq => "commit.id",
26
- :col => persister.get_underlying_connection.collection(:commits.to_s),
27
- :routekey => "commit.%s"
28
- },
29
- :events => {
30
- :name => "events",
31
- :payload => "",
32
- :unq => "type",
33
- :col => persister.get_underlying_connection.collection(:events.to_s),
34
- :routekey => "evt.%s"
35
- }
36
- }
37
- end
38
-
39
18
  def persister
40
- @persister ||= connect(:mongo, @settings)
19
+ @persister ||= connect(:mongo, settings)
41
20
  @persister
42
21
  end
43
22
 
@@ -52,8 +31,13 @@ Loads object ids from a collection to a queue for further processing.
52
31
 
53
32
  options.opt :earliest, 'Seconds since epoch of earliest item to load',
54
33
  :short => 'e', :default => 0, :type => :int
55
- options.opt :number, 'Number of items to load (-1 means all)',
56
- :short => 'n', :type => :int, :default => -1
34
+ options.opt :latest, 'Seconds since epoch of latest item to load',
35
+ :short => 'l', :default => Time.now.to_i + (60 * 60 * 24 * 360 * 20),
36
+ :type => :int
37
+ options.opt :number, 'Total number of items to load',
38
+ :short => 'n', :type => :int, :default => 2**48
39
+ options.opt :batch, 'Number of items to process in a batch',
40
+ :short => 'b', :type => :int, :default => 10000
57
41
  options.opt :filter,
58
42
  'Filter items by regexp on item attributes: item.attr=regexp',
59
43
  :short => 'f', :type => String, :multi => true
@@ -61,7 +45,6 @@ Loads object ids from a collection to a queue for further processing.
61
45
 
62
46
  def validate
63
47
  super
64
- Trollop::die "no collection specified" unless args[0] && !args[0].empty?
65
48
  filter = options[:filter]
66
49
  case
67
50
  when filter.is_a?(Array)
@@ -71,27 +54,23 @@ Loads object ids from a collection to a queue for further processing.
71
54
  when filter == []
72
55
  # Noop
73
56
  else
74
- Trollop::die "A filter can only be a string"
57
+ Trollop::die 'A filter can only be a string'
75
58
  end
76
59
  end
77
60
 
78
61
  def go
79
- # Message tags await publisher ack
80
- awaiting_ack = SortedSet.new
81
-
82
62
  # Num events read
83
63
  num_read = 0
84
64
 
85
- collection = case args[0]
86
- when "events"
87
- :events
88
- when "commits"
89
- :commits
90
- end
91
-
92
- puts "Loading from collection #{collection}"
93
65
  puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
94
- puts "Loading #{options[:number]} items" if options[:verbose] && options[:number] != -1
66
+ puts "Loading items before #{Time.at(options[:latest])}" if options[:verbose]
67
+ puts "Loading #{options[:batch]} items per batch" if options[:batch]
68
+ puts "Loading #{options[:number]} items" if options[:verbose]
69
+
70
+ if options[:batch] >= options[:number]
71
+ puts "Batch > number of items, setting batch to #{options[:number]}"
72
+ options[:batch] = options[:number]
73
+ end
95
74
 
96
75
  what = case
97
76
  when options[:filter].is_a?(Array)
@@ -104,9 +83,12 @@ Loads object ids from a collection to a queue for further processing.
104
83
  {}
105
84
  end
106
85
 
107
- from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
86
+ from = {'_id' => {
87
+ '$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest])),
88
+ '$lte' => BSON::ObjectId.from_time(Time.at(options[:latest]))}
89
+ }
108
90
 
109
- (puts "Mongo filter:"; pp what.merge(from)) if options[:verbose]
91
+ (puts 'Mongo filter:'; pp what.merge(from)) if options[:verbose]
110
92
 
111
93
  AMQP.start(:host => config(:amqp_host),
112
94
  :port => config(:amqp_port),
@@ -119,94 +101,47 @@ Loads object ids from a collection to a queue for further processing.
119
101
 
120
102
  # What to do when the user hits Ctrl+c
121
103
  show_stopper = Proc.new {
104
+ puts('Closing connection')
122
105
  connection.close { EventMachine.stop }
123
106
  }
124
107
 
125
- # Read next 100000 items and queue them
108
+ # Read next options[:batch] items and queue them
126
109
  read_and_publish = Proc.new {
127
110
 
128
- to_read = if options.number == -1
129
- 100000
130
- else
131
- if options.number - num_read - 1 <= 0
132
- -1
133
- else
134
- options.number - num_read - 1
135
- end
136
- end
137
-
138
- read = 0
139
- col_info[collection][:col].find(what.merge(from),
111
+ persister.get_underlying_connection[:events].find(what.merge(from),
140
112
  :skip => num_read,
141
- :limit => to_read).each do |e|
142
-
143
- payload = read_value(e, col_info[collection][:payload])
144
- payload = if payload.class == BSON::OrderedHash
145
- payload.delete "_id" # Inserted by MongoDB on event insert
146
- payload.to_json
147
- end
148
- read += 1
149
- unq = read_value(e, col_info[collection][:unq])
113
+ :limit => options[:batch]).each do |e|
114
+ unq = read_value(e, 'type')
150
115
  if unq.class != String or unq.nil? then
151
- throw Exception.new("Unique value can only be a String")
116
+ throw Exception.new('Unique value can only be a String')
152
117
  end
153
118
 
154
- key = col_info[collection][:routekey] % unq
155
-
156
- exchange.publish payload, :persistent => true, :routing_key => key
119
+ exchange.publish e['id'], :persistent => false,
120
+ :routing_key => "evt.#{e['type']}"
157
121
 
158
122
  num_read += 1
159
- puts("Publish id = #{payload[unq]} (#{num_read} total)") if options.verbose
160
- awaiting_ack << num_read
123
+ puts "Publish id = #{e['id']} (#{num_read} total)" if options.verbose
161
124
  end
162
125
 
163
- # Nothing new in the DB and no msgs waiting ack
164
- if (read == 0 and awaiting_ack.size == 0) or to_read == -1
165
- puts("Finished reading, exiting")
126
+ if num_read >= options[:number]
127
+ puts 'Finished reading, exiting'
166
128
  show_stopper.call
167
- end
168
- }
169
-
170
- # Remove acknowledged or failed msg tags from the queue
171
- # Trigger more messages to be read when ack msg queue size drops to zero
172
- publisher_event = Proc.new { |ack|
173
- if ack.multiple then
174
- awaiting_ack.delete_if { |x| x <= ack.delivery_tag }
175
129
  else
176
- awaiting_ack.delete ack.delivery_tag
177
- end
178
-
179
- if awaiting_ack.size == 0
180
- puts("ACKS.size= #{awaiting_ack.size}") if options.verbose
130
+ # Schedule new event processing cycle
181
131
  EventMachine.next_tick do
182
132
  read_and_publish.call
183
133
  end
184
134
  end
185
135
  }
186
136
 
187
- # Await publisher confirms
188
- channel.confirm_select
189
-
190
- # Callback when confirms have arrived
191
- channel.on_ack do |ack|
192
- puts "ACK: tag=#{ack.delivery_tag}, mul=#{ack.multiple}" if options.verbose
193
- publisher_event.call(ack)
194
- end
195
-
196
- # Callback when confirms failed.
197
- channel.on_nack do |nack|
198
- puts "NACK: tag=#{nack.delivery_tag}, mul=#{nack.multiple}" if options.verbose
199
- publisher_event.call(nack)
200
- end
201
-
202
137
  # Signal handlers
203
138
  Signal.trap('INT', show_stopper)
204
139
  Signal.trap('TERM', show_stopper)
205
140
 
206
- # Trigger start processing
207
141
  EventMachine.add_timer(0.1) do
208
142
  read_and_publish.call
209
143
  end
144
+
210
145
  end
211
146
  end
212
147