ghtorrent 0.5 → 0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'ghtorrent'
5
+
6
+
7
+ class GHTProcessEvent < GHTDataRetrieval
8
+
9
+ def prepare_options(options)
10
+ options.banner <<-BANNER
11
+ Process one or more event ids
12
+ #{command_name} [options] eventid [...]
13
+ BANNER
14
+
15
+ end
16
+
17
+ def go
18
+ ARGV.each do |a|
19
+ data = ghtorrent.get_event(a)
20
+ unless data.empty?
21
+ event = data[0]
22
+ begin
23
+ send(event['type'], event)
24
+ rescue Exception => e
25
+ puts e
26
+ puts e.backtrace
27
+ end
28
+ else
29
+ warn "GHTProcessEvent: No event with id #{a}"
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ GHTProcessEvent.run
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'ghtorrent'
5
+
6
+ GHTRetrieveRepo.run(ARGV)
data/bin/ght-rm-dupl CHANGED
@@ -1,134 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require 'mongo'
4
+ require 'ghtorrent'
5
5
 
6
- require 'ghtorrent/settings'
7
- require 'ghtorrent/logging'
8
- require 'ghtorrent/command'
9
- require 'ghtorrent/persister'
10
-
11
- class GHRMDupl < GHTorrent::Command
12
-
13
- include GHTorrent::Settings
14
- include GHTorrent::Persister
15
-
16
- def col_info()
17
- {
18
- :commits => {
19
- :unq => "sha",
20
- :col => persister.get_underlying_connection.collection(:commits.to_s),
21
- },
22
- :events => {
23
- :unq => "id",
24
- :col => persister.get_underlying_connection.collection(:events.to_s),
25
- }
26
- }
27
- end
28
-
29
- def persister
30
- @persister ||= connect(:mongo, @settings)
31
- @persister
32
- end
33
-
34
- def prepare_options(options)
35
- options.banner <<-BANNER
36
- Removes duplicate entries from collections
37
-
38
- #{command_name} [options] collection
39
-
40
- #{command_name} options:
41
- BANNER
42
-
43
- options.opt :earliest, 'Seconds since epoch of earliest item to load',
44
- :short => 'e', :default => 0, :type => :int
45
- options.opt :snapshot, 'Perform clean up every x records',
46
- :short => 's', :default => -1, :type => :int
47
- end
48
-
49
- def validate
50
- super
51
- Trollop::die "no collection specified" unless args[0] && !args[0].empty?
52
- end
53
-
54
- # Print MongoDB remove statements that
55
- # remove all but one entries for each commit.
56
- def remove_duplicates(data, col)
57
- removed = 0
58
- data.select { |k, v| v.size > 1 }.each do |k, v|
59
- v.slice(0..(v.size - 2)).map do |x|
60
- removed += 1 if delete_by_id col, x
61
- end
62
- end
63
- removed
64
- end
65
-
66
- def delete_by_id(col, id)
67
- begin
68
- col.remove({'_id' => id})
69
- true
70
- rescue Mongo::OperationFailure
71
- puts "Cannot remove record with id #{id} from #{col.name}"
72
- false
73
- end
74
- end
75
-
76
- def go
77
- collection = case ARGV[0]
78
- when "commits" then
79
- :commits
80
- when "events" then
81
- :events
82
- else
83
- puts "Not a known collection name: #{ARGV[0]}\n"
84
- end
85
-
86
- from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
87
-
88
- snapshot = options[:snapshot]
89
-
90
- puts "Deleting duplicates from collection #{collection}"
91
- puts "Deleting duplicates after #{Time.at(options[:earliest])}"
92
- puts "Perform clean up every #{snapshot} records"
93
-
94
- # Various counters to report stats
95
- processed = total_processed = removed = 0
96
-
97
- data = Hash.new
98
-
99
- # The following code needs to save intermediate results to cope
100
- # with large datasets
101
- col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
102
- _id = r["_id"]
103
- commit = read_value(r, col_info[collection][:unq])
104
-
105
- # If entries cannot be parsed, remove them
106
- if commit.empty?
107
- puts "Deleting unknown entry #{_id}"
108
- removed += 1 if delete_by_id col_info[collection][:col], _id
109
- else
110
- data[commit] = [] if data[commit].nil?
111
- data[commit] << _id
112
- end
113
-
114
- processed += 1
115
- total_processed += 1
116
-
117
- print "\rProcessed #{processed} records"
118
-
119
- # Calculate duplicates, save intermediate result
120
- if snapshot > 0 and processed > snapshot
121
- puts "\nLoaded #{data.size} values, cleaning"
122
- removed += remove_duplicates data, col_info[collection][:col]
123
- data = Hash.new
124
- processed = 0
125
- end
126
- end
127
-
128
- removed += remove_duplicates data, col_info[collection][:col]
129
-
130
- puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
131
- end
132
- end
133
-
134
- GHRMDupl.run
6
+ GHRMDupl.run
data/lib/ghtorrent.rb CHANGED
@@ -48,3 +48,13 @@ require 'ghtorrent/retriever'
48
48
 
49
49
  # SQL database fillup methods
50
50
  require 'ghtorrent/ghtorrent'
51
+
52
+ # Commands
53
+ require 'ghtorrent/commands/ght_data_retrieval'
54
+ require 'ghtorrent/commands/ght_mirror_events'
55
+ require 'ghtorrent/commands/ght_get_more_commits'
56
+ require 'ghtorrent/commands/ght_rm_dupl'
57
+ require 'ghtorrent/commands/ght_load'
58
+ require 'ghtorrent/commands/ght_retrieve_repo'
59
+
60
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -4,7 +4,7 @@ module GHTorrent
4
4
 
5
5
  ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
6
6
  :commit_comments, :repo_collaborators, :watchers, :pull_requests,
7
- :forks, :pull_request_comments, :issue_comments, :issues
7
+ :forks, :pull_request_comments, :issue_comments, :issues, :issue_events
8
8
  ]
9
9
 
10
10
  # Stores +data+ into +entity+. Returns a unique key for the stored entry.
@@ -110,8 +110,12 @@ module GHTorrent
110
110
  get_collection("forks")
111
111
  when :pull_request_comments
112
112
  get_collection("pull_request_comments")
113
+ when :issues
114
+ get_collection("issues")
113
115
  when :issue_comments
114
116
  get_collection("issue_comments")
117
+ when :issue_events
118
+ get_collection("issue_events")
115
119
  end
116
120
  end
117
121
 
@@ -130,7 +134,7 @@ module GHTorrent
130
134
  Mongo::ReplSetConnection.new(repl_arr, :read => :secondary)\
131
135
  .db(config(:mongo_db))
132
136
  end
133
- init_db(@mongo) if @mongo.collections.size <= 0
137
+ init_db(@mongo) if @mongo.collections.size < ENTITIES.size
134
138
  @mongo
135
139
  else
136
140
  @mongo
@@ -183,6 +187,13 @@ module GHTorrent
183
187
  ensure_index(:pull_request_comments, "owner")
184
188
  ensure_index(:pull_request_comments, "pullreq_id")
185
189
  ensure_index(:pull_request_comments, "id")
190
+ ensure_index(:issues, "repo")
191
+ ensure_index(:issues, "owner")
192
+ ensure_index(:issues, "issue_id")
193
+ ensure_index(:issue_events, "repo")
194
+ ensure_index(:issue_events, "owner")
195
+ ensure_index(:issue_events, "issue_id")
196
+ ensure_index(:issue_events, "id")
186
197
  end
187
198
 
188
199
  def rescue_connection_failure(max_retries=60)
@@ -8,12 +8,14 @@ require 'ghtorrent/logging'
8
8
  require 'ghtorrent/settings'
9
9
  require 'ghtorrent/time'
10
10
  require 'ghtorrent/cache'
11
+ require 'version'
11
12
 
12
13
  module GHTorrent
13
14
  module APIClient
14
15
  include GHTorrent::Logging
15
16
  include GHTorrent::Settings
16
17
  include GHTorrent::Cache
18
+ include GHTorrent::Logging
17
19
 
18
20
  # This is to fix an annoying bug in JRuby's SSL not being able to
19
21
  # verify a valid certificate.
@@ -25,10 +27,20 @@ module GHTorrent
25
27
  # result pages.
26
28
  def paged_api_request(url, pages = -1, cache = true, last = nil)
27
29
 
28
- data = if URI.parse(url).query.nil? # Top level request, no params
29
- api_request_raw(url, false)
30
- else
30
+ url = if not url.include?("per_page")
31
+ if url.include?("?")
32
+ url + "&per_page=100"
33
+ else
34
+ url + "?per_page=100"
35
+ end
36
+ else
37
+ url
38
+ end
39
+
40
+ data = if CGI::parse(URI::parse(url).query).has_key?("page")
31
41
  api_request_raw(url, use_cache?(cache, method = :paged))
42
+ else
43
+ api_request_raw(url, false)
32
44
  end
33
45
 
34
46
  return [] if data.nil?
@@ -79,7 +91,7 @@ module GHTorrent
79
91
  when "prod"
80
92
  :prod
81
93
  else
82
- raise GHTorrentException("")
94
+ raise GHTorrentException.new("Don't know cache configuration #{@cache_mode}")
83
95
  end
84
96
  case @cache_mode
85
97
  when :dev
@@ -128,21 +140,22 @@ module GHTorrent
128
140
  # Do the actual request and return the result object
129
141
  def api_request_raw(url, use_cache = false)
130
142
  @num_api_calls ||= 0
131
- @ts ||= Time.now().tv_sec()
143
+ @ts ||= Time.now.to_i
144
+ @started_min ||= Time.now.min
132
145
 
133
146
  #Rate limiting to avoid error requests
134
147
  if Time.now().tv_sec() - @ts < 60 then
135
148
  if @num_api_calls >= @settings['mirror']['reqrate'].to_i
136
- sleep = 60 - (Time.now().tv_sec() - @ts)
149
+ sleep = 60 - (Time.now.to_i - @ts)
137
150
  debug "APIClient: Sleeping for #{sleep}"
138
151
  sleep (sleep)
139
152
  @num_api_calls = 0
140
- @ts = Time.now().tv_sec()
153
+ @ts = Time.now.to_i
141
154
  end
142
155
  else
143
156
  debug "APIClient: Tick, num_calls = #{@num_api_calls}, zeroing"
144
157
  @num_api_calls = 0
145
- @ts = Time.now().tv_sec()
158
+ @ts = Time.now.to_i
146
159
  end
147
160
 
148
161
  begin
@@ -166,7 +179,17 @@ module GHTorrent
166
179
  end
167
180
 
168
181
  total = Time.now.to_ms - start_time.to_ms
169
- debug "APIClient: Request: #{url} (#{@num_api_calls} calls,#{if from_cache then " from cache," end} Total: #{total} ms)"
182
+ debug "APIClient: Request: #{url} (#{@num_api_calls} calls #{if from_cache then " from cache," else "(#{contents.meta['x-ratelimit-remaining']} remaining)," end} Total: #{total} ms)"
183
+
184
+ if not from_cache and config(:respect_api_ratelimit) and
185
+ contents.meta['x-ratelimit-remaining'].to_i < 400
186
+ sleep = 60 - @started_min
187
+ debug "APIClient: Request limit reached, sleeping for #{sleep} min"
188
+ sleep(sleep * 60)
189
+ @started_min = Time.now.min
190
+ @num_api_calls = 0
191
+ end
192
+
170
193
  contents
171
194
  rescue OpenURI::HTTPError => e
172
195
  case e.io.status[0].to_i
@@ -176,10 +199,10 @@ module GHTorrent
176
199
  403, # Forbidden
177
200
  404, # Not found
178
201
  422 then # Unprocessable entity
179
- STDERR.puts "#{url}: #{e.io.status[1]}"
202
+ warn "#{url}: #{e.io.status[1]}"
180
203
  return nil
181
204
  else # Server error or HTTP conditions that Github does not report
182
- STDERR.puts "#{url}"
205
+ warn "#{url}"
183
206
  raise e
184
207
  end
185
208
  end
@@ -187,12 +210,23 @@ module GHTorrent
187
210
 
188
211
  def do_request(url)
189
212
  @attach_ip ||= config(:attach_ip)
213
+ @username ||= config(:github_username)
214
+ @passwd ||= config(:github_passwd)
215
+ @user_agent ||= "ghtorrent-v#{GHTorrent::VERSION}"
216
+
217
+ @open_func ||= if @username.nil?
218
+ lambda {|url| open(url, 'User-Agent' => @user_agent)}
219
+ else
220
+ lambda {|url| open(url,
221
+ 'User-Agent' => @user_agent,
222
+ :http_basic_authentication => [@username, @passwd])}
223
+ end
190
224
 
191
225
  if @attach_ip.nil? or @attach_ip.eql? "0.0.0.0"
192
- open(url)
226
+ @open_func.call(url)
193
227
  else
194
228
  attach_to(@attach_ip) do
195
- open(url)
229
+ @open_func.call(url)
196
230
  end
197
231
  end
198
232
  end
@@ -1,4 +1,5 @@
1
1
  require 'json'
2
+ require 'bson'
2
3
 
3
4
  class BSON::OrderedHash
4
5
 
@@ -20,4 +21,4 @@ class BSON::OrderedHash
20
21
  def to_json
21
22
  to_h.to_json
22
23
  end
23
- end
24
+ end
@@ -4,6 +4,7 @@ require 'daemons'
4
4
  require 'etc'
5
5
 
6
6
  require 'ghtorrent/settings'
7
+ require 'version'
7
8
 
8
9
  module GHTorrent
9
10
 
@@ -16,6 +17,7 @@ module GHTorrent
16
17
  class Command
17
18
 
18
19
  include GHTorrent::Settings
20
+ include GHTorrent::Settings
19
21
 
20
22
  # Specify the run method for subclasses.
21
23
  class << self
@@ -33,6 +35,8 @@ module GHTorrent
33
35
  command.process_options
34
36
  command.validate
35
37
 
38
+ puts "GHTorrent version: #{GHTorrent::VERSION}"
39
+
36
40
  command.settings = YAML::load_file command.options[:config]
37
41
 
38
42
  unless command.options[:addr].nil?
@@ -41,6 +45,18 @@ module GHTorrent
41
45
  command.options[:addr])
42
46
  end
43
47
 
48
+ unless command.options[:username].nil?
49
+ command.settings = command.override_config(command.settings,
50
+ :github_username,
51
+ command.options[:username])
52
+ end
53
+
54
+ unless command.options[:password].nil?
55
+ command.settings = command.override_config(command.settings,
56
+ :github_passwd,
57
+ command.options[:password])
58
+ end
59
+
44
60
  if command.options[:daemon]
45
61
  if Process.uid == 0
46
62
  # Daemonize as a proper system daemon
@@ -97,6 +113,8 @@ Standard options:
97
113
  opt :daemon, 'run as daemon', :short => 'd'
98
114
  opt :user, 'run as the specified user (only when started as root)',
99
115
  :short => 'u', :type => String
116
+ opt :username, 'Username at Github', :type => String
117
+ opt :password, 'Password at Github', :type => String
100
118
  end
101
119
  end
102
120
 
@@ -0,0 +1,218 @@
1
+ require 'rubygems'
2
+ require 'amqp'
3
+ require 'json'
4
+ require 'pp'
5
+
6
+ require 'ghtorrent/ghtorrent'
7
+ require 'ghtorrent/settings'
8
+ require 'ghtorrent/logging'
9
+ require 'ghtorrent/command'
10
+
11
+ class GHTDataRetrieval < GHTorrent::Command
12
+
13
+ include GHTorrent::Settings
14
+ include GHTorrent::Logging
15
+
16
+ def parse(msg)
17
+ JSON.parse(msg)
18
+ end
19
+
20
+ def PushEvent(data)
21
+ data['payload']['commits'].each do |c|
22
+ url = c['url'].split(/\//)
23
+
24
+ ghtorrent.get_commit url[4], url[5], url[7]
25
+ end
26
+ end
27
+
28
+ def WatchEvent(data)
29
+ owner = data['repo']['name'].split(/\//)[0]
30
+ repo = data['repo']['name'].split(/\//)[1]
31
+ watcher = data['actor']['login']
32
+ created_at = data['created_at']
33
+
34
+ ghtorrent.get_watcher owner, repo, watcher, created_at
35
+ end
36
+
37
+ def FollowEvent(data)
38
+ follower = data['actor']['login']
39
+ followed = data['payload']['target']['login']
40
+ created_at = data['created_at']
41
+
42
+ ghtorrent.get_follower(follower, followed, created_at)
43
+ end
44
+
45
+ def MemberEvent(data)
46
+ owner = data['actor']['login']
47
+ repo = data['repo']['name'].split(/\//)[1]
48
+ new_member = data['payload']['member']['login']
49
+ created_at = data['created_at']
50
+
51
+ ghtorrent.get_project_member(owner, repo, new_member, created_at)
52
+ end
53
+
54
+ def CommitCommentEvent(data)
55
+ user = data['actor']['login']
56
+ repo = data['repo']['name'].split(/\//)[1]
57
+ id = data['payload']['comment']['id']
58
+ created_at = data['created_at']
59
+
60
+ ghtorrent.get_commit_comment(user, repo, id, created_at)
61
+ end
62
+
63
+ def PullRequestEvent(data)
64
+ owner = data['payload']['pull_request']['base']['repo']['owner']['login']
65
+ repo = data['payload']['pull_request']['base']['repo']['name']
66
+ pullreq_id = data['payload']['number']
67
+ action = data['payload']['action']
68
+ created_at = data['created_at']
69
+
70
+ ghtorrent.get_pull_request(owner, repo, pullreq_id, action, created_at)
71
+ end
72
+
73
+ def ForkEvent(data)
74
+ owner = data['repo']['name'].split(/\//)[0]
75
+ repo = data['repo']['name'].split(/\//)[1]
76
+ fork_id = data['payload']['forkee']['id']
77
+ created_at = data['created_at']
78
+
79
+ ghtorrent.get_fork(owner, repo, fork_id, created_at)
80
+ end
81
+
82
+ def PullRequestReviewCommentEvent(data)
83
+ owner = data['repo']['name'].split(/\//)[0]
84
+ repo = data['repo']['name'].split(/\//)[1]
85
+ comment_id = data['payload']['comment']['id']
86
+ pullreq_id = data['payload']['comment']['_links']['pull_request']['href'].split(/\//)[-1]
87
+ created_at = data['created_at']
88
+
89
+ ghtorrent.get_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
90
+ end
91
+
92
+ def IssuesEvent(data)
93
+ owner = data['repo']['name'].split(/\//)[0]
94
+ repo = data['repo']['name'].split(/\//)[1]
95
+ issue_id = data['payload']['issue']['number']
96
+ created_at = data['created_at']
97
+
98
+ ghtorrent.get_issue(owner, repo, issue_id, created_at)
99
+ end
100
+
101
+ def IssueCommentEvent(data)
102
+ owner = data['repo']['name'].split(/\//)[0]
103
+ repo = data['repo']['name'].split(/\//)[1]
104
+ issue_id = data['payload']['issue']['number']
105
+ comment_id = data['payload']['comment']['id']
106
+ created_at = data['created_at']
107
+
108
+ ghtorrent.get_issue_comment(owner, repo, issue_id, comment_id)
109
+ end
110
+
111
+ def handlers
112
+ %w(PushEvent WatchEvent FollowEvent MemberEvent
113
+ CommitCommentEvent PullRequestEvent ForkEvent
114
+ PullRequestReviewCommentEvent IssuesEvent IssueCommentEvent)
115
+ #%w(IssuesEvent IssueCommentEvent)
116
+ end
117
+
118
+ def prepare_options(options)
119
+ options.banner <<-BANNER
120
+ Retrieves events from queues and processes them through GHTorrent
121
+ #{command_name} [options]
122
+
123
+ #{command_name} options:
124
+ BANNER
125
+
126
+ options.opt :filter,
127
+ 'Only process messages for repos in the provided file',
128
+ :short => 'f', :type => String
129
+ end
130
+
131
+ def validate
132
+ super
133
+ Trollop::die "Filter file does not exist" if options[:filter] and not File.exist?(options[:filter])
134
+ end
135
+
136
+ def logger
137
+ ghtorrent.logger
138
+ end
139
+
140
+ def ghtorrent
141
+ @gh ||= GHTorrent::Mirror.new(@settings)
142
+ @gh
143
+ end
144
+
145
+ def go
146
+ filter = Array.new
147
+
148
+ if options[:filter]
149
+ File.open(options[:filter]).each { |l|
150
+ next if l.match(/^ *#/)
151
+ parts = l.split(/ /)
152
+ next if parts.size < 2
153
+ debug "GHTDataRetrieval: Filtering events by #{parts[0] + "/" + parts[1]}"
154
+ filter << parts[0] + "/" + parts[1]
155
+ }
156
+ end
157
+
158
+ # Graceful exit
159
+ Signal.trap('INT') {
160
+ info "GHTDataRetrieval: Received SIGINT, exiting"
161
+ AMQP.stop { EM.stop }
162
+ }
163
+ Signal.trap('TERM') {
164
+ info "GHTDataRetrieval: Received SIGTERM, exiting"
165
+ AMQP.stop { EM.stop }
166
+ }
167
+
168
+ AMQP.start(:host => config(:amqp_host),
169
+ :port => config(:amqp_port),
170
+ :username => config(:amqp_username),
171
+ :password => config(:amqp_password)) do |connection|
172
+
173
+ channel = AMQP::Channel.new(connection, :prefetch => config(:amqp_prefetch))
174
+ exchange = channel.topic(config(:amqp_exchange), :durable => true,
175
+ :auto_delete => false)
176
+
177
+ handlers.each { |h|
178
+ queue = channel.queue("#{h}s", {:durable => true})\
179
+ .bind(exchange, :routing_key => "evt.#{h}")
180
+
181
+ info "GHTDataRetrieval: Binding handler #{h} to routing key evt.#{h}"
182
+
183
+ queue.subscribe(:ack => true) do |headers, msg|
184
+ begin
185
+ data = parse(msg)
186
+ info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
187
+
188
+ unless options[:filter].nil?
189
+ if filter.include?(data['repo']['name'])
190
+ send(h, data)
191
+ else
192
+ info "GHTDataRetrieval: Repo #{data['repo']['name']} not in process list. Ignoring event #{data['type']}-#{data['id']}"
193
+ end
194
+ else
195
+ send(h, data)
196
+ end
197
+ headers.ack
198
+ info "GHTDataRetrieval: Processed event: #{data['type']}-#{data['id']}"
199
+ rescue Exception => e
200
+ # Give a message a chance to be reprocessed
201
+ if headers.redelivered?
202
+ data = parse(msg)
203
+ warn "GHTDataRetrieval: Could not process event: #{data['type']}-#{data['id']}"
204
+ headers.reject(:requeue => false)
205
+ else
206
+ headers.reject(:requeue => true)
207
+ end
208
+
209
+ STDERR.puts e
210
+ STDERR.puts e.backtrace.join("\n")
211
+ end
212
+ end
213
+ }
214
+ end
215
+ end
216
+ end
217
+
218
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :