ghtorrent 0.5 → 0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'ghtorrent'
5
+
6
+
7
+ class GHTProcessEvent < GHTDataRetrieval
8
+
9
+ def prepare_options(options)
10
+ options.banner <<-BANNER
11
+ Process one or more event ids
12
+ #{command_name} [options] eventid [...]
13
+ BANNER
14
+
15
+ end
16
+
17
+ def go
18
+ ARGV.each do |a|
19
+ data = ghtorrent.get_event(a)
20
+ unless data.empty?
21
+ event = data[0]
22
+ begin
23
+ send(event['type'], event)
24
+ rescue Exception => e
25
+ puts e
26
+ puts e.backtrace
27
+ end
28
+ else
29
+ warn "GHTProcessEvent: No event with id #{a}"
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ GHTProcessEvent.run
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'ghtorrent'
5
+
6
+ GHTRetrieveRepo.run(ARGV)
data/bin/ght-rm-dupl CHANGED
@@ -1,134 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require 'mongo'
4
+ require 'ghtorrent'
5
5
 
6
- require 'ghtorrent/settings'
7
- require 'ghtorrent/logging'
8
- require 'ghtorrent/command'
9
- require 'ghtorrent/persister'
10
-
11
- class GHRMDupl < GHTorrent::Command
12
-
13
- include GHTorrent::Settings
14
- include GHTorrent::Persister
15
-
16
- def col_info()
17
- {
18
- :commits => {
19
- :unq => "sha",
20
- :col => persister.get_underlying_connection.collection(:commits.to_s),
21
- },
22
- :events => {
23
- :unq => "id",
24
- :col => persister.get_underlying_connection.collection(:events.to_s),
25
- }
26
- }
27
- end
28
-
29
- def persister
30
- @persister ||= connect(:mongo, @settings)
31
- @persister
32
- end
33
-
34
- def prepare_options(options)
35
- options.banner <<-BANNER
36
- Removes duplicate entries from collections
37
-
38
- #{command_name} [options] collection
39
-
40
- #{command_name} options:
41
- BANNER
42
-
43
- options.opt :earliest, 'Seconds since epoch of earliest item to load',
44
- :short => 'e', :default => 0, :type => :int
45
- options.opt :snapshot, 'Perform clean up every x records',
46
- :short => 's', :default => -1, :type => :int
47
- end
48
-
49
- def validate
50
- super
51
- Trollop::die "no collection specified" unless args[0] && !args[0].empty?
52
- end
53
-
54
- # Print MongoDB remove statements that
55
- # remove all but one entries for each commit.
56
- def remove_duplicates(data, col)
57
- removed = 0
58
- data.select { |k, v| v.size > 1 }.each do |k, v|
59
- v.slice(0..(v.size - 2)).map do |x|
60
- removed += 1 if delete_by_id col, x
61
- end
62
- end
63
- removed
64
- end
65
-
66
- def delete_by_id(col, id)
67
- begin
68
- col.remove({'_id' => id})
69
- true
70
- rescue Mongo::OperationFailure
71
- puts "Cannot remove record with id #{id} from #{col.name}"
72
- false
73
- end
74
- end
75
-
76
- def go
77
- collection = case ARGV[0]
78
- when "commits" then
79
- :commits
80
- when "events" then
81
- :events
82
- else
83
- puts "Not a known collection name: #{ARGV[0]}\n"
84
- end
85
-
86
- from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
87
-
88
- snapshot = options[:snapshot]
89
-
90
- puts "Deleting duplicates from collection #{collection}"
91
- puts "Deleting duplicates after #{Time.at(options[:earliest])}"
92
- puts "Perform clean up every #{snapshot} records"
93
-
94
- # Various counters to report stats
95
- processed = total_processed = removed = 0
96
-
97
- data = Hash.new
98
-
99
- # The following code needs to save intermediate results to cope
100
- # with large datasets
101
- col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
102
- _id = r["_id"]
103
- commit = read_value(r, col_info[collection][:unq])
104
-
105
- # If entries cannot be parsed, remove them
106
- if commit.empty?
107
- puts "Deleting unknown entry #{_id}"
108
- removed += 1 if delete_by_id col_info[collection][:col], _id
109
- else
110
- data[commit] = [] if data[commit].nil?
111
- data[commit] << _id
112
- end
113
-
114
- processed += 1
115
- total_processed += 1
116
-
117
- print "\rProcessed #{processed} records"
118
-
119
- # Calculate duplicates, save intermediate result
120
- if snapshot > 0 and processed > snapshot
121
- puts "\nLoaded #{data.size} values, cleaning"
122
- removed += remove_duplicates data, col_info[collection][:col]
123
- data = Hash.new
124
- processed = 0
125
- end
126
- end
127
-
128
- removed += remove_duplicates data, col_info[collection][:col]
129
-
130
- puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
131
- end
132
- end
133
-
134
- GHRMDupl.run
6
+ GHRMDupl.run
data/lib/ghtorrent.rb CHANGED
@@ -48,3 +48,13 @@ require 'ghtorrent/retriever'
48
48
 
49
49
  # SQL database fillup methods
50
50
  require 'ghtorrent/ghtorrent'
51
+
52
+ # Commands
53
+ require 'ghtorrent/commands/ght_data_retrieval'
54
+ require 'ghtorrent/commands/ght_mirror_events'
55
+ require 'ghtorrent/commands/ght_get_more_commits'
56
+ require 'ghtorrent/commands/ght_rm_dupl'
57
+ require 'ghtorrent/commands/ght_load'
58
+ require 'ghtorrent/commands/ght_retrieve_repo'
59
+
60
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -4,7 +4,7 @@ module GHTorrent
4
4
 
5
5
  ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
6
6
  :commit_comments, :repo_collaborators, :watchers, :pull_requests,
7
- :forks, :pull_request_comments, :issue_comments, :issues
7
+ :forks, :pull_request_comments, :issue_comments, :issues, :issue_events
8
8
  ]
9
9
 
10
10
  # Stores +data+ into +entity+. Returns a unique key for the stored entry.
@@ -110,8 +110,12 @@ module GHTorrent
110
110
  get_collection("forks")
111
111
  when :pull_request_comments
112
112
  get_collection("pull_request_comments")
113
+ when :issues
114
+ get_collection("issues")
113
115
  when :issue_comments
114
116
  get_collection("issue_comments")
117
+ when :issue_events
118
+ get_collection("issue_events")
115
119
  end
116
120
  end
117
121
 
@@ -130,7 +134,7 @@ module GHTorrent
130
134
  Mongo::ReplSetConnection.new(repl_arr, :read => :secondary)\
131
135
  .db(config(:mongo_db))
132
136
  end
133
- init_db(@mongo) if @mongo.collections.size <= 0
137
+ init_db(@mongo) if @mongo.collections.size < ENTITIES.size
134
138
  @mongo
135
139
  else
136
140
  @mongo
@@ -183,6 +187,13 @@ module GHTorrent
183
187
  ensure_index(:pull_request_comments, "owner")
184
188
  ensure_index(:pull_request_comments, "pullreq_id")
185
189
  ensure_index(:pull_request_comments, "id")
190
+ ensure_index(:issues, "repo")
191
+ ensure_index(:issues, "owner")
192
+ ensure_index(:issues, "issue_id")
193
+ ensure_index(:issue_events, "repo")
194
+ ensure_index(:issue_events, "owner")
195
+ ensure_index(:issue_events, "issue_id")
196
+ ensure_index(:issue_events, "id")
186
197
  end
187
198
 
188
199
  def rescue_connection_failure(max_retries=60)
@@ -8,12 +8,14 @@ require 'ghtorrent/logging'
8
8
  require 'ghtorrent/settings'
9
9
  require 'ghtorrent/time'
10
10
  require 'ghtorrent/cache'
11
+ require 'version'
11
12
 
12
13
  module GHTorrent
13
14
  module APIClient
14
15
  include GHTorrent::Logging
15
16
  include GHTorrent::Settings
16
17
  include GHTorrent::Cache
18
+ include GHTorrent::Logging
17
19
 
18
20
  # This is to fix an annoying bug in JRuby's SSL not being able to
19
21
  # verify a valid certificate.
@@ -25,10 +27,20 @@ module GHTorrent
25
27
  # result pages.
26
28
  def paged_api_request(url, pages = -1, cache = true, last = nil)
27
29
 
28
- data = if URI.parse(url).query.nil? # Top level request, no params
29
- api_request_raw(url, false)
30
- else
30
+ url = if not url.include?("per_page")
31
+ if url.include?("?")
32
+ url + "&per_page=100"
33
+ else
34
+ url + "?per_page=100"
35
+ end
36
+ else
37
+ url
38
+ end
39
+
40
+ data = if CGI::parse(URI::parse(url).query).has_key?("page")
31
41
  api_request_raw(url, use_cache?(cache, method = :paged))
42
+ else
43
+ api_request_raw(url, false)
32
44
  end
33
45
 
34
46
  return [] if data.nil?
@@ -79,7 +91,7 @@ module GHTorrent
79
91
  when "prod"
80
92
  :prod
81
93
  else
82
- raise GHTorrentException("")
94
+ raise GHTorrentException.new("Don't know cache configuration #{@cache_mode}")
83
95
  end
84
96
  case @cache_mode
85
97
  when :dev
@@ -128,21 +140,22 @@ module GHTorrent
128
140
  # Do the actual request and return the result object
129
141
  def api_request_raw(url, use_cache = false)
130
142
  @num_api_calls ||= 0
131
- @ts ||= Time.now().tv_sec()
143
+ @ts ||= Time.now.to_i
144
+ @started_min ||= Time.now.min
132
145
 
133
146
  #Rate limiting to avoid error requests
134
147
  if Time.now().tv_sec() - @ts < 60 then
135
148
  if @num_api_calls >= @settings['mirror']['reqrate'].to_i
136
- sleep = 60 - (Time.now().tv_sec() - @ts)
149
+ sleep = 60 - (Time.now.to_i - @ts)
137
150
  debug "APIClient: Sleeping for #{sleep}"
138
151
  sleep (sleep)
139
152
  @num_api_calls = 0
140
- @ts = Time.now().tv_sec()
153
+ @ts = Time.now.to_i
141
154
  end
142
155
  else
143
156
  debug "APIClient: Tick, num_calls = #{@num_api_calls}, zeroing"
144
157
  @num_api_calls = 0
145
- @ts = Time.now().tv_sec()
158
+ @ts = Time.now.to_i
146
159
  end
147
160
 
148
161
  begin
@@ -166,7 +179,17 @@ module GHTorrent
166
179
  end
167
180
 
168
181
  total = Time.now.to_ms - start_time.to_ms
169
- debug "APIClient: Request: #{url} (#{@num_api_calls} calls,#{if from_cache then " from cache," end} Total: #{total} ms)"
182
+ debug "APIClient: Request: #{url} (#{@num_api_calls} calls #{if from_cache then " from cache," else "(#{contents.meta['x-ratelimit-remaining']} remaining)," end} Total: #{total} ms)"
183
+
184
+ if not from_cache and config(:respect_api_ratelimit) and
185
+ contents.meta['x-ratelimit-remaining'].to_i < 400
186
+ sleep = 60 - @started_min
187
+ debug "APIClient: Request limit reached, sleeping for #{sleep} min"
188
+ sleep(sleep * 60)
189
+ @started_min = Time.now.min
190
+ @num_api_calls = 0
191
+ end
192
+
170
193
  contents
171
194
  rescue OpenURI::HTTPError => e
172
195
  case e.io.status[0].to_i
@@ -176,10 +199,10 @@ module GHTorrent
176
199
  403, # Forbidden
177
200
  404, # Not found
178
201
  422 then # Unprocessable entity
179
- STDERR.puts "#{url}: #{e.io.status[1]}"
202
+ warn "#{url}: #{e.io.status[1]}"
180
203
  return nil
181
204
  else # Server error or HTTP conditions that Github does not report
182
- STDERR.puts "#{url}"
205
+ warn "#{url}"
183
206
  raise e
184
207
  end
185
208
  end
@@ -187,12 +210,23 @@ module GHTorrent
187
210
 
188
211
  def do_request(url)
189
212
  @attach_ip ||= config(:attach_ip)
213
+ @username ||= config(:github_username)
214
+ @passwd ||= config(:github_passwd)
215
+ @user_agent ||= "ghtorrent-v#{GHTorrent::VERSION}"
216
+
217
+ @open_func ||= if @username.nil?
218
+ lambda {|url| open(url, 'User-Agent' => @user_agent)}
219
+ else
220
+ lambda {|url| open(url,
221
+ 'User-Agent' => @user_agent,
222
+ :http_basic_authentication => [@username, @passwd])}
223
+ end
190
224
 
191
225
  if @attach_ip.nil? or @attach_ip.eql? "0.0.0.0"
192
- open(url)
226
+ @open_func.call(url)
193
227
  else
194
228
  attach_to(@attach_ip) do
195
- open(url)
229
+ @open_func.call(url)
196
230
  end
197
231
  end
198
232
  end
@@ -1,4 +1,5 @@
1
1
  require 'json'
2
+ require 'bson'
2
3
 
3
4
  class BSON::OrderedHash
4
5
 
@@ -20,4 +21,4 @@ class BSON::OrderedHash
20
21
  def to_json
21
22
  to_h.to_json
22
23
  end
23
- end
24
+ end
@@ -4,6 +4,7 @@ require 'daemons'
4
4
  require 'etc'
5
5
 
6
6
  require 'ghtorrent/settings'
7
+ require 'version'
7
8
 
8
9
  module GHTorrent
9
10
 
@@ -16,6 +17,7 @@ module GHTorrent
16
17
  class Command
17
18
 
18
19
  include GHTorrent::Settings
20
+ include GHTorrent::Settings
19
21
 
20
22
  # Specify the run method for subclasses.
21
23
  class << self
@@ -33,6 +35,8 @@ module GHTorrent
33
35
  command.process_options
34
36
  command.validate
35
37
 
38
+ puts "GHTorrent version: #{GHTorrent::VERSION}"
39
+
36
40
  command.settings = YAML::load_file command.options[:config]
37
41
 
38
42
  unless command.options[:addr].nil?
@@ -41,6 +45,18 @@ module GHTorrent
41
45
  command.options[:addr])
42
46
  end
43
47
 
48
+ unless command.options[:username].nil?
49
+ command.settings = command.override_config(command.settings,
50
+ :github_username,
51
+ command.options[:username])
52
+ end
53
+
54
+ unless command.options[:password].nil?
55
+ command.settings = command.override_config(command.settings,
56
+ :github_passwd,
57
+ command.options[:password])
58
+ end
59
+
44
60
  if command.options[:daemon]
45
61
  if Process.uid == 0
46
62
  # Daemonize as a proper system daemon
@@ -97,6 +113,8 @@ Standard options:
97
113
  opt :daemon, 'run as daemon', :short => 'd'
98
114
  opt :user, 'run as the specified user (only when started as root)',
99
115
  :short => 'u', :type => String
116
+ opt :username, 'Username at Github', :type => String
117
+ opt :password, 'Password at Github', :type => String
100
118
  end
101
119
  end
102
120
 
@@ -0,0 +1,218 @@
1
+ require 'rubygems'
2
+ require 'amqp'
3
+ require 'json'
4
+ require 'pp'
5
+
6
+ require 'ghtorrent/ghtorrent'
7
+ require 'ghtorrent/settings'
8
+ require 'ghtorrent/logging'
9
+ require 'ghtorrent/command'
10
+
11
+ class GHTDataRetrieval < GHTorrent::Command
12
+
13
+ include GHTorrent::Settings
14
+ include GHTorrent::Logging
15
+
16
+ def parse(msg)
17
+ JSON.parse(msg)
18
+ end
19
+
20
+ def PushEvent(data)
21
+ data['payload']['commits'].each do |c|
22
+ url = c['url'].split(/\//)
23
+
24
+ ghtorrent.get_commit url[4], url[5], url[7]
25
+ end
26
+ end
27
+
28
+ def WatchEvent(data)
29
+ owner = data['repo']['name'].split(/\//)[0]
30
+ repo = data['repo']['name'].split(/\//)[1]
31
+ watcher = data['actor']['login']
32
+ created_at = data['created_at']
33
+
34
+ ghtorrent.get_watcher owner, repo, watcher, created_at
35
+ end
36
+
37
+ def FollowEvent(data)
38
+ follower = data['actor']['login']
39
+ followed = data['payload']['target']['login']
40
+ created_at = data['created_at']
41
+
42
+ ghtorrent.get_follower(follower, followed, created_at)
43
+ end
44
+
45
+ def MemberEvent(data)
46
+ owner = data['actor']['login']
47
+ repo = data['repo']['name'].split(/\//)[1]
48
+ new_member = data['payload']['member']['login']
49
+ created_at = data['created_at']
50
+
51
+ ghtorrent.get_project_member(owner, repo, new_member, created_at)
52
+ end
53
+
54
+ def CommitCommentEvent(data)
55
+ user = data['actor']['login']
56
+ repo = data['repo']['name'].split(/\//)[1]
57
+ id = data['payload']['comment']['id']
58
+ created_at = data['created_at']
59
+
60
+ ghtorrent.get_commit_comment(user, repo, id, created_at)
61
+ end
62
+
63
+ def PullRequestEvent(data)
64
+ owner = data['payload']['pull_request']['base']['repo']['owner']['login']
65
+ repo = data['payload']['pull_request']['base']['repo']['name']
66
+ pullreq_id = data['payload']['number']
67
+ action = data['payload']['action']
68
+ created_at = data['created_at']
69
+
70
+ ghtorrent.get_pull_request(owner, repo, pullreq_id, action, created_at)
71
+ end
72
+
73
+ def ForkEvent(data)
74
+ owner = data['repo']['name'].split(/\//)[0]
75
+ repo = data['repo']['name'].split(/\//)[1]
76
+ fork_id = data['payload']['forkee']['id']
77
+ created_at = data['created_at']
78
+
79
+ ghtorrent.get_fork(owner, repo, fork_id, created_at)
80
+ end
81
+
82
+ def PullRequestReviewCommentEvent(data)
83
+ owner = data['repo']['name'].split(/\//)[0]
84
+ repo = data['repo']['name'].split(/\//)[1]
85
+ comment_id = data['payload']['comment']['id']
86
+ pullreq_id = data['payload']['comment']['_links']['pull_request']['href'].split(/\//)[-1]
87
+ created_at = data['created_at']
88
+
89
+ ghtorrent.get_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
90
+ end
91
+
92
+ def IssuesEvent(data)
93
+ owner = data['repo']['name'].split(/\//)[0]
94
+ repo = data['repo']['name'].split(/\//)[1]
95
+ issue_id = data['payload']['issue']['number']
96
+ created_at = data['created_at']
97
+
98
+ ghtorrent.get_issue(owner, repo, issue_id, created_at)
99
+ end
100
+
101
+ def IssueCommentEvent(data)
102
+ owner = data['repo']['name'].split(/\//)[0]
103
+ repo = data['repo']['name'].split(/\//)[1]
104
+ issue_id = data['payload']['issue']['number']
105
+ comment_id = data['payload']['comment']['id']
106
+ created_at = data['created_at']
107
+
108
+ ghtorrent.get_issue_comment(owner, repo, issue_id, comment_id)
109
+ end
110
+
111
+ def handlers
112
+ %w(PushEvent WatchEvent FollowEvent MemberEvent
113
+ CommitCommentEvent PullRequestEvent ForkEvent
114
+ PullRequestReviewCommentEvent IssuesEvent IssueCommentEvent)
115
+ #%w(IssuesEvent IssueCommentEvent)
116
+ end
117
+
118
+ def prepare_options(options)
119
+ options.banner <<-BANNER
120
+ Retrieves events from queues and processes them through GHTorrent
121
+ #{command_name} [options]
122
+
123
+ #{command_name} options:
124
+ BANNER
125
+
126
+ options.opt :filter,
127
+ 'Only process messages for repos in the provided file',
128
+ :short => 'f', :type => String
129
+ end
130
+
131
+ def validate
132
+ super
133
+ Trollop::die "Filter file does not exist" if options[:filter] and not File.exist?(options[:filter])
134
+ end
135
+
136
+ def logger
137
+ ghtorrent.logger
138
+ end
139
+
140
+ def ghtorrent
141
+ @gh ||= GHTorrent::Mirror.new(@settings)
142
+ @gh
143
+ end
144
+
145
+ def go
146
+ filter = Array.new
147
+
148
+ if options[:filter]
149
+ File.open(options[:filter]).each { |l|
150
+ next if l.match(/^ *#/)
151
+ parts = l.split(/ /)
152
+ next if parts.size < 2
153
+ debug "GHTDataRetrieval: Filtering events by #{parts[0] + "/" + parts[1]}"
154
+ filter << parts[0] + "/" + parts[1]
155
+ }
156
+ end
157
+
158
+ # Graceful exit
159
+ Signal.trap('INT') {
160
+ info "GHTDataRetrieval: Received SIGINT, exiting"
161
+ AMQP.stop { EM.stop }
162
+ }
163
+ Signal.trap('TERM') {
164
+ info "GHTDataRetrieval: Received SIGTERM, exiting"
165
+ AMQP.stop { EM.stop }
166
+ }
167
+
168
+ AMQP.start(:host => config(:amqp_host),
169
+ :port => config(:amqp_port),
170
+ :username => config(:amqp_username),
171
+ :password => config(:amqp_password)) do |connection|
172
+
173
+ channel = AMQP::Channel.new(connection, :prefetch => config(:amqp_prefetch))
174
+ exchange = channel.topic(config(:amqp_exchange), :durable => true,
175
+ :auto_delete => false)
176
+
177
+ handlers.each { |h|
178
+ queue = channel.queue("#{h}s", {:durable => true})\
179
+ .bind(exchange, :routing_key => "evt.#{h}")
180
+
181
+ info "GHTDataRetrieval: Binding handler #{h} to routing key evt.#{h}"
182
+
183
+ queue.subscribe(:ack => true) do |headers, msg|
184
+ begin
185
+ data = parse(msg)
186
+ info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
187
+
188
+ unless options[:filter].nil?
189
+ if filter.include?(data['repo']['name'])
190
+ send(h, data)
191
+ else
192
+ info "GHTDataRetrieval: Repo #{data['repo']['name']} not in process list. Ignoring event #{data['type']}-#{data['id']}"
193
+ end
194
+ else
195
+ send(h, data)
196
+ end
197
+ headers.ack
198
+ info "GHTDataRetrieval: Processed event: #{data['type']}-#{data['id']}"
199
+ rescue Exception => e
200
+ # Give a message a chance to be reprocessed
201
+ if headers.redelivered?
202
+ data = parse(msg)
203
+ warn "GHTDataRetrieval: Could not process event: #{data['type']}-#{data['id']}"
204
+ headers.reject(:requeue => false)
205
+ else
206
+ headers.reject(:requeue => true)
207
+ end
208
+
209
+ STDERR.puts e
210
+ STDERR.puts e.backtrace.join("\n")
211
+ end
212
+ end
213
+ }
214
+ end
215
+ end
216
+ end
217
+
218
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :