ghtorrent 0.10 → 0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +16 -0
  3. data/Gemfile.lock +12 -27
  4. data/README.md +20 -33
  5. data/Rakefile +1 -9
  6. data/bin/ght-log-analyzer +11 -6
  7. data/bin/ght-log-influx +190 -0
  8. data/bin/ght-queue-grep.rb +55 -0
  9. data/bin/ght-retrieve-users +6 -0
  10. data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
  11. data/lib/ghtorrent.rb +4 -4
  12. data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
  13. data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
  14. data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
  15. data/lib/ghtorrent/api_client.rb +45 -119
  16. data/lib/ghtorrent/command.rb +25 -8
  17. data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
  18. data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
  19. data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
  20. data/lib/ghtorrent/commands/ght_load.rb +1 -2
  21. data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
  22. data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
  23. data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
  24. data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
  25. data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
  26. data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
  27. data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
  28. data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
  29. data/lib/ghtorrent/event_processing.rb +140 -0
  30. data/lib/ghtorrent/ghtorrent.rb +330 -396
  31. data/lib/ghtorrent/logging.rb +65 -12
  32. data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
  33. data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
  34. data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
  35. data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
  36. data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
  37. data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
  38. data/lib/ghtorrent/retriever.rb +100 -57
  39. data/lib/ghtorrent/settings.rb +14 -17
  40. data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
  41. data/lib/version.rb +1 -1
  42. metadata +14 -46
  43. data/bin/ght-process-event +0 -35
  44. data/lib/ghtorrent/cache.rb +0 -97
  45. data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
  46. data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
  47. data/spec/api_client_spec.rb +0 -42
  48. data/spec/spec_helper.rb +0 -21
@@ -17,6 +17,7 @@ module GHTorrent
17
17
  class Command
18
18
 
19
19
  include GHTorrent::Settings
20
+ include GHTorrent::Logging
20
21
 
21
22
  # Specify the run method for subclasses.
22
23
  class << self
@@ -60,6 +61,19 @@ module GHTorrent
60
61
  command.options[:token])
61
62
  end
62
63
 
64
+ unless command.options[:req_limit].nil?
65
+ command.settings = command.override_config(command.settings,
66
+ :req_limit,
67
+ command.options[:req_limit])
68
+ end
69
+
70
+ unless command.options[:uniq].nil?
71
+ command.settings = command.override_config(command.settings,
72
+ :logging_uniq,
73
+ command.options[:uniq])
74
+ end
75
+
76
+
63
77
  begin
64
78
  command.go
65
79
  rescue => e
@@ -88,12 +102,16 @@ Standard options:
88
102
  opt :config, 'config.yaml file location', :short => 'c',
89
103
  :default => 'config.yaml'
90
104
  opt :verbose, 'verbose mode', :short => 'v'
91
- opt :addr, 'ip address to use for performing requests', :short => 'a',
105
+ opt :addr, 'IP address to use for performing requests', :short => 'a',
92
106
  :type => String
93
107
  opt :username, 'Username at Github', :short => 's', :type => String
94
108
  opt :password, 'Password at Github', :type => String
95
109
  opt :token, 'OAuth Github token (use instead of username/password)',
96
110
  :type => String, :short => 't'
111
+ opt :req_limit, 'Request limit for provided account (in reqs/hour)',
112
+ :type => Integer, :short => 'l'
113
+ opt :uniq, 'Unique name for this command. Will appear in logs.',
114
+ :type => String, :short => 'u'
97
115
  end
98
116
  end
99
117
 
@@ -143,12 +161,11 @@ Standard options:
143
161
  def go
144
162
  end
145
163
 
146
- # Specify a handler to incoming messages from a connection to
147
- # a queue.
148
- # [queue]: The queue name to bind to
149
- # [ack]: :before or :after when should acks be send, before or after
150
- # the block returns
151
- # [block]: A block with one argument (the message)
164
+ # Specify a handler to incoming messages from a connection to a queue.
165
+ #
166
+ # @param queue [String] the queue name to bind to
167
+ # @param ack [Symbol] when should acks be send, :before or :after the block returns
168
+ # @param block [Block]: A block accepting one argument (the message)
152
169
  def queue_client(queue, ack = :after, block)
153
170
 
154
171
  stopped = false
@@ -194,7 +211,7 @@ Standard options:
194
211
  sleep(1)
195
212
  rescue Interrupt => _
196
213
  stopped = true
197
- rescue Exception => e
214
+ rescue StandardError => e
198
215
  raise e
199
216
  end
200
217
  end
@@ -0,0 +1,50 @@
1
+ module GHTorrent
2
+ module Commands
3
+ # Defines a process to download the full data available for a single user
4
+ module FullUserRetriever
5
+
6
+ def retrieve_user(login)
7
+ #self.settings = override_config(settings, :mirror_history_pages_back, -1)
8
+
9
+ user_entry = ght.transaction { ght.ensure_user(login, false, false) }
10
+ on_github = api_request(ghurl ("users/#{login}"))
11
+
12
+ if on_github.empty?
13
+ if user_entry.nil?
14
+ warn "User #{login} does not exist on GitHub"
15
+ exit
16
+ else
17
+ ght.transaction do
18
+ ght.get_db.from(:users).where(:login => login).update(:users__deleted => true)
19
+ end
20
+ warn "User #{login} marked as deleted"
21
+ return
22
+ end
23
+ else
24
+ if user_entry.nil?
25
+ warn "Error retrieving user #{login}"
26
+ exit
27
+ end
28
+ end
29
+
30
+ user = user_entry[:login]
31
+
32
+ def send_message(function, user)
33
+ begin
34
+ ght.send(function, user)
35
+ rescue StandardError => e
36
+ puts STDERR, e.message
37
+ puts STDERR, e.backtrace
38
+ end
39
+ end
40
+
41
+ functions = %w(ensure_user_following ensure_user_followers ensure_orgs ensure_org)
42
+
43
+ functions.each do |x|
44
+ send_message(x, user)
45
+ end
46
+
47
+ end
48
+ end
49
+ end
50
+ end
@@ -12,6 +12,7 @@ class GHTDataRetrieval < GHTorrent::Command
12
12
  include GHTorrent::Settings
13
13
  include GHTorrent::Logging
14
14
  include GHTorrent::Persister
15
+ include GHTorrent::EventProcessing
15
16
 
16
17
  def persister
17
18
  @persister ||= connect(:mongo, settings)
@@ -22,99 +23,11 @@ class GHTDataRetrieval < GHTorrent::Command
22
23
  JSON.parse(msg)
23
24
  end
24
25
 
25
- def PushEvent(data)
26
- data['payload']['commits'].each do |c|
27
- url = c['url'].split(/\//)
28
-
29
- ghtorrent.get_commit url[4], url[5], url[7]
30
- end
31
- end
32
-
33
- def WatchEvent(data)
34
- owner = data['repo']['name'].split(/\//)[0]
35
- repo = data['repo']['name'].split(/\//)[1]
36
- watcher = data['actor']['login']
37
- created_at = data['created_at']
38
-
39
- ghtorrent.get_watcher owner, repo, watcher, created_at
40
- end
41
-
42
- def FollowEvent(data)
43
- follower = data['actor']['login']
44
- followed = data['payload']['target']['login']
45
- created_at = data['created_at']
46
-
47
- ghtorrent.get_follower(follower, followed, created_at)
48
- end
49
-
50
- def MemberEvent(data)
51
- owner = data['actor']['login']
52
- repo = data['repo']['name'].split(/\//)[1]
53
- new_member = data['payload']['member']['login']
54
- created_at = data['created_at']
55
-
56
- ghtorrent.get_project_member(owner, repo, new_member, created_at)
57
- end
58
-
59
- def CommitCommentEvent(data)
60
- user = data['repo']['name'].split(/\//)[0]
61
- repo = data['repo']['name'].split(/\//)[1]
62
- id = data['payload']['comment']['id']
63
- sha = data['payload']['comment']['commit_id']
64
-
65
- ghtorrent.get_commit_comment(user, repo, sha, id)
66
- end
67
-
68
- def PullRequestEvent(data)
69
- owner = data['payload']['pull_request']['base']['repo']['owner']['login']
70
- repo = data['payload']['pull_request']['base']['repo']['name']
71
- pullreq_id = data['payload']['number']
72
- action = data['payload']['action']
73
- actor = data['actor']['login']
74
- created_at = data['created_at']
75
-
76
- ghtorrent.get_pull_request(owner, repo, pullreq_id, action, actor, created_at)
77
- end
78
-
79
- def ForkEvent(data)
80
- owner = data['repo']['name'].split(/\//)[0]
81
- repo = data['repo']['name'].split(/\//)[1]
82
- fork_id = data['payload']['forkee']['id']
83
-
84
- #ghtorrent.get_fork(owner, repo, fork_id)
85
- end
86
-
87
- def PullRequestReviewCommentEvent(data)
88
- owner = data['repo']['name'].split(/\//)[0]
89
- repo = data['repo']['name'].split(/\//)[1]
90
- comment_id = data['payload']['comment']['id']
91
- pullreq_id = data['payload']['comment']['_links']['pull_request']['href'].split(/\//)[-1]
92
-
93
- ghtorrent.get_pullreq_comment(owner, repo, pullreq_id, comment_id)
94
- end
95
-
96
- def IssuesEvent(data)
97
- owner = data['repo']['name'].split(/\//)[0]
98
- repo = data['repo']['name'].split(/\//)[1]
99
- issue_id = data['payload']['issue']['number']
100
-
101
- ghtorrent.get_issue(owner, repo, issue_id)
102
- end
103
-
104
- def IssueCommentEvent(data)
105
- owner = data['repo']['name'].split(/\//)[0]
106
- repo = data['repo']['name'].split(/\//)[1]
107
- issue_id = data['payload']['issue']['number']
108
- comment_id = data['payload']['comment']['id']
109
-
110
- ghtorrent.get_issue_comment(owner, repo, issue_id, comment_id)
111
- end
112
-
113
26
  def handlers
114
- %w(PushEvent WatchEvent FollowEvent MemberEvent
27
+ %w(PushEvent WatchEvent FollowEvent MemberEvent CreateEvent
115
28
  CommitCommentEvent PullRequestEvent ForkEvent
116
29
  PullRequestReviewCommentEvent IssuesEvent IssueCommentEvent)
117
- #%w(PullRequestEvent)
30
+ #%w(ForkEvent)
118
31
  end
119
32
 
120
33
  def prepare_options(options)
@@ -135,7 +48,8 @@ If event_id is provided, only this event is processed.
135
48
  end
136
49
 
137
50
  def ghtorrent
138
- @gh ||= GHTorrent::Mirror.new(@settings)
51
+ #@gh ||= GHTorrent::Mirror.new(@settings)
52
+ @gh ||= TransactedGHTorrent.new(settings)
139
53
  @gh
140
54
  end
141
55
 
@@ -143,7 +57,7 @@ If event_id is provided, only this event is processed.
143
57
  event = persister.get_underlying_connection[:events].find_one('id' => evt_id)
144
58
  event.delete '_id'
145
59
  data = parse(event.to_json)
146
- info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
60
+ debug "Processing event: #{data['type']}-#{data['id']}"
147
61
  data
148
62
  end
149
63
 
@@ -153,7 +67,7 @@ If event_id is provided, only this event is processed.
153
67
  event = retrieve_event(ARGV[0])
154
68
 
155
69
  if event.nil?
156
- warn "GHTDataRetrieval: No event with id: #{ARGV[0]}"
70
+ warn "No event with id: #{ARGV[0]}"
157
71
  else
158
72
  send(event['type'], event)
159
73
  end
@@ -178,20 +92,20 @@ If event_id is provided, only this event is processed.
178
92
  queue = channel.queue("#{h}s", {:durable => true})\
179
93
  .bind(exchange, :routing_key => "evt.#{h}")
180
94
 
181
- info "GHTDataRetrieval: Binding handler #{h} to routing key evt.#{h}"
95
+ info "Binding handler #{h} to routing key evt.#{h}"
182
96
 
183
97
  queue.subscribe(:ack => true) do |headers, properties, msg|
98
+ start = Time.now
184
99
  begin
185
-
186
100
  data = retrieve_event(msg)
187
101
  send(h, data)
188
102
 
189
103
  channel.acknowledge(headers.delivery_tag, false)
190
- info "GHTDataRetrieval: Processed event: #{data['type']}-#{data['id']}"
191
- rescue Exception => e
104
+ info "Success processing event. Type: #{data['type']}, ID: #{data['id']}, Time: #{Time.now.to_ms - start.to_ms} ms"
105
+ rescue StandardError => e
192
106
  # Give a message a chance to be reprocessed
193
107
  if headers.redelivered?
194
- warn "GHTDataRetrieval: Could not process event: #{msg}"
108
+ warn "Error processing event. Type: #{data['type']}, ID: #{data['id']}, Time: #{Time.now.to_ms - start.to_ms} ms"
195
109
  channel.reject(headers.delivery_tag, false)
196
110
  else
197
111
  channel.reject(headers.delivery_tag, true)
@@ -12,6 +12,7 @@ class GHTMoreCommitsRetriever < GHTorrent::Command
12
12
  include GHTorrent::Settings
13
13
  include GHTorrent::Retriever
14
14
  include GHTorrent::Persister
15
+ include GHTorrent::Logging
15
16
 
16
17
  def prepare_options(options)
17
18
  options.banner <<-BANNER
@@ -28,7 +29,7 @@ Retrieves more commits for the provided repository
28
29
  If not set, will start from latest stored commit',
29
30
  :short => 'f', :default => false, :type => :boolean
30
31
  options.opt :upto, 'Get all commits up to the provided timestamp',
31
- :short => 't', :default => 0, :type => :int
32
+ :short => 'x', :default => 0, :type => :int
32
33
  end
33
34
 
34
35
  def validate
@@ -36,20 +37,15 @@ Retrieves more commits for the provided repository
36
37
  Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
37
38
  end
38
39
 
39
- def logger
40
- @ght.logger
41
- end
40
+ #def logger
41
+ # @ght.logger
42
+ #end
42
43
 
43
44
  def persister
44
45
  @persister ||= connect(:mongo, settings)
45
46
  @persister
46
47
  end
47
48
 
48
- def ext_uniq
49
- @ext_uniq ||= config(:uniq_id)
50
- @ext_uniq
51
- end
52
-
53
49
  def go
54
50
 
55
51
  @ght ||= GHTorrent::Mirror.new(settings)
@@ -82,7 +78,7 @@ Retrieves more commits for the provided repository
82
78
  old_head = nil
83
79
  while (true)
84
80
  begin
85
- logger.debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
81
+ debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
86
82
 
87
83
  @settings = override_config(@settings, :mirror_history_pages_back, 1)
88
84
  commits = retrieve_commits(repo, head, user)
@@ -97,12 +93,12 @@ Retrieves more commits for the provided repository
97
93
  total_commits += 1
98
94
 
99
95
  if options[:num] < total_commits
100
- logger.info("Already retrieved #{total_commits} commits. Stopping.")
96
+ info("Already retrieved #{total_commits} commits. Stopping.")
101
97
  return
102
98
  end
103
99
 
104
100
  if Time.parse(c['commit']['author']['date']) < Time.at(options[:upto])
105
- logger.info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
101
+ info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
106
102
  return
107
103
  end
108
104
 
@@ -110,17 +106,17 @@ Retrieves more commits for the provided repository
110
106
  @ght.ensure_commit(repo, c['sha'], user)
111
107
  end
112
108
  end
113
- rescue Exception => e
114
- logger.warn("Error processing: #{e}")
115
- logger.warn(e.backtrace.join("\n"))
109
+ rescue StandardError => e
110
+ warn("Error processing: #{e}")
111
+ warn(e.backtrace.join("\n"))
116
112
  if old_head == head
117
- logger.info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
113
+ info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
118
114
  fail("Cannot retrieve commits from head: #{head}")
119
115
  end
120
116
  old_head = head
121
117
  end
122
118
  end
123
- logger.debug("Processed #{total_commits} commits for #{user}/#{repo}")
119
+ debug("Processed #{total_commits} commits for #{user}/#{repo}")
124
120
  end
125
121
  end
126
122
 
@@ -4,7 +4,6 @@ require 'pp'
4
4
  require 'bunny'
5
5
 
6
6
  require 'ghtorrent/settings'
7
- require 'ghtorrent/logging'
8
7
  require 'ghtorrent/persister'
9
8
  require 'ghtorrent/command'
10
9
  require 'ghtorrent/bson_orderedhash'
@@ -31,7 +30,7 @@ Loads object ids from a collection to a queue for further processing.
31
30
  options.opt :earliest, 'Seconds since epoch of earliest item to load',
32
31
  :short => 'e', :default => 0, :type => :int
33
32
  options.opt :latest, 'Seconds since epoch of latest item to load',
34
- :short => 'l', :default => Time.now.to_i + (60 * 60 * 24 * 360 * 20),
33
+ :short => 'x', :default => Time.now.to_i + (60 * 60 * 24 * 360 * 20),
35
34
  :type => :int
36
35
  options.opt :number, 'Total number of items to load',
37
36
  :short => 'n', :type => :int, :default => 2**48
@@ -15,10 +15,7 @@ class GHTMirrorEvents < GHTorrent::Command
15
15
  include GHTorrent::Logging
16
16
  include GHTorrent::Persister
17
17
  include GHTorrent::APIClient
18
-
19
- def logger
20
- @logger
21
- end
18
+ include GHTorrent::Logging
22
19
 
23
20
  def store_count(events)
24
21
  stored = Array.new
@@ -41,13 +38,13 @@ class GHTMirrorEvents < GHTorrent::Command
41
38
  def retrieve(exchange)
42
39
  begin
43
40
  new = dupl = 0
44
- events = api_request "https://api.github.com/events", false
41
+ events = api_request "https://api.github.com/events?per_page=100"
45
42
  (new, dupl, stored) = store_count events
46
43
 
47
- # This means that first page cannot contain all new events. Go
48
- # up to 10 pages back to find all new events not contained in first page.
44
+ # This means that the first page does not contain all new events. Do
45
+ # a paged request and get everything on the queue
49
46
  if dupl == 0
50
- events = paged_api_request "https://api.github.com/events"
47
+ events = paged_api_request "https://api.github.com/events?per_page=100"
51
48
  (new1, dupl1, stored1) = store_count events
52
49
  stored = stored | stored1
53
50
  new = new + new1
@@ -58,7 +55,7 @@ class GHTMirrorEvents < GHTorrent::Command
58
55
  exchange.publish e['id'], :persistent => true, :routing_key => key
59
56
  end
60
57
  return new, dupl
61
- rescue Exception => e
58
+ rescue StandardError => e
62
59
  STDERR.puts e.message
63
60
  STDERR.puts e.backtrace
64
61
  end
@@ -66,7 +63,6 @@ class GHTMirrorEvents < GHTorrent::Command
66
63
 
67
64
  def go
68
65
  @persister = connect(:mongo, @settings)
69
- @logger = Logger.new(STDOUT)
70
66
 
71
67
  conn = Bunny.new(:host => config(:amqp_host),
72
68
  :port => config(:amqp_port),
@@ -75,7 +71,7 @@ class GHTMirrorEvents < GHTorrent::Command
75
71
  conn.start
76
72
 
77
73
  ch = conn.create_channel
78
- @logger.debug "Connection to #{config(:amqp_host)} succeded"
74
+ debug "Connection to #{config(:amqp_host)} succeded"
79
75
 
80
76
  exchange = ch.topic(config(:amqp_exchange), :durable => true,
81
77
  :auto_delete => false)
@@ -97,7 +93,7 @@ class GHTMirrorEvents < GHTorrent::Command
97
93
  end
98
94
  rescue Interrupt
99
95
  stopped = true
100
- rescue Exception => e
96
+ rescue StandardError => e
101
97
  @logger.error e
102
98
  end
103
99
  end