ghtorrent 0.10 → 0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +16 -0
  3. data/Gemfile.lock +12 -27
  4. data/README.md +20 -33
  5. data/Rakefile +1 -9
  6. data/bin/ght-log-analyzer +11 -6
  7. data/bin/ght-log-influx +190 -0
  8. data/bin/ght-queue-grep.rb +55 -0
  9. data/bin/ght-retrieve-users +6 -0
  10. data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
  11. data/lib/ghtorrent.rb +4 -4
  12. data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
  13. data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
  14. data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
  15. data/lib/ghtorrent/api_client.rb +45 -119
  16. data/lib/ghtorrent/command.rb +25 -8
  17. data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
  18. data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
  19. data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
  20. data/lib/ghtorrent/commands/ght_load.rb +1 -2
  21. data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
  22. data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
  23. data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
  24. data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
  25. data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
  26. data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
  27. data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
  28. data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
  29. data/lib/ghtorrent/event_processing.rb +140 -0
  30. data/lib/ghtorrent/ghtorrent.rb +330 -396
  31. data/lib/ghtorrent/logging.rb +65 -12
  32. data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
  33. data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
  34. data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
  35. data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
  36. data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
  37. data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
  38. data/lib/ghtorrent/retriever.rb +100 -57
  39. data/lib/ghtorrent/settings.rb +14 -17
  40. data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
  41. data/lib/version.rb +1 -1
  42. metadata +14 -46
  43. data/bin/ght-process-event +0 -35
  44. data/lib/ghtorrent/cache.rb +0 -97
  45. data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
  46. data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
  47. data/spec/api_client_spec.rb +0 -42
  48. data/spec/spec_helper.rb +0 -21
@@ -0,0 +1,55 @@
1
+ require 'json'
2
+ require 'ghtorrent'
3
+
4
+
5
+ class GHTQueueGrep < GHTorrent::Command
6
+
7
+ include GHTorrent::Persister
8
+
9
+ def persister
10
+ @persister ||= connect(:mongo, settings)
11
+ @persister
12
+ end
13
+
14
+ def retrieve_event(evt_id)
15
+ event = persister.get_underlying_connection[:events].find_one('id' => evt_id)
16
+ event.delete '_id'
17
+ data = JSON.parse(event.to_json)
18
+ data
19
+ end
20
+
21
+ def go
22
+
23
+ processor = Proc.new do |evt_id|
24
+ e = retrieve_event(evt_id)
25
+
26
+ comment = case e['type']
27
+ when /CommitCommentEvent/
28
+ e['payload']['comment']['body']
29
+ when /PullRequestReviewCommentEvent/
30
+ e['payload']['body']
31
+ when /IssueCommentEvent/
32
+ e['payload']['comment']['body']
33
+ end
34
+
35
+ url = case e['type']
36
+ when /CommitCommentEvent/
37
+ e['payload']['comment']['html_url']
38
+ when /PullRequestReviewCommentEvent/
39
+ e['payload']['comment']['html_url']
40
+ when /IssueCommentEvent/
41
+ e['payload']['comment']['html_url']
42
+ end
43
+
44
+ if comment =~ %r[security|csrf|xss|injection]im
45
+ puts "#{e['repo']['name']} #{url}"
46
+ end
47
+
48
+ end
49
+
50
+ queue_client 'comments', processor
51
+ end
52
+
53
+ end
54
+
55
+ GHTQueueGrep.run
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'ghtorrent'
5
+
6
+ GHTRetrieveUsers.run(ARGV)
@@ -3,4 +3,4 @@
3
3
  require 'rubygems'
4
4
  require 'ghtorrent'
5
5
 
6
- GHRMDupl.run
6
+ GHTUpdateRepo.run(ARGV)
@@ -27,11 +27,9 @@ require 'ghtorrent/bson_orderedhash'
27
27
 
28
28
  # Basic utility modules
29
29
  require 'version'
30
- require 'ghtorrent/gh_torrent_exception'
31
30
  require 'ghtorrent/utils'
32
31
  require 'ghtorrent/logging'
33
32
  require 'ghtorrent/settings'
34
- require 'ghtorrent/cache'
35
33
  require 'ghtorrent/api_client'
36
34
 
37
35
  # Support for command line utilities offered by this gem
@@ -47,8 +45,9 @@ require 'ghtorrent/persister'
47
45
  require 'ghtorrent/retriever'
48
46
 
49
47
  # SQL database fillup methods
48
+ require 'ghtorrent/event_processing'
50
49
  require 'ghtorrent/ghtorrent'
51
- require 'ghtorrent/transacted_ghtorrent'
50
+ require 'ghtorrent/transacted_gh_torrent'
52
51
 
53
52
  # Multi-process queue clients
54
53
  require 'ghtorrent/multiprocess_queue_client'
@@ -57,12 +56,13 @@ require 'ghtorrent/multiprocess_queue_client'
57
56
  require 'ghtorrent/commands/ght_data_retrieval'
58
57
  require 'ghtorrent/commands/ght_mirror_events'
59
58
  require 'ghtorrent/commands/ght_get_more_commits'
60
- require 'ghtorrent/commands/ght_rm_dupl'
61
59
  require 'ghtorrent/commands/ght_load'
62
60
  require 'ghtorrent/commands/ght_retrieve_repo'
63
61
  require 'ghtorrent/commands/ght_retrieve_user'
64
62
  require 'ghtorrent/commands/ght_retrieve_dependents'
65
63
  require 'ghtorrent/commands/ght_retrieve_repos'
66
64
  require 'ghtorrent/commands/ght_retrieve_one'
65
+ require 'ghtorrent/commands/ght_retrieve_users'
66
+ require 'ghtorrent/commands/ght_update_repo'
67
67
 
68
68
  # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -11,7 +11,7 @@ module GHTorrent
11
11
  # Stores +data+ into +entity+. Returns a unique key for the stored entry.
12
12
  def store(entity, data = {})
13
13
  unless ENTITIES.include?(entity)
14
- raise GHTorrentException.new("Perister: Entity #{entity} not known")
14
+ raise "Perister: Entity #{entity} not known"
15
15
  end
16
16
  end
17
17
 
@@ -52,14 +52,7 @@ module GHTorrent
52
52
  # matching JSON object.
53
53
  def find(entity, query = {})
54
54
  unless ENTITIES.include?(entity)
55
- raise GHTorrentException.new("Perister: Entity #{entity} not known")
56
- end
57
- end
58
-
59
- # Find the record identified by +id+ in +entity+
60
- def find_by_ext_ref_id(entity, id)
61
- unless ENTITIES.include?(entity)
62
- raise GHTorrentException.new("Perister: Entity #{entity} not known")
55
+ raise "Perister: Entity #{entity} not known"
63
56
  end
64
57
  end
65
58
 
@@ -67,13 +60,13 @@ module GHTorrent
67
60
  # The +query+ can be any query supported by +find+.
68
61
  def count(entity, query = {})
69
62
  unless ENTITIES.include?(entity)
70
- raise GHTorrentException.new("Perister: Entity #{entity} not known")
63
+ raise "Perister: Entity #{entity} not known"
71
64
  end
72
65
  end
73
66
 
74
67
  def del(entity, query = {})
75
68
  unless ENTITIES.include?(entity)
76
- raise GHTorrentException.new("Perister: Entity #{entity} not known")
69
+ raise "Perister: Entity #{entity} not known"
77
70
  end
78
71
  end
79
72
 
@@ -70,12 +70,6 @@ module GHTorrent
70
70
  }
71
71
  end
72
72
 
73
- # Find the record identified by +id+ in +entity+
74
- def find_by_ext_ref_id(entity, id)
75
- super
76
- raise NotImplementedError
77
- end
78
-
79
73
  # Count the number of items returned by +query+
80
74
  def count(entity, query)
81
75
  super
@@ -162,10 +156,12 @@ module GHTorrent
162
156
  .db(config(:mongo_db))
163
157
  end
164
158
 
165
- @mongo.authenticate(config(:mongo_username), config(:mongo_passwd))
159
+ unless config(:mongo_username).nil?
160
+ @mongo.authenticate(config(:mongo_username), config(:mongo_passwd))
161
+ end
166
162
  stats = @mongo.stats
167
- init_db(@mongo) if stats['collections'] < ENTITIES.size + 2
168
- init_db(@mongo) if stats['indexes'] < IDXS.keys.size + ENTITIES.size
163
+ #init_db(@mongo) if stats['collections'] < ENTITIES.size + 2
164
+ #init_db(@mongo) if stats['indexes'] < IDXS.keys.size + ENTITIES.size
169
165
 
170
166
  @mongo
171
167
  else
@@ -18,11 +18,6 @@ module GHTorrent
18
18
  []
19
19
  end
20
20
 
21
- def find_by_ext_ref_id(entity, id)
22
- super
23
- nil
24
- end
25
-
26
21
  def get_id
27
22
  0
28
23
  end
@@ -7,14 +7,12 @@ require 'json'
7
7
  require 'ghtorrent/logging'
8
8
  require 'ghtorrent/settings'
9
9
  require 'ghtorrent/time'
10
- require 'ghtorrent/cache'
11
10
  require 'version'
12
11
 
13
12
  module GHTorrent
14
13
  module APIClient
15
14
  include GHTorrent::Logging
16
15
  include GHTorrent::Settings
17
- include GHTorrent::Cache
18
16
  include GHTorrent::Logging
19
17
 
20
18
  # This is to fix an annoying bug in JRuby's SSL not being able to
@@ -26,10 +24,10 @@ module GHTorrent
26
24
  # A paged request. Used when the result can expand to more than one
27
25
  # result pages.
28
26
  def paged_api_request(url, pages = config(:mirror_history_pages_back),
29
- cache = true, last = nil)
27
+ last = nil)
30
28
 
31
29
  url = ensure_max_per_page(url)
32
- data = determine_cache_and_do_request(cache, url)
30
+ data = api_request_raw(url)
33
31
 
34
32
  return [] if data.nil?
35
33
 
@@ -47,15 +45,7 @@ module GHTorrent
47
45
  if links['next'].nil?
48
46
  parse_request_result(data)
49
47
  else
50
- parse_request_result(data) |
51
- if links['next'] == last
52
- if last != links['last']
53
- warn "APIClient: Last header mismatch: method=#{last}, cache=#{links['last']}"
54
- end
55
- paged_api_request(links['next'], pages, false, last)
56
- else
57
- paged_api_request(links['next'], pages, cache, last)
58
- end
48
+ parse_request_result(data) | paged_api_request(links['next'], pages, last)
59
49
  end
60
50
  else
61
51
  parse_request_result(data)
@@ -65,16 +55,16 @@ module GHTorrent
65
55
 
66
56
  # A normal request. Returns a hash or an array of hashes representing the
67
57
  # parsed JSON result.
68
- def api_request(url, cache = true)
69
- parse_request_result api_request_raw(ensure_max_per_page(url), use_cache?(cache))
58
+ def api_request(url)
59
+ parse_request_result api_request_raw(ensure_max_per_page(url))
70
60
  end
71
61
 
72
62
  # Determine the number of pages contained in a multi-page API response
73
63
  def num_pages(url)
74
64
  url = ensure_max_per_page(url)
75
- data = determine_cache_and_do_request(true, url)
65
+ data = api_request_raw(url)
76
66
 
77
- if data.meta.nil? or data.meta['link'].nil?
67
+ if data.nil? or data.meta.nil? or data.meta['link'].nil?
78
68
  return 1
79
69
  end
80
70
 
@@ -106,60 +96,6 @@ module GHTorrent
106
96
  end
107
97
  end
108
98
 
109
- def determine_cache_and_do_request(cache, url)
110
- query = URI::parse(url).query
111
- params = unless query.nil?
112
- CGI::parse(query)
113
- else
114
- {}
115
- end
116
- if params.has_key?('page') or (params.has_key?('last_sha'))
117
- api_request_raw(url, use_cache?(cache, method = :paged))
118
- else
119
- api_request_raw(url, use_cache?(cache, method = :non_paged))
120
- end
121
- end
122
-
123
- # Determine whether to use cache or not, depending on the type of the
124
- # request
125
- def use_cache?(client_request, method = :non_paged)
126
- @cache_mode ||= case config(:cache_mode)
127
- when 'dev'
128
- :dev
129
- when 'prod'
130
- :prod
131
- when 'all'
132
- :all
133
- when 'off'
134
- when false
135
- :off
136
- else
137
- raise GHTorrentException.new("Don't know cache configuration #{config(:cache_mode)}")
138
- end
139
- case @cache_mode
140
- when :off
141
- return false
142
- when :all
143
- return true
144
- when :dev
145
- unless client_request
146
- return false
147
- end
148
- return true
149
- when :prod
150
- if client_request
151
- return true
152
- else
153
- case method
154
- when :non_paged
155
- return false
156
- when :paged
157
- return true
158
- end
159
- end
160
- end
161
- end
162
-
163
99
  # Parse a Github link header
164
100
  def parse_links(links)
165
101
  links.split(/,/).reduce({}) do |acc, x|
@@ -184,33 +120,39 @@ module GHTorrent
184
120
  end
185
121
  end
186
122
 
123
+ def request_error_msg(url, exception)
124
+ <<-MSG
125
+ Failed request. URL: #{url}, Status code: #{exception.io.status[0]},
126
+ Status: #{exception.io.status[1]},
127
+ Access: #{if (@token.nil? or @token.empty?) then @username else @token end},
128
+ IP: #{@attach_ip}, Remaining: #{@remaining}
129
+ MSG
130
+ end
131
+
132
+ def error_msg(url, exception)
133
+ <<-MSG
134
+ Failed request. URL: #{url}, Exception: #{exception.message},
135
+ Access: #{if (@token.nil? or @token.empty?) then @username else @token end},
136
+ IP: #{@attach_ip}, Remaining: #{@remaining}
137
+ MSG
138
+ end
139
+
187
140
  # Do the actual request and return the result object
188
- def api_request_raw(url, use_cache = false)
141
+ def api_request_raw(url)
189
142
 
190
143
  begin
191
144
  start_time = Time.now
192
- from_cache = false
193
-
194
- contents =
195
- if use_cache
196
- if not (cached = cache_get(url)).nil?
197
- from_cache = true
198
- cached
199
- else
200
- tocache = Cachable.new(do_request(url))
201
- cache_put(url, tocache)
202
- tocache
203
- end
204
- else
205
- do_request(url)
206
- end
207
145
 
146
+ contents = do_request(url)
208
147
  total = Time.now.to_ms - start_time.to_ms
209
- debug "APIClient[#{@attach_ip}]: Request: #{url} #{if from_cache then "from cache," else "(#{@remaining} remaining)," end} Total: #{total} ms"
148
+ info "Successful request. URL: #{url}, Remaining: #{@remaining}, Total: #{total} ms"
210
149
 
211
150
  contents
212
151
  rescue OpenURI::HTTPError => e
213
- case e.io.status[0].to_i
152
+ @remaining = e.io.meta['x-ratelimit-remaining'].to_i
153
+ @reset = e.io.meta['x-ratelimit-reset'].to_i
154
+
155
+ case e.io.status[0].to_i
214
156
  # The following indicate valid Github return codes
215
157
  when 400, # Bad request
216
158
  401, # Unauthorized
@@ -218,22 +160,24 @@ module GHTorrent
218
160
  404, # Not found
219
161
  422 then # Unprocessable entity
220
162
  total = Time.now.to_ms - start_time.to_ms
221
- warn "APIClient[#{@attach_ip}]: Request: #{url} (#{@remaining} remaining), Total: #{total} ms, Status: #{e.io.status[1]}"
222
- @remaining = e.io.meta['x-ratelimit-remaining'].to_i
223
- @reset = e.io.meta['x-ratelimit-reset'].to_i
224
- return nil
163
+ warn request_error_msg(url, e).strip.gsub(/\s+/,' ').gsub("\n", ' ')
164
+ return nil
225
165
  else # Server error or HTTP conditions that Github does not report
226
- warn "APIClient: #{url}: #{e.io.status[1]}"
166
+ warn request_error_msg(url, e).strip.gsub(/\s+/,' ').gsub("\n", ' ')
227
167
  raise e
228
168
  end
169
+ rescue StandardError => e
170
+ warn error_msg(url, e).strip.gsub(/\s+/,' ').gsub("\n", ' ')
171
+ raise e
229
172
  ensure
230
- if not from_cache and config(:respect_api_ratelimit) and @remaining < 10
173
+ # The exact limit is only enforced upon the first @reset
174
+ if 5000 - @remaining >= @req_limit
231
175
  to_sleep = @reset - Time.now.to_i + 2
232
- debug "APIClient[#{@attach_ip}]: Request limit reached, sleeping for #{to_sleep} secs"
176
+ debug "Request limit reached, sleeping for #{to_sleep} secs"
233
177
  t = Thread.new do
234
178
  slept = 0
235
179
  while true do
236
- debug "APIClient[#{@attach_ip}]: sleeping for #{to_sleep - slept} seconds"
180
+ debug "Sleeping for #{to_sleep - slept} seconds"
237
181
  sleep 1
238
182
  slept += 1
239
183
  end
@@ -262,9 +206,10 @@ module GHTorrent
262
206
  @username ||= config(:github_username)
263
207
  @passwd ||= config(:github_passwd)
264
208
  @user_agent ||= config(:user_agent)
265
- @remaining ||= 10
209
+ @remaining ||= 5000
266
210
  @reset ||= Time.now.to_i + 3600
267
211
  @auth_type ||= auth_method(@username, @token)
212
+ @req_limit ||= config(:req_limit)
268
213
 
269
214
  open_func ||=
270
215
  case @auth_type
@@ -276,7 +221,7 @@ module GHTorrent
276
221
  when :token
277
222
  # As per: https://developer.github.com/v3/auth/#via-oauth-tokens
278
223
  lambda {|url| open(url, 'User-Agent' => @user_agent,
279
- :http_basic_authentication => [@token, 'x-oauth-basic'])}
224
+ 'Authorization' => "token #{@token}") }
280
225
  end
281
226
 
282
227
  result = if @attach_ip.nil? or @attach_ip.eql? '0.0.0.0'
@@ -312,7 +257,7 @@ module GHTorrent
312
257
 
313
258
  result = begin
314
259
  yield
315
- rescue Exception => e
260
+ rescue StandardError => e
316
261
  raise e
317
262
  ensure
318
263
  TCPSocket.instance_eval do
@@ -328,22 +273,3 @@ module GHTorrent
328
273
 
329
274
  end
330
275
  end
331
-
332
- class Cachable
333
-
334
- include OpenURI::Meta
335
-
336
- attr_reader :base_uri, :meta, :status
337
-
338
- def initialize(response)
339
- @data = response.read
340
- @base_uri = response.base_uri
341
- @meta = response.meta
342
- @status = response.status
343
- end
344
-
345
- def read
346
- @data
347
- end
348
-
349
- end