ghtorrent 0.10 → 0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +16 -0
- data/Gemfile.lock +12 -27
- data/README.md +20 -33
- data/Rakefile +1 -9
- data/bin/ght-log-analyzer +11 -6
- data/bin/ght-log-influx +190 -0
- data/bin/ght-queue-grep.rb +55 -0
- data/bin/ght-retrieve-users +6 -0
- data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
- data/lib/ghtorrent.rb +4 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
- data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
- data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
- data/lib/ghtorrent/api_client.rb +45 -119
- data/lib/ghtorrent/command.rb +25 -8
- data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
- data/lib/ghtorrent/commands/ght_load.rb +1 -2
- data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
- data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
- data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
- data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
- data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
- data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
- data/lib/ghtorrent/event_processing.rb +140 -0
- data/lib/ghtorrent/ghtorrent.rb +330 -396
- data/lib/ghtorrent/logging.rb +65 -12
- data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
- data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
- data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
- data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
- data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
- data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
- data/lib/ghtorrent/retriever.rb +100 -57
- data/lib/ghtorrent/settings.rb +14 -17
- data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
- data/lib/version.rb +1 -1
- metadata +14 -46
- data/bin/ght-process-event +0 -35
- data/lib/ghtorrent/cache.rb +0 -97
- data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
- data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
- data/spec/api_client_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -21
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'ghtorrent'
|
3
|
+
|
4
|
+
|
5
|
+
class GHTQueueGrep < GHTorrent::Command
|
6
|
+
|
7
|
+
include GHTorrent::Persister
|
8
|
+
|
9
|
+
def persister
|
10
|
+
@persister ||= connect(:mongo, settings)
|
11
|
+
@persister
|
12
|
+
end
|
13
|
+
|
14
|
+
def retrieve_event(evt_id)
|
15
|
+
event = persister.get_underlying_connection[:events].find_one('id' => evt_id)
|
16
|
+
event.delete '_id'
|
17
|
+
data = JSON.parse(event.to_json)
|
18
|
+
data
|
19
|
+
end
|
20
|
+
|
21
|
+
def go
|
22
|
+
|
23
|
+
processor = Proc.new do |evt_id|
|
24
|
+
e = retrieve_event(evt_id)
|
25
|
+
|
26
|
+
comment = case e['type']
|
27
|
+
when /CommitCommentEvent/
|
28
|
+
e['payload']['comment']['body']
|
29
|
+
when /PullRequestReviewCommentEvent/
|
30
|
+
e['payload']['body']
|
31
|
+
when /IssueCommentEvent/
|
32
|
+
e['payload']['comment']['body']
|
33
|
+
end
|
34
|
+
|
35
|
+
url = case e['type']
|
36
|
+
when /CommitCommentEvent/
|
37
|
+
e['payload']['comment']['html_url']
|
38
|
+
when /PullRequestReviewCommentEvent/
|
39
|
+
e['payload']['comment']['html_url']
|
40
|
+
when /IssueCommentEvent/
|
41
|
+
e['payload']['comment']['html_url']
|
42
|
+
end
|
43
|
+
|
44
|
+
if comment =~ %r[security|csrf|xss|injection]im
|
45
|
+
puts "#{e['repo']['name']} #{url}"
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
queue_client 'comments', processor
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
GHTQueueGrep.run
|
data/lib/ghtorrent.rb
CHANGED
@@ -27,11 +27,9 @@ require 'ghtorrent/bson_orderedhash'
|
|
27
27
|
|
28
28
|
# Basic utility modules
|
29
29
|
require 'version'
|
30
|
-
require 'ghtorrent/gh_torrent_exception'
|
31
30
|
require 'ghtorrent/utils'
|
32
31
|
require 'ghtorrent/logging'
|
33
32
|
require 'ghtorrent/settings'
|
34
|
-
require 'ghtorrent/cache'
|
35
33
|
require 'ghtorrent/api_client'
|
36
34
|
|
37
35
|
# Support for command line utilities offered by this gem
|
@@ -47,8 +45,9 @@ require 'ghtorrent/persister'
|
|
47
45
|
require 'ghtorrent/retriever'
|
48
46
|
|
49
47
|
# SQL database fillup methods
|
48
|
+
require 'ghtorrent/event_processing'
|
50
49
|
require 'ghtorrent/ghtorrent'
|
51
|
-
require 'ghtorrent/
|
50
|
+
require 'ghtorrent/transacted_gh_torrent'
|
52
51
|
|
53
52
|
# Multi-process queue clients
|
54
53
|
require 'ghtorrent/multiprocess_queue_client'
|
@@ -57,12 +56,13 @@ require 'ghtorrent/multiprocess_queue_client'
|
|
57
56
|
require 'ghtorrent/commands/ght_data_retrieval'
|
58
57
|
require 'ghtorrent/commands/ght_mirror_events'
|
59
58
|
require 'ghtorrent/commands/ght_get_more_commits'
|
60
|
-
require 'ghtorrent/commands/ght_rm_dupl'
|
61
59
|
require 'ghtorrent/commands/ght_load'
|
62
60
|
require 'ghtorrent/commands/ght_retrieve_repo'
|
63
61
|
require 'ghtorrent/commands/ght_retrieve_user'
|
64
62
|
require 'ghtorrent/commands/ght_retrieve_dependents'
|
65
63
|
require 'ghtorrent/commands/ght_retrieve_repos'
|
66
64
|
require 'ghtorrent/commands/ght_retrieve_one'
|
65
|
+
require 'ghtorrent/commands/ght_retrieve_users'
|
66
|
+
require 'ghtorrent/commands/ght_update_repo'
|
67
67
|
|
68
68
|
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|
@@ -11,7 +11,7 @@ module GHTorrent
|
|
11
11
|
# Stores +data+ into +entity+. Returns a unique key for the stored entry.
|
12
12
|
def store(entity, data = {})
|
13
13
|
unless ENTITIES.include?(entity)
|
14
|
-
raise
|
14
|
+
raise "Perister: Entity #{entity} not known"
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
@@ -52,14 +52,7 @@ module GHTorrent
|
|
52
52
|
# matching JSON object.
|
53
53
|
def find(entity, query = {})
|
54
54
|
unless ENTITIES.include?(entity)
|
55
|
-
raise
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Find the record identified by +id+ in +entity+
|
60
|
-
def find_by_ext_ref_id(entity, id)
|
61
|
-
unless ENTITIES.include?(entity)
|
62
|
-
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
55
|
+
raise "Perister: Entity #{entity} not known"
|
63
56
|
end
|
64
57
|
end
|
65
58
|
|
@@ -67,13 +60,13 @@ module GHTorrent
|
|
67
60
|
# The +query+ can be any query supported by +find+.
|
68
61
|
def count(entity, query = {})
|
69
62
|
unless ENTITIES.include?(entity)
|
70
|
-
raise
|
63
|
+
raise "Perister: Entity #{entity} not known"
|
71
64
|
end
|
72
65
|
end
|
73
66
|
|
74
67
|
def del(entity, query = {})
|
75
68
|
unless ENTITIES.include?(entity)
|
76
|
-
raise
|
69
|
+
raise "Perister: Entity #{entity} not known"
|
77
70
|
end
|
78
71
|
end
|
79
72
|
|
@@ -70,12 +70,6 @@ module GHTorrent
|
|
70
70
|
}
|
71
71
|
end
|
72
72
|
|
73
|
-
# Find the record identified by +id+ in +entity+
|
74
|
-
def find_by_ext_ref_id(entity, id)
|
75
|
-
super
|
76
|
-
raise NotImplementedError
|
77
|
-
end
|
78
|
-
|
79
73
|
# Count the number of items returned by +query+
|
80
74
|
def count(entity, query)
|
81
75
|
super
|
@@ -162,10 +156,12 @@ module GHTorrent
|
|
162
156
|
.db(config(:mongo_db))
|
163
157
|
end
|
164
158
|
|
165
|
-
|
159
|
+
unless config(:mongo_username).nil?
|
160
|
+
@mongo.authenticate(config(:mongo_username), config(:mongo_passwd))
|
161
|
+
end
|
166
162
|
stats = @mongo.stats
|
167
|
-
init_db(@mongo) if stats['collections'] < ENTITIES.size + 2
|
168
|
-
init_db(@mongo) if stats['indexes'] < IDXS.keys.size + ENTITIES.size
|
163
|
+
#init_db(@mongo) if stats['collections'] < ENTITIES.size + 2
|
164
|
+
#init_db(@mongo) if stats['indexes'] < IDXS.keys.size + ENTITIES.size
|
169
165
|
|
170
166
|
@mongo
|
171
167
|
else
|
data/lib/ghtorrent/api_client.rb
CHANGED
@@ -7,14 +7,12 @@ require 'json'
|
|
7
7
|
require 'ghtorrent/logging'
|
8
8
|
require 'ghtorrent/settings'
|
9
9
|
require 'ghtorrent/time'
|
10
|
-
require 'ghtorrent/cache'
|
11
10
|
require 'version'
|
12
11
|
|
13
12
|
module GHTorrent
|
14
13
|
module APIClient
|
15
14
|
include GHTorrent::Logging
|
16
15
|
include GHTorrent::Settings
|
17
|
-
include GHTorrent::Cache
|
18
16
|
include GHTorrent::Logging
|
19
17
|
|
20
18
|
# This is to fix an annoying bug in JRuby's SSL not being able to
|
@@ -26,10 +24,10 @@ module GHTorrent
|
|
26
24
|
# A paged request. Used when the result can expand to more than one
|
27
25
|
# result pages.
|
28
26
|
def paged_api_request(url, pages = config(:mirror_history_pages_back),
|
29
|
-
|
27
|
+
last = nil)
|
30
28
|
|
31
29
|
url = ensure_max_per_page(url)
|
32
|
-
data =
|
30
|
+
data = api_request_raw(url)
|
33
31
|
|
34
32
|
return [] if data.nil?
|
35
33
|
|
@@ -47,15 +45,7 @@ module GHTorrent
|
|
47
45
|
if links['next'].nil?
|
48
46
|
parse_request_result(data)
|
49
47
|
else
|
50
|
-
parse_request_result(data) |
|
51
|
-
if links['next'] == last
|
52
|
-
if last != links['last']
|
53
|
-
warn "APIClient: Last header mismatch: method=#{last}, cache=#{links['last']}"
|
54
|
-
end
|
55
|
-
paged_api_request(links['next'], pages, false, last)
|
56
|
-
else
|
57
|
-
paged_api_request(links['next'], pages, cache, last)
|
58
|
-
end
|
48
|
+
parse_request_result(data) | paged_api_request(links['next'], pages, last)
|
59
49
|
end
|
60
50
|
else
|
61
51
|
parse_request_result(data)
|
@@ -65,16 +55,16 @@ module GHTorrent
|
|
65
55
|
|
66
56
|
# A normal request. Returns a hash or an array of hashes representing the
|
67
57
|
# parsed JSON result.
|
68
|
-
def api_request(url
|
69
|
-
parse_request_result api_request_raw(ensure_max_per_page(url)
|
58
|
+
def api_request(url)
|
59
|
+
parse_request_result api_request_raw(ensure_max_per_page(url))
|
70
60
|
end
|
71
61
|
|
72
62
|
# Determine the number of pages contained in a multi-page API response
|
73
63
|
def num_pages(url)
|
74
64
|
url = ensure_max_per_page(url)
|
75
|
-
data =
|
65
|
+
data = api_request_raw(url)
|
76
66
|
|
77
|
-
if data.meta.nil? or data.meta['link'].nil?
|
67
|
+
if data.nil? or data.meta.nil? or data.meta['link'].nil?
|
78
68
|
return 1
|
79
69
|
end
|
80
70
|
|
@@ -106,60 +96,6 @@ module GHTorrent
|
|
106
96
|
end
|
107
97
|
end
|
108
98
|
|
109
|
-
def determine_cache_and_do_request(cache, url)
|
110
|
-
query = URI::parse(url).query
|
111
|
-
params = unless query.nil?
|
112
|
-
CGI::parse(query)
|
113
|
-
else
|
114
|
-
{}
|
115
|
-
end
|
116
|
-
if params.has_key?('page') or (params.has_key?('last_sha'))
|
117
|
-
api_request_raw(url, use_cache?(cache, method = :paged))
|
118
|
-
else
|
119
|
-
api_request_raw(url, use_cache?(cache, method = :non_paged))
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
# Determine whether to use cache or not, depending on the type of the
|
124
|
-
# request
|
125
|
-
def use_cache?(client_request, method = :non_paged)
|
126
|
-
@cache_mode ||= case config(:cache_mode)
|
127
|
-
when 'dev'
|
128
|
-
:dev
|
129
|
-
when 'prod'
|
130
|
-
:prod
|
131
|
-
when 'all'
|
132
|
-
:all
|
133
|
-
when 'off'
|
134
|
-
when false
|
135
|
-
:off
|
136
|
-
else
|
137
|
-
raise GHTorrentException.new("Don't know cache configuration #{config(:cache_mode)}")
|
138
|
-
end
|
139
|
-
case @cache_mode
|
140
|
-
when :off
|
141
|
-
return false
|
142
|
-
when :all
|
143
|
-
return true
|
144
|
-
when :dev
|
145
|
-
unless client_request
|
146
|
-
return false
|
147
|
-
end
|
148
|
-
return true
|
149
|
-
when :prod
|
150
|
-
if client_request
|
151
|
-
return true
|
152
|
-
else
|
153
|
-
case method
|
154
|
-
when :non_paged
|
155
|
-
return false
|
156
|
-
when :paged
|
157
|
-
return true
|
158
|
-
end
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
99
|
# Parse a Github link header
|
164
100
|
def parse_links(links)
|
165
101
|
links.split(/,/).reduce({}) do |acc, x|
|
@@ -184,33 +120,39 @@ module GHTorrent
|
|
184
120
|
end
|
185
121
|
end
|
186
122
|
|
123
|
+
def request_error_msg(url, exception)
|
124
|
+
<<-MSG
|
125
|
+
Failed request. URL: #{url}, Status code: #{exception.io.status[0]},
|
126
|
+
Status: #{exception.io.status[1]},
|
127
|
+
Access: #{if (@token.nil? or @token.empty?) then @username else @token end},
|
128
|
+
IP: #{@attach_ip}, Remaining: #{@remaining}
|
129
|
+
MSG
|
130
|
+
end
|
131
|
+
|
132
|
+
def error_msg(url, exception)
|
133
|
+
<<-MSG
|
134
|
+
Failed request. URL: #{url}, Exception: #{exception.message},
|
135
|
+
Access: #{if (@token.nil? or @token.empty?) then @username else @token end},
|
136
|
+
IP: #{@attach_ip}, Remaining: #{@remaining}
|
137
|
+
MSG
|
138
|
+
end
|
139
|
+
|
187
140
|
# Do the actual request and return the result object
|
188
|
-
def api_request_raw(url
|
141
|
+
def api_request_raw(url)
|
189
142
|
|
190
143
|
begin
|
191
144
|
start_time = Time.now
|
192
|
-
from_cache = false
|
193
|
-
|
194
|
-
contents =
|
195
|
-
if use_cache
|
196
|
-
if not (cached = cache_get(url)).nil?
|
197
|
-
from_cache = true
|
198
|
-
cached
|
199
|
-
else
|
200
|
-
tocache = Cachable.new(do_request(url))
|
201
|
-
cache_put(url, tocache)
|
202
|
-
tocache
|
203
|
-
end
|
204
|
-
else
|
205
|
-
do_request(url)
|
206
|
-
end
|
207
145
|
|
146
|
+
contents = do_request(url)
|
208
147
|
total = Time.now.to_ms - start_time.to_ms
|
209
|
-
|
148
|
+
info "Successful request. URL: #{url}, Remaining: #{@remaining}, Total: #{total} ms"
|
210
149
|
|
211
150
|
contents
|
212
151
|
rescue OpenURI::HTTPError => e
|
213
|
-
|
152
|
+
@remaining = e.io.meta['x-ratelimit-remaining'].to_i
|
153
|
+
@reset = e.io.meta['x-ratelimit-reset'].to_i
|
154
|
+
|
155
|
+
case e.io.status[0].to_i
|
214
156
|
# The following indicate valid Github return codes
|
215
157
|
when 400, # Bad request
|
216
158
|
401, # Unauthorized
|
@@ -218,22 +160,24 @@ module GHTorrent
|
|
218
160
|
404, # Not found
|
219
161
|
422 then # Unprocessable entity
|
220
162
|
total = Time.now.to_ms - start_time.to_ms
|
221
|
-
warn
|
222
|
-
|
223
|
-
@reset = e.io.meta['x-ratelimit-reset'].to_i
|
224
|
-
return nil
|
163
|
+
warn request_error_msg(url, e).strip.gsub(/\s+/,' ').gsub("\n", ' ')
|
164
|
+
return nil
|
225
165
|
else # Server error or HTTP conditions that Github does not report
|
226
|
-
warn
|
166
|
+
warn request_error_msg(url, e).strip.gsub(/\s+/,' ').gsub("\n", ' ')
|
227
167
|
raise e
|
228
168
|
end
|
169
|
+
rescue StandardError => e
|
170
|
+
warn error_msg(url, e).strip.gsub(/\s+/,' ').gsub("\n", ' ')
|
171
|
+
raise e
|
229
172
|
ensure
|
230
|
-
|
173
|
+
# The exact limit is only enforced upon the first @reset
|
174
|
+
if 5000 - @remaining >= @req_limit
|
231
175
|
to_sleep = @reset - Time.now.to_i + 2
|
232
|
-
debug "
|
176
|
+
debug "Request limit reached, sleeping for #{to_sleep} secs"
|
233
177
|
t = Thread.new do
|
234
178
|
slept = 0
|
235
179
|
while true do
|
236
|
-
debug "
|
180
|
+
debug "Sleeping for #{to_sleep - slept} seconds"
|
237
181
|
sleep 1
|
238
182
|
slept += 1
|
239
183
|
end
|
@@ -262,9 +206,10 @@ module GHTorrent
|
|
262
206
|
@username ||= config(:github_username)
|
263
207
|
@passwd ||= config(:github_passwd)
|
264
208
|
@user_agent ||= config(:user_agent)
|
265
|
-
@remaining ||=
|
209
|
+
@remaining ||= 5000
|
266
210
|
@reset ||= Time.now.to_i + 3600
|
267
211
|
@auth_type ||= auth_method(@username, @token)
|
212
|
+
@req_limit ||= config(:req_limit)
|
268
213
|
|
269
214
|
open_func ||=
|
270
215
|
case @auth_type
|
@@ -276,7 +221,7 @@ module GHTorrent
|
|
276
221
|
when :token
|
277
222
|
# As per: https://developer.github.com/v3/auth/#via-oauth-tokens
|
278
223
|
lambda {|url| open(url, 'User-Agent' => @user_agent,
|
279
|
-
|
224
|
+
'Authorization' => "token #{@token}") }
|
280
225
|
end
|
281
226
|
|
282
227
|
result = if @attach_ip.nil? or @attach_ip.eql? '0.0.0.0'
|
@@ -312,7 +257,7 @@ module GHTorrent
|
|
312
257
|
|
313
258
|
result = begin
|
314
259
|
yield
|
315
|
-
rescue
|
260
|
+
rescue StandardError => e
|
316
261
|
raise e
|
317
262
|
ensure
|
318
263
|
TCPSocket.instance_eval do
|
@@ -328,22 +273,3 @@ module GHTorrent
|
|
328
273
|
|
329
274
|
end
|
330
275
|
end
|
331
|
-
|
332
|
-
class Cachable
|
333
|
-
|
334
|
-
include OpenURI::Meta
|
335
|
-
|
336
|
-
attr_reader :base_uri, :meta, :status
|
337
|
-
|
338
|
-
def initialize(response)
|
339
|
-
@data = response.read
|
340
|
-
@base_uri = response.base_uri
|
341
|
-
@meta = response.meta
|
342
|
-
@status = response.status
|
343
|
-
end
|
344
|
-
|
345
|
-
def read
|
346
|
-
@data
|
347
|
-
end
|
348
|
-
|
349
|
-
end
|