ghtorrent 0.9 → 0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- MzBhZTlkNGYxYTc3ZTg4NzQ3OTQyMWQ3NTMxZGY1YzljYmE0MzIwZg==
5
- data.tar.gz: !binary |-
6
- NmE3OGNlNGY4Y2QzMGJkY2E3NWY0MTNhMTU2MDAyYTA3MDUzN2IzZQ==
2
+ SHA1:
3
+ metadata.gz: 1394c5a3869bab8d02cf61fc8108ca1f10afc0b8
4
+ data.tar.gz: 9d82ecf68d0316db32f962af8cb7d0070dd74c82
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- MTkwZWQwNmQ2MjE1OTU5ZWQ5NWRmZWFhYjY4ZGM3NTA5ZmY2ZDcyMTI0OGZj
10
- ZGY0MDczZGFmYTZlZTEyYmMxYWI4YTU5ODQyZGFmM2YwZmNhODljMDZmMWJk
11
- ZThiYWQ5OGUxMmI3Yjg1Nzk1MjBkZTFiNzkxZmE1NTAzNGFmNzM=
12
- data.tar.gz: !binary |-
13
- MmJiODg3NmU2OTdjZTA2Njc0MmFmZWRhM2JkZjA4NjY5OTRiZTJhOTEzZGJk
14
- NmJhN2JkNGFjODdhZDk0YWY4MDIzY2NkODU5MDhjZDQxNjA4NzU0ZWFkYmMx
15
- ZDE2Y2U3NzQ0ZDAxN2YxNmNjZDRiZTUzMDQ5OWY5OWY3ZDc3Yjc=
6
+ metadata.gz: 7dbfe453c542137cafafa641c83bb680749e946c86193447e2b6aee829b88370bc66e90a8ea3d64a33d3c0061a7c5b01fa292ee96b94fba6dcca910d472d0f7f
7
+ data.tar.gz: d5c6487264b3a4d17af08e33ac2babd17b63ca6849e1ebd694a1d6b639797e8e870664f7dcde79718d61350551971a0c4b99b5d0ec56923b31984227c69b49bc
data/CHANGELOG CHANGED
@@ -1,5 +1,20 @@
1
+ = Version 0.10
2
+ * Base class for multiprocess queue clients
3
+ * Make retrieval of pull request commits faster
4
+ * Drop table forks
5
+ * Remove merged and user_id fields from pull_request
6
+ * Ruby 2.0.0 support
7
+ * Fast(er) path to project forking. Copy commits from base repo instead of getting them from Github.
8
+ * Github API tokens for authentication
9
+ * Support for disabling the cache
10
+ * A new real-time log analyzer
11
+ * Mark fake users as such
12
+ * Organization members are now retrieved
13
+ * Authenticated MongoDB connections
14
+ * General bug fixes and cleanups
15
+
1
16
  = Version 0.9
2
- * Remove dependency to EventMachine-based AMQP client. We now use bunny.
17
+ * Remove dependency to EventMachine-based AMQP client. We now use bunny.
3
18
 
4
19
  = Version 0.8.1
5
20
  * New tool to retrieve specific entities and their dependencies
@@ -8,7 +23,7 @@
8
23
  * Support for finer grained transactions when processing large entities
9
24
  * Commit comments are now indexed per owner/repo (was just by comment id)
10
25
  * Remove the unused daemon mode
11
- * Various exception fixes and more detailed logging
26
+ * Various exception fixes and more detailed logging
12
27
 
13
28
  = Version 0.8
14
29
  * Retrieve and process issue labels
data/Gemfile.lock CHANGED
@@ -1,28 +1,28 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ghtorrent (0.8.1)
5
- bson_ext (~> 1.9.0)
6
- bunny (~> 1.0.0)
7
- mongo (~> 1.9.0)
8
- sequel (~> 4.5.0)
9
- trollop (~> 2.0.0)
4
+ ghtorrent (0.9)
5
+ bson_ext (~> 1.9, >= 1.9.0)
6
+ bunny (~> 1.0, >= 1.0.0)
7
+ mongo (~> 1.9, >= 1.9.0)
8
+ sequel (~> 4.5, >= 4.5.0)
9
+ trollop (~> 2.0, >= 2.0.0)
10
10
 
11
11
  GEM
12
12
  remote: https://rubygems.org/
13
13
  specs:
14
14
  addressable (2.3.5)
15
15
  amq-protocol (1.9.2)
16
- bson (1.9.2)
17
- bson_ext (1.9.2)
18
- bson (~> 1.9.2)
19
- bunny (1.0.7)
16
+ bson (1.10.0)
17
+ bson_ext (1.10.0)
18
+ bson (~> 1.10.0)
19
+ bunny (1.2.1)
20
20
  amq-protocol (>= 1.9.2)
21
21
  crack (0.4.1)
22
22
  safe_yaml (~> 0.9.0)
23
23
  diff-lcs (1.2.5)
24
- mongo (1.9.2)
25
- bson (~> 1.9.2)
24
+ mongo (1.10.0)
25
+ bson (~> 1.10.0)
26
26
  rspec (2.14.1)
27
27
  rspec-core (~> 2.14.0)
28
28
  rspec-expectations (~> 2.14.0)
@@ -32,7 +32,7 @@ GEM
32
32
  diff-lcs (>= 1.1.3, < 2.0)
33
33
  rspec-mocks (2.14.4)
34
34
  safe_yaml (0.9.7)
35
- sequel (4.5.0)
35
+ sequel (4.10.0)
36
36
  trollop (2.0)
37
37
  webmock (1.16.0)
38
38
  addressable (>= 2.2.7)
@@ -44,5 +44,5 @@ PLATFORMS
44
44
  DEPENDENCIES
45
45
  ghtorrent!
46
46
  jdbc-mysql
47
- rspec (~> 2.14.0)
47
+ rspec (~> 2.14, >= 2.14.0)
48
48
  webmock (~> 1.16)
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'pp'
4
+ require 'time'
5
+
6
+ stats = Hash.new { |hash, key| hash[key] = Hash.new }
7
+
8
+ Thread.new do
9
+ puts "Collecting data..."
10
+ while (true) do
11
+ sleep(1)
12
+ system "clear" or system "cls"
13
+
14
+ stats.each do |k,v|
15
+ unless v[:time_in].nil?
16
+ v[:time_in] += 1
17
+ end
18
+ end
19
+
20
+ puts ' '
21
+ puts "ACTIVE"
22
+ puts ' '
23
+
24
+ data = stats.select { |k, v| v[:status] == 'A' }
25
+ ml = [:ip, :repo, :stage, :remaining, :not_found, :time_in].reduce({}) do |acc, x|
26
+ max = [data.map { |k, v| v[x].size }.max, x.to_s.length].max
27
+ acc.merge({x => max})
28
+ end
29
+
30
+ ml[:pid] = data.keys.map { |x| x.length }.max
31
+
32
+ fmt = "%-#{ml[:pid]}s %-#{ml[:ip]}s %-#{ml[:repo]}s %-#{ml[:stage]}s %-#{ml[:time_in]}s %-#{ml[:not_found]}s %-#{ml[:remaining]}s"
33
+
34
+ puts sprintf(fmt, 'pid', 'ip', 'repo', 'stage', 'time_in', 'not_found', 'remaining')
35
+ data.sort { |a, b| b[1][:time_in].to_i <=> a[1][:time_in].to_i }.each do |x|
36
+ k, v = x[0], x[1]
37
+ puts sprintf(fmt, k, v[:ip], v[:repo], v[:stage], v[:time_in], v[:not_found], v[:remaining])
38
+ end
39
+
40
+
41
+ puts ' '
42
+ puts "SLEEPING"
43
+ puts ' '
44
+
45
+ data = stats.select { |k, v| v[:status] == 'S' }
46
+ ml = [:ip, :repo, :stage, :sleep_remaining, :time_in].reduce({}) do |acc, x|
47
+ max = [data.map { |k, v|
48
+ if v[x].nil? then
49
+ 1
50
+ else
51
+ v[x].size
52
+ end }.max, x.to_s.length].max
53
+ acc.merge({x => max})
54
+ end
55
+
56
+ ml[:pid] = data.keys.map { |x| x.length }.max
57
+
58
+ fmt = "%-#{ml[:pid]}s %-#{ml[:ip]}s %-#{ml[:repo]}s %-#{ml[:stage]}s %-#{ml[:time_in]}s %-#{ml[:sleep_remaining]}s"
59
+
60
+ puts sprintf(fmt, 'pid', 'ip', 'repo', 'stage', 'time_in', 'remaining_sleep')
61
+ data.sort { |a, b| a[1][:sleep_remaining].to_i <=> b[1][:sleep_remaining].to_i }.each do |x|
62
+ k, v = x[0], x[1]
63
+ puts sprintf(fmt, k, v[:ip], v[:repo], v[:stage], v[:time_in], v[:sleep_remaining])
64
+ end
65
+ end
66
+ end
67
+
68
+
69
+ ARGF.each do |x|
70
+
71
+ next unless x =~ /APIClient/
72
+
73
+ if x =~ /sleeping/
74
+ ts, pid, remaining = x.match(/\[([^.]+).*#([0-9]+)\].*for ([0-9]+).*/).captures
75
+
76
+ ts = Time.parse(ts).to_i
77
+ stats[pid][:status] = 'S'
78
+ stats[pid][:sleep_remaining] = remaining
79
+
80
+ unless stats[pid][:repo].nil?
81
+ stats[pid][:time_in] = ts - stats[pid][:start_pr_ts]
82
+ end
83
+
84
+ elsif x =~ /Not Found|Gone|Conflict/
85
+ pid = x.match(/.*#([0-9]+).*APIClient.*/).captures[0]
86
+ if stats[pid][:not_found].nil?
87
+ stats[pid][:not_found] = 0
88
+ end
89
+ stats[pid][:not_found] += 1
90
+ else
91
+ begin
92
+ ts, pid, ip, url, remaining, time =
93
+ x.match(/.*\[([^.]+).*#([0-9]+)\].*APIClient\[(.*)\].*(https:\/\/.*) \(([0-9]+) remaining\).* ([0-9]+) ms$/).captures
94
+ rescue
95
+ puts x
96
+ next
97
+ end
98
+ ts = Time.parse(ts).to_i
99
+ url_chunks = url.split(/\//)
100
+ owner = url_chunks[4]
101
+ repo = url_chunks[5]
102
+ begin
103
+ stage = url_chunks[6].split(/\?/)[0]
104
+ rescue
105
+ stage = 'repo'
106
+ end
107
+
108
+ stats[pid][:status] = 'A'
109
+ stats[pid][:ip] = ip
110
+
111
+ unless stats[pid][:repo] == "#{owner}/#{repo}"
112
+ stats[pid][:repo] = "#{owner}/#{repo}"
113
+ stats[pid][:not_found] = 0
114
+ stats[pid][:start_pr_ts] = ts
115
+ stats[pid][:time_in] = 0
116
+ end
117
+
118
+ stats[pid][:time_in] = ts - stats[pid][:start_pr_ts]
119
+ stats[pid][:stage] = stage
120
+ stats[pid][:remaining] = remaining
121
+
122
+ if stats[pid][:time].nil?
123
+ stats[pid][:time] = Queue.new
124
+ end
125
+
126
+ if stats[pid][:time].length > 100
127
+ stats[pid][:time].pop
128
+ end
129
+
130
+ stats[pid][:time].push(time)
131
+ end
132
+ end
133
+
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'ghtorrent'
5
+
6
+ GHTRetrieveOne.run(ARGV)
data/lib/ghtorrent.rb CHANGED
@@ -50,6 +50,9 @@ require 'ghtorrent/retriever'
50
50
  require 'ghtorrent/ghtorrent'
51
51
  require 'ghtorrent/transacted_ghtorrent'
52
52
 
53
+ # Multi-process queue clients
54
+ require 'ghtorrent/multiprocess_queue_client'
55
+
53
56
  # Commands
54
57
  require 'ghtorrent/commands/ght_data_retrieval'
55
58
  require 'ghtorrent/commands/ght_mirror_events'
@@ -60,5 +63,6 @@ require 'ghtorrent/commands/ght_retrieve_repo'
60
63
  require 'ghtorrent/commands/ght_retrieve_user'
61
64
  require 'ghtorrent/commands/ght_retrieve_dependents'
62
65
  require 'ghtorrent/commands/ght_retrieve_repos'
66
+ require 'ghtorrent/commands/ght_retrieve_one'
63
67
 
64
68
  # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -86,7 +86,7 @@ module GHTorrent
86
86
 
87
87
  def del(entity, query)
88
88
  super
89
- raise Exception 'No filter was specifed. Cowardily refusing to remove all entries' if query == {}
89
+ raise Exception 'No filter was specified. Cowardily refusing to remove all entries' if query == {}
90
90
  get_entity(entity).remove(query)
91
91
  end
92
92
 
@@ -154,7 +154,7 @@ module GHTorrent
154
154
  @mongo = if replicas.nil?
155
155
  Mongo::Connection.new(config(:mongo_host),
156
156
  config(:mongo_port))\
157
- .db(config(:mongo_db))
157
+ .db(config(:mongo_db))\
158
158
  else
159
159
  repl_arr = replicas.strip.split(/ /).map{|x| "#{x}:#{config(:mongo_port)}"}
160
160
  repl_arr << "#{config(:mongo_host)}:#{config(:mongo_port)}"
@@ -162,6 +162,7 @@ module GHTorrent
162
162
  .db(config(:mongo_db))
163
163
  end
164
164
 
165
+ @mongo.authenticate(config(:mongo_username), config(:mongo_passwd))
165
166
  stats = @mongo.stats
166
167
  init_db(@mongo) if stats['collections'] < ENTITIES.size + 2
167
168
  init_db(@mongo) if stats['indexes'] < IDXS.keys.size + ENTITIES.size
@@ -28,26 +28,8 @@ module GHTorrent
28
28
  def paged_api_request(url, pages = config(:mirror_history_pages_back),
29
29
  cache = true, last = nil)
30
30
 
31
- url = if not url.include?('per_page')
32
- if url.include?('?')
33
- url + '&per_page=100'
34
- else
35
- url + '?per_page=100'
36
- end
37
- else
38
- url
39
- end
40
-
41
- params = CGI::parse(URI::parse(url).query)
42
- data = if params.has_key?('page') or (params.has_key?('last_sha'))
43
- api_request_raw(url, use_cache?(cache, method = :paged))
44
- else
45
- if @cache_mode == :all
46
- api_request_raw(url, true)
47
- else
48
- api_request_raw(url, false)
49
- end
50
- end
31
+ url = ensure_max_per_page(url)
32
+ data = determine_cache_and_do_request(cache, url)
51
33
 
52
34
  return [] if data.nil?
53
35
 
@@ -80,14 +62,64 @@ module GHTorrent
80
62
  end
81
63
  end
82
64
 
65
+
83
66
  # A normal request. Returns a hash or an array of hashes representing the
84
67
  # parsed JSON result.
85
68
  def api_request(url, cache = true)
86
- parse_request_result api_request_raw(url, use_cache?(cache))
69
+ parse_request_result api_request_raw(ensure_max_per_page(url), use_cache?(cache))
70
+ end
71
+
72
+ # Determine the number of pages contained in a multi-page API response
73
+ def num_pages(url)
74
+ url = ensure_max_per_page(url)
75
+ data = determine_cache_and_do_request(true, url)
76
+
77
+ if data.meta.nil? or data.meta['link'].nil?
78
+ return 1
79
+ end
80
+
81
+ links = parse_links(data.meta['link'])
82
+
83
+ if links.nil? or links['last'].nil?
84
+ return 1
85
+ end
86
+
87
+ params = CGI::parse(URI::parse(links['last']).query)
88
+ params['page'][0].to_i
87
89
  end
88
90
 
89
91
  private
90
92
 
93
+ def ensure_max_per_page(url)
94
+ if url.include?('page')
95
+ if not url.include?('per_page')
96
+ if url.include?('?')
97
+ url + '&per_page=100'
98
+ else
99
+ url + '?per_page=100'
100
+ end
101
+ else
102
+ url
103
+ end
104
+ else
105
+ url
106
+ end
107
+ end
108
+
109
+ def determine_cache_and_do_request(cache, url)
110
+ query = URI::parse(url).query
111
+ params = unless query.nil?
112
+ CGI::parse(query)
113
+ else
114
+ {}
115
+ end
116
+ if params.has_key?('page') or (params.has_key?('last_sha'))
117
+ api_request_raw(url, use_cache?(cache, method = :paged))
118
+ else
119
+ api_request_raw(url, use_cache?(cache, method = :non_paged))
120
+ end
121
+ end
122
+
91
123
  # Determine whether to use cache or not, depending on the type of the
92
124
  # request
93
125
  def use_cache?(client_request, method = :non_paged)
@@ -98,10 +130,15 @@ module GHTorrent
98
130
  :prod
99
131
  when 'all'
100
132
  :all
133
+ when 'off'
134
+ when false
135
+ :off
101
136
  else
102
- raise GHTorrentException.new("Don't know cache configuration #{@cache_mode}")
137
+ raise GHTorrentException.new("Don't know cache configuration #{config(:cache_mode)}")
103
138
  end
104
139
  case @cache_mode
140
+ when :off
141
+ return false
105
142
  when :all
106
143
  return true
107
144
  when :dev
@@ -180,7 +217,8 @@ module GHTorrent
180
217
  403, # Forbidden
181
218
  404, # Not found
182
219
  422 then # Unprocessable entity
183
- warn "APIClient: #{url}: #{e.io.status[1]}"
220
+ total = Time.now.to_ms - start_time.to_ms
221
+ warn "APIClient[#{@attach_ip}]: Request: #{url} (#{@remaining} remaining), Total: #{total} ms, Status: #{e.io.status[1]}"
184
222
  @remaining = e.io.meta['x-ratelimit-remaining'].to_i
185
223
  @reset = e.io.meta['x-ratelimit-reset'].to_i
186
224
  return nil
@@ -191,27 +229,55 @@ module GHTorrent
191
229
  ensure
192
230
  if not from_cache and config(:respect_api_ratelimit) and @remaining < 10
193
231
  to_sleep = @reset - Time.now.to_i + 2
194
- debug "APIClient: Request limit reached, sleeping for #{to_sleep} secs"
232
+ debug "APIClient[#{@attach_ip}]: Request limit reached, sleeping for #{to_sleep} secs"
233
+ t = Thread.new do
234
+ slept = 0
235
+ while true do
236
+ debug "APIClient[#{@attach_ip}]: sleeping for #{to_sleep - slept} seconds"
237
+ sleep 1
238
+ slept += 1
239
+ end
240
+ end
195
241
  sleep(to_sleep)
242
+ t.exit
243
+ end
244
+ end
245
+ end
246
+
247
+ def auth_method(username, token)
248
+ if token.nil? or token.empty?
249
+ if username.nil? or username.empty?
250
+ :none
251
+ else
252
+ :username
196
253
  end
254
+ else
255
+ :token
197
256
  end
198
257
  end
199
258
 
200
259
  def do_request(url)
201
260
  @attach_ip ||= config(:attach_ip)
261
+ @token ||= config(:github_token)
202
262
  @username ||= config(:github_username)
203
263
  @passwd ||= config(:github_passwd)
204
264
  @user_agent ||= config(:user_agent)
205
265
  @remaining ||= 10
206
266
  @reset ||= Time.now.to_i + 3600
207
-
208
- open_func ||= if @username.nil?
209
- lambda {|url| open(url, 'User-Agent' => @user_agent)}
210
- else
211
- lambda {|url| open(url,
212
- 'User-Agent' => @user_agent,
213
- :http_basic_authentication => [@username, @passwd])}
214
- end
267
+ @auth_type ||= auth_method(@username, @token)
268
+
269
+ open_func ||=
270
+ case @auth_type
271
+ when :none
272
+ lambda {|url| open(url, 'User-Agent' => @user_agent)}
273
+ when :username
274
+ lambda {|url| open(url, 'User-Agent' => @user_agent,
275
+ :http_basic_authentication => [@username, @passwd])}
276
+ when :token
277
+ # As per: https://developer.github.com/v3/auth/#via-oauth-tokens
278
+ lambda {|url| open(url, 'User-Agent' => @user_agent,
279
+ :http_basic_authentication => [@token, 'x-oauth-basic'])}
280
+ end
215
281
 
216
282
  result = if @attach_ip.nil? or @attach_ip.eql? '0.0.0.0'
217
283
  open_func.call(url)
@@ -231,8 +297,15 @@ module GHTorrent
231
297
  (class << self; self; end).instance_eval do
232
298
  alias_method :original_open, :open
233
299
 
234
- define_method(:open) do |conn_address, conn_port|
235
- original_open(conn_address, conn_port, ip)
300
+ case RUBY_VERSION
301
+ when /1.9/
302
+ define_method(:open) do |conn_address, conn_port|
303
+ original_open(conn_address, conn_port, ip)
304
+ end
305
+ when /2.0/
306
+ define_method(:open) do |conn_address, conn_port, local_host, local_port|
307
+ original_open(conn_address, conn_port, ip, local_port)
308
+ end
236
309
  end
237
310
  end
238
311
  end