ghtorrent 0.4 → 0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +24 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +40 -0
- data/README.md +23 -22
- data/bin/ght-data-retrieval +66 -24
- data/bin/ght-load +41 -19
- data/bin/ght-mirror-events +13 -16
- data/bin/ght-rm-dupl +119 -77
- data/lib/ghtorrent.rb +14 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
- data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
- data/lib/ghtorrent/api_client.rb +151 -16
- data/lib/ghtorrent/bson_orderedhash.rb +23 -0
- data/lib/ghtorrent/cache.rb +97 -0
- data/lib/ghtorrent/command.rb +43 -25
- data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
- data/lib/ghtorrent/ghtorrent.rb +615 -164
- data/lib/ghtorrent/hash.rb +11 -0
- data/lib/ghtorrent/logging.rb +11 -7
- data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
- data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
- data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
- data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
- data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
- data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
- data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
- data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
- data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
- data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
- data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
- data/lib/ghtorrent/persister.rb +3 -0
- data/lib/ghtorrent/retriever.rb +298 -102
- data/lib/ghtorrent/settings.rb +20 -1
- data/lib/ghtorrent/time.rb +5 -0
- data/lib/ghtorrent/utils.rb +22 -4
- data/lib/version.rb +5 -0
- metadata +173 -145
- data/lib/ghtorrent/call_stack.rb +0 -91
data/lib/ghtorrent/api_client.rb
CHANGED
@@ -1,28 +1,41 @@
|
|
1
|
-
require 'net/http'
|
2
|
-
require 'set'
|
3
1
|
require 'open-uri'
|
2
|
+
require 'net/http'
|
3
|
+
require 'digest/sha1'
|
4
|
+
require 'fileutils'
|
4
5
|
require 'json'
|
5
6
|
|
7
|
+
require 'ghtorrent/logging'
|
8
|
+
require 'ghtorrent/settings'
|
9
|
+
require 'ghtorrent/time'
|
10
|
+
require 'ghtorrent/cache'
|
11
|
+
|
6
12
|
module GHTorrent
|
7
13
|
module APIClient
|
8
14
|
include GHTorrent::Logging
|
9
15
|
include GHTorrent::Settings
|
16
|
+
include GHTorrent::Cache
|
10
17
|
|
11
|
-
|
12
|
-
|
13
|
-
|
18
|
+
# This is to fix an annoying bug in JRuby's SSL not being able to
|
19
|
+
# verify a valid certificate.
|
20
|
+
if defined? JRUBY_VERSION
|
21
|
+
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
|
14
22
|
end
|
15
23
|
|
16
24
|
# A paged request. Used when the result can expand to more than one
|
17
25
|
# result pages.
|
18
|
-
def paged_api_request(url, pages = -1)
|
26
|
+
def paged_api_request(url, pages = -1, cache = true, last = nil)
|
19
27
|
|
20
|
-
data =
|
28
|
+
data = if URI.parse(url).query.nil? # Top level request, no params
|
29
|
+
api_request_raw(url, false)
|
30
|
+
else
|
31
|
+
api_request_raw(url, use_cache?(cache, method = :paged))
|
32
|
+
end
|
21
33
|
|
22
34
|
return [] if data.nil?
|
23
35
|
|
24
36
|
unless data.meta['link'].nil?
|
25
37
|
links = parse_links(data.meta['link'])
|
38
|
+
last = links['last'] if last.nil?
|
26
39
|
|
27
40
|
if pages > 0
|
28
41
|
pages = pages - 1
|
@@ -34,7 +47,15 @@ module GHTorrent
|
|
34
47
|
if links['next'].nil?
|
35
48
|
parse_request_result(data)
|
36
49
|
else
|
37
|
-
parse_request_result(data) |
|
50
|
+
parse_request_result(data) |
|
51
|
+
if links['next'] == last
|
52
|
+
if last != links['last']
|
53
|
+
warn "APIClient: Last header mismatch: method=#{last}, cache=#{links['last']}"
|
54
|
+
end
|
55
|
+
paged_api_request(links['next'], pages, false, last)
|
56
|
+
else
|
57
|
+
paged_api_request(links['next'], pages, cache, last)
|
58
|
+
end
|
38
59
|
end
|
39
60
|
else
|
40
61
|
parse_request_result(data)
|
@@ -43,12 +64,43 @@ module GHTorrent
|
|
43
64
|
|
44
65
|
# A normal request. Returns a hash or an array of hashes representing the
|
45
66
|
# parsed JSON result.
|
46
|
-
def api_request(url)
|
47
|
-
parse_request_result api_request_raw(url)
|
67
|
+
def api_request(url, cache = false)
|
68
|
+
parse_request_result api_request_raw(url, use_cache?(cache))
|
48
69
|
end
|
49
70
|
|
50
71
|
private
|
51
72
|
|
73
|
+
# Determine whether to use cache or not, depending on the type of the
|
74
|
+
# request
|
75
|
+
def use_cache?(client_request, method = :non_paged)
|
76
|
+
@cache_mode ||= case config(:cache_mode)
|
77
|
+
when "dev"
|
78
|
+
:dev
|
79
|
+
when "prod"
|
80
|
+
:prod
|
81
|
+
else
|
82
|
+
raise GHTorrentException("")
|
83
|
+
end
|
84
|
+
case @cache_mode
|
85
|
+
when :dev
|
86
|
+
unless client_request
|
87
|
+
return false
|
88
|
+
end
|
89
|
+
return true
|
90
|
+
when :prod
|
91
|
+
if client_request
|
92
|
+
return true
|
93
|
+
else
|
94
|
+
case method
|
95
|
+
when :non_paged
|
96
|
+
return false
|
97
|
+
when :paged
|
98
|
+
return true
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
52
104
|
# Parse a Github link header
|
53
105
|
def parse_links(links)
|
54
106
|
links.split(/,/).reduce({}) do |acc, x|
|
@@ -64,6 +116,7 @@ module GHTorrent
|
|
64
116
|
[]
|
65
117
|
else
|
66
118
|
json = result.read
|
119
|
+
|
67
120
|
if json.nil?
|
68
121
|
[]
|
69
122
|
else
|
@@ -73,7 +126,10 @@ module GHTorrent
|
|
73
126
|
end
|
74
127
|
|
75
128
|
# Do the actual request and return the result object
|
76
|
-
def api_request_raw(url)
|
129
|
+
def api_request_raw(url, use_cache = false)
|
130
|
+
@num_api_calls ||= 0
|
131
|
+
@ts ||= Time.now().tv_sec()
|
132
|
+
|
77
133
|
#Rate limiting to avoid error requests
|
78
134
|
if Time.now().tv_sec() - @ts < 60 then
|
79
135
|
if @num_api_calls >= @settings['mirror']['reqrate'].to_i
|
@@ -89,10 +145,29 @@ module GHTorrent
|
|
89
145
|
@ts = Time.now().tv_sec()
|
90
146
|
end
|
91
147
|
|
92
|
-
@num_api_calls += 1
|
93
|
-
debug "APIClient: Request: #{url} (num_calls = #{@num_api_calls})"
|
94
148
|
begin
|
95
|
-
|
149
|
+
start_time = Time.now
|
150
|
+
from_cache = false
|
151
|
+
|
152
|
+
contents =
|
153
|
+
if use_cache
|
154
|
+
if not (cached = cache_get(url)).nil?
|
155
|
+
from_cache = true
|
156
|
+
cached
|
157
|
+
else
|
158
|
+
tocache = Cachable.new(do_request(url))
|
159
|
+
@num_api_calls += 1
|
160
|
+
cache_put(url, tocache)
|
161
|
+
tocache
|
162
|
+
end
|
163
|
+
else
|
164
|
+
@num_api_calls += 1
|
165
|
+
do_request(url)
|
166
|
+
end
|
167
|
+
|
168
|
+
total = Time.now.to_ms - start_time.to_ms
|
169
|
+
debug "APIClient: Request: #{url} (#{@num_api_calls} calls,#{if from_cache then " from cache," end} Total: #{total} ms)"
|
170
|
+
contents
|
96
171
|
rescue OpenURI::HTTPError => e
|
97
172
|
case e.io.status[0].to_i
|
98
173
|
# The following indicate valid Github return codes
|
@@ -100,7 +175,7 @@ module GHTorrent
|
|
100
175
|
401, # Unauthorized
|
101
176
|
403, # Forbidden
|
102
177
|
404, # Not found
|
103
|
-
422
|
178
|
+
422 then # Unprocessable entity
|
104
179
|
STDERR.puts "#{url}: #{e.io.status[1]}"
|
105
180
|
return nil
|
106
181
|
else # Server error or HTTP conditions that Github does not report
|
@@ -109,5 +184,65 @@ module GHTorrent
|
|
109
184
|
end
|
110
185
|
end
|
111
186
|
end
|
187
|
+
|
188
|
+
def do_request(url)
|
189
|
+
@attach_ip ||= config(:attach_ip)
|
190
|
+
|
191
|
+
if @attach_ip.nil? or @attach_ip.eql? "0.0.0.0"
|
192
|
+
open(url)
|
193
|
+
else
|
194
|
+
attach_to(@attach_ip) do
|
195
|
+
open(url)
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Attach to a specific IP address if the machine has multiple
|
201
|
+
def attach_to(ip)
|
202
|
+
TCPSocket.instance_eval do
|
203
|
+
(class << self; self; end).instance_eval do
|
204
|
+
alias_method :original_open, :open
|
205
|
+
|
206
|
+
define_method(:open) do |conn_address, conn_port|
|
207
|
+
original_open(conn_address, conn_port, ip)
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
result = begin
|
213
|
+
yield
|
214
|
+
rescue Exception => e
|
215
|
+
raise e
|
216
|
+
ensure
|
217
|
+
TCPSocket.instance_eval do
|
218
|
+
(class << self; self; end).instance_eval do
|
219
|
+
alias_method :open, :original_open
|
220
|
+
remove_method :original_open
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
result
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
class Cachable
|
232
|
+
|
233
|
+
include OpenURI::Meta
|
234
|
+
|
235
|
+
attr_reader :base_uri, :meta, :status
|
236
|
+
|
237
|
+
def initialize(response)
|
238
|
+
@data = response.read
|
239
|
+
@base_uri = response.base_uri
|
240
|
+
@meta = response.meta
|
241
|
+
@status = response.status
|
112
242
|
end
|
113
|
-
|
243
|
+
|
244
|
+
def read
|
245
|
+
@data
|
246
|
+
end
|
247
|
+
|
248
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
class BSON::OrderedHash
|
4
|
+
|
5
|
+
# Convert a BSON result to a +Hash+
|
6
|
+
def to_h
|
7
|
+
inject({}) do |acc, element|
|
8
|
+
k, v = element;
|
9
|
+
acc[k] = if v.class == Array then
|
10
|
+
v.map{|x| if x.class == BSON::OrderedHash then x.to_h else x end}
|
11
|
+
elsif v.class == BSON::OrderedHash then
|
12
|
+
v.to_h
|
13
|
+
else
|
14
|
+
v
|
15
|
+
end;
|
16
|
+
acc
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_json
|
21
|
+
to_h.to_json
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
require 'ghtorrent/logging'
|
5
|
+
require 'ghtorrent/settings'
|
6
|
+
|
7
|
+
module GHTorrent
|
8
|
+
module Cache
|
9
|
+
include GHTorrent::Logging
|
10
|
+
include GHTorrent::Settings
|
11
|
+
|
12
|
+
# Root dir for cached objects.
|
13
|
+
def cache_dir
|
14
|
+
@cache_dir ||= config(:cache_dir)
|
15
|
+
@cache_dir
|
16
|
+
end
|
17
|
+
|
18
|
+
# The maximum time an item can be cached before being considered stale
|
19
|
+
def max_life
|
20
|
+
@max_life ||= config(:cache_stale_age)
|
21
|
+
@max_life
|
22
|
+
end
|
23
|
+
|
24
|
+
# Put an object to the cache
|
25
|
+
def cache_put(key, object)
|
26
|
+
file = cache_location(key)
|
27
|
+
FileUtils.mkdir_p(File.dirname (file))
|
28
|
+
|
29
|
+
begin
|
30
|
+
File.open(file, 'w') do |f|
|
31
|
+
f.flocked? do
|
32
|
+
YAML::dump object, f
|
33
|
+
end
|
34
|
+
end
|
35
|
+
rescue
|
36
|
+
warn "Could not cache object #{file} for key #{key}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the object indexed by +key+ from the cache. Returns nil if the
|
41
|
+
# key is not found or the object is too old.
|
42
|
+
def cache_get(key)
|
43
|
+
file = cache_location(key)
|
44
|
+
|
45
|
+
unless File.exist?(file)
|
46
|
+
return nil
|
47
|
+
end
|
48
|
+
|
49
|
+
unless (Time.now() - File.mtime(file)) < max_life
|
50
|
+
debug "Cached object for key #{key} too old"
|
51
|
+
return nil
|
52
|
+
end
|
53
|
+
|
54
|
+
begin
|
55
|
+
File.open(file, 'r') do |f|
|
56
|
+
f.flocked? do
|
57
|
+
YAML::load(f)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
rescue
|
61
|
+
warn "Could not read object from cache location #{file}"
|
62
|
+
File.delete(file)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def cache_location(key)
|
69
|
+
hash = hashkey(key)
|
70
|
+
start = hash[0,2]
|
71
|
+
File.join(cache_dir, start, hash)
|
72
|
+
end
|
73
|
+
|
74
|
+
def hashkey(key)
|
75
|
+
Digest::SHA1.hexdigest key
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class File
|
82
|
+
def flocked? &block
|
83
|
+
status = flock LOCK_EX
|
84
|
+
case status
|
85
|
+
when false
|
86
|
+
return true
|
87
|
+
when 0
|
88
|
+
begin
|
89
|
+
block ? block.call : false
|
90
|
+
ensure
|
91
|
+
flock LOCK_UN
|
92
|
+
end
|
93
|
+
else
|
94
|
+
raise SystemCallError, status
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
data/lib/ghtorrent/command.rb
CHANGED
@@ -3,25 +3,44 @@ require 'trollop'
|
|
3
3
|
require 'daemons'
|
4
4
|
require 'etc'
|
5
5
|
|
6
|
-
|
7
|
-
# line argument parsing and command bootstraping support. The order of
|
8
|
-
# initialization is the following:
|
9
|
-
# prepare_options
|
10
|
-
# validate
|
11
|
-
# go
|
6
|
+
require 'ghtorrent/settings'
|
12
7
|
|
13
8
|
module GHTorrent
|
9
|
+
|
10
|
+
# Base class for all GHTorrent command line utilities. Provides basic command
|
11
|
+
# line argument parsing and command bootstraping support. The order of
|
12
|
+
# initialization is the following:
|
13
|
+
# prepare_options
|
14
|
+
# validate
|
15
|
+
# go
|
14
16
|
class Command
|
15
17
|
|
16
|
-
|
18
|
+
include GHTorrent::Settings
|
17
19
|
|
18
20
|
# Specify the run method for subclasses.
|
19
21
|
class << self
|
20
22
|
def run(args = ARGV)
|
21
|
-
|
23
|
+
attr_accessor :args
|
24
|
+
attr_accessor :settings
|
25
|
+
attr_accessor :name
|
26
|
+
attr_accessor :options
|
27
|
+
|
28
|
+
command = new()
|
29
|
+
|
30
|
+
command.name = self.class.name
|
31
|
+
command.args = args
|
32
|
+
|
22
33
|
command.process_options
|
23
34
|
command.validate
|
24
35
|
|
36
|
+
command.settings = YAML::load_file command.options[:config]
|
37
|
+
|
38
|
+
unless command.options[:addr].nil?
|
39
|
+
command.settings = command.override_config(command.settings,
|
40
|
+
:attach_ip,
|
41
|
+
command.options[:addr])
|
42
|
+
end
|
43
|
+
|
25
44
|
if command.options[:daemon]
|
26
45
|
if Process.uid == 0
|
27
46
|
# Daemonize as a proper system daemon
|
@@ -59,15 +78,10 @@ module GHTorrent
|
|
59
78
|
end
|
60
79
|
end
|
61
80
|
|
62
|
-
|
63
|
-
@args = args
|
64
|
-
@name = self.class.name
|
65
|
-
end
|
66
|
-
|
67
|
-
# Specify and parse supported command line options.
|
81
|
+
# Specify and parse top-level command line options.
|
68
82
|
def process_options
|
69
83
|
command = self
|
70
|
-
@options = Trollop::options(
|
84
|
+
@options = Trollop::options(command.args) do
|
71
85
|
|
72
86
|
command.prepare_options(self)
|
73
87
|
|
@@ -78,13 +92,12 @@ Standard options:
|
|
78
92
|
opt :config, 'config.yaml file location', :short => 'c',
|
79
93
|
:default => 'config.yaml'
|
80
94
|
opt :verbose, 'verbose mode', :short => 'v'
|
95
|
+
opt :addr, 'ip address to use for performing requests', :short => 'a',
|
96
|
+
:type => String
|
81
97
|
opt :daemon, 'run as daemon', :short => 'd'
|
82
98
|
opt :user, 'run as the specified user (only when started as root)',
|
83
99
|
:short => 'u', :type => String
|
84
100
|
end
|
85
|
-
|
86
|
-
@args = @args.dup
|
87
|
-
ARGV.clear
|
88
101
|
end
|
89
102
|
|
90
103
|
# Get the version of the project
|
@@ -102,18 +115,19 @@ Standard options:
|
|
102
115
|
# provided by this class.
|
103
116
|
def validate
|
104
117
|
if options[:config].nil?
|
105
|
-
unless (file_exists?("config.yaml")
|
106
|
-
Trollop::die "No config file in default
|
107
|
-
|
108
|
-
|
118
|
+
unless (file_exists?("config.yaml"))
|
119
|
+
Trollop::die "No config file in default location (#{Dir.pwd}). You
|
120
|
+
need to specify the #{:config} parameter. Read the
|
121
|
+
documentation on how to create a config.yaml file."
|
109
122
|
end
|
110
123
|
else
|
111
|
-
Trollop::die "Cannot find file #{options[:config]}"
|
124
|
+
Trollop::die "Cannot find file #{options[:config]}" \
|
125
|
+
unless file_exists?(options[:config])
|
112
126
|
end
|
113
127
|
|
114
128
|
unless @options[:user].nil?
|
115
129
|
if not Process.uid == 0
|
116
|
-
Trollop::die "Option --user (-u)
|
130
|
+
Trollop::die "Option --user (-u) can only be specified by root"
|
117
131
|
end
|
118
132
|
begin
|
119
133
|
Etc.getpwnam(@options[:user])
|
@@ -132,6 +146,11 @@ Standard options:
|
|
132
146
|
def go
|
133
147
|
end
|
134
148
|
|
149
|
+
def override_config(config_file, setting, new_value)
|
150
|
+
STDERR.puts "Overriding configuration #{setting}=#{config(setting)} with cmd line #{new_value}"
|
151
|
+
merge_config_values({setting => new_value})
|
152
|
+
end
|
153
|
+
|
135
154
|
private
|
136
155
|
|
137
156
|
def file_exists?(file)
|
@@ -142,7 +161,6 @@ Standard options:
|
|
142
161
|
false
|
143
162
|
end
|
144
163
|
end
|
145
|
-
|
146
164
|
end
|
147
165
|
|
148
166
|
end
|