ghtorrent 0.4 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +24 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +40 -0
- data/README.md +23 -22
- data/bin/ght-data-retrieval +66 -24
- data/bin/ght-load +41 -19
- data/bin/ght-mirror-events +13 -16
- data/bin/ght-rm-dupl +119 -77
- data/lib/ghtorrent.rb +14 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
- data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
- data/lib/ghtorrent/api_client.rb +151 -16
- data/lib/ghtorrent/bson_orderedhash.rb +23 -0
- data/lib/ghtorrent/cache.rb +97 -0
- data/lib/ghtorrent/command.rb +43 -25
- data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
- data/lib/ghtorrent/ghtorrent.rb +615 -164
- data/lib/ghtorrent/hash.rb +11 -0
- data/lib/ghtorrent/logging.rb +11 -7
- data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
- data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
- data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
- data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
- data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
- data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
- data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
- data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
- data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
- data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
- data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
- data/lib/ghtorrent/persister.rb +3 -0
- data/lib/ghtorrent/retriever.rb +298 -102
- data/lib/ghtorrent/settings.rb +20 -1
- data/lib/ghtorrent/time.rb +5 -0
- data/lib/ghtorrent/utils.rb +22 -4
- data/lib/version.rb +5 -0
- metadata +173 -145
- data/lib/ghtorrent/call_stack.rb +0 -91
data/lib/ghtorrent/api_client.rb
CHANGED
@@ -1,28 +1,41 @@
|
|
1
|
-
require 'net/http'
|
2
|
-
require 'set'
|
3
1
|
require 'open-uri'
|
2
|
+
require 'net/http'
|
3
|
+
require 'digest/sha1'
|
4
|
+
require 'fileutils'
|
4
5
|
require 'json'
|
5
6
|
|
7
|
+
require 'ghtorrent/logging'
|
8
|
+
require 'ghtorrent/settings'
|
9
|
+
require 'ghtorrent/time'
|
10
|
+
require 'ghtorrent/cache'
|
11
|
+
|
6
12
|
module GHTorrent
|
7
13
|
module APIClient
|
8
14
|
include GHTorrent::Logging
|
9
15
|
include GHTorrent::Settings
|
16
|
+
include GHTorrent::Cache
|
10
17
|
|
11
|
-
|
12
|
-
|
13
|
-
|
18
|
+
# This is to fix an annoying bug in JRuby's SSL not being able to
|
19
|
+
# verify a valid certificate.
|
20
|
+
if defined? JRUBY_VERSION
|
21
|
+
OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
|
14
22
|
end
|
15
23
|
|
16
24
|
# A paged request. Used when the result can expand to more than one
|
17
25
|
# result pages.
|
18
|
-
def paged_api_request(url, pages = -1)
|
26
|
+
def paged_api_request(url, pages = -1, cache = true, last = nil)
|
19
27
|
|
20
|
-
data =
|
28
|
+
data = if URI.parse(url).query.nil? # Top level request, no params
|
29
|
+
api_request_raw(url, false)
|
30
|
+
else
|
31
|
+
api_request_raw(url, use_cache?(cache, method = :paged))
|
32
|
+
end
|
21
33
|
|
22
34
|
return [] if data.nil?
|
23
35
|
|
24
36
|
unless data.meta['link'].nil?
|
25
37
|
links = parse_links(data.meta['link'])
|
38
|
+
last = links['last'] if last.nil?
|
26
39
|
|
27
40
|
if pages > 0
|
28
41
|
pages = pages - 1
|
@@ -34,7 +47,15 @@ module GHTorrent
|
|
34
47
|
if links['next'].nil?
|
35
48
|
parse_request_result(data)
|
36
49
|
else
|
37
|
-
parse_request_result(data) |
|
50
|
+
parse_request_result(data) |
|
51
|
+
if links['next'] == last
|
52
|
+
if last != links['last']
|
53
|
+
warn "APIClient: Last header mismatch: method=#{last}, cache=#{links['last']}"
|
54
|
+
end
|
55
|
+
paged_api_request(links['next'], pages, false, last)
|
56
|
+
else
|
57
|
+
paged_api_request(links['next'], pages, cache, last)
|
58
|
+
end
|
38
59
|
end
|
39
60
|
else
|
40
61
|
parse_request_result(data)
|
@@ -43,12 +64,43 @@ module GHTorrent
|
|
43
64
|
|
44
65
|
# A normal request. Returns a hash or an array of hashes representing the
|
45
66
|
# parsed JSON result.
|
46
|
-
def api_request(url)
|
47
|
-
parse_request_result api_request_raw(url)
|
67
|
+
def api_request(url, cache = false)
|
68
|
+
parse_request_result api_request_raw(url, use_cache?(cache))
|
48
69
|
end
|
49
70
|
|
50
71
|
private
|
51
72
|
|
73
|
+
# Determine whether to use cache or not, depending on the type of the
|
74
|
+
# request
|
75
|
+
def use_cache?(client_request, method = :non_paged)
|
76
|
+
@cache_mode ||= case config(:cache_mode)
|
77
|
+
when "dev"
|
78
|
+
:dev
|
79
|
+
when "prod"
|
80
|
+
:prod
|
81
|
+
else
|
82
|
+
raise GHTorrentException("")
|
83
|
+
end
|
84
|
+
case @cache_mode
|
85
|
+
when :dev
|
86
|
+
unless client_request
|
87
|
+
return false
|
88
|
+
end
|
89
|
+
return true
|
90
|
+
when :prod
|
91
|
+
if client_request
|
92
|
+
return true
|
93
|
+
else
|
94
|
+
case method
|
95
|
+
when :non_paged
|
96
|
+
return false
|
97
|
+
when :paged
|
98
|
+
return true
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
52
104
|
# Parse a Github link header
|
53
105
|
def parse_links(links)
|
54
106
|
links.split(/,/).reduce({}) do |acc, x|
|
@@ -64,6 +116,7 @@ module GHTorrent
|
|
64
116
|
[]
|
65
117
|
else
|
66
118
|
json = result.read
|
119
|
+
|
67
120
|
if json.nil?
|
68
121
|
[]
|
69
122
|
else
|
@@ -73,7 +126,10 @@ module GHTorrent
|
|
73
126
|
end
|
74
127
|
|
75
128
|
# Do the actual request and return the result object
|
76
|
-
def api_request_raw(url)
|
129
|
+
def api_request_raw(url, use_cache = false)
|
130
|
+
@num_api_calls ||= 0
|
131
|
+
@ts ||= Time.now().tv_sec()
|
132
|
+
|
77
133
|
#Rate limiting to avoid error requests
|
78
134
|
if Time.now().tv_sec() - @ts < 60 then
|
79
135
|
if @num_api_calls >= @settings['mirror']['reqrate'].to_i
|
@@ -89,10 +145,29 @@ module GHTorrent
|
|
89
145
|
@ts = Time.now().tv_sec()
|
90
146
|
end
|
91
147
|
|
92
|
-
@num_api_calls += 1
|
93
|
-
debug "APIClient: Request: #{url} (num_calls = #{@num_api_calls})"
|
94
148
|
begin
|
95
|
-
|
149
|
+
start_time = Time.now
|
150
|
+
from_cache = false
|
151
|
+
|
152
|
+
contents =
|
153
|
+
if use_cache
|
154
|
+
if not (cached = cache_get(url)).nil?
|
155
|
+
from_cache = true
|
156
|
+
cached
|
157
|
+
else
|
158
|
+
tocache = Cachable.new(do_request(url))
|
159
|
+
@num_api_calls += 1
|
160
|
+
cache_put(url, tocache)
|
161
|
+
tocache
|
162
|
+
end
|
163
|
+
else
|
164
|
+
@num_api_calls += 1
|
165
|
+
do_request(url)
|
166
|
+
end
|
167
|
+
|
168
|
+
total = Time.now.to_ms - start_time.to_ms
|
169
|
+
debug "APIClient: Request: #{url} (#{@num_api_calls} calls,#{if from_cache then " from cache," end} Total: #{total} ms)"
|
170
|
+
contents
|
96
171
|
rescue OpenURI::HTTPError => e
|
97
172
|
case e.io.status[0].to_i
|
98
173
|
# The following indicate valid Github return codes
|
@@ -100,7 +175,7 @@ module GHTorrent
|
|
100
175
|
401, # Unauthorized
|
101
176
|
403, # Forbidden
|
102
177
|
404, # Not found
|
103
|
-
422
|
178
|
+
422 then # Unprocessable entity
|
104
179
|
STDERR.puts "#{url}: #{e.io.status[1]}"
|
105
180
|
return nil
|
106
181
|
else # Server error or HTTP conditions that Github does not report
|
@@ -109,5 +184,65 @@ module GHTorrent
|
|
109
184
|
end
|
110
185
|
end
|
111
186
|
end
|
187
|
+
|
188
|
+
def do_request(url)
|
189
|
+
@attach_ip ||= config(:attach_ip)
|
190
|
+
|
191
|
+
if @attach_ip.nil? or @attach_ip.eql? "0.0.0.0"
|
192
|
+
open(url)
|
193
|
+
else
|
194
|
+
attach_to(@attach_ip) do
|
195
|
+
open(url)
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Attach to a specific IP address if the machine has multiple
|
201
|
+
def attach_to(ip)
|
202
|
+
TCPSocket.instance_eval do
|
203
|
+
(class << self; self; end).instance_eval do
|
204
|
+
alias_method :original_open, :open
|
205
|
+
|
206
|
+
define_method(:open) do |conn_address, conn_port|
|
207
|
+
original_open(conn_address, conn_port, ip)
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
result = begin
|
213
|
+
yield
|
214
|
+
rescue Exception => e
|
215
|
+
raise e
|
216
|
+
ensure
|
217
|
+
TCPSocket.instance_eval do
|
218
|
+
(class << self; self; end).instance_eval do
|
219
|
+
alias_method :open, :original_open
|
220
|
+
remove_method :original_open
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
result
|
226
|
+
end
|
227
|
+
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
class Cachable
|
232
|
+
|
233
|
+
include OpenURI::Meta
|
234
|
+
|
235
|
+
attr_reader :base_uri, :meta, :status
|
236
|
+
|
237
|
+
def initialize(response)
|
238
|
+
@data = response.read
|
239
|
+
@base_uri = response.base_uri
|
240
|
+
@meta = response.meta
|
241
|
+
@status = response.status
|
112
242
|
end
|
113
|
-
|
243
|
+
|
244
|
+
def read
|
245
|
+
@data
|
246
|
+
end
|
247
|
+
|
248
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
class BSON::OrderedHash
|
4
|
+
|
5
|
+
# Convert a BSON result to a +Hash+
|
6
|
+
def to_h
|
7
|
+
inject({}) do |acc, element|
|
8
|
+
k, v = element;
|
9
|
+
acc[k] = if v.class == Array then
|
10
|
+
v.map{|x| if x.class == BSON::OrderedHash then x.to_h else x end}
|
11
|
+
elsif v.class == BSON::OrderedHash then
|
12
|
+
v.to_h
|
13
|
+
else
|
14
|
+
v
|
15
|
+
end;
|
16
|
+
acc
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_json
|
21
|
+
to_h.to_json
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
require 'ghtorrent/logging'
|
5
|
+
require 'ghtorrent/settings'
|
6
|
+
|
7
|
+
module GHTorrent
|
8
|
+
module Cache
|
9
|
+
include GHTorrent::Logging
|
10
|
+
include GHTorrent::Settings
|
11
|
+
|
12
|
+
# Root dir for cached objects.
|
13
|
+
def cache_dir
|
14
|
+
@cache_dir ||= config(:cache_dir)
|
15
|
+
@cache_dir
|
16
|
+
end
|
17
|
+
|
18
|
+
# The maximum time an item can be cached before being considered stale
|
19
|
+
def max_life
|
20
|
+
@max_life ||= config(:cache_stale_age)
|
21
|
+
@max_life
|
22
|
+
end
|
23
|
+
|
24
|
+
# Put an object to the cache
|
25
|
+
def cache_put(key, object)
|
26
|
+
file = cache_location(key)
|
27
|
+
FileUtils.mkdir_p(File.dirname (file))
|
28
|
+
|
29
|
+
begin
|
30
|
+
File.open(file, 'w') do |f|
|
31
|
+
f.flocked? do
|
32
|
+
YAML::dump object, f
|
33
|
+
end
|
34
|
+
end
|
35
|
+
rescue
|
36
|
+
warn "Could not cache object #{file} for key #{key}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get the object indexed by +key+ from the cache. Returns nil if the
|
41
|
+
# key is not found or the object is too old.
|
42
|
+
def cache_get(key)
|
43
|
+
file = cache_location(key)
|
44
|
+
|
45
|
+
unless File.exist?(file)
|
46
|
+
return nil
|
47
|
+
end
|
48
|
+
|
49
|
+
unless (Time.now() - File.mtime(file)) < max_life
|
50
|
+
debug "Cached object for key #{key} too old"
|
51
|
+
return nil
|
52
|
+
end
|
53
|
+
|
54
|
+
begin
|
55
|
+
File.open(file, 'r') do |f|
|
56
|
+
f.flocked? do
|
57
|
+
YAML::load(f)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
rescue
|
61
|
+
warn "Could not read object from cache location #{file}"
|
62
|
+
File.delete(file)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def cache_location(key)
|
69
|
+
hash = hashkey(key)
|
70
|
+
start = hash[0,2]
|
71
|
+
File.join(cache_dir, start, hash)
|
72
|
+
end
|
73
|
+
|
74
|
+
def hashkey(key)
|
75
|
+
Digest::SHA1.hexdigest key
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class File
|
82
|
+
def flocked? &block
|
83
|
+
status = flock LOCK_EX
|
84
|
+
case status
|
85
|
+
when false
|
86
|
+
return true
|
87
|
+
when 0
|
88
|
+
begin
|
89
|
+
block ? block.call : false
|
90
|
+
ensure
|
91
|
+
flock LOCK_UN
|
92
|
+
end
|
93
|
+
else
|
94
|
+
raise SystemCallError, status
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
data/lib/ghtorrent/command.rb
CHANGED
@@ -3,25 +3,44 @@ require 'trollop'
|
|
3
3
|
require 'daemons'
|
4
4
|
require 'etc'
|
5
5
|
|
6
|
-
|
7
|
-
# line argument parsing and command bootstraping support. The order of
|
8
|
-
# initialization is the following:
|
9
|
-
# prepare_options
|
10
|
-
# validate
|
11
|
-
# go
|
6
|
+
require 'ghtorrent/settings'
|
12
7
|
|
13
8
|
module GHTorrent
|
9
|
+
|
10
|
+
# Base class for all GHTorrent command line utilities. Provides basic command
|
11
|
+
# line argument parsing and command bootstraping support. The order of
|
12
|
+
# initialization is the following:
|
13
|
+
# prepare_options
|
14
|
+
# validate
|
15
|
+
# go
|
14
16
|
class Command
|
15
17
|
|
16
|
-
|
18
|
+
include GHTorrent::Settings
|
17
19
|
|
18
20
|
# Specify the run method for subclasses.
|
19
21
|
class << self
|
20
22
|
def run(args = ARGV)
|
21
|
-
|
23
|
+
attr_accessor :args
|
24
|
+
attr_accessor :settings
|
25
|
+
attr_accessor :name
|
26
|
+
attr_accessor :options
|
27
|
+
|
28
|
+
command = new()
|
29
|
+
|
30
|
+
command.name = self.class.name
|
31
|
+
command.args = args
|
32
|
+
|
22
33
|
command.process_options
|
23
34
|
command.validate
|
24
35
|
|
36
|
+
command.settings = YAML::load_file command.options[:config]
|
37
|
+
|
38
|
+
unless command.options[:addr].nil?
|
39
|
+
command.settings = command.override_config(command.settings,
|
40
|
+
:attach_ip,
|
41
|
+
command.options[:addr])
|
42
|
+
end
|
43
|
+
|
25
44
|
if command.options[:daemon]
|
26
45
|
if Process.uid == 0
|
27
46
|
# Daemonize as a proper system daemon
|
@@ -59,15 +78,10 @@ module GHTorrent
|
|
59
78
|
end
|
60
79
|
end
|
61
80
|
|
62
|
-
|
63
|
-
@args = args
|
64
|
-
@name = self.class.name
|
65
|
-
end
|
66
|
-
|
67
|
-
# Specify and parse supported command line options.
|
81
|
+
# Specify and parse top-level command line options.
|
68
82
|
def process_options
|
69
83
|
command = self
|
70
|
-
@options = Trollop::options(
|
84
|
+
@options = Trollop::options(command.args) do
|
71
85
|
|
72
86
|
command.prepare_options(self)
|
73
87
|
|
@@ -78,13 +92,12 @@ Standard options:
|
|
78
92
|
opt :config, 'config.yaml file location', :short => 'c',
|
79
93
|
:default => 'config.yaml'
|
80
94
|
opt :verbose, 'verbose mode', :short => 'v'
|
95
|
+
opt :addr, 'ip address to use for performing requests', :short => 'a',
|
96
|
+
:type => String
|
81
97
|
opt :daemon, 'run as daemon', :short => 'd'
|
82
98
|
opt :user, 'run as the specified user (only when started as root)',
|
83
99
|
:short => 'u', :type => String
|
84
100
|
end
|
85
|
-
|
86
|
-
@args = @args.dup
|
87
|
-
ARGV.clear
|
88
101
|
end
|
89
102
|
|
90
103
|
# Get the version of the project
|
@@ -102,18 +115,19 @@ Standard options:
|
|
102
115
|
# provided by this class.
|
103
116
|
def validate
|
104
117
|
if options[:config].nil?
|
105
|
-
unless (file_exists?("config.yaml")
|
106
|
-
Trollop::die "No config file in default
|
107
|
-
|
108
|
-
|
118
|
+
unless (file_exists?("config.yaml"))
|
119
|
+
Trollop::die "No config file in default location (#{Dir.pwd}). You
|
120
|
+
need to specify the #{:config} parameter. Read the
|
121
|
+
documentation on how to create a config.yaml file."
|
109
122
|
end
|
110
123
|
else
|
111
|
-
Trollop::die "Cannot find file #{options[:config]}"
|
124
|
+
Trollop::die "Cannot find file #{options[:config]}" \
|
125
|
+
unless file_exists?(options[:config])
|
112
126
|
end
|
113
127
|
|
114
128
|
unless @options[:user].nil?
|
115
129
|
if not Process.uid == 0
|
116
|
-
Trollop::die "Option --user (-u)
|
130
|
+
Trollop::die "Option --user (-u) can only be specified by root"
|
117
131
|
end
|
118
132
|
begin
|
119
133
|
Etc.getpwnam(@options[:user])
|
@@ -132,6 +146,11 @@ Standard options:
|
|
132
146
|
def go
|
133
147
|
end
|
134
148
|
|
149
|
+
def override_config(config_file, setting, new_value)
|
150
|
+
STDERR.puts "Overriding configuration #{setting}=#{config(setting)} with cmd line #{new_value}"
|
151
|
+
merge_config_values({setting => new_value})
|
152
|
+
end
|
153
|
+
|
135
154
|
private
|
136
155
|
|
137
156
|
def file_exists?(file)
|
@@ -142,7 +161,6 @@ Standard options:
|
|
142
161
|
false
|
143
162
|
end
|
144
163
|
end
|
145
|
-
|
146
164
|
end
|
147
165
|
|
148
166
|
end
|