ghtorrent 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/CHANGELOG +24 -0
  2. data/Gemfile +17 -0
  3. data/Gemfile.lock +40 -0
  4. data/README.md +23 -22
  5. data/bin/ght-data-retrieval +66 -24
  6. data/bin/ght-load +41 -19
  7. data/bin/ght-mirror-events +13 -16
  8. data/bin/ght-rm-dupl +119 -77
  9. data/lib/ghtorrent.rb +14 -4
  10. data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
  11. data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
  12. data/lib/ghtorrent/api_client.rb +151 -16
  13. data/lib/ghtorrent/bson_orderedhash.rb +23 -0
  14. data/lib/ghtorrent/cache.rb +97 -0
  15. data/lib/ghtorrent/command.rb +43 -25
  16. data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
  17. data/lib/ghtorrent/ghtorrent.rb +615 -164
  18. data/lib/ghtorrent/hash.rb +11 -0
  19. data/lib/ghtorrent/logging.rb +11 -7
  20. data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
  21. data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
  22. data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
  23. data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
  24. data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
  25. data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
  26. data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
  27. data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
  28. data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
  29. data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
  30. data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
  31. data/lib/ghtorrent/persister.rb +3 -0
  32. data/lib/ghtorrent/retriever.rb +298 -102
  33. data/lib/ghtorrent/settings.rb +20 -1
  34. data/lib/ghtorrent/time.rb +5 -0
  35. data/lib/ghtorrent/utils.rb +22 -4
  36. data/lib/version.rb +5 -0
  37. metadata +173 -145
  38. data/lib/ghtorrent/call_stack.rb +0 -91
@@ -1,28 +1,41 @@
1
- require 'net/http'
2
- require 'set'
3
1
  require 'open-uri'
2
+ require 'net/http'
3
+ require 'digest/sha1'
4
+ require 'fileutils'
4
5
  require 'json'
5
6
 
7
+ require 'ghtorrent/logging'
8
+ require 'ghtorrent/settings'
9
+ require 'ghtorrent/time'
10
+ require 'ghtorrent/cache'
11
+
6
12
  module GHTorrent
7
13
  module APIClient
8
14
  include GHTorrent::Logging
9
15
  include GHTorrent::Settings
16
+ include GHTorrent::Cache
10
17
 
11
- def initialize(settings)
12
- @num_api_calls = 0
13
- @ts = Time.now().tv_sec()
18
+ # This is to fix an annoying bug in JRuby's SSL not being able to
19
+ # verify a valid certificate.
20
+ if defined? JRUBY_VERSION
21
+ OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
14
22
  end
15
23
 
16
24
  # A paged request. Used when the result can expand to more than one
17
25
  # result pages.
18
- def paged_api_request(url, pages = -1)
26
+ def paged_api_request(url, pages = -1, cache = true, last = nil)
19
27
 
20
- data = api_request_raw(url)
28
+ data = if URI.parse(url).query.nil? # Top level request, no params
29
+ api_request_raw(url, false)
30
+ else
31
+ api_request_raw(url, use_cache?(cache, method = :paged))
32
+ end
21
33
 
22
34
  return [] if data.nil?
23
35
 
24
36
  unless data.meta['link'].nil?
25
37
  links = parse_links(data.meta['link'])
38
+ last = links['last'] if last.nil?
26
39
 
27
40
  if pages > 0
28
41
  pages = pages - 1
@@ -34,7 +47,15 @@ module GHTorrent
34
47
  if links['next'].nil?
35
48
  parse_request_result(data)
36
49
  else
37
- parse_request_result(data) | paged_api_request(links['next'], pages)
50
+ parse_request_result(data) |
51
+ if links['next'] == last
52
+ if last != links['last']
53
+ warn "APIClient: Last header mismatch: method=#{last}, cache=#{links['last']}"
54
+ end
55
+ paged_api_request(links['next'], pages, false, last)
56
+ else
57
+ paged_api_request(links['next'], pages, cache, last)
58
+ end
38
59
  end
39
60
  else
40
61
  parse_request_result(data)
@@ -43,12 +64,43 @@ module GHTorrent
43
64
 
44
65
  # A normal request. Returns a hash or an array of hashes representing the
45
66
  # parsed JSON result.
46
- def api_request(url)
47
- parse_request_result api_request_raw(url)
67
+ def api_request(url, cache = false)
68
+ parse_request_result api_request_raw(url, use_cache?(cache))
48
69
  end
49
70
 
50
71
  private
51
72
 
73
+ # Determine whether to use cache or not, depending on the type of the
74
+ # request
75
+ def use_cache?(client_request, method = :non_paged)
76
+ @cache_mode ||= case config(:cache_mode)
77
+ when "dev"
78
+ :dev
79
+ when "prod"
80
+ :prod
81
+ else
82
+ raise GHTorrentException("")
83
+ end
84
+ case @cache_mode
85
+ when :dev
86
+ unless client_request
87
+ return false
88
+ end
89
+ return true
90
+ when :prod
91
+ if client_request
92
+ return true
93
+ else
94
+ case method
95
+ when :non_paged
96
+ return false
97
+ when :paged
98
+ return true
99
+ end
100
+ end
101
+ end
102
+ end
103
+
52
104
  # Parse a Github link header
53
105
  def parse_links(links)
54
106
  links.split(/,/).reduce({}) do |acc, x|
@@ -64,6 +116,7 @@ module GHTorrent
64
116
  []
65
117
  else
66
118
  json = result.read
119
+
67
120
  if json.nil?
68
121
  []
69
122
  else
@@ -73,7 +126,10 @@ module GHTorrent
73
126
  end
74
127
 
75
128
  # Do the actual request and return the result object
76
- def api_request_raw(url)
129
+ def api_request_raw(url, use_cache = false)
130
+ @num_api_calls ||= 0
131
+ @ts ||= Time.now().tv_sec()
132
+
77
133
  #Rate limiting to avoid error requests
78
134
  if Time.now().tv_sec() - @ts < 60 then
79
135
  if @num_api_calls >= @settings['mirror']['reqrate'].to_i
@@ -89,10 +145,29 @@ module GHTorrent
89
145
  @ts = Time.now().tv_sec()
90
146
  end
91
147
 
92
- @num_api_calls += 1
93
- debug "APIClient: Request: #{url} (num_calls = #{@num_api_calls})"
94
148
  begin
95
- open(url)
149
+ start_time = Time.now
150
+ from_cache = false
151
+
152
+ contents =
153
+ if use_cache
154
+ if not (cached = cache_get(url)).nil?
155
+ from_cache = true
156
+ cached
157
+ else
158
+ tocache = Cachable.new(do_request(url))
159
+ @num_api_calls += 1
160
+ cache_put(url, tocache)
161
+ tocache
162
+ end
163
+ else
164
+ @num_api_calls += 1
165
+ do_request(url)
166
+ end
167
+
168
+ total = Time.now.to_ms - start_time.to_ms
169
+ debug "APIClient: Request: #{url} (#{@num_api_calls} calls,#{if from_cache then " from cache," end} Total: #{total} ms)"
170
+ contents
96
171
  rescue OpenURI::HTTPError => e
97
172
  case e.io.status[0].to_i
98
173
  # The following indicate valid Github return codes
@@ -100,7 +175,7 @@ module GHTorrent
100
175
  401, # Unauthorized
101
176
  403, # Forbidden
102
177
  404, # Not found
103
- 422 : # Unprocessable entity
178
+ 422 then # Unprocessable entity
104
179
  STDERR.puts "#{url}: #{e.io.status[1]}"
105
180
  return nil
106
181
  else # Server error or HTTP conditions that Github does not report
@@ -109,5 +184,65 @@ module GHTorrent
109
184
  end
110
185
  end
111
186
  end
187
+
188
+ def do_request(url)
189
+ @attach_ip ||= config(:attach_ip)
190
+
191
+ if @attach_ip.nil? or @attach_ip.eql? "0.0.0.0"
192
+ open(url)
193
+ else
194
+ attach_to(@attach_ip) do
195
+ open(url)
196
+ end
197
+ end
198
+ end
199
+
200
+ # Attach to a specific IP address if the machine has multiple
201
+ def attach_to(ip)
202
+ TCPSocket.instance_eval do
203
+ (class << self; self; end).instance_eval do
204
+ alias_method :original_open, :open
205
+
206
+ define_method(:open) do |conn_address, conn_port|
207
+ original_open(conn_address, conn_port, ip)
208
+ end
209
+ end
210
+ end
211
+
212
+ result = begin
213
+ yield
214
+ rescue Exception => e
215
+ raise e
216
+ ensure
217
+ TCPSocket.instance_eval do
218
+ (class << self; self; end).instance_eval do
219
+ alias_method :open, :original_open
220
+ remove_method :original_open
221
+ end
222
+ end
223
+ end
224
+
225
+ result
226
+ end
227
+
228
+ end
229
+ end
230
+
231
+ class Cachable
232
+
233
+ include OpenURI::Meta
234
+
235
+ attr_reader :base_uri, :meta, :status
236
+
237
+ def initialize(response)
238
+ @data = response.read
239
+ @base_uri = response.base_uri
240
+ @meta = response.meta
241
+ @status = response.status
112
242
  end
113
- end
243
+
244
+ def read
245
+ @data
246
+ end
247
+
248
+ end
@@ -0,0 +1,23 @@
1
+ require 'json'
2
+
3
+ class BSON::OrderedHash
4
+
5
+ # Convert a BSON result to a +Hash+
6
+ def to_h
7
+ inject({}) do |acc, element|
8
+ k, v = element;
9
+ acc[k] = if v.class == Array then
10
+ v.map{|x| if x.class == BSON::OrderedHash then x.to_h else x end}
11
+ elsif v.class == BSON::OrderedHash then
12
+ v.to_h
13
+ else
14
+ v
15
+ end;
16
+ acc
17
+ end
18
+ end
19
+
20
+ def to_json
21
+ to_h.to_json
22
+ end
23
+ end
@@ -0,0 +1,97 @@
1
+ require 'digest/sha1'
2
+ require 'fileutils'
3
+
4
+ require 'ghtorrent/logging'
5
+ require 'ghtorrent/settings'
6
+
7
+ module GHTorrent
8
+ module Cache
9
+ include GHTorrent::Logging
10
+ include GHTorrent::Settings
11
+
12
+ # Root dir for cached objects.
13
+ def cache_dir
14
+ @cache_dir ||= config(:cache_dir)
15
+ @cache_dir
16
+ end
17
+
18
+ # The maximum time an item can be cached before being considered stale
19
+ def max_life
20
+ @max_life ||= config(:cache_stale_age)
21
+ @max_life
22
+ end
23
+
24
+ # Put an object to the cache
25
+ def cache_put(key, object)
26
+ file = cache_location(key)
27
+ FileUtils.mkdir_p(File.dirname (file))
28
+
29
+ begin
30
+ File.open(file, 'w') do |f|
31
+ f.flocked? do
32
+ YAML::dump object, f
33
+ end
34
+ end
35
+ rescue
36
+ warn "Could not cache object #{file} for key #{key}"
37
+ end
38
+ end
39
+
40
+ # Get the object indexed by +key+ from the cache. Returns nil if the
41
+ # key is not found or the object is too old.
42
+ def cache_get(key)
43
+ file = cache_location(key)
44
+
45
+ unless File.exist?(file)
46
+ return nil
47
+ end
48
+
49
+ unless (Time.now() - File.mtime(file)) < max_life
50
+ debug "Cached object for key #{key} too old"
51
+ return nil
52
+ end
53
+
54
+ begin
55
+ File.open(file, 'r') do |f|
56
+ f.flocked? do
57
+ YAML::load(f)
58
+ end
59
+ end
60
+ rescue
61
+ warn "Could not read object from cache location #{file}"
62
+ File.delete(file)
63
+ end
64
+ end
65
+
66
+ private
67
+
68
+ def cache_location(key)
69
+ hash = hashkey(key)
70
+ start = hash[0,2]
71
+ File.join(cache_dir, start, hash)
72
+ end
73
+
74
+ def hashkey(key)
75
+ Digest::SHA1.hexdigest key
76
+ end
77
+
78
+ end
79
+ end
80
+
81
+ class File
82
+ def flocked? &block
83
+ status = flock LOCK_EX
84
+ case status
85
+ when false
86
+ return true
87
+ when 0
88
+ begin
89
+ block ? block.call : false
90
+ ensure
91
+ flock LOCK_UN
92
+ end
93
+ else
94
+ raise SystemCallError, status
95
+ end
96
+ end
97
+ end
@@ -3,25 +3,44 @@ require 'trollop'
3
3
  require 'daemons'
4
4
  require 'etc'
5
5
 
6
- # Base class for all GHTorrent command line utilities. Provides basic command
7
- # line argument parsing and command bootstraping support. The order of
8
- # initialization is the following:
9
- # prepare_options
10
- # validate
11
- # go
6
+ require 'ghtorrent/settings'
12
7
 
13
8
  module GHTorrent
9
+
10
+ # Base class for all GHTorrent command line utilities. Provides basic command
11
+ # line argument parsing and command bootstraping support. The order of
12
+ # initialization is the following:
13
+ # prepare_options
14
+ # validate
15
+ # go
14
16
  class Command
15
17
 
16
- attr_reader :args, :options, :name
18
+ include GHTorrent::Settings
17
19
 
18
20
  # Specify the run method for subclasses.
19
21
  class << self
20
22
  def run(args = ARGV)
21
- command = new(args)
23
+ attr_accessor :args
24
+ attr_accessor :settings
25
+ attr_accessor :name
26
+ attr_accessor :options
27
+
28
+ command = new()
29
+
30
+ command.name = self.class.name
31
+ command.args = args
32
+
22
33
  command.process_options
23
34
  command.validate
24
35
 
36
+ command.settings = YAML::load_file command.options[:config]
37
+
38
+ unless command.options[:addr].nil?
39
+ command.settings = command.override_config(command.settings,
40
+ :attach_ip,
41
+ command.options[:addr])
42
+ end
43
+
25
44
  if command.options[:daemon]
26
45
  if Process.uid == 0
27
46
  # Daemonize as a proper system daemon
@@ -59,15 +78,10 @@ module GHTorrent
59
78
  end
60
79
  end
61
80
 
62
- def initialize(args)
63
- @args = args
64
- @name = self.class.name
65
- end
66
-
67
- # Specify and parse supported command line options.
81
+ # Specify and parse top-level command line options.
68
82
  def process_options
69
83
  command = self
70
- @options = Trollop::options(@args) do
84
+ @options = Trollop::options(command.args) do
71
85
 
72
86
  command.prepare_options(self)
73
87
 
@@ -78,13 +92,12 @@ Standard options:
78
92
  opt :config, 'config.yaml file location', :short => 'c',
79
93
  :default => 'config.yaml'
80
94
  opt :verbose, 'verbose mode', :short => 'v'
95
+ opt :addr, 'ip address to use for performing requests', :short => 'a',
96
+ :type => String
81
97
  opt :daemon, 'run as daemon', :short => 'd'
82
98
  opt :user, 'run as the specified user (only when started as root)',
83
99
  :short => 'u', :type => String
84
100
  end
85
-
86
- @args = @args.dup
87
- ARGV.clear
88
101
  end
89
102
 
90
103
  # Get the version of the project
@@ -102,18 +115,19 @@ Standard options:
102
115
  # provided by this class.
103
116
  def validate
104
117
  if options[:config].nil?
105
- unless (file_exists?("config.yaml") or file_exists?("/etc/ghtorrent/config.yaml"))
106
- Trollop::die "No config file in default locations (., /etc/ghtorrent)
107
- you need to specify the #{:config} parameter. Read the
108
- documentation on how to create a config.yaml file."
118
+ unless (file_exists?("config.yaml"))
119
+ Trollop::die "No config file in default location (#{Dir.pwd}). You
120
+ need to specify the #{:config} parameter. Read the
121
+ documentation on how to create a config.yaml file."
109
122
  end
110
123
  else
111
- Trollop::die "Cannot find file #{options[:config]}" unless file_exists?(options[:config])
124
+ Trollop::die "Cannot find file #{options[:config]}" \
125
+ unless file_exists?(options[:config])
112
126
  end
113
127
 
114
128
  unless @options[:user].nil?
115
129
  if not Process.uid == 0
116
- Trollop::die "Option --user (-u) cannot be specified by normal users"
130
+ Trollop::die "Option --user (-u) can only be specified by root"
117
131
  end
118
132
  begin
119
133
  Etc.getpwnam(@options[:user])
@@ -132,6 +146,11 @@ Standard options:
132
146
  def go
133
147
  end
134
148
 
149
+ def override_config(config_file, setting, new_value)
150
+ STDERR.puts "Overriding configuration #{setting}=#{config(setting)} with cmd line #{new_value}"
151
+ merge_config_values({setting => new_value})
152
+ end
153
+
135
154
  private
136
155
 
137
156
  def file_exists?(file)
@@ -142,7 +161,6 @@ Standard options:
142
161
  false
143
162
  end
144
163
  end
145
-
146
164
  end
147
165
 
148
166
  end