ghtorrent 0.4 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/CHANGELOG +24 -0
  2. data/Gemfile +17 -0
  3. data/Gemfile.lock +40 -0
  4. data/README.md +23 -22
  5. data/bin/ght-data-retrieval +66 -24
  6. data/bin/ght-load +41 -19
  7. data/bin/ght-mirror-events +13 -16
  8. data/bin/ght-rm-dupl +119 -77
  9. data/lib/ghtorrent.rb +14 -4
  10. data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
  11. data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
  12. data/lib/ghtorrent/api_client.rb +151 -16
  13. data/lib/ghtorrent/bson_orderedhash.rb +23 -0
  14. data/lib/ghtorrent/cache.rb +97 -0
  15. data/lib/ghtorrent/command.rb +43 -25
  16. data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
  17. data/lib/ghtorrent/ghtorrent.rb +615 -164
  18. data/lib/ghtorrent/hash.rb +11 -0
  19. data/lib/ghtorrent/logging.rb +11 -7
  20. data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
  21. data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
  22. data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
  23. data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
  24. data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
  25. data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
  26. data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
  27. data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
  28. data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
  29. data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
  30. data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
  31. data/lib/ghtorrent/persister.rb +3 -0
  32. data/lib/ghtorrent/retriever.rb +298 -102
  33. data/lib/ghtorrent/settings.rb +20 -1
  34. data/lib/ghtorrent/time.rb +5 -0
  35. data/lib/ghtorrent/utils.rb +22 -4
  36. data/lib/version.rb +5 -0
  37. metadata +173 -145
  38. data/lib/ghtorrent/call_stack.rb +0 -91
@@ -1,28 +1,41 @@
1
- require 'net/http'
2
- require 'set'
3
1
  require 'open-uri'
2
+ require 'net/http'
3
+ require 'digest/sha1'
4
+ require 'fileutils'
4
5
  require 'json'
5
6
 
7
+ require 'ghtorrent/logging'
8
+ require 'ghtorrent/settings'
9
+ require 'ghtorrent/time'
10
+ require 'ghtorrent/cache'
11
+
6
12
  module GHTorrent
7
13
  module APIClient
8
14
  include GHTorrent::Logging
9
15
  include GHTorrent::Settings
16
+ include GHTorrent::Cache
10
17
 
11
- def initialize(settings)
12
- @num_api_calls = 0
13
- @ts = Time.now().tv_sec()
18
+ # This is to fix an annoying bug in JRuby's SSL not being able to
19
+ # verify a valid certificate.
20
+ if defined? JRUBY_VERSION
21
+ OpenSSL::SSL::VERIFY_PEER = OpenSSL::SSL::VERIFY_NONE
14
22
  end
15
23
 
16
24
  # A paged request. Used when the result can expand to more than one
17
25
  # result pages.
18
- def paged_api_request(url, pages = -1)
26
+ def paged_api_request(url, pages = -1, cache = true, last = nil)
19
27
 
20
- data = api_request_raw(url)
28
+ data = if URI.parse(url).query.nil? # Top level request, no params
29
+ api_request_raw(url, false)
30
+ else
31
+ api_request_raw(url, use_cache?(cache, method = :paged))
32
+ end
21
33
 
22
34
  return [] if data.nil?
23
35
 
24
36
  unless data.meta['link'].nil?
25
37
  links = parse_links(data.meta['link'])
38
+ last = links['last'] if last.nil?
26
39
 
27
40
  if pages > 0
28
41
  pages = pages - 1
@@ -34,7 +47,15 @@ module GHTorrent
34
47
  if links['next'].nil?
35
48
  parse_request_result(data)
36
49
  else
37
- parse_request_result(data) | paged_api_request(links['next'], pages)
50
+ parse_request_result(data) |
51
+ if links['next'] == last
52
+ if last != links['last']
53
+ warn "APIClient: Last header mismatch: method=#{last}, cache=#{links['last']}"
54
+ end
55
+ paged_api_request(links['next'], pages, false, last)
56
+ else
57
+ paged_api_request(links['next'], pages, cache, last)
58
+ end
38
59
  end
39
60
  else
40
61
  parse_request_result(data)
@@ -43,12 +64,43 @@ module GHTorrent
43
64
 
44
65
  # A normal request. Returns a hash or an array of hashes representing the
45
66
  # parsed JSON result.
46
- def api_request(url)
47
- parse_request_result api_request_raw(url)
67
+ def api_request(url, cache = false)
68
+ parse_request_result api_request_raw(url, use_cache?(cache))
48
69
  end
49
70
 
50
71
  private
51
72
 
73
+ # Determine whether to use cache or not, depending on the type of the
74
+ # request
75
+ def use_cache?(client_request, method = :non_paged)
76
+ @cache_mode ||= case config(:cache_mode)
77
+ when "dev"
78
+ :dev
79
+ when "prod"
80
+ :prod
81
+ else
82
+ raise GHTorrentException("")
83
+ end
84
+ case @cache_mode
85
+ when :dev
86
+ unless client_request
87
+ return false
88
+ end
89
+ return true
90
+ when :prod
91
+ if client_request
92
+ return true
93
+ else
94
+ case method
95
+ when :non_paged
96
+ return false
97
+ when :paged
98
+ return true
99
+ end
100
+ end
101
+ end
102
+ end
103
+
52
104
  # Parse a Github link header
53
105
  def parse_links(links)
54
106
  links.split(/,/).reduce({}) do |acc, x|
@@ -64,6 +116,7 @@ module GHTorrent
64
116
  []
65
117
  else
66
118
  json = result.read
119
+
67
120
  if json.nil?
68
121
  []
69
122
  else
@@ -73,7 +126,10 @@ module GHTorrent
73
126
  end
74
127
 
75
128
  # Do the actual request and return the result object
76
- def api_request_raw(url)
129
+ def api_request_raw(url, use_cache = false)
130
+ @num_api_calls ||= 0
131
+ @ts ||= Time.now().tv_sec()
132
+
77
133
  #Rate limiting to avoid error requests
78
134
  if Time.now().tv_sec() - @ts < 60 then
79
135
  if @num_api_calls >= @settings['mirror']['reqrate'].to_i
@@ -89,10 +145,29 @@ module GHTorrent
89
145
  @ts = Time.now().tv_sec()
90
146
  end
91
147
 
92
- @num_api_calls += 1
93
- debug "APIClient: Request: #{url} (num_calls = #{@num_api_calls})"
94
148
  begin
95
- open(url)
149
+ start_time = Time.now
150
+ from_cache = false
151
+
152
+ contents =
153
+ if use_cache
154
+ if not (cached = cache_get(url)).nil?
155
+ from_cache = true
156
+ cached
157
+ else
158
+ tocache = Cachable.new(do_request(url))
159
+ @num_api_calls += 1
160
+ cache_put(url, tocache)
161
+ tocache
162
+ end
163
+ else
164
+ @num_api_calls += 1
165
+ do_request(url)
166
+ end
167
+
168
+ total = Time.now.to_ms - start_time.to_ms
169
+ debug "APIClient: Request: #{url} (#{@num_api_calls} calls,#{if from_cache then " from cache," end} Total: #{total} ms)"
170
+ contents
96
171
  rescue OpenURI::HTTPError => e
97
172
  case e.io.status[0].to_i
98
173
  # The following indicate valid Github return codes
@@ -100,7 +175,7 @@ module GHTorrent
100
175
  401, # Unauthorized
101
176
  403, # Forbidden
102
177
  404, # Not found
103
- 422 : # Unprocessable entity
178
+ 422 then # Unprocessable entity
104
179
  STDERR.puts "#{url}: #{e.io.status[1]}"
105
180
  return nil
106
181
  else # Server error or HTTP conditions that Github does not report
@@ -109,5 +184,65 @@ module GHTorrent
109
184
  end
110
185
  end
111
186
  end
187
+
188
+ def do_request(url)
189
+ @attach_ip ||= config(:attach_ip)
190
+
191
+ if @attach_ip.nil? or @attach_ip.eql? "0.0.0.0"
192
+ open(url)
193
+ else
194
+ attach_to(@attach_ip) do
195
+ open(url)
196
+ end
197
+ end
198
+ end
199
+
200
+ # Attach to a specific IP address if the machine has multiple
201
+ def attach_to(ip)
202
+ TCPSocket.instance_eval do
203
+ (class << self; self; end).instance_eval do
204
+ alias_method :original_open, :open
205
+
206
+ define_method(:open) do |conn_address, conn_port|
207
+ original_open(conn_address, conn_port, ip)
208
+ end
209
+ end
210
+ end
211
+
212
+ result = begin
213
+ yield
214
+ rescue Exception => e
215
+ raise e
216
+ ensure
217
+ TCPSocket.instance_eval do
218
+ (class << self; self; end).instance_eval do
219
+ alias_method :open, :original_open
220
+ remove_method :original_open
221
+ end
222
+ end
223
+ end
224
+
225
+ result
226
+ end
227
+
228
+ end
229
+ end
230
+
231
+ class Cachable
232
+
233
+ include OpenURI::Meta
234
+
235
+ attr_reader :base_uri, :meta, :status
236
+
237
+ def initialize(response)
238
+ @data = response.read
239
+ @base_uri = response.base_uri
240
+ @meta = response.meta
241
+ @status = response.status
112
242
  end
113
- end
243
+
244
+ def read
245
+ @data
246
+ end
247
+
248
+ end
@@ -0,0 +1,23 @@
1
+ require 'json'
2
+
3
+ class BSON::OrderedHash
4
+
5
+ # Convert a BSON result to a +Hash+
6
+ def to_h
7
+ inject({}) do |acc, element|
8
+ k, v = element;
9
+ acc[k] = if v.class == Array then
10
+ v.map{|x| if x.class == BSON::OrderedHash then x.to_h else x end}
11
+ elsif v.class == BSON::OrderedHash then
12
+ v.to_h
13
+ else
14
+ v
15
+ end;
16
+ acc
17
+ end
18
+ end
19
+
20
+ def to_json
21
+ to_h.to_json
22
+ end
23
+ end
@@ -0,0 +1,97 @@
1
+ require 'digest/sha1'
2
+ require 'fileutils'
3
+
4
+ require 'ghtorrent/logging'
5
+ require 'ghtorrent/settings'
6
+
7
+ module GHTorrent
8
+ module Cache
9
+ include GHTorrent::Logging
10
+ include GHTorrent::Settings
11
+
12
+ # Root dir for cached objects.
13
+ def cache_dir
14
+ @cache_dir ||= config(:cache_dir)
15
+ @cache_dir
16
+ end
17
+
18
+ # The maximum time an item can be cached before being considered stale
19
+ def max_life
20
+ @max_life ||= config(:cache_stale_age)
21
+ @max_life
22
+ end
23
+
24
+ # Put an object to the cache
25
+ def cache_put(key, object)
26
+ file = cache_location(key)
27
+ FileUtils.mkdir_p(File.dirname (file))
28
+
29
+ begin
30
+ File.open(file, 'w') do |f|
31
+ f.flocked? do
32
+ YAML::dump object, f
33
+ end
34
+ end
35
+ rescue
36
+ warn "Could not cache object #{file} for key #{key}"
37
+ end
38
+ end
39
+
40
+ # Get the object indexed by +key+ from the cache. Returns nil if the
41
+ # key is not found or the object is too old.
42
+ def cache_get(key)
43
+ file = cache_location(key)
44
+
45
+ unless File.exist?(file)
46
+ return nil
47
+ end
48
+
49
+ unless (Time.now() - File.mtime(file)) < max_life
50
+ debug "Cached object for key #{key} too old"
51
+ return nil
52
+ end
53
+
54
+ begin
55
+ File.open(file, 'r') do |f|
56
+ f.flocked? do
57
+ YAML::load(f)
58
+ end
59
+ end
60
+ rescue
61
+ warn "Could not read object from cache location #{file}"
62
+ File.delete(file)
63
+ end
64
+ end
65
+
66
+ private
67
+
68
+ def cache_location(key)
69
+ hash = hashkey(key)
70
+ start = hash[0,2]
71
+ File.join(cache_dir, start, hash)
72
+ end
73
+
74
+ def hashkey(key)
75
+ Digest::SHA1.hexdigest key
76
+ end
77
+
78
+ end
79
+ end
80
+
81
+ class File
82
+ def flocked? &block
83
+ status = flock LOCK_EX
84
+ case status
85
+ when false
86
+ return true
87
+ when 0
88
+ begin
89
+ block ? block.call : false
90
+ ensure
91
+ flock LOCK_UN
92
+ end
93
+ else
94
+ raise SystemCallError, status
95
+ end
96
+ end
97
+ end
@@ -3,25 +3,44 @@ require 'trollop'
3
3
  require 'daemons'
4
4
  require 'etc'
5
5
 
6
- # Base class for all GHTorrent command line utilities. Provides basic command
7
- # line argument parsing and command bootstraping support. The order of
8
- # initialization is the following:
9
- # prepare_options
10
- # validate
11
- # go
6
+ require 'ghtorrent/settings'
12
7
 
13
8
  module GHTorrent
9
+
10
+ # Base class for all GHTorrent command line utilities. Provides basic command
11
+ # line argument parsing and command bootstraping support. The order of
12
+ # initialization is the following:
13
+ # prepare_options
14
+ # validate
15
+ # go
14
16
  class Command
15
17
 
16
- attr_reader :args, :options, :name
18
+ include GHTorrent::Settings
17
19
 
18
20
  # Specify the run method for subclasses.
19
21
  class << self
20
22
  def run(args = ARGV)
21
- command = new(args)
23
+ attr_accessor :args
24
+ attr_accessor :settings
25
+ attr_accessor :name
26
+ attr_accessor :options
27
+
28
+ command = new()
29
+
30
+ command.name = self.class.name
31
+ command.args = args
32
+
22
33
  command.process_options
23
34
  command.validate
24
35
 
36
+ command.settings = YAML::load_file command.options[:config]
37
+
38
+ unless command.options[:addr].nil?
39
+ command.settings = command.override_config(command.settings,
40
+ :attach_ip,
41
+ command.options[:addr])
42
+ end
43
+
25
44
  if command.options[:daemon]
26
45
  if Process.uid == 0
27
46
  # Daemonize as a proper system daemon
@@ -59,15 +78,10 @@ module GHTorrent
59
78
  end
60
79
  end
61
80
 
62
- def initialize(args)
63
- @args = args
64
- @name = self.class.name
65
- end
66
-
67
- # Specify and parse supported command line options.
81
+ # Specify and parse top-level command line options.
68
82
  def process_options
69
83
  command = self
70
- @options = Trollop::options(@args) do
84
+ @options = Trollop::options(command.args) do
71
85
 
72
86
  command.prepare_options(self)
73
87
 
@@ -78,13 +92,12 @@ Standard options:
78
92
  opt :config, 'config.yaml file location', :short => 'c',
79
93
  :default => 'config.yaml'
80
94
  opt :verbose, 'verbose mode', :short => 'v'
95
+ opt :addr, 'ip address to use for performing requests', :short => 'a',
96
+ :type => String
81
97
  opt :daemon, 'run as daemon', :short => 'd'
82
98
  opt :user, 'run as the specified user (only when started as root)',
83
99
  :short => 'u', :type => String
84
100
  end
85
-
86
- @args = @args.dup
87
- ARGV.clear
88
101
  end
89
102
 
90
103
  # Get the version of the project
@@ -102,18 +115,19 @@ Standard options:
102
115
  # provided by this class.
103
116
  def validate
104
117
  if options[:config].nil?
105
- unless (file_exists?("config.yaml") or file_exists?("/etc/ghtorrent/config.yaml"))
106
- Trollop::die "No config file in default locations (., /etc/ghtorrent)
107
- you need to specify the #{:config} parameter. Read the
108
- documentation on how to create a config.yaml file."
118
+ unless (file_exists?("config.yaml"))
119
+ Trollop::die "No config file in default location (#{Dir.pwd}). You
120
+ need to specify the #{:config} parameter. Read the
121
+ documentation on how to create a config.yaml file."
109
122
  end
110
123
  else
111
- Trollop::die "Cannot find file #{options[:config]}" unless file_exists?(options[:config])
124
+ Trollop::die "Cannot find file #{options[:config]}" \
125
+ unless file_exists?(options[:config])
112
126
  end
113
127
 
114
128
  unless @options[:user].nil?
115
129
  if not Process.uid == 0
116
- Trollop::die "Option --user (-u) cannot be specified by normal users"
130
+ Trollop::die "Option --user (-u) can only be specified by root"
117
131
  end
118
132
  begin
119
133
  Etc.getpwnam(@options[:user])
@@ -132,6 +146,11 @@ Standard options:
132
146
  def go
133
147
  end
134
148
 
149
+ def override_config(config_file, setting, new_value)
150
+ STDERR.puts "Overriding configuration #{setting}=#{config(setting)} with cmd line #{new_value}"
151
+ merge_config_values({setting => new_value})
152
+ end
153
+
135
154
  private
136
155
 
137
156
  def file_exists?(file)
@@ -142,7 +161,6 @@ Standard options:
142
161
  false
143
162
  end
144
163
  end
145
-
146
164
  end
147
165
 
148
166
  end