ghtorrent 0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,136 @@
1
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
2
+ #
3
+ # Redistribution and use in source and binary forms, with or
4
+ # without modification, are permitted provided that the following
5
+ # conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above
8
+ # copyright notice, this list of conditions and the following
9
+ # disclaimer.
10
+ #
11
+ # 2. Redistributions in binary form must reproduce the above
12
+ # copyright notice, this list of conditions and the following
13
+ # disclaimer in the documentation and/or other materials
14
+ # provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
20
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
23
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
+ # POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ require 'trollop'
30
+
31
+ # Base class for all GHTorrent command line utilities. Provides basic command
32
+ # line argument parsing and command bootstraping support. The order of
33
+ # initialization is the following:
34
+ # prepare_options
35
+ # validate
36
+ # go
37
+
38
+ module GHTorrent
39
+ class Command
40
+
41
+ attr_reader :args, :options
42
+
43
+ # Specify the run method for subclasses.
44
+ class << self
45
+ def run(args = ARGV)
46
+ command = new(args)
47
+ command.process_options
48
+ command.validate
49
+
50
+ begin
51
+ command.go
52
+ rescue => e
53
+ STDERR.puts e.message
54
+ if command.options.verbose
55
+ STDERR.puts e.backtrace.join("\n")
56
+ else
57
+ STDERR.puts e.backtrace[0]
58
+ end
59
+ exit 1
60
+ end
61
+ end
62
+ end
63
+
64
+ def initialize(args)
65
+ @args = args
66
+ end
67
+
68
+ # Specify and parse supported command line options.
69
+ def process_options
70
+ command = self
71
+ @options = Trollop::options(@args) do
72
+
73
+ command.prepare_options(self)
74
+
75
+ banner <<-END
76
+ Standard options:
77
+ END
78
+
79
+ opt :config, 'config.yaml file location', :short => 'c',
80
+ :default => 'config.yaml'
81
+ opt :verbose, 'verbose mode', :short => 'v'
82
+ end
83
+
84
+ @args = @args.dup
85
+ ARGV.clear
86
+ end
87
+
88
+ # Get the version of the project
89
+ def version
90
+ IO.read(File.join(File.dirname(__FILE__), '..', '..', 'VERSION'))
91
+ end
92
+
93
+ # This method should be overriden by subclasses in order to specify,
94
+ # using trollop, the supported command line options
95
+ def prepare_options(options)
96
+ end
97
+
98
+ # Examine the validity of the provided options in the context of the
99
+ # executed command. Subclasses can also call super to also invoke the checks
100
+ # provided by this class.
101
+ def validate
102
+ if options[:config].nil?
103
+ unless (file_exists?("config.yaml") or file_exists?("/etc/ghtorrent/config.yaml"))
104
+ Trollop::die "No config file in default locations (., /etc/ghtorrent)
105
+ you need to specify the #{:config} parameter. Read the
106
+ documnetation on how to create a config.yaml file."
107
+ end
108
+ else
109
+ Trollop::die "Cannot find file #{options[:config]}" unless file_exists?(options[:config])
110
+ end
111
+ end
112
+
113
+ # Name of the command that is currently being executed.
114
+ def command_name
115
+ File.basename($0)
116
+ end
117
+
118
+ # The actual command code.
119
+ def go
120
+ end
121
+
122
+ private
123
+
124
+ def file_exists?(file)
125
+ begin
126
+ File::Stat.new(file)
127
+ true
128
+ rescue
129
+ false
130
+ end
131
+ end
132
+
133
+ end
134
+
135
+ end
136
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -0,0 +1,396 @@
1
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
2
+ #
3
+ # Redistribution and use in source and binary forms, with or
4
+ # without modification, are permitted provided that the following
5
+ # conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above
8
+ # copyright notice, this list of conditions and the following
9
+ # disclaimer.
10
+ #
11
+ # 2. Redistributions in binary form must reproduce the above
12
+ # copyright notice, this list of conditions and the following
13
+ # disclaimer in the documentation and/or other materials
14
+ # provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
20
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
23
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
+ # POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ require 'sequel'
30
+
31
+ module GHTorrent
32
+ class Mirror
33
+
34
+ include GHTorrent::Logging
35
+ include GHTorrent::Settings
36
+ include GHTorrent::Retriever
37
+ include GHTorrent::Persister
38
+
39
+ attr_reader :settings, :persister
40
+
41
+ def initialize(configuration)
42
+
43
+ @settings = YAML::load_file configuration
44
+ super(@settings)
45
+ @ext_uniq = config(:uniq_id)
46
+ @logger = Logger.new(STDOUT)
47
+ @persister = connect(:mongo, @settings)
48
+ get_db
49
+ end
50
+
51
+ # db related functions
52
+ def get_db
53
+
54
+ @db = Sequel.connect(config(:sql_url))
55
+
56
+ if @db.tables.empty?
57
+ dir = File.join(File.dirname(__FILE__), 'migrations')
58
+ puts "Database empty, running migrations from #{dir}"
59
+ Sequel.extension :migration
60
+ Sequel::Migrator.apply(@db, dir)
61
+ end
62
+ @db
63
+ end
64
+
65
+ ##
66
+ # Ensure that a user exists, or fetch its latest state from Github
67
+ # ==Parameters:
68
+ # user::
69
+ # The email or login name to lookup the user by
70
+ #
71
+ # == Returns:
72
+ # If the user can be retrieved, it is returned as a Hash. Otherwise,
73
+ # the result is nil
74
+ def get_commit(user, repo, sha)
75
+
76
+ unless sha.match(/[a-f0-9]{40}$/)
77
+ error "GHTorrent: Ignoring commit #{sha}"
78
+ return
79
+ end
80
+
81
+ commits = @db[:commits]
82
+ commit = commits.first(:sha => sha)
83
+
84
+ if commit.nil?
85
+ @db.transaction(:rollback => :reraise) do
86
+ ensure_repo(user, repo)
87
+ c = retrieve_commit(repo, sha, user)
88
+
89
+ author = commit_user(c['author'], c['commit']['author'])
90
+ commiter = commit_user(c['committer'], c['commit']['committer'])
91
+
92
+ commits.insert(:sha => sha,
93
+ :author_id => author[:id],
94
+ :committer_id => commiter[:id],
95
+ :created_at => date(c['commit']['author']['date']),
96
+ :ext_ref_id => c[@ext_uniq]
97
+ )
98
+
99
+ #c['parents'].each do |p|
100
+ # url = p['url'].split(/\//)
101
+ # get_commit url[4], url[5], url[7]
102
+ #
103
+ # commit = commits.first(:sha => sha)
104
+ # parent = commits.first(:sha => url[7])
105
+ # @db[:commit_parents].insert(:commit_id => commit[:id],
106
+ # :parent_id => parent[:id])
107
+ # @log.info "Added parent #{parent[:sha]} to commit #{sha}"
108
+ #end
109
+ end
110
+ debug "GHTorrent: Transaction committed"
111
+ else
112
+ debug "GHTorrent: Commit #{sha} exists"
113
+ end
114
+ end
115
+
116
+ ##
117
+ # Add (or update) an entry for a commit author. This method uses information
118
+ # in the JSON object returned by Github to add (or update) a user in the
119
+ # metadata database with a full user entry (both Git and Github details).
120
+ # Resolution of how
121
+ #
122
+ # ==Parameters:
123
+ # githubuser::
124
+ # A hash containing the user's Github login
125
+ # commituser::
126
+ # A hash containing the Git commit's user name and email
127
+ # == Returns:
128
+ # The (added/modified) user entry as a Hash.
129
+ def commit_user(githubuser, commituser)
130
+
131
+ raise GHTorrentException.new "git user is null" if commituser.nil?
132
+
133
+ users = @db[:users]
134
+
135
+ name = commituser['name']
136
+ email = commituser['email'] #if is_valid_email(commituser['email'])
137
+ # Github user can be null when the commit email has not been associated
138
+ # with any account in Github.
139
+ login = githubuser['login'] unless githubuser.nil?
140
+
141
+ if login.nil?
142
+ ensure_user("#{name}<#{email}>", true)
143
+ else
144
+ dbuser = users.first(:login => login)
145
+ byemail = users.first(:email => email)
146
+ if dbuser.nil?
147
+ # We do not have the user in the database yet. Add him
148
+ added = ensure_user(login, true)
149
+ if byemail.nil?
150
+ #
151
+ users.filter(:login => login).update(:name => name) if added[:name].nil?
152
+ users.filter(:login => login).update(:email => email) if added[:email].nil?
153
+ else
154
+ # There is a previous entry for the user, currently identified by
155
+ # email. This means that the user has updated his account and now
156
+ # Github is able to associate his commits with his git credentials.
157
+ # As the previous entry might have already associated records, just
158
+ # delete the new one and update the existing with any extra data.
159
+ users.filter(:login => login).delete
160
+ users.filter(:email => email).update(
161
+ :login => login,
162
+ :company => added['company'],
163
+ :location => added['location'],
164
+ :hireable => added['hireable'],
165
+ :bio => added['bio'],
166
+ :created_at => added['created_at']
167
+ )
168
+ end
169
+ else
170
+ users.filter(:login => login).update(:name => name) if dbuser[:name].nil?
171
+ users.filter(:login => login).update(:email => email) if dbuser[:email].nil?
172
+ end
173
+ users.first(:login => login)
174
+ end
175
+ end
176
+
177
+ ##
178
+ # Ensure that a user exists, or fetch its latest state from Github
179
+ # ==Parameters:
180
+ # user::
181
+ # The full email address in RFC 822 format
182
+ # or a login name to lookup the user by
183
+ # followers::
184
+ # A boolean value indicating whether to retrieve the user's followers
185
+ # == Returns:
186
+ # If the user can be retrieved, it is returned as a Hash. Otherwise,
187
+ # the result is nil
188
+ def ensure_user(user, followers)
189
+ # Github only supports alpa-nums and dashes in its usernames.
190
+ # All other sympbols are treated as emails.
191
+ u = if not user.match(/^[A-Za-z0-9\-]*$/)
192
+ begin
193
+ name, email = user.split("<")
194
+ email = email.split(">")[0]
195
+ rescue Exception
196
+ raise new GHTorrentException("Not a valid email address: #{user}")
197
+ end
198
+ ensure_user_byemail(email.strip, name.strip, followers)
199
+ else
200
+ ensure_user_byuname(user, followers)
201
+ end
202
+ return u
203
+ end
204
+
205
+ ##
206
+ # Ensure that a user exists, or fetch its latest state from Github
207
+ # ==Parameters:
208
+ # user::
209
+ # The login name to lookup the user by
210
+ #
211
+ # == Returns:
212
+ # If the user can be retrieved, it is returned as a Hash. Otherwise,
213
+ # the result is nil
214
+ def ensure_user_byuname(user, followers)
215
+ users = @db[:users]
216
+ usr = users.first(:login => user)
217
+
218
+ if usr.nil?
219
+ u = retrieve_user_byusername(user)
220
+ email = unless u['email'].nil?
221
+ if u['email'].strip == "" then
222
+ nil
223
+ else
224
+ u['email'].strip
225
+ end
226
+ end
227
+
228
+ users.insert(:login => u['login'],
229
+ :name => u['name'],
230
+ :company => u['company'],
231
+ :email => email,
232
+ :hireable => boolean(u['hirable']),
233
+ :bio => u['bio'],
234
+ :location => u['location'],
235
+ :created_at => date(u['created_at']),
236
+ :ext_ref_id => u[@ext_uniq])
237
+
238
+ info "GHTorrent: New user #{user}"
239
+
240
+ # Get the user's followers
241
+ ensure_user_followers(user) if followers
242
+
243
+ users.first(:login => user)
244
+ else
245
+ debug "GHTorrent: User #{user} exists"
246
+ usr
247
+ end
248
+ end
249
+
250
+ ##
251
+ # Get all followers for a user. Since we do not know when the actual
252
+ # follow event took place, we set the created_at field to the timestamp
253
+ # of the method call.
254
+ #
255
+ # ==Parameters:
256
+ # [user] The user login to find followers by
257
+ def ensure_user_followers(user, ts = Time.now)
258
+
259
+ followers = retrieve_new_user_followers(user)
260
+ followers.each { |f|
261
+ follower = f['login']
262
+ ensure_user(user, false)
263
+ ensure_user(follower, false)
264
+
265
+ userid = @db[:users].select(:id).first(:login => user)[:id]
266
+ followerid = @db[:users].select(:id).first(:login => follower)[:id]
267
+ followers = @db[:followers]
268
+
269
+ if followers.first(:user_id => userid, :follower_id => followerid).nil?
270
+ @db[:followers].insert(:user_id => userid,
271
+ :follower_id => followerid,
272
+ :created_at => ts,
273
+ :ext_ref_id => f[@ext_uniq]
274
+ )
275
+ info "GHTorrent: User #{follower} follows #{user}"
276
+ else
277
+ info "User #{follower} already follows #{user}"
278
+ end
279
+ }
280
+ end
281
+
282
+ ##
283
+ # Try to retrieve a user by email. Search the DB first, fall back to
284
+ # Github API v2 if unsuccessful.
285
+ #
286
+ # ==Parameters:
287
+ # user::
288
+ # The email to lookup the user by
289
+ #
290
+ # == Returns:
291
+ # If the user can be retrieved, it is returned as a Hash. Otherwise,
292
+ # the result is nil
293
+ def ensure_user_byemail(email, name, followers)
294
+ users = @db[:users]
295
+ usr = users.first(:email => email)
296
+
297
+ if usr.nil?
298
+
299
+ u = retrieve_user_byemail(email, name)
300
+
301
+ if u.nil? or u['user'].nil? or u['user']['login'].nil?
302
+ debug "GHTorrent: Cannot find #{email} through API v2 query"
303
+ users.insert(:email => email,
304
+ :name => name,
305
+ :login => (0...8).map { 65.+(rand(25)).chr }.join,
306
+ :created_at => Time.now,
307
+ :ext_ref_id => ""
308
+ )
309
+ users.first(:email => email)
310
+ else
311
+ users.insert(:login => u['user']['login'],
312
+ :name => u['user']['name'],
313
+ :company => u['user']['company'],
314
+ :email => u['user']['email'],
315
+ :hireable => nil,
316
+ :bio => nil,
317
+ :location => u['user']['location'],
318
+ :created_at => date(u['user']['created_at']),
319
+ :ext_ref_id => u[@ext_uniq])
320
+ debug "GHTorrent: Found #{email} through API v2 query"
321
+ ensure_user_followers(user) if followers
322
+ users.first(:email => email)
323
+ end
324
+ else
325
+ debug "GHTorrent: User with email #{email} exists"
326
+ usr
327
+ end
328
+ end
329
+
330
+ ##
331
+ # Ensure that a repo exists, or fetch its latest state from Github
332
+ #
333
+ # ==Parameters:
334
+ # [user] The email or login name to which this repo belongs
335
+ # [repo] The repo name
336
+ #
337
+ # == Returns: If the repo can be retrieved, it is returned as a Hash.
338
+ # Otherwise, the result is nil
339
+ def ensure_repo(user, repo)
340
+
341
+ ensure_user(user, false)
342
+ repos = @db[:projects]
343
+ currepo = repos.first(:name => repo)
344
+
345
+ if currepo.nil?
346
+ r = retrieve_repo(user, repo)
347
+ repos.insert(:url => r['url'],
348
+ :owner_id => @db[:users].filter(:login => user).first[:id],
349
+ :name => r['name'],
350
+ :description => r['description'],
351
+ :language => r['language'],
352
+ :created_at => date(r['created_at']),
353
+ :ext_ref_id => r[@ext_uniq])
354
+
355
+ info "GHTorrent: New repo #{repo}"
356
+ repos.first(:name => repo)
357
+ else
358
+ debug "GHTorrent: Repo #{repo} exists"
359
+ currepo
360
+ end
361
+ end
362
+
363
+ private
364
+
365
+ ##
366
+ # Convert a string value to boolean, the SQL way
367
+ def boolean(arg)
368
+ case arg
369
+ when 'true'
370
+ 1
371
+ when 'false'
372
+ 0
373
+ when nil
374
+ 0
375
+ end
376
+ end
377
+
378
+ # Dates returned by Github are formatted as:
379
+ # - yyyy-mm-ddThh:mm:ssZ
380
+ # - yyyy/mm/dd hh:mm:ss {+/-}hhmm
381
+ def date(arg)
382
+ Time.parse(arg).to_i
383
+ end
384
+
385
+ def is_valid_email(email)
386
+ email =~ /^[a-zA-Z][\w\.-]*[a-zA-Z0-9]@[a-zA-Z0-9][\w\.-]*[a-zA-Z0-9]\.[a-zA-Z][a-zA-Z\.]*[a-zA-Z]$/
387
+ end
388
+ end
389
+ # Base exception for all GHTorrent exceptions
390
+ class GHTorrentException < Exception
391
+
392
+ end
393
+
394
+ end
395
+
396
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :