ghtorrent 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,136 @@
1
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
2
+ #
3
+ # Redistribution and use in source and binary forms, with or
4
+ # without modification, are permitted provided that the following
5
+ # conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above
8
+ # copyright notice, this list of conditions and the following
9
+ # disclaimer.
10
+ #
11
+ # 2. Redistributions in binary form must reproduce the above
12
+ # copyright notice, this list of conditions and the following
13
+ # disclaimer in the documentation and/or other materials
14
+ # provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
20
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
23
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
+ # POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ require 'trollop'
30
+
31
+ # Base class for all GHTorrent command line utilities. Provides basic command
32
+ # line argument parsing and command bootstraping support. The order of
33
+ # initialization is the following:
34
+ # prepare_options
35
+ # validate
36
+ # go
37
+
38
+ module GHTorrent
39
+ class Command
40
+
41
+ attr_reader :args, :options
42
+
43
+ # Specify the run method for subclasses.
44
+ class << self
45
+ def run(args = ARGV)
46
+ command = new(args)
47
+ command.process_options
48
+ command.validate
49
+
50
+ begin
51
+ command.go
52
+ rescue => e
53
+ STDERR.puts e.message
54
+ if command.options.verbose
55
+ STDERR.puts e.backtrace.join("\n")
56
+ else
57
+ STDERR.puts e.backtrace[0]
58
+ end
59
+ exit 1
60
+ end
61
+ end
62
+ end
63
+
64
+ def initialize(args)
65
+ @args = args
66
+ end
67
+
68
+ # Specify and parse supported command line options.
69
+ def process_options
70
+ command = self
71
+ @options = Trollop::options(@args) do
72
+
73
+ command.prepare_options(self)
74
+
75
+ banner <<-END
76
+ Standard options:
77
+ END
78
+
79
+ opt :config, 'config.yaml file location', :short => 'c',
80
+ :default => 'config.yaml'
81
+ opt :verbose, 'verbose mode', :short => 'v'
82
+ end
83
+
84
+ @args = @args.dup
85
+ ARGV.clear
86
+ end
87
+
88
+ # Get the version of the project
89
+ def version
90
+ IO.read(File.join(File.dirname(__FILE__), '..', '..', 'VERSION'))
91
+ end
92
+
93
+ # This method should be overriden by subclasses in order to specify,
94
+ # using trollop, the supported command line options
95
+ def prepare_options(options)
96
+ end
97
+
98
+ # Examine the validity of the provided options in the context of the
99
+ # executed command. Subclasses can also call super to also invoke the checks
100
+ # provided by this class.
101
+ def validate
102
+ if options[:config].nil?
103
+ unless (file_exists?("config.yaml") or file_exists?("/etc/ghtorrent/config.yaml"))
104
+ Trollop::die "No config file in default locations (., /etc/ghtorrent)
105
+ you need to specify the #{:config} parameter. Read the
106
+ documnetation on how to create a config.yaml file."
107
+ end
108
+ else
109
+ Trollop::die "Cannot find file #{options[:config]}" unless file_exists?(options[:config])
110
+ end
111
+ end
112
+
113
+ # Name of the command that is currently being executed.
114
+ def command_name
115
+ File.basename($0)
116
+ end
117
+
118
+ # The actual command code.
119
+ def go
120
+ end
121
+
122
+ private
123
+
124
+ def file_exists?(file)
125
+ begin
126
+ File::Stat.new(file)
127
+ true
128
+ rescue
129
+ false
130
+ end
131
+ end
132
+
133
+ end
134
+
135
+ end
136
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -0,0 +1,396 @@
1
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
2
+ #
3
+ # Redistribution and use in source and binary forms, with or
4
+ # without modification, are permitted provided that the following
5
+ # conditions are met:
6
+ #
7
+ # 1. Redistributions of source code must retain the above
8
+ # copyright notice, this list of conditions and the following
9
+ # disclaimer.
10
+ #
11
+ # 2. Redistributions in binary form must reproduce the above
12
+ # copyright notice, this list of conditions and the following
13
+ # disclaimer in the documentation and/or other materials
14
+ # provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
20
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
23
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
+ # POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ require 'sequel'
30
+
31
+ module GHTorrent
32
+ class Mirror
33
+
34
+ include GHTorrent::Logging
35
+ include GHTorrent::Settings
36
+ include GHTorrent::Retriever
37
+ include GHTorrent::Persister
38
+
39
+ attr_reader :settings, :persister
40
+
41
+ def initialize(configuration)
42
+
43
+ @settings = YAML::load_file configuration
44
+ super(@settings)
45
+ @ext_uniq = config(:uniq_id)
46
+ @logger = Logger.new(STDOUT)
47
+ @persister = connect(:mongo, @settings)
48
+ get_db
49
+ end
50
+
51
+ # db related functions
52
+ def get_db
53
+
54
+ @db = Sequel.connect(config(:sql_url))
55
+
56
+ if @db.tables.empty?
57
+ dir = File.join(File.dirname(__FILE__), 'migrations')
58
+ puts "Database empty, running migrations from #{dir}"
59
+ Sequel.extension :migration
60
+ Sequel::Migrator.apply(@db, dir)
61
+ end
62
+ @db
63
+ end
64
+
65
+ ##
66
+ # Ensure that a user exists, or fetch its latest state from Github
67
+ # ==Parameters:
68
+ # user::
69
+ # The email or login name to lookup the user by
70
+ #
71
+ # == Returns:
72
+ # If the user can be retrieved, it is returned as a Hash. Otherwise,
73
+ # the result is nil
74
+ def get_commit(user, repo, sha)
75
+
76
+ unless sha.match(/[a-f0-9]{40}$/)
77
+ error "GHTorrent: Ignoring commit #{sha}"
78
+ return
79
+ end
80
+
81
+ commits = @db[:commits]
82
+ commit = commits.first(:sha => sha)
83
+
84
+ if commit.nil?
85
+ @db.transaction(:rollback => :reraise) do
86
+ ensure_repo(user, repo)
87
+ c = retrieve_commit(repo, sha, user)
88
+
89
+ author = commit_user(c['author'], c['commit']['author'])
90
+ commiter = commit_user(c['committer'], c['commit']['committer'])
91
+
92
+ commits.insert(:sha => sha,
93
+ :author_id => author[:id],
94
+ :committer_id => commiter[:id],
95
+ :created_at => date(c['commit']['author']['date']),
96
+ :ext_ref_id => c[@ext_uniq]
97
+ )
98
+
99
+ #c['parents'].each do |p|
100
+ # url = p['url'].split(/\//)
101
+ # get_commit url[4], url[5], url[7]
102
+ #
103
+ # commit = commits.first(:sha => sha)
104
+ # parent = commits.first(:sha => url[7])
105
+ # @db[:commit_parents].insert(:commit_id => commit[:id],
106
+ # :parent_id => parent[:id])
107
+ # @log.info "Added parent #{parent[:sha]} to commit #{sha}"
108
+ #end
109
+ end
110
+ debug "GHTorrent: Transaction committed"
111
+ else
112
+ debug "GHTorrent: Commit #{sha} exists"
113
+ end
114
+ end
115
+
116
+ ##
117
+ # Add (or update) an entry for a commit author. This method uses information
118
+ # in the JSON object returned by Github to add (or update) a user in the
119
+ # metadata database with a full user entry (both Git and Github details).
120
+ # Resolution of how
121
+ #
122
+ # ==Parameters:
123
+ # githubuser::
124
+ # A hash containing the user's Github login
125
+ # commituser::
126
+ # A hash containing the Git commit's user name and email
127
+ # == Returns:
128
+ # The (added/modified) user entry as a Hash.
129
+ def commit_user(githubuser, commituser)
130
+
131
+ raise GHTorrentException.new "git user is null" if commituser.nil?
132
+
133
+ users = @db[:users]
134
+
135
+ name = commituser['name']
136
+ email = commituser['email'] #if is_valid_email(commituser['email'])
137
+ # Github user can be null when the commit email has not been associated
138
+ # with any account in Github.
139
+ login = githubuser['login'] unless githubuser.nil?
140
+
141
+ if login.nil?
142
+ ensure_user("#{name}<#{email}>", true)
143
+ else
144
+ dbuser = users.first(:login => login)
145
+ byemail = users.first(:email => email)
146
+ if dbuser.nil?
147
+ # We do not have the user in the database yet. Add him
148
+ added = ensure_user(login, true)
149
+ if byemail.nil?
150
+ #
151
+ users.filter(:login => login).update(:name => name) if added[:name].nil?
152
+ users.filter(:login => login).update(:email => email) if added[:email].nil?
153
+ else
154
+ # There is a previous entry for the user, currently identified by
155
+ # email. This means that the user has updated his account and now
156
+ # Github is able to associate his commits with his git credentials.
157
+ # As the previous entry might have already associated records, just
158
+ # delete the new one and update the existing with any extra data.
159
+ users.filter(:login => login).delete
160
+ users.filter(:email => email).update(
161
+ :login => login,
162
+ :company => added['company'],
163
+ :location => added['location'],
164
+ :hireable => added['hireable'],
165
+ :bio => added['bio'],
166
+ :created_at => added['created_at']
167
+ )
168
+ end
169
+ else
170
+ users.filter(:login => login).update(:name => name) if dbuser[:name].nil?
171
+ users.filter(:login => login).update(:email => email) if dbuser[:email].nil?
172
+ end
173
+ users.first(:login => login)
174
+ end
175
+ end
176
+
177
+ ##
178
+ # Ensure that a user exists, or fetch its latest state from Github
179
+ # ==Parameters:
180
+ # user::
181
+ # The full email address in RFC 822 format
182
+ # or a login name to lookup the user by
183
+ # followers::
184
+ # A boolean value indicating whether to retrieve the user's followers
185
+ # == Returns:
186
+ # If the user can be retrieved, it is returned as a Hash. Otherwise,
187
+ # the result is nil
188
+ def ensure_user(user, followers)
189
+ # Github only supports alpa-nums and dashes in its usernames.
190
+ # All other sympbols are treated as emails.
191
+ u = if not user.match(/^[A-Za-z0-9\-]*$/)
192
+ begin
193
+ name, email = user.split("<")
194
+ email = email.split(">")[0]
195
+ rescue Exception
196
+ raise new GHTorrentException("Not a valid email address: #{user}")
197
+ end
198
+ ensure_user_byemail(email.strip, name.strip, followers)
199
+ else
200
+ ensure_user_byuname(user, followers)
201
+ end
202
+ return u
203
+ end
204
+
205
+ ##
206
+ # Ensure that a user exists, or fetch its latest state from Github
207
+ # ==Parameters:
208
+ # user::
209
+ # The login name to lookup the user by
210
+ #
211
+ # == Returns:
212
+ # If the user can be retrieved, it is returned as a Hash. Otherwise,
213
+ # the result is nil
214
+ def ensure_user_byuname(user, followers)
215
+ users = @db[:users]
216
+ usr = users.first(:login => user)
217
+
218
+ if usr.nil?
219
+ u = retrieve_user_byusername(user)
220
+ email = unless u['email'].nil?
221
+ if u['email'].strip == "" then
222
+ nil
223
+ else
224
+ u['email'].strip
225
+ end
226
+ end
227
+
228
+ users.insert(:login => u['login'],
229
+ :name => u['name'],
230
+ :company => u['company'],
231
+ :email => email,
232
+ :hireable => boolean(u['hirable']),
233
+ :bio => u['bio'],
234
+ :location => u['location'],
235
+ :created_at => date(u['created_at']),
236
+ :ext_ref_id => u[@ext_uniq])
237
+
238
+ info "GHTorrent: New user #{user}"
239
+
240
+ # Get the user's followers
241
+ ensure_user_followers(user) if followers
242
+
243
+ users.first(:login => user)
244
+ else
245
+ debug "GHTorrent: User #{user} exists"
246
+ usr
247
+ end
248
+ end
249
+
250
+ ##
251
+ # Get all followers for a user. Since we do not know when the actual
252
+ # follow event took place, we set the created_at field to the timestamp
253
+ # of the method call.
254
+ #
255
+ # ==Parameters:
256
+ # [user] The user login to find followers by
257
+ def ensure_user_followers(user, ts = Time.now)
258
+
259
+ followers = retrieve_new_user_followers(user)
260
+ followers.each { |f|
261
+ follower = f['login']
262
+ ensure_user(user, false)
263
+ ensure_user(follower, false)
264
+
265
+ userid = @db[:users].select(:id).first(:login => user)[:id]
266
+ followerid = @db[:users].select(:id).first(:login => follower)[:id]
267
+ followers = @db[:followers]
268
+
269
+ if followers.first(:user_id => userid, :follower_id => followerid).nil?
270
+ @db[:followers].insert(:user_id => userid,
271
+ :follower_id => followerid,
272
+ :created_at => ts,
273
+ :ext_ref_id => f[@ext_uniq]
274
+ )
275
+ info "GHTorrent: User #{follower} follows #{user}"
276
+ else
277
+ info "User #{follower} already follows #{user}"
278
+ end
279
+ }
280
+ end
281
+
282
+ ##
283
+ # Try to retrieve a user by email. Search the DB first, fall back to
284
+ # Github API v2 if unsuccessful.
285
+ #
286
+ # ==Parameters:
287
+ # user::
288
+ # The email to lookup the user by
289
+ #
290
+ # == Returns:
291
+ # If the user can be retrieved, it is returned as a Hash. Otherwise,
292
+ # the result is nil
293
+ def ensure_user_byemail(email, name, followers)
294
+ users = @db[:users]
295
+ usr = users.first(:email => email)
296
+
297
+ if usr.nil?
298
+
299
+ u = retrieve_user_byemail(email, name)
300
+
301
+ if u.nil? or u['user'].nil? or u['user']['login'].nil?
302
+ debug "GHTorrent: Cannot find #{email} through API v2 query"
303
+ users.insert(:email => email,
304
+ :name => name,
305
+ :login => (0...8).map { 65.+(rand(25)).chr }.join,
306
+ :created_at => Time.now,
307
+ :ext_ref_id => ""
308
+ )
309
+ users.first(:email => email)
310
+ else
311
+ users.insert(:login => u['user']['login'],
312
+ :name => u['user']['name'],
313
+ :company => u['user']['company'],
314
+ :email => u['user']['email'],
315
+ :hireable => nil,
316
+ :bio => nil,
317
+ :location => u['user']['location'],
318
+ :created_at => date(u['user']['created_at']),
319
+ :ext_ref_id => u[@ext_uniq])
320
+ debug "GHTorrent: Found #{email} through API v2 query"
321
+ ensure_user_followers(user) if followers
322
+ users.first(:email => email)
323
+ end
324
+ else
325
+ debug "GHTorrent: User with email #{email} exists"
326
+ usr
327
+ end
328
+ end
329
+
330
+ ##
331
+ # Ensure that a repo exists, or fetch its latest state from Github
332
+ #
333
+ # ==Parameters:
334
+ # [user] The email or login name to which this repo belongs
335
+ # [repo] The repo name
336
+ #
337
+ # == Returns: If the repo can be retrieved, it is returned as a Hash.
338
+ # Otherwise, the result is nil
339
+ def ensure_repo(user, repo)
340
+
341
+ ensure_user(user, false)
342
+ repos = @db[:projects]
343
+ currepo = repos.first(:name => repo)
344
+
345
+ if currepo.nil?
346
+ r = retrieve_repo(user, repo)
347
+ repos.insert(:url => r['url'],
348
+ :owner_id => @db[:users].filter(:login => user).first[:id],
349
+ :name => r['name'],
350
+ :description => r['description'],
351
+ :language => r['language'],
352
+ :created_at => date(r['created_at']),
353
+ :ext_ref_id => r[@ext_uniq])
354
+
355
+ info "GHTorrent: New repo #{repo}"
356
+ repos.first(:name => repo)
357
+ else
358
+ debug "GHTorrent: Repo #{repo} exists"
359
+ currepo
360
+ end
361
+ end
362
+
363
+ private
364
+
365
+ ##
366
+ # Convert a string value to boolean, the SQL way
367
+ def boolean(arg)
368
+ case arg
369
+ when 'true'
370
+ 1
371
+ when 'false'
372
+ 0
373
+ when nil
374
+ 0
375
+ end
376
+ end
377
+
378
+ # Dates returned by Github are formatted as:
379
+ # - yyyy-mm-ddThh:mm:ssZ
380
+ # - yyyy/mm/dd hh:mm:ss {+/-}hhmm
381
+ def date(arg)
382
+ Time.parse(arg).to_i
383
+ end
384
+
385
+ def is_valid_email(email)
386
+ email =~ /^[a-zA-Z][\w\.-]*[a-zA-Z0-9]@[a-zA-Z0-9][\w\.-]*[a-zA-Z0-9]\.[a-zA-Z][a-zA-Z\.]*[a-zA-Z]$/
387
+ end
388
+ end
389
+ # Base exception for all GHTorrent exceptions
390
+ class GHTorrentException < Exception
391
+
392
+ end
393
+
394
+ end
395
+
396
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :