ghtorrent 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +132 -0
- data/Rakefile +20 -0
- data/bin/ght-data-retrieval +119 -0
- data/bin/ght-load +242 -0
- data/bin/ght-mirror-events +154 -0
- data/bin/ght-periodic-dump +92 -0
- data/bin/ght-rm-dupl +124 -0
- data/bin/ght-torrent-index +180 -0
- data/lib/ghtorrent.rb +22 -0
- data/lib/ghtorrent/adapters/base_adapter.rb +91 -0
- data/lib/ghtorrent/adapters/mongo_persister.rb +126 -0
- data/lib/ghtorrent/adapters/noop_persister.rb +58 -0
- data/lib/ghtorrent/api_client.rb +106 -0
- data/lib/ghtorrent/call_stack.rb +119 -0
- data/lib/ghtorrent/command.rb +136 -0
- data/lib/ghtorrent/ghtorrent.rb +396 -0
- data/lib/ghtorrent/logging.rb +69 -0
- data/lib/ghtorrent/migrations/001_init_schema.rb +60 -0
- data/lib/ghtorrent/migrations/002_add_followers_created_at.rb +15 -0
- data/lib/ghtorrent/migrations/003_add_external_ref_ids.rb +40 -0
- data/lib/ghtorrent/persister.rb +48 -0
- data/lib/ghtorrent/retriever.rb +148 -0
- data/lib/ghtorrent/settings.rb +63 -0
- data/lib/ghtorrent/utils.rb +58 -0
- data/test/callstack_test.rb +67 -0
- metadata +181 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
2
|
+
#
|
3
|
+
# Redistribution and use in source and binary forms, with or
|
4
|
+
# without modification, are permitted provided that the following
|
5
|
+
# conditions are met:
|
6
|
+
#
|
7
|
+
# 1. Redistributions of source code must retain the above
|
8
|
+
# copyright notice, this list of conditions and the following
|
9
|
+
# disclaimer.
|
10
|
+
#
|
11
|
+
# 2. Redistributions in binary form must reproduce the above
|
12
|
+
# copyright notice, this list of conditions and the following
|
13
|
+
# disclaimer in the documentation and/or other materials
|
14
|
+
# provided with the distribution.
|
15
|
+
#
|
16
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
17
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
18
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
19
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
20
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
21
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
22
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
23
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
24
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
25
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
26
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
27
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
28
|
+
|
29
|
+
require 'logger'
|
30
|
+
|
31
|
+
module GHTorrent
|
32
|
+
module Logging
|
33
|
+
|
34
|
+
DEBUG_LEVEL = defined?(Logger) ? Logger::DEBUG : 0
|
35
|
+
|
36
|
+
def warn(msg)
|
37
|
+
log(:warn, msg)
|
38
|
+
end
|
39
|
+
|
40
|
+
def info(msg)
|
41
|
+
log(:info, msg)
|
42
|
+
end
|
43
|
+
|
44
|
+
def debug(msg)
|
45
|
+
log(:debug, msg)
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
# Log a message at the given level.
|
51
|
+
def log(level, msg)
|
52
|
+
return unless @logger
|
53
|
+
case level
|
54
|
+
when :fatal then
|
55
|
+
@logger.fatal msg
|
56
|
+
when :error then
|
57
|
+
@logger.error msg
|
58
|
+
when :warn then
|
59
|
+
@logger.warn msg
|
60
|
+
when :info then
|
61
|
+
@logger.info msg
|
62
|
+
when :debug then
|
63
|
+
@logger.debug msg
|
64
|
+
else
|
65
|
+
@logger.debug msg
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
|
3
|
+
Sequel.migration do
|
4
|
+
up do
|
5
|
+
puts("Creating table users")
|
6
|
+
create_table :users do
|
7
|
+
primary_key :id
|
8
|
+
String :login, :unique => true, :null => false
|
9
|
+
String :name
|
10
|
+
String :company, :null => true
|
11
|
+
String :location, :null => true
|
12
|
+
String :email, :null => true, :unique => true
|
13
|
+
TrueClass :hireable, :null => true
|
14
|
+
String :bio, :null => true
|
15
|
+
Time :created_at, :null => false
|
16
|
+
end
|
17
|
+
|
18
|
+
puts("Creating table projects")
|
19
|
+
create_table :projects do
|
20
|
+
primary_key :id
|
21
|
+
String :url
|
22
|
+
foreign_key :owner_id, :users
|
23
|
+
String :name, :null => false
|
24
|
+
String :description
|
25
|
+
String :language
|
26
|
+
Time :created_at, :null => false
|
27
|
+
end
|
28
|
+
|
29
|
+
puts("Creating table commits")
|
30
|
+
create_table :commits do
|
31
|
+
primary_key :id
|
32
|
+
String :sha, :size => 40, :unique => true
|
33
|
+
foreign_key :author_id, :users
|
34
|
+
foreign_key :committer_id, :users
|
35
|
+
Time :created_at, :null => false
|
36
|
+
end
|
37
|
+
|
38
|
+
puts("Creating table commit_parents")
|
39
|
+
create_table :commit_parents do
|
40
|
+
foreign_key :commit_id, :commits, :null => false
|
41
|
+
foreign_key :parent_id, :commits, :null => false
|
42
|
+
primary_key [:commit_id, :parent_id]
|
43
|
+
end
|
44
|
+
|
45
|
+
puts("Creating table followers")
|
46
|
+
create_table :followers do
|
47
|
+
foreign_key :user_id, :users, :null => false
|
48
|
+
foreign_key :follower_id, :users, :null => false
|
49
|
+
primary_key [:user_id, :follower_id]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
down do
|
54
|
+
drop_table :users
|
55
|
+
drop_table :projects
|
56
|
+
drop_table :commits
|
57
|
+
drop_table :commit_parents
|
58
|
+
drop_table :followers
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
|
3
|
+
Sequel.migration do
|
4
|
+
up do
|
5
|
+
alter_table :users do
|
6
|
+
add_column :ext_ref_id, String, :null => false, :size => 24, :default => "0"
|
7
|
+
end
|
8
|
+
|
9
|
+
alter_table :projects do
|
10
|
+
add_column :ext_ref_id, String, :null => false, :size => 24, :default => "0"
|
11
|
+
end
|
12
|
+
|
13
|
+
alter_table :commits do
|
14
|
+
add_column :ext_ref_id, String, :null => false, :size => 24, :default => "0"
|
15
|
+
end
|
16
|
+
|
17
|
+
alter_table :followers do
|
18
|
+
add_column :ext_ref_id, String, :null => false, :size => 24, :default => "0"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
down do
|
23
|
+
alter_table :users do
|
24
|
+
drop_column :ext_ref_id
|
25
|
+
end
|
26
|
+
|
27
|
+
alter_table :projects do
|
28
|
+
drop_column :ext_ref_id
|
29
|
+
end
|
30
|
+
|
31
|
+
alter_table :commits do
|
32
|
+
drop_column :ext_ref_id
|
33
|
+
end
|
34
|
+
|
35
|
+
alter_table :followers do
|
36
|
+
drop_column :ext_ref_id
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
2
|
+
#
|
3
|
+
# Redistribution and use in source and binary forms, with or
|
4
|
+
# without modification, are permitted provided that the following
|
5
|
+
# conditions are met:
|
6
|
+
#
|
7
|
+
# 1. Redistributions of source code must retain the above
|
8
|
+
# copyright notice, this list of conditions and the following
|
9
|
+
# disclaimer.
|
10
|
+
#
|
11
|
+
# 2. Redistributions in binary form must reproduce the above
|
12
|
+
# copyright notice, this list of conditions and the following
|
13
|
+
# disclaimer in the documentation and/or other materials
|
14
|
+
# provided with the distribution.
|
15
|
+
#
|
16
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
17
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
18
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
19
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
20
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
21
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
22
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
23
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
24
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
25
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
26
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
27
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
28
|
+
|
29
|
+
module GHTorrent
|
30
|
+
|
31
|
+
#
|
32
|
+
module Persister
|
33
|
+
|
34
|
+
ADAPTERS = {
|
35
|
+
:mongo => GHTorrent::MongoPersister,
|
36
|
+
:noop => GHTorrent::NoopPersister
|
37
|
+
}
|
38
|
+
|
39
|
+
# Factory method for retrieving persistence connections.
|
40
|
+
# The +settings+ argument is a fully parsed YAML document
|
41
|
+
# passed on to adapters. The available +adapter+ are :mongo and :noop
|
42
|
+
def connect(adapter, settings)
|
43
|
+
driver = ADAPTERS[adapter]
|
44
|
+
driver.new(settings)
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,148 @@
|
|
1
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
2
|
+
#
|
3
|
+
# Redistribution and use in source and binary forms, with or
|
4
|
+
# without modification, are permitted provided that the following
|
5
|
+
# conditions are met:
|
6
|
+
#
|
7
|
+
# 1. Redistributions of source code must retain the above
|
8
|
+
# copyright notice, this list of conditions and the following
|
9
|
+
# disclaimer.
|
10
|
+
#
|
11
|
+
# 2. Redistributions in binary form must reproduce the above
|
12
|
+
# copyright notice, this list of conditions and the following
|
13
|
+
# disclaimer in the documentation and/or other materials
|
14
|
+
# provided with the distribution.
|
15
|
+
#
|
16
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
17
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
18
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
19
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
20
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
21
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
22
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
23
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
24
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
25
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
26
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
27
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
28
|
+
|
29
|
+
module GHTorrent
|
30
|
+
module Retriever
|
31
|
+
|
32
|
+
include GHTorrent::APIClient
|
33
|
+
include GHTorrent::Settings
|
34
|
+
|
35
|
+
def initialize(settings)
|
36
|
+
super(settings)
|
37
|
+
@settings = settings
|
38
|
+
@uniq = config(:uniq_id)
|
39
|
+
end
|
40
|
+
|
41
|
+
def retrieve_user_byusername(user)
|
42
|
+
stored_user = @persister.find(:users, {'login' => user})
|
43
|
+
if stored_user.empty?
|
44
|
+
url = ghurl "users/#{user}"
|
45
|
+
u = api_request(url)
|
46
|
+
|
47
|
+
if u.nil?
|
48
|
+
throw GHTorrentException.new("Cannot find user #{user}")
|
49
|
+
end
|
50
|
+
|
51
|
+
unq = @persister.store(:users, u)
|
52
|
+
u[@uniq] = unq
|
53
|
+
info "Retriever: New user #{user}"
|
54
|
+
u
|
55
|
+
else
|
56
|
+
debug "Retriever: Already got user #{user}"
|
57
|
+
stored_user.first
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Try Github API v2 user search by email. This is optional info, so
|
62
|
+
# it may not return any data.
|
63
|
+
# http://develop.github.com/p/users.html
|
64
|
+
def retrieve_user_byemail(email, name)
|
65
|
+
url = ghurl_v2("user/email/#{email}")
|
66
|
+
api_request(url)
|
67
|
+
end
|
68
|
+
|
69
|
+
def retrieve_new_user_followers(user)
|
70
|
+
stored_followers = @persister.find(:followers, {'follows' => user})
|
71
|
+
|
72
|
+
followers = paged_api_request(ghurl "users/#{user}/followers")
|
73
|
+
followers.each do |x|
|
74
|
+
x['follows'] = user
|
75
|
+
|
76
|
+
exists = !stored_followers.find { |f|
|
77
|
+
f['follows'] == user && f['login'] == x['login']
|
78
|
+
}.nil?
|
79
|
+
|
80
|
+
if not exists
|
81
|
+
@persister.store(:followers, x)
|
82
|
+
info "Retriever: Added follower #{user} -> #{x['login']}"
|
83
|
+
else
|
84
|
+
debug "Retriever: Follower #{user} -> #{x['login']} exists"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
@persister.find(:followers, {'follows' => user})
|
89
|
+
end
|
90
|
+
|
91
|
+
def retrieve_commit(repo, sha, user)
|
92
|
+
commit = @persister.find(:commits, {'sha' => "#{sha}"})
|
93
|
+
|
94
|
+
if commit.empty?
|
95
|
+
url = ghurl "repos/#{user}/#{repo}/commits/#{sha}"
|
96
|
+
c = api_request(url)
|
97
|
+
|
98
|
+
if c.nil?
|
99
|
+
throw GHTorrentException.new("Cannot find commit #{user}/#{repo}/#{sha}")
|
100
|
+
end
|
101
|
+
|
102
|
+
unq = @persister.store(:commits, c)
|
103
|
+
info "Retriever: New commit #{repo} -> #{sha}"
|
104
|
+
c[@uniq] = unq
|
105
|
+
c
|
106
|
+
else
|
107
|
+
debug "Retriever: Already got commit #{repo} -> #{sha}"
|
108
|
+
commit.first
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def retrieve_repo(user, repo)
|
113
|
+
stored_repo = @persister.find(:repos, {'owner.login' => user,
|
114
|
+
'name' => repo })
|
115
|
+
if stored_repo.empty?
|
116
|
+
url = ghurl "repos/#{user}/#{repo}"
|
117
|
+
r = api_request(url)
|
118
|
+
|
119
|
+
if r.nil?
|
120
|
+
throw GHTorrentException.new("Cannot find repo #{user}/#{repo}")
|
121
|
+
end
|
122
|
+
|
123
|
+
unq = @persister.store(:repos, r)
|
124
|
+
info "Retriever: New repo #{user} -> #{repo}"
|
125
|
+
r[@uniq] = unq
|
126
|
+
r
|
127
|
+
else
|
128
|
+
debug "Retriever: Already got repo #{user} -> #{repo}"
|
129
|
+
stored_repo.first
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# Get current Github events
|
134
|
+
def get_events
|
135
|
+
api_request "https://api.github.com/events"
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
def ghurl(path)
|
141
|
+
config(:mirror_urlbase) + path
|
142
|
+
end
|
143
|
+
|
144
|
+
def ghurl_v2(path)
|
145
|
+
config(:mirror_urlbase_v2) + path
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
2
|
+
#
|
3
|
+
# Redistribution and use in source and binary forms, with or
|
4
|
+
# without modification, are permitted provided that the following
|
5
|
+
# conditions are met:
|
6
|
+
#
|
7
|
+
# 1. Redistributions of source code must retain the above
|
8
|
+
# copyright notice, this list of conditions and the following
|
9
|
+
# disclaimer.
|
10
|
+
#
|
11
|
+
# 2. Redistributions in binary form must reproduce the above
|
12
|
+
# copyright notice, this list of conditions and the following
|
13
|
+
# disclaimer in the documentation and/or other materials
|
14
|
+
# provided with the distribution.
|
15
|
+
#
|
16
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
17
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
18
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
19
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
20
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
21
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
22
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
23
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
24
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
25
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
26
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
27
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
28
|
+
|
29
|
+
require 'yaml'
|
30
|
+
|
31
|
+
module GHTorrent
|
32
|
+
module Settings
|
33
|
+
|
34
|
+
include GHTorrent::Utils
|
35
|
+
|
36
|
+
CONFIGKEYS = {
|
37
|
+
:amqp_host => "amqp.host",
|
38
|
+
:amqp_port => "amqp.port",
|
39
|
+
:amqp_username => "amqp.username",
|
40
|
+
:amqp_password => "amqp.password",
|
41
|
+
:amqp_exchange => "amqp.exchange",
|
42
|
+
|
43
|
+
:sql_url => "sql.url",
|
44
|
+
|
45
|
+
:mirror_urlbase => "mirror.urlbase",
|
46
|
+
:mirror_urlbase_v2 => "mirror.urlbase_v2",
|
47
|
+
:mirror_reqrate => "mirror.reqrate",
|
48
|
+
:mirror_pollevery => "mirror.pollevery",
|
49
|
+
:mirror_persister => "mirror.persister",
|
50
|
+
|
51
|
+
:uniq_id => "uniq_id"
|
52
|
+
}
|
53
|
+
|
54
|
+
def config(key)
|
55
|
+
read_value(settings, CONFIGKEYS[key])
|
56
|
+
end
|
57
|
+
|
58
|
+
def merge(more_keys)
|
59
|
+
more_keys.each {|k,v| CONFIGKEYS[k] = v}
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|