ghtorrent 0.6 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -0
- data/Gemfile +1 -11
- data/Gemfile.lock +27 -29
- data/README.md +10 -14
- data/bin/ght-mirror-events +0 -0
- data/bin/ght-process-event +0 -0
- data/bin/ght-retrieve-repo +0 -0
- data/bin/ght-retrieve-user +6 -0
- data/lib/ghtorrent.rb +1 -0
- data/lib/ghtorrent/adapters/base_adapter.rb +6 -0
- data/lib/ghtorrent/adapters/mongo_persister.rb +8 -0
- data/lib/ghtorrent/api_client.rb +8 -29
- data/lib/ghtorrent/command.rb +1 -3
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +5 -10
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +28 -17
- data/lib/ghtorrent/commands/ght_load.rb +2 -2
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +45 -15
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +72 -0
- data/lib/ghtorrent/ghtorrent.rb +288 -209
- data/lib/ghtorrent/migrations/012_add_forks_to_projects.rb +31 -0
- data/lib/ghtorrent/migrations/013_add_merged_to_pullreqs.rb +39 -0
- data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +21 -0
- data/lib/ghtorrent/retriever.rb +90 -25
- data/lib/ghtorrent/settings.rb +44 -6
- data/lib/version.rb +2 -2
- metadata +52 -84
- data/bin/ght-periodic-dump +0 -130
- data/bin/ght-torrent-index +0 -150
- data/test/callstack_test.rb +0 -67
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
= Version 0.7
|
2
|
+
* Full support for issues (comments, labels etc) and pull requests
|
3
|
+
* Cleaned up retrieval of pull request commits
|
4
|
+
* Cleaned up association of commits with repositories.
|
5
|
+
* Removed the forks table. Forks are now tracked by the forked_from field in projects
|
6
|
+
* Use Github's HTTP headers for request throttling
|
7
|
+
* Support for setting user agent header as per Github API requirements
|
8
|
+
* Support for marking projects as deleted (run fixes/update_deleted.rb)
|
9
|
+
* New tool (ght-retrieve-user) to retrieve all data for a single user
|
10
|
+
* Support for running without a config.yaml file
|
11
|
+
* Technical report in doc/
|
12
|
+
|
1
13
|
= Version 0.6
|
2
14
|
|
3
15
|
* Support retrieval of issues, issue events and issue history
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,40 +1,38 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
ghtorrent (0.6)
|
5
|
+
amqp (~> 1.0.0)
|
6
|
+
bson_ext (~> 1.8.0)
|
7
|
+
daemons (~> 1.1.0)
|
8
|
+
mongo (~> 1.8.0)
|
9
|
+
sequel (~> 3.47)
|
10
|
+
trollop (~> 2.0.0)
|
11
|
+
|
1
12
|
GEM
|
2
13
|
remote: https://rubygems.org/
|
3
14
|
specs:
|
4
|
-
amq-client (0.
|
5
|
-
amq-protocol (>=
|
15
|
+
amq-client (1.0.2)
|
16
|
+
amq-protocol (>= 1.2.0)
|
6
17
|
eventmachine
|
7
|
-
amq-protocol (
|
8
|
-
amqp (0.
|
9
|
-
amq-client (~> 0.
|
10
|
-
amq-protocol (>=
|
18
|
+
amq-protocol (1.4.0)
|
19
|
+
amqp (1.0.2)
|
20
|
+
amq-client (~> 1.0.2)
|
21
|
+
amq-protocol (>= 1.3.0)
|
11
22
|
eventmachine
|
12
|
-
bson (1.
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
mongo (1.6.4)
|
22
|
-
bson (~> 1.6.4)
|
23
|
-
sequel (3.37.0)
|
24
|
-
sqlite3 (1.3.6)
|
25
|
-
trollop (1.16.2)
|
23
|
+
bson (1.8.5)
|
24
|
+
bson_ext (1.8.5)
|
25
|
+
bson (~> 1.8.5)
|
26
|
+
daemons (1.1.9)
|
27
|
+
eventmachine (1.0.3)
|
28
|
+
mongo (1.8.5)
|
29
|
+
bson (~> 1.8.5)
|
30
|
+
sequel (3.47.0)
|
31
|
+
trollop (2.0)
|
26
32
|
|
27
33
|
PLATFORMS
|
28
|
-
java
|
29
34
|
ruby
|
30
35
|
|
31
36
|
DEPENDENCIES
|
32
|
-
|
33
|
-
bson_ext
|
34
|
-
daemons
|
37
|
+
ghtorrent!
|
35
38
|
jdbc-mysql
|
36
|
-
json
|
37
|
-
mongo
|
38
|
-
sequel
|
39
|
-
sqlite3
|
40
|
-
trollop
|
data/README.md
CHANGED
@@ -9,22 +9,21 @@ GHTorrent relies on the following software to work:
|
|
9
9
|
|
10
10
|
* MongoDB > 2.0
|
11
11
|
* RabbitMQ >= 2.7
|
12
|
-
*
|
13
|
-
|
12
|
+
* MySQL >= 5.5. GHTorrent is tested mainly with MySQL, but can theoretically be
|
13
|
+
used with any SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html). Your milaege may vary.
|
14
14
|
|
15
|
-
GHTorrent is written in Ruby (tested with 1.9
|
16
|
-
it as a Gem do:
|
15
|
+
GHTorrent is written in Ruby (tested with 1.9). To install it as a Gem do:
|
17
16
|
|
18
17
|
<code>
|
19
18
|
sudo gem install ghtorrent
|
20
19
|
</code>
|
21
20
|
|
22
21
|
Depending on which SQL database you want to use, install the appropriate
|
23
|
-
dependency gem. GHTorrent already installs the `
|
24
|
-
install the development package for `
|
22
|
+
dependency gem. GHTorrent already installs the `mysql2` gem (if it fails,
|
23
|
+
install the development package for `libmysql-dev` for your system).
|
25
24
|
|
26
25
|
<code>
|
27
|
-
sudo gem install mysql2 #or postgres
|
26
|
+
sudo gem install mysql2 #or sqlite3-ruby #or postgres
|
28
27
|
</code>
|
29
28
|
|
30
29
|
#### Configuring
|
@@ -95,12 +94,14 @@ and performance reasons. To catch up with Github's event stream, it is
|
|
95
94
|
usually enough to run `ght-mirror-events` on one host. To collect all data
|
96
95
|
pointed by each event, one instance of `ght-data-retrieval` is not enough.
|
97
96
|
Both scripts employ throttling mechanisms to keep API usage whithin the
|
98
|
-
limits imposed by Github (currently
|
97
|
+
limits imposed by Github (currently 60 reqs/hr/ip). If you want the full
|
98
|
+
5000 reqs/hr/ip, you will have to provide your Github login details
|
99
|
+
in the `config.yaml` file.
|
99
100
|
|
100
101
|
#### Data
|
101
102
|
|
102
103
|
You can find torrents for retrieving data on the
|
103
|
-
[Available Torrents](https://
|
104
|
+
[Available Torrents](https://ghtorrent.org/downloads.html) page. You need two sets of data:
|
104
105
|
|
105
106
|
* Raw events: Github's [event stream](https://api.github.com/events). These
|
106
107
|
are the roots for mirroring operations. The `ght-data-retrieval` crawler starts
|
@@ -108,11 +109,6 @@ from an event and goes deep into the rabbit hole.
|
|
108
109
|
* SQL dumps+Linked data: Data dumps from the SQL database and the corresponding
|
109
110
|
MongoDB entities.
|
110
111
|
|
111
|
-
|
112
|
-
*At the moment, GHTorrent is in the process of redesigning its data storage
|
113
|
-
schema. Consequently, it does not distribute SQL dumps or linked data raw data.
|
114
|
-
The distribution service will come back shortly.*
|
115
|
-
|
116
112
|
#### Reporting bugs
|
117
113
|
|
118
114
|
Please use the [Issue
|
data/bin/ght-mirror-events
CHANGED
File without changes
|
data/bin/ght-process-event
CHANGED
File without changes
|
data/bin/ght-retrieve-repo
CHANGED
File without changes
|
data/lib/ghtorrent.rb
CHANGED
@@ -56,5 +56,6 @@ require 'ghtorrent/commands/ght_get_more_commits'
|
|
56
56
|
require 'ghtorrent/commands/ght_rm_dupl'
|
57
57
|
require 'ghtorrent/commands/ght_load'
|
58
58
|
require 'ghtorrent/commands/ght_retrieve_repo'
|
59
|
+
require 'ghtorrent/commands/ght_retrieve_user'
|
59
60
|
|
60
61
|
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|
@@ -70,6 +70,12 @@ module GHTorrent
|
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
73
|
+
def del(entity, query = {})
|
74
|
+
unless ENTITIES.include?(entity)
|
75
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
73
79
|
# Get a raw connection to the underlying data store. The connection is
|
74
80
|
# implementaiton dependent.
|
75
81
|
def get_underlying_connection
|
@@ -65,6 +65,12 @@ module GHTorrent
|
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
68
|
+
def del(entity, query)
|
69
|
+
super
|
70
|
+
raise Exception 'No filter was specifed. Cowardily refusing to remove all entries' if query == {}
|
71
|
+
get_entity(entity).remove(query)
|
72
|
+
end
|
73
|
+
|
68
74
|
def get_underlying_connection
|
69
75
|
mongo
|
70
76
|
end
|
@@ -163,7 +169,9 @@ module GHTorrent
|
|
163
169
|
ensure_index(:users, "login")
|
164
170
|
ensure_index(:commits, "sha")
|
165
171
|
ensure_index(:repos, "name")
|
172
|
+
ensure_index(:repos, "owner.login")
|
166
173
|
ensure_index(:followers, "follows")
|
174
|
+
ensure_index(:followers, "login")
|
167
175
|
ensure_index(:org_members, "org")
|
168
176
|
ensure_index(:commit_comments, "repo")
|
169
177
|
ensure_index(:commit_comments, "user")
|
data/lib/ghtorrent/api_client.rb
CHANGED
@@ -25,7 +25,8 @@ module GHTorrent
|
|
25
25
|
|
26
26
|
# A paged request. Used when the result can expand to more than one
|
27
27
|
# result pages.
|
28
|
-
def paged_api_request(url, pages =
|
28
|
+
def paged_api_request(url, pages = config(:mirror_history_pages_back),
|
29
|
+
cache = true, last = nil)
|
29
30
|
|
30
31
|
url = if not url.include?("per_page")
|
31
32
|
if url.include?("?")
|
@@ -139,24 +140,6 @@ module GHTorrent
|
|
139
140
|
|
140
141
|
# Do the actual request and return the result object
|
141
142
|
def api_request_raw(url, use_cache = false)
|
142
|
-
@num_api_calls ||= 0
|
143
|
-
@ts ||= Time.now.to_i
|
144
|
-
@started_min ||= Time.now.min
|
145
|
-
|
146
|
-
#Rate limiting to avoid error requests
|
147
|
-
if Time.now().tv_sec() - @ts < 60 then
|
148
|
-
if @num_api_calls >= @settings['mirror']['reqrate'].to_i
|
149
|
-
sleep = 60 - (Time.now.to_i - @ts)
|
150
|
-
debug "APIClient: Sleeping for #{sleep}"
|
151
|
-
sleep (sleep)
|
152
|
-
@num_api_calls = 0
|
153
|
-
@ts = Time.now.to_i
|
154
|
-
end
|
155
|
-
else
|
156
|
-
debug "APIClient: Tick, num_calls = #{@num_api_calls}, zeroing"
|
157
|
-
@num_api_calls = 0
|
158
|
-
@ts = Time.now.to_i
|
159
|
-
end
|
160
143
|
|
161
144
|
begin
|
162
145
|
start_time = Time.now
|
@@ -169,25 +152,21 @@ module GHTorrent
|
|
169
152
|
cached
|
170
153
|
else
|
171
154
|
tocache = Cachable.new(do_request(url))
|
172
|
-
@num_api_calls += 1
|
173
155
|
cache_put(url, tocache)
|
174
156
|
tocache
|
175
157
|
end
|
176
158
|
else
|
177
|
-
@num_api_calls += 1
|
178
159
|
do_request(url)
|
179
160
|
end
|
180
161
|
|
181
162
|
total = Time.now.to_ms - start_time.to_ms
|
182
|
-
debug "APIClient: Request: #{url}
|
163
|
+
debug "APIClient: Request: #{url} #{if from_cache then " from cache," else "(#{contents.meta['x-ratelimit-remaining']} remaining)," end} Total: #{total} ms"
|
183
164
|
|
184
165
|
if not from_cache and config(:respect_api_ratelimit) and
|
185
|
-
contents.meta['x-ratelimit-remaining'].to_i <
|
186
|
-
sleep =
|
166
|
+
contents.meta['x-ratelimit-remaining'].to_i < 20
|
167
|
+
sleep = 61 - Time.now.min
|
187
168
|
debug "APIClient: Request limit reached, sleeping for #{sleep} min"
|
188
169
|
sleep(sleep * 60)
|
189
|
-
@started_min = Time.now.min
|
190
|
-
@num_api_calls = 0
|
191
170
|
end
|
192
171
|
|
193
172
|
contents
|
@@ -199,10 +178,10 @@ module GHTorrent
|
|
199
178
|
403, # Forbidden
|
200
179
|
404, # Not found
|
201
180
|
422 then # Unprocessable entity
|
202
|
-
warn "#{url}: #{e.io.status[1]}"
|
181
|
+
warn "APIClient: #{url}: #{e.io.status[1]}"
|
203
182
|
return nil
|
204
183
|
else # Server error or HTTP conditions that Github does not report
|
205
|
-
warn "#{url}"
|
184
|
+
warn "APIClient: #{url}: #{e.io.status[1]}"
|
206
185
|
raise e
|
207
186
|
end
|
208
187
|
end
|
@@ -212,7 +191,7 @@ module GHTorrent
|
|
212
191
|
@attach_ip ||= config(:attach_ip)
|
213
192
|
@username ||= config(:github_username)
|
214
193
|
@passwd ||= config(:github_passwd)
|
215
|
-
@user_agent ||=
|
194
|
+
@user_agent ||= config(:user_agent)
|
216
195
|
|
217
196
|
@open_func ||= if @username.nil?
|
218
197
|
lambda {|url| open(url, 'User-Agent' => @user_agent)}
|
data/lib/ghtorrent/command.rb
CHANGED
@@ -35,8 +35,6 @@ module GHTorrent
|
|
35
35
|
command.process_options
|
36
36
|
command.validate
|
37
37
|
|
38
|
-
puts "GHTorrent version: #{GHTorrent::VERSION}"
|
39
|
-
|
40
38
|
command.settings = YAML::load_file command.options[:config]
|
41
39
|
|
42
40
|
unless command.options[:addr].nil?
|
@@ -165,7 +163,7 @@ Standard options:
|
|
165
163
|
end
|
166
164
|
|
167
165
|
def override_config(config_file, setting, new_value)
|
168
|
-
|
166
|
+
puts "Overriding configuration #{setting}=#{config(setting)} with cmd line #{new_value}"
|
169
167
|
merge_config_values({setting => new_value})
|
170
168
|
end
|
171
169
|
|
@@ -55,9 +55,8 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
55
55
|
user = data['actor']['login']
|
56
56
|
repo = data['repo']['name'].split(/\//)[1]
|
57
57
|
id = data['payload']['comment']['id']
|
58
|
-
created_at = data['created_at']
|
59
58
|
|
60
|
-
ghtorrent.get_commit_comment(user, repo, id
|
59
|
+
ghtorrent.get_commit_comment(user, repo, id)
|
61
60
|
end
|
62
61
|
|
63
62
|
def PullRequestEvent(data)
|
@@ -74,9 +73,8 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
74
73
|
owner = data['repo']['name'].split(/\//)[0]
|
75
74
|
repo = data['repo']['name'].split(/\//)[1]
|
76
75
|
fork_id = data['payload']['forkee']['id']
|
77
|
-
created_at = data['created_at']
|
78
76
|
|
79
|
-
ghtorrent.get_fork(owner, repo, fork_id
|
77
|
+
ghtorrent.get_fork(owner, repo, fork_id)
|
80
78
|
end
|
81
79
|
|
82
80
|
def PullRequestReviewCommentEvent(data)
|
@@ -84,18 +82,16 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
84
82
|
repo = data['repo']['name'].split(/\//)[1]
|
85
83
|
comment_id = data['payload']['comment']['id']
|
86
84
|
pullreq_id = data['payload']['comment']['_links']['pull_request']['href'].split(/\//)[-1]
|
87
|
-
created_at = data['created_at']
|
88
85
|
|
89
|
-
ghtorrent.get_pullreq_comment(owner, repo, pullreq_id, comment_id
|
86
|
+
ghtorrent.get_pullreq_comment(owner, repo, pullreq_id, comment_id)
|
90
87
|
end
|
91
88
|
|
92
89
|
def IssuesEvent(data)
|
93
90
|
owner = data['repo']['name'].split(/\//)[0]
|
94
91
|
repo = data['repo']['name'].split(/\//)[1]
|
95
92
|
issue_id = data['payload']['issue']['number']
|
96
|
-
created_at = data['created_at']
|
97
93
|
|
98
|
-
ghtorrent.get_issue(owner, repo, issue_id
|
94
|
+
ghtorrent.get_issue(owner, repo, issue_id)
|
99
95
|
end
|
100
96
|
|
101
97
|
def IssueCommentEvent(data)
|
@@ -103,7 +99,6 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
103
99
|
repo = data['repo']['name'].split(/\//)[1]
|
104
100
|
issue_id = data['payload']['issue']['number']
|
105
101
|
comment_id = data['payload']['comment']['id']
|
106
|
-
created_at = data['created_at']
|
107
102
|
|
108
103
|
ghtorrent.get_issue_comment(owner, repo, issue_id, comment_id)
|
109
104
|
end
|
@@ -112,7 +107,7 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
112
107
|
%w(PushEvent WatchEvent FollowEvent MemberEvent
|
113
108
|
CommitCommentEvent PullRequestEvent ForkEvent
|
114
109
|
PullRequestReviewCommentEvent IssuesEvent IssueCommentEvent)
|
115
|
-
#%w(
|
110
|
+
#%w(PullRequestEvent)
|
116
111
|
end
|
117
112
|
|
118
113
|
def prepare_options(options)
|
@@ -23,17 +23,17 @@ Retrieves more commits for the provided repository
|
|
23
23
|
BANNER
|
24
24
|
|
25
25
|
options.opt :num, 'Number of commits to retrieve',
|
26
|
-
:short => 'n', :default =>
|
27
|
-
options.opt :full, 'Retrieve all commits,
|
28
|
-
|
26
|
+
:short => 'n', :default => 1024 * 1024 * 1024, :type => :int
|
27
|
+
options.opt :full, 'Retrieve all commits, starting from the latest available.
|
28
|
+
If not set, will start from latest stored commit',
|
29
|
+
:short => 'f', :default => false, :type => :boolean
|
30
|
+
options.opt :upto, 'Get all commits up to the provided timestamp',
|
31
|
+
:short => 't', :default => 0, :type => :int
|
29
32
|
end
|
30
33
|
|
31
34
|
def validate
|
32
35
|
super
|
33
36
|
Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
|
34
|
-
|
35
|
-
Trollop::die "-a and -n cannot be defined at the same time" \
|
36
|
-
if not options[:all].nil? and not options[:foo].nil?
|
37
37
|
end
|
38
38
|
|
39
39
|
def logger
|
@@ -68,37 +68,43 @@ Retrieves more commits for the provided repository
|
|
68
68
|
end
|
69
69
|
|
70
70
|
repo = repo_entry[:name]
|
71
|
-
num_pages = if options[:num] == -1 then 1024 * 1024 else options[:n]/30 end
|
72
|
-
num_pages = if options[:full] == -1 then num_pages else 1024 * 1024 end
|
73
|
-
page = 0
|
74
|
-
|
75
71
|
|
76
|
-
head =
|
72
|
+
head = if options[:full] == false
|
77
73
|
@ght.get_db.from(:commits).\
|
78
74
|
where(:commits__project_id => repo_entry[:id]).\
|
79
75
|
order(:created_at).\
|
80
|
-
first
|
81
|
-
select(:sha)
|
76
|
+
first[:sha]
|
82
77
|
else
|
83
78
|
"master"
|
84
79
|
end
|
85
80
|
|
86
81
|
total_commits = 0
|
87
|
-
|
82
|
+
old_head = nil
|
83
|
+
while (true)
|
88
84
|
begin
|
89
85
|
logger.debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
|
90
86
|
|
91
87
|
commits = retrieve_commits(repo, head, user, 1)
|
92
|
-
|
88
|
+
|
93
89
|
if commits.nil? or commits.empty? or commits.size == 1
|
94
|
-
page = num_pages # To break the loop
|
95
90
|
break
|
96
91
|
end
|
97
92
|
|
98
|
-
total_commits += commits.size
|
99
93
|
head = commits.last['sha']
|
100
94
|
|
101
95
|
commits.map do |c|
|
96
|
+
total_commits += 1
|
97
|
+
|
98
|
+
if options[:num] < total_commits
|
99
|
+
logger.info("Already retrieved #{total_commits} commits. Stopping.")
|
100
|
+
return
|
101
|
+
end
|
102
|
+
|
103
|
+
if Time.parse(c['commit']['author']['date']) < Time.at(options[:upto])
|
104
|
+
logger.info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
|
105
|
+
return
|
106
|
+
end
|
107
|
+
|
102
108
|
@ght.transaction do
|
103
109
|
@ght.ensure_commit(repo, c['sha'], user)
|
104
110
|
end
|
@@ -106,6 +112,11 @@ Retrieves more commits for the provided repository
|
|
106
112
|
rescue Exception => e
|
107
113
|
logger.warn("Error processing: #{e}")
|
108
114
|
logger.warn(e.backtrace.join("\n"))
|
115
|
+
if old_head == head
|
116
|
+
logger.info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
|
117
|
+
fail("Cannot retrieve commits from head: #{head}")
|
118
|
+
end
|
119
|
+
old_head = head
|
109
120
|
end
|
110
121
|
end
|
111
122
|
logger.debug("Processed #{total_commits} commits for #{user}/#{repo}")
|