ghtorrent 0.6 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -0
- data/Gemfile +1 -11
- data/Gemfile.lock +27 -29
- data/README.md +10 -14
- data/bin/ght-mirror-events +0 -0
- data/bin/ght-process-event +0 -0
- data/bin/ght-retrieve-repo +0 -0
- data/bin/ght-retrieve-user +6 -0
- data/lib/ghtorrent.rb +1 -0
- data/lib/ghtorrent/adapters/base_adapter.rb +6 -0
- data/lib/ghtorrent/adapters/mongo_persister.rb +8 -0
- data/lib/ghtorrent/api_client.rb +8 -29
- data/lib/ghtorrent/command.rb +1 -3
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +5 -10
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +28 -17
- data/lib/ghtorrent/commands/ght_load.rb +2 -2
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +45 -15
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +72 -0
- data/lib/ghtorrent/ghtorrent.rb +288 -209
- data/lib/ghtorrent/migrations/012_add_forks_to_projects.rb +31 -0
- data/lib/ghtorrent/migrations/013_add_merged_to_pullreqs.rb +39 -0
- data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +21 -0
- data/lib/ghtorrent/retriever.rb +90 -25
- data/lib/ghtorrent/settings.rb +44 -6
- data/lib/version.rb +2 -2
- metadata +52 -84
- data/bin/ght-periodic-dump +0 -130
- data/bin/ght-torrent-index +0 -150
- data/test/callstack_test.rb +0 -67
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
= Version 0.7
|
2
|
+
* Full support for issues (comments, labels etc) and pull requests
|
3
|
+
* Cleaned up retrieval of pull request commits
|
4
|
+
* Cleaned up association of commits with repositories.
|
5
|
+
* Removed the forks table. Forks are now tracked by the forked_from field in projects
|
6
|
+
* Use Github's HTTP headers for request throttling
|
7
|
+
* Support for setting user agent header as per Github API requirements
|
8
|
+
* Support for marking projects as deleted (run fixes/update_deleted.rb)
|
9
|
+
* New tool (ght-retrieve-user) to retrieve all data for a single user
|
10
|
+
* Support for running without a config.yaml file
|
11
|
+
* Technical report in doc/
|
12
|
+
|
1
13
|
= Version 0.6
|
2
14
|
|
3
15
|
* Support retrieval of issues, issue events and issue history
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,40 +1,38 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
ghtorrent (0.6)
|
5
|
+
amqp (~> 1.0.0)
|
6
|
+
bson_ext (~> 1.8.0)
|
7
|
+
daemons (~> 1.1.0)
|
8
|
+
mongo (~> 1.8.0)
|
9
|
+
sequel (~> 3.47)
|
10
|
+
trollop (~> 2.0.0)
|
11
|
+
|
1
12
|
GEM
|
2
13
|
remote: https://rubygems.org/
|
3
14
|
specs:
|
4
|
-
amq-client (0.
|
5
|
-
amq-protocol (>=
|
15
|
+
amq-client (1.0.2)
|
16
|
+
amq-protocol (>= 1.2.0)
|
6
17
|
eventmachine
|
7
|
-
amq-protocol (
|
8
|
-
amqp (0.
|
9
|
-
amq-client (~> 0.
|
10
|
-
amq-protocol (>=
|
18
|
+
amq-protocol (1.4.0)
|
19
|
+
amqp (1.0.2)
|
20
|
+
amq-client (~> 1.0.2)
|
21
|
+
amq-protocol (>= 1.3.0)
|
11
22
|
eventmachine
|
12
|
-
bson (1.
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
mongo (1.6.4)
|
22
|
-
bson (~> 1.6.4)
|
23
|
-
sequel (3.37.0)
|
24
|
-
sqlite3 (1.3.6)
|
25
|
-
trollop (1.16.2)
|
23
|
+
bson (1.8.5)
|
24
|
+
bson_ext (1.8.5)
|
25
|
+
bson (~> 1.8.5)
|
26
|
+
daemons (1.1.9)
|
27
|
+
eventmachine (1.0.3)
|
28
|
+
mongo (1.8.5)
|
29
|
+
bson (~> 1.8.5)
|
30
|
+
sequel (3.47.0)
|
31
|
+
trollop (2.0)
|
26
32
|
|
27
33
|
PLATFORMS
|
28
|
-
java
|
29
34
|
ruby
|
30
35
|
|
31
36
|
DEPENDENCIES
|
32
|
-
|
33
|
-
bson_ext
|
34
|
-
daemons
|
37
|
+
ghtorrent!
|
35
38
|
jdbc-mysql
|
36
|
-
json
|
37
|
-
mongo
|
38
|
-
sequel
|
39
|
-
sqlite3
|
40
|
-
trollop
|
data/README.md
CHANGED
@@ -9,22 +9,21 @@ GHTorrent relies on the following software to work:
|
|
9
9
|
|
10
10
|
* MongoDB > 2.0
|
11
11
|
* RabbitMQ >= 2.7
|
12
|
-
*
|
13
|
-
|
12
|
+
* MySQL >= 5.5. GHTorrent is tested mainly with MySQL, but can theoretically be
|
13
|
+
used with any SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html). Your milaege may vary.
|
14
14
|
|
15
|
-
GHTorrent is written in Ruby (tested with 1.9
|
16
|
-
it as a Gem do:
|
15
|
+
GHTorrent is written in Ruby (tested with 1.9). To install it as a Gem do:
|
17
16
|
|
18
17
|
<code>
|
19
18
|
sudo gem install ghtorrent
|
20
19
|
</code>
|
21
20
|
|
22
21
|
Depending on which SQL database you want to use, install the appropriate
|
23
|
-
dependency gem. GHTorrent already installs the `
|
24
|
-
install the development package for `
|
22
|
+
dependency gem. GHTorrent already installs the `mysql2` gem (if it fails,
|
23
|
+
install the development package for `libmysql-dev` for your system).
|
25
24
|
|
26
25
|
<code>
|
27
|
-
sudo gem install mysql2 #or postgres
|
26
|
+
sudo gem install mysql2 #or sqlite3-ruby #or postgres
|
28
27
|
</code>
|
29
28
|
|
30
29
|
#### Configuring
|
@@ -95,12 +94,14 @@ and performance reasons. To catch up with Github's event stream, it is
|
|
95
94
|
usually enough to run `ght-mirror-events` on one host. To collect all data
|
96
95
|
pointed by each event, one instance of `ght-data-retrieval` is not enough.
|
97
96
|
Both scripts employ throttling mechanisms to keep API usage whithin the
|
98
|
-
limits imposed by Github (currently
|
97
|
+
limits imposed by Github (currently 60 reqs/hr/ip). If you want the full
|
98
|
+
5000 reqs/hr/ip, you will have to provide your Github login details
|
99
|
+
in the `config.yaml` file.
|
99
100
|
|
100
101
|
#### Data
|
101
102
|
|
102
103
|
You can find torrents for retrieving data on the
|
103
|
-
[Available Torrents](https://
|
104
|
+
[Available Torrents](https://ghtorrent.org/downloads.html) page. You need two sets of data:
|
104
105
|
|
105
106
|
* Raw events: Github's [event stream](https://api.github.com/events). These
|
106
107
|
are the roots for mirroring operations. The `ght-data-retrieval` crawler starts
|
@@ -108,11 +109,6 @@ from an event and goes deep into the rabbit hole.
|
|
108
109
|
* SQL dumps+Linked data: Data dumps from the SQL database and the corresponding
|
109
110
|
MongoDB entities.
|
110
111
|
|
111
|
-
|
112
|
-
*At the moment, GHTorrent is in the process of redesigning its data storage
|
113
|
-
schema. Consequently, it does not distribute SQL dumps or linked data raw data.
|
114
|
-
The distribution service will come back shortly.*
|
115
|
-
|
116
112
|
#### Reporting bugs
|
117
113
|
|
118
114
|
Please use the [Issue
|
data/bin/ght-mirror-events
CHANGED
File without changes
|
data/bin/ght-process-event
CHANGED
File without changes
|
data/bin/ght-retrieve-repo
CHANGED
File without changes
|
data/lib/ghtorrent.rb
CHANGED
@@ -56,5 +56,6 @@ require 'ghtorrent/commands/ght_get_more_commits'
|
|
56
56
|
require 'ghtorrent/commands/ght_rm_dupl'
|
57
57
|
require 'ghtorrent/commands/ght_load'
|
58
58
|
require 'ghtorrent/commands/ght_retrieve_repo'
|
59
|
+
require 'ghtorrent/commands/ght_retrieve_user'
|
59
60
|
|
60
61
|
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|
@@ -70,6 +70,12 @@ module GHTorrent
|
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
73
|
+
def del(entity, query = {})
|
74
|
+
unless ENTITIES.include?(entity)
|
75
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
73
79
|
# Get a raw connection to the underlying data store. The connection is
|
74
80
|
# implementaiton dependent.
|
75
81
|
def get_underlying_connection
|
@@ -65,6 +65,12 @@ module GHTorrent
|
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
68
|
+
def del(entity, query)
|
69
|
+
super
|
70
|
+
raise Exception 'No filter was specifed. Cowardily refusing to remove all entries' if query == {}
|
71
|
+
get_entity(entity).remove(query)
|
72
|
+
end
|
73
|
+
|
68
74
|
def get_underlying_connection
|
69
75
|
mongo
|
70
76
|
end
|
@@ -163,7 +169,9 @@ module GHTorrent
|
|
163
169
|
ensure_index(:users, "login")
|
164
170
|
ensure_index(:commits, "sha")
|
165
171
|
ensure_index(:repos, "name")
|
172
|
+
ensure_index(:repos, "owner.login")
|
166
173
|
ensure_index(:followers, "follows")
|
174
|
+
ensure_index(:followers, "login")
|
167
175
|
ensure_index(:org_members, "org")
|
168
176
|
ensure_index(:commit_comments, "repo")
|
169
177
|
ensure_index(:commit_comments, "user")
|
data/lib/ghtorrent/api_client.rb
CHANGED
@@ -25,7 +25,8 @@ module GHTorrent
|
|
25
25
|
|
26
26
|
# A paged request. Used when the result can expand to more than one
|
27
27
|
# result pages.
|
28
|
-
def paged_api_request(url, pages =
|
28
|
+
def paged_api_request(url, pages = config(:mirror_history_pages_back),
|
29
|
+
cache = true, last = nil)
|
29
30
|
|
30
31
|
url = if not url.include?("per_page")
|
31
32
|
if url.include?("?")
|
@@ -139,24 +140,6 @@ module GHTorrent
|
|
139
140
|
|
140
141
|
# Do the actual request and return the result object
|
141
142
|
def api_request_raw(url, use_cache = false)
|
142
|
-
@num_api_calls ||= 0
|
143
|
-
@ts ||= Time.now.to_i
|
144
|
-
@started_min ||= Time.now.min
|
145
|
-
|
146
|
-
#Rate limiting to avoid error requests
|
147
|
-
if Time.now().tv_sec() - @ts < 60 then
|
148
|
-
if @num_api_calls >= @settings['mirror']['reqrate'].to_i
|
149
|
-
sleep = 60 - (Time.now.to_i - @ts)
|
150
|
-
debug "APIClient: Sleeping for #{sleep}"
|
151
|
-
sleep (sleep)
|
152
|
-
@num_api_calls = 0
|
153
|
-
@ts = Time.now.to_i
|
154
|
-
end
|
155
|
-
else
|
156
|
-
debug "APIClient: Tick, num_calls = #{@num_api_calls}, zeroing"
|
157
|
-
@num_api_calls = 0
|
158
|
-
@ts = Time.now.to_i
|
159
|
-
end
|
160
143
|
|
161
144
|
begin
|
162
145
|
start_time = Time.now
|
@@ -169,25 +152,21 @@ module GHTorrent
|
|
169
152
|
cached
|
170
153
|
else
|
171
154
|
tocache = Cachable.new(do_request(url))
|
172
|
-
@num_api_calls += 1
|
173
155
|
cache_put(url, tocache)
|
174
156
|
tocache
|
175
157
|
end
|
176
158
|
else
|
177
|
-
@num_api_calls += 1
|
178
159
|
do_request(url)
|
179
160
|
end
|
180
161
|
|
181
162
|
total = Time.now.to_ms - start_time.to_ms
|
182
|
-
debug "APIClient: Request: #{url}
|
163
|
+
debug "APIClient: Request: #{url} #{if from_cache then " from cache," else "(#{contents.meta['x-ratelimit-remaining']} remaining)," end} Total: #{total} ms"
|
183
164
|
|
184
165
|
if not from_cache and config(:respect_api_ratelimit) and
|
185
|
-
contents.meta['x-ratelimit-remaining'].to_i <
|
186
|
-
sleep =
|
166
|
+
contents.meta['x-ratelimit-remaining'].to_i < 20
|
167
|
+
sleep = 61 - Time.now.min
|
187
168
|
debug "APIClient: Request limit reached, sleeping for #{sleep} min"
|
188
169
|
sleep(sleep * 60)
|
189
|
-
@started_min = Time.now.min
|
190
|
-
@num_api_calls = 0
|
191
170
|
end
|
192
171
|
|
193
172
|
contents
|
@@ -199,10 +178,10 @@ module GHTorrent
|
|
199
178
|
403, # Forbidden
|
200
179
|
404, # Not found
|
201
180
|
422 then # Unprocessable entity
|
202
|
-
warn "#{url}: #{e.io.status[1]}"
|
181
|
+
warn "APIClient: #{url}: #{e.io.status[1]}"
|
203
182
|
return nil
|
204
183
|
else # Server error or HTTP conditions that Github does not report
|
205
|
-
warn "#{url}"
|
184
|
+
warn "APIClient: #{url}: #{e.io.status[1]}"
|
206
185
|
raise e
|
207
186
|
end
|
208
187
|
end
|
@@ -212,7 +191,7 @@ module GHTorrent
|
|
212
191
|
@attach_ip ||= config(:attach_ip)
|
213
192
|
@username ||= config(:github_username)
|
214
193
|
@passwd ||= config(:github_passwd)
|
215
|
-
@user_agent ||=
|
194
|
+
@user_agent ||= config(:user_agent)
|
216
195
|
|
217
196
|
@open_func ||= if @username.nil?
|
218
197
|
lambda {|url| open(url, 'User-Agent' => @user_agent)}
|
data/lib/ghtorrent/command.rb
CHANGED
@@ -35,8 +35,6 @@ module GHTorrent
|
|
35
35
|
command.process_options
|
36
36
|
command.validate
|
37
37
|
|
38
|
-
puts "GHTorrent version: #{GHTorrent::VERSION}"
|
39
|
-
|
40
38
|
command.settings = YAML::load_file command.options[:config]
|
41
39
|
|
42
40
|
unless command.options[:addr].nil?
|
@@ -165,7 +163,7 @@ Standard options:
|
|
165
163
|
end
|
166
164
|
|
167
165
|
def override_config(config_file, setting, new_value)
|
168
|
-
|
166
|
+
puts "Overriding configuration #{setting}=#{config(setting)} with cmd line #{new_value}"
|
169
167
|
merge_config_values({setting => new_value})
|
170
168
|
end
|
171
169
|
|
@@ -55,9 +55,8 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
55
55
|
user = data['actor']['login']
|
56
56
|
repo = data['repo']['name'].split(/\//)[1]
|
57
57
|
id = data['payload']['comment']['id']
|
58
|
-
created_at = data['created_at']
|
59
58
|
|
60
|
-
ghtorrent.get_commit_comment(user, repo, id
|
59
|
+
ghtorrent.get_commit_comment(user, repo, id)
|
61
60
|
end
|
62
61
|
|
63
62
|
def PullRequestEvent(data)
|
@@ -74,9 +73,8 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
74
73
|
owner = data['repo']['name'].split(/\//)[0]
|
75
74
|
repo = data['repo']['name'].split(/\//)[1]
|
76
75
|
fork_id = data['payload']['forkee']['id']
|
77
|
-
created_at = data['created_at']
|
78
76
|
|
79
|
-
ghtorrent.get_fork(owner, repo, fork_id
|
77
|
+
ghtorrent.get_fork(owner, repo, fork_id)
|
80
78
|
end
|
81
79
|
|
82
80
|
def PullRequestReviewCommentEvent(data)
|
@@ -84,18 +82,16 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
84
82
|
repo = data['repo']['name'].split(/\//)[1]
|
85
83
|
comment_id = data['payload']['comment']['id']
|
86
84
|
pullreq_id = data['payload']['comment']['_links']['pull_request']['href'].split(/\//)[-1]
|
87
|
-
created_at = data['created_at']
|
88
85
|
|
89
|
-
ghtorrent.get_pullreq_comment(owner, repo, pullreq_id, comment_id
|
86
|
+
ghtorrent.get_pullreq_comment(owner, repo, pullreq_id, comment_id)
|
90
87
|
end
|
91
88
|
|
92
89
|
def IssuesEvent(data)
|
93
90
|
owner = data['repo']['name'].split(/\//)[0]
|
94
91
|
repo = data['repo']['name'].split(/\//)[1]
|
95
92
|
issue_id = data['payload']['issue']['number']
|
96
|
-
created_at = data['created_at']
|
97
93
|
|
98
|
-
ghtorrent.get_issue(owner, repo, issue_id
|
94
|
+
ghtorrent.get_issue(owner, repo, issue_id)
|
99
95
|
end
|
100
96
|
|
101
97
|
def IssueCommentEvent(data)
|
@@ -103,7 +99,6 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
103
99
|
repo = data['repo']['name'].split(/\//)[1]
|
104
100
|
issue_id = data['payload']['issue']['number']
|
105
101
|
comment_id = data['payload']['comment']['id']
|
106
|
-
created_at = data['created_at']
|
107
102
|
|
108
103
|
ghtorrent.get_issue_comment(owner, repo, issue_id, comment_id)
|
109
104
|
end
|
@@ -112,7 +107,7 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
112
107
|
%w(PushEvent WatchEvent FollowEvent MemberEvent
|
113
108
|
CommitCommentEvent PullRequestEvent ForkEvent
|
114
109
|
PullRequestReviewCommentEvent IssuesEvent IssueCommentEvent)
|
115
|
-
#%w(
|
110
|
+
#%w(PullRequestEvent)
|
116
111
|
end
|
117
112
|
|
118
113
|
def prepare_options(options)
|
@@ -23,17 +23,17 @@ Retrieves more commits for the provided repository
|
|
23
23
|
BANNER
|
24
24
|
|
25
25
|
options.opt :num, 'Number of commits to retrieve',
|
26
|
-
:short => 'n', :default =>
|
27
|
-
options.opt :full, 'Retrieve all commits,
|
28
|
-
|
26
|
+
:short => 'n', :default => 1024 * 1024 * 1024, :type => :int
|
27
|
+
options.opt :full, 'Retrieve all commits, starting from the latest available.
|
28
|
+
If not set, will start from latest stored commit',
|
29
|
+
:short => 'f', :default => false, :type => :boolean
|
30
|
+
options.opt :upto, 'Get all commits up to the provided timestamp',
|
31
|
+
:short => 't', :default => 0, :type => :int
|
29
32
|
end
|
30
33
|
|
31
34
|
def validate
|
32
35
|
super
|
33
36
|
Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
|
34
|
-
|
35
|
-
Trollop::die "-a and -n cannot be defined at the same time" \
|
36
|
-
if not options[:all].nil? and not options[:foo].nil?
|
37
37
|
end
|
38
38
|
|
39
39
|
def logger
|
@@ -68,37 +68,43 @@ Retrieves more commits for the provided repository
|
|
68
68
|
end
|
69
69
|
|
70
70
|
repo = repo_entry[:name]
|
71
|
-
num_pages = if options[:num] == -1 then 1024 * 1024 else options[:n]/30 end
|
72
|
-
num_pages = if options[:full] == -1 then num_pages else 1024 * 1024 end
|
73
|
-
page = 0
|
74
|
-
|
75
71
|
|
76
|
-
head =
|
72
|
+
head = if options[:full] == false
|
77
73
|
@ght.get_db.from(:commits).\
|
78
74
|
where(:commits__project_id => repo_entry[:id]).\
|
79
75
|
order(:created_at).\
|
80
|
-
first
|
81
|
-
select(:sha)
|
76
|
+
first[:sha]
|
82
77
|
else
|
83
78
|
"master"
|
84
79
|
end
|
85
80
|
|
86
81
|
total_commits = 0
|
87
|
-
|
82
|
+
old_head = nil
|
83
|
+
while (true)
|
88
84
|
begin
|
89
85
|
logger.debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
|
90
86
|
|
91
87
|
commits = retrieve_commits(repo, head, user, 1)
|
92
|
-
|
88
|
+
|
93
89
|
if commits.nil? or commits.empty? or commits.size == 1
|
94
|
-
page = num_pages # To break the loop
|
95
90
|
break
|
96
91
|
end
|
97
92
|
|
98
|
-
total_commits += commits.size
|
99
93
|
head = commits.last['sha']
|
100
94
|
|
101
95
|
commits.map do |c|
|
96
|
+
total_commits += 1
|
97
|
+
|
98
|
+
if options[:num] < total_commits
|
99
|
+
logger.info("Already retrieved #{total_commits} commits. Stopping.")
|
100
|
+
return
|
101
|
+
end
|
102
|
+
|
103
|
+
if Time.parse(c['commit']['author']['date']) < Time.at(options[:upto])
|
104
|
+
logger.info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
|
105
|
+
return
|
106
|
+
end
|
107
|
+
|
102
108
|
@ght.transaction do
|
103
109
|
@ght.ensure_commit(repo, c['sha'], user)
|
104
110
|
end
|
@@ -106,6 +112,11 @@ Retrieves more commits for the provided repository
|
|
106
112
|
rescue Exception => e
|
107
113
|
logger.warn("Error processing: #{e}")
|
108
114
|
logger.warn(e.backtrace.join("\n"))
|
115
|
+
if old_head == head
|
116
|
+
logger.info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
|
117
|
+
fail("Cannot retrieve commits from head: #{head}")
|
118
|
+
end
|
119
|
+
old_head = head
|
109
120
|
end
|
110
121
|
end
|
111
122
|
logger.debug("Processed #{total_commits} commits for #{user}/#{repo}")
|