ghtorrent 0.10 → 0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +16 -0
- data/Gemfile.lock +12 -27
- data/README.md +20 -33
- data/Rakefile +1 -9
- data/bin/ght-log-analyzer +11 -6
- data/bin/ght-log-influx +190 -0
- data/bin/ght-queue-grep.rb +55 -0
- data/bin/ght-retrieve-users +6 -0
- data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
- data/lib/ghtorrent.rb +4 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
- data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
- data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
- data/lib/ghtorrent/api_client.rb +45 -119
- data/lib/ghtorrent/command.rb +25 -8
- data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
- data/lib/ghtorrent/commands/ght_load.rb +1 -2
- data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
- data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
- data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
- data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
- data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
- data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
- data/lib/ghtorrent/event_processing.rb +140 -0
- data/lib/ghtorrent/ghtorrent.rb +330 -396
- data/lib/ghtorrent/logging.rb +65 -12
- data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
- data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
- data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
- data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
- data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
- data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
- data/lib/ghtorrent/retriever.rb +100 -57
- data/lib/ghtorrent/settings.rb +14 -17
- data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
- data/lib/version.rb +1 -1
- metadata +14 -46
- data/bin/ght-process-event +0 -35
- data/lib/ghtorrent/cache.rb +0 -97
- data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
- data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
- data/spec/api_client_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -21
data/lib/ghtorrent/command.rb
CHANGED
@@ -17,6 +17,7 @@ module GHTorrent
|
|
17
17
|
class Command
|
18
18
|
|
19
19
|
include GHTorrent::Settings
|
20
|
+
include GHTorrent::Logging
|
20
21
|
|
21
22
|
# Specify the run method for subclasses.
|
22
23
|
class << self
|
@@ -60,6 +61,19 @@ module GHTorrent
|
|
60
61
|
command.options[:token])
|
61
62
|
end
|
62
63
|
|
64
|
+
unless command.options[:req_limit].nil?
|
65
|
+
command.settings = command.override_config(command.settings,
|
66
|
+
:req_limit,
|
67
|
+
command.options[:req_limit])
|
68
|
+
end
|
69
|
+
|
70
|
+
unless command.options[:uniq].nil?
|
71
|
+
command.settings = command.override_config(command.settings,
|
72
|
+
:logging_uniq,
|
73
|
+
command.options[:uniq])
|
74
|
+
end
|
75
|
+
|
76
|
+
|
63
77
|
begin
|
64
78
|
command.go
|
65
79
|
rescue => e
|
@@ -88,12 +102,16 @@ Standard options:
|
|
88
102
|
opt :config, 'config.yaml file location', :short => 'c',
|
89
103
|
:default => 'config.yaml'
|
90
104
|
opt :verbose, 'verbose mode', :short => 'v'
|
91
|
-
opt :addr, '
|
105
|
+
opt :addr, 'IP address to use for performing requests', :short => 'a',
|
92
106
|
:type => String
|
93
107
|
opt :username, 'Username at Github', :short => 's', :type => String
|
94
108
|
opt :password, 'Password at Github', :type => String
|
95
109
|
opt :token, 'OAuth Github token (use instead of username/password)',
|
96
110
|
:type => String, :short => 't'
|
111
|
+
opt :req_limit, 'Request limit for provided account (in reqs/hour)',
|
112
|
+
:type => Integer, :short => 'l'
|
113
|
+
opt :uniq, 'Unique name for this command. Will appear in logs.',
|
114
|
+
:type => String, :short => 'u'
|
97
115
|
end
|
98
116
|
end
|
99
117
|
|
@@ -143,12 +161,11 @@ Standard options:
|
|
143
161
|
def go
|
144
162
|
end
|
145
163
|
|
146
|
-
# Specify a handler to incoming messages from a connection to
|
147
|
-
#
|
148
|
-
# [
|
149
|
-
# [
|
150
|
-
#
|
151
|
-
# [block]: A block with one argument (the message)
|
164
|
+
# Specify a handler to incoming messages from a connection to a queue.
|
165
|
+
#
|
166
|
+
# @param queue [String] the queue name to bind to
|
167
|
+
# @param ack [Symbol] when should acks be send, :before or :after the block returns
|
168
|
+
# @param block [Block]: A block accepting one argument (the message)
|
152
169
|
def queue_client(queue, ack = :after, block)
|
153
170
|
|
154
171
|
stopped = false
|
@@ -194,7 +211,7 @@ Standard options:
|
|
194
211
|
sleep(1)
|
195
212
|
rescue Interrupt => _
|
196
213
|
stopped = true
|
197
|
-
rescue
|
214
|
+
rescue StandardError => e
|
198
215
|
raise e
|
199
216
|
end
|
200
217
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module GHTorrent
|
2
|
+
module Commands
|
3
|
+
# Defines a process to download the full data available for a single user
|
4
|
+
module FullUserRetriever
|
5
|
+
|
6
|
+
def retrieve_user(login)
|
7
|
+
#self.settings = override_config(settings, :mirror_history_pages_back, -1)
|
8
|
+
|
9
|
+
user_entry = ght.transaction { ght.ensure_user(login, false, false) }
|
10
|
+
on_github = api_request(ghurl ("users/#{login}"))
|
11
|
+
|
12
|
+
if on_github.empty?
|
13
|
+
if user_entry.nil?
|
14
|
+
warn "User #{login} does not exist on GitHub"
|
15
|
+
exit
|
16
|
+
else
|
17
|
+
ght.transaction do
|
18
|
+
ght.get_db.from(:users).where(:login => login).update(:users__deleted => true)
|
19
|
+
end
|
20
|
+
warn "User #{login} marked as deleted"
|
21
|
+
return
|
22
|
+
end
|
23
|
+
else
|
24
|
+
if user_entry.nil?
|
25
|
+
warn "Error retrieving user #{login}"
|
26
|
+
exit
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
user = user_entry[:login]
|
31
|
+
|
32
|
+
def send_message(function, user)
|
33
|
+
begin
|
34
|
+
ght.send(function, user)
|
35
|
+
rescue StandardError => e
|
36
|
+
puts STDERR, e.message
|
37
|
+
puts STDERR, e.backtrace
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
functions = %w(ensure_user_following ensure_user_followers ensure_orgs ensure_org)
|
42
|
+
|
43
|
+
functions.each do |x|
|
44
|
+
send_message(x, user)
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -12,6 +12,7 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
12
12
|
include GHTorrent::Settings
|
13
13
|
include GHTorrent::Logging
|
14
14
|
include GHTorrent::Persister
|
15
|
+
include GHTorrent::EventProcessing
|
15
16
|
|
16
17
|
def persister
|
17
18
|
@persister ||= connect(:mongo, settings)
|
@@ -22,99 +23,11 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
22
23
|
JSON.parse(msg)
|
23
24
|
end
|
24
25
|
|
25
|
-
def PushEvent(data)
|
26
|
-
data['payload']['commits'].each do |c|
|
27
|
-
url = c['url'].split(/\//)
|
28
|
-
|
29
|
-
ghtorrent.get_commit url[4], url[5], url[7]
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def WatchEvent(data)
|
34
|
-
owner = data['repo']['name'].split(/\//)[0]
|
35
|
-
repo = data['repo']['name'].split(/\//)[1]
|
36
|
-
watcher = data['actor']['login']
|
37
|
-
created_at = data['created_at']
|
38
|
-
|
39
|
-
ghtorrent.get_watcher owner, repo, watcher, created_at
|
40
|
-
end
|
41
|
-
|
42
|
-
def FollowEvent(data)
|
43
|
-
follower = data['actor']['login']
|
44
|
-
followed = data['payload']['target']['login']
|
45
|
-
created_at = data['created_at']
|
46
|
-
|
47
|
-
ghtorrent.get_follower(follower, followed, created_at)
|
48
|
-
end
|
49
|
-
|
50
|
-
def MemberEvent(data)
|
51
|
-
owner = data['actor']['login']
|
52
|
-
repo = data['repo']['name'].split(/\//)[1]
|
53
|
-
new_member = data['payload']['member']['login']
|
54
|
-
created_at = data['created_at']
|
55
|
-
|
56
|
-
ghtorrent.get_project_member(owner, repo, new_member, created_at)
|
57
|
-
end
|
58
|
-
|
59
|
-
def CommitCommentEvent(data)
|
60
|
-
user = data['repo']['name'].split(/\//)[0]
|
61
|
-
repo = data['repo']['name'].split(/\//)[1]
|
62
|
-
id = data['payload']['comment']['id']
|
63
|
-
sha = data['payload']['comment']['commit_id']
|
64
|
-
|
65
|
-
ghtorrent.get_commit_comment(user, repo, sha, id)
|
66
|
-
end
|
67
|
-
|
68
|
-
def PullRequestEvent(data)
|
69
|
-
owner = data['payload']['pull_request']['base']['repo']['owner']['login']
|
70
|
-
repo = data['payload']['pull_request']['base']['repo']['name']
|
71
|
-
pullreq_id = data['payload']['number']
|
72
|
-
action = data['payload']['action']
|
73
|
-
actor = data['actor']['login']
|
74
|
-
created_at = data['created_at']
|
75
|
-
|
76
|
-
ghtorrent.get_pull_request(owner, repo, pullreq_id, action, actor, created_at)
|
77
|
-
end
|
78
|
-
|
79
|
-
def ForkEvent(data)
|
80
|
-
owner = data['repo']['name'].split(/\//)[0]
|
81
|
-
repo = data['repo']['name'].split(/\//)[1]
|
82
|
-
fork_id = data['payload']['forkee']['id']
|
83
|
-
|
84
|
-
#ghtorrent.get_fork(owner, repo, fork_id)
|
85
|
-
end
|
86
|
-
|
87
|
-
def PullRequestReviewCommentEvent(data)
|
88
|
-
owner = data['repo']['name'].split(/\//)[0]
|
89
|
-
repo = data['repo']['name'].split(/\//)[1]
|
90
|
-
comment_id = data['payload']['comment']['id']
|
91
|
-
pullreq_id = data['payload']['comment']['_links']['pull_request']['href'].split(/\//)[-1]
|
92
|
-
|
93
|
-
ghtorrent.get_pullreq_comment(owner, repo, pullreq_id, comment_id)
|
94
|
-
end
|
95
|
-
|
96
|
-
def IssuesEvent(data)
|
97
|
-
owner = data['repo']['name'].split(/\//)[0]
|
98
|
-
repo = data['repo']['name'].split(/\//)[1]
|
99
|
-
issue_id = data['payload']['issue']['number']
|
100
|
-
|
101
|
-
ghtorrent.get_issue(owner, repo, issue_id)
|
102
|
-
end
|
103
|
-
|
104
|
-
def IssueCommentEvent(data)
|
105
|
-
owner = data['repo']['name'].split(/\//)[0]
|
106
|
-
repo = data['repo']['name'].split(/\//)[1]
|
107
|
-
issue_id = data['payload']['issue']['number']
|
108
|
-
comment_id = data['payload']['comment']['id']
|
109
|
-
|
110
|
-
ghtorrent.get_issue_comment(owner, repo, issue_id, comment_id)
|
111
|
-
end
|
112
|
-
|
113
26
|
def handlers
|
114
|
-
%w(PushEvent WatchEvent FollowEvent MemberEvent
|
27
|
+
%w(PushEvent WatchEvent FollowEvent MemberEvent CreateEvent
|
115
28
|
CommitCommentEvent PullRequestEvent ForkEvent
|
116
29
|
PullRequestReviewCommentEvent IssuesEvent IssueCommentEvent)
|
117
|
-
#%w(
|
30
|
+
#%w(ForkEvent)
|
118
31
|
end
|
119
32
|
|
120
33
|
def prepare_options(options)
|
@@ -135,7 +48,8 @@ If event_id is provided, only this event is processed.
|
|
135
48
|
end
|
136
49
|
|
137
50
|
def ghtorrent
|
138
|
-
|
51
|
+
#@gh ||= GHTorrent::Mirror.new(@settings)
|
52
|
+
@gh ||= TransactedGHTorrent.new(settings)
|
139
53
|
@gh
|
140
54
|
end
|
141
55
|
|
@@ -143,7 +57,7 @@ If event_id is provided, only this event is processed.
|
|
143
57
|
event = persister.get_underlying_connection[:events].find_one('id' => evt_id)
|
144
58
|
event.delete '_id'
|
145
59
|
data = parse(event.to_json)
|
146
|
-
|
60
|
+
debug "Processing event: #{data['type']}-#{data['id']}"
|
147
61
|
data
|
148
62
|
end
|
149
63
|
|
@@ -153,7 +67,7 @@ If event_id is provided, only this event is processed.
|
|
153
67
|
event = retrieve_event(ARGV[0])
|
154
68
|
|
155
69
|
if event.nil?
|
156
|
-
warn "
|
70
|
+
warn "No event with id: #{ARGV[0]}"
|
157
71
|
else
|
158
72
|
send(event['type'], event)
|
159
73
|
end
|
@@ -178,20 +92,20 @@ If event_id is provided, only this event is processed.
|
|
178
92
|
queue = channel.queue("#{h}s", {:durable => true})\
|
179
93
|
.bind(exchange, :routing_key => "evt.#{h}")
|
180
94
|
|
181
|
-
info "
|
95
|
+
info "Binding handler #{h} to routing key evt.#{h}"
|
182
96
|
|
183
97
|
queue.subscribe(:ack => true) do |headers, properties, msg|
|
98
|
+
start = Time.now
|
184
99
|
begin
|
185
|
-
|
186
100
|
data = retrieve_event(msg)
|
187
101
|
send(h, data)
|
188
102
|
|
189
103
|
channel.acknowledge(headers.delivery_tag, false)
|
190
|
-
info "
|
191
|
-
rescue
|
104
|
+
info "Success processing event. Type: #{data['type']}, ID: #{data['id']}, Time: #{Time.now.to_ms - start.to_ms} ms"
|
105
|
+
rescue StandardError => e
|
192
106
|
# Give a message a chance to be reprocessed
|
193
107
|
if headers.redelivered?
|
194
|
-
warn "
|
108
|
+
warn "Error processing event. Type: #{data['type']}, ID: #{data['id']}, Time: #{Time.now.to_ms - start.to_ms} ms"
|
195
109
|
channel.reject(headers.delivery_tag, false)
|
196
110
|
else
|
197
111
|
channel.reject(headers.delivery_tag, true)
|
@@ -12,6 +12,7 @@ class GHTMoreCommitsRetriever < GHTorrent::Command
|
|
12
12
|
include GHTorrent::Settings
|
13
13
|
include GHTorrent::Retriever
|
14
14
|
include GHTorrent::Persister
|
15
|
+
include GHTorrent::Logging
|
15
16
|
|
16
17
|
def prepare_options(options)
|
17
18
|
options.banner <<-BANNER
|
@@ -28,7 +29,7 @@ Retrieves more commits for the provided repository
|
|
28
29
|
If not set, will start from latest stored commit',
|
29
30
|
:short => 'f', :default => false, :type => :boolean
|
30
31
|
options.opt :upto, 'Get all commits up to the provided timestamp',
|
31
|
-
:short => '
|
32
|
+
:short => 'x', :default => 0, :type => :int
|
32
33
|
end
|
33
34
|
|
34
35
|
def validate
|
@@ -36,20 +37,15 @@ Retrieves more commits for the provided repository
|
|
36
37
|
Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
|
37
38
|
end
|
38
39
|
|
39
|
-
def logger
|
40
|
-
|
41
|
-
end
|
40
|
+
#def logger
|
41
|
+
# @ght.logger
|
42
|
+
#end
|
42
43
|
|
43
44
|
def persister
|
44
45
|
@persister ||= connect(:mongo, settings)
|
45
46
|
@persister
|
46
47
|
end
|
47
48
|
|
48
|
-
def ext_uniq
|
49
|
-
@ext_uniq ||= config(:uniq_id)
|
50
|
-
@ext_uniq
|
51
|
-
end
|
52
|
-
|
53
49
|
def go
|
54
50
|
|
55
51
|
@ght ||= GHTorrent::Mirror.new(settings)
|
@@ -82,7 +78,7 @@ Retrieves more commits for the provided repository
|
|
82
78
|
old_head = nil
|
83
79
|
while (true)
|
84
80
|
begin
|
85
|
-
|
81
|
+
debug("Retrieving more commits for #{user}/#{repo} from head: #{head}")
|
86
82
|
|
87
83
|
@settings = override_config(@settings, :mirror_history_pages_back, 1)
|
88
84
|
commits = retrieve_commits(repo, head, user)
|
@@ -97,12 +93,12 @@ Retrieves more commits for the provided repository
|
|
97
93
|
total_commits += 1
|
98
94
|
|
99
95
|
if options[:num] < total_commits
|
100
|
-
|
96
|
+
info("Already retrieved #{total_commits} commits. Stopping.")
|
101
97
|
return
|
102
98
|
end
|
103
99
|
|
104
100
|
if Time.parse(c['commit']['author']['date']) < Time.at(options[:upto])
|
105
|
-
|
101
|
+
info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
|
106
102
|
return
|
107
103
|
end
|
108
104
|
|
@@ -110,17 +106,17 @@ Retrieves more commits for the provided repository
|
|
110
106
|
@ght.ensure_commit(repo, c['sha'], user)
|
111
107
|
end
|
112
108
|
end
|
113
|
-
rescue
|
114
|
-
|
115
|
-
|
109
|
+
rescue StandardError => e
|
110
|
+
warn("Error processing: #{e}")
|
111
|
+
warn(e.backtrace.join("\n"))
|
116
112
|
if old_head == head
|
117
|
-
|
113
|
+
info("Commit #{c['sha']} older than #{Time.at(options[:upto])}. Stopping.")
|
118
114
|
fail("Cannot retrieve commits from head: #{head}")
|
119
115
|
end
|
120
116
|
old_head = head
|
121
117
|
end
|
122
118
|
end
|
123
|
-
|
119
|
+
debug("Processed #{total_commits} commits for #{user}/#{repo}")
|
124
120
|
end
|
125
121
|
end
|
126
122
|
|
@@ -4,7 +4,6 @@ require 'pp'
|
|
4
4
|
require 'bunny'
|
5
5
|
|
6
6
|
require 'ghtorrent/settings'
|
7
|
-
require 'ghtorrent/logging'
|
8
7
|
require 'ghtorrent/persister'
|
9
8
|
require 'ghtorrent/command'
|
10
9
|
require 'ghtorrent/bson_orderedhash'
|
@@ -31,7 +30,7 @@ Loads object ids from a collection to a queue for further processing.
|
|
31
30
|
options.opt :earliest, 'Seconds since epoch of earliest item to load',
|
32
31
|
:short => 'e', :default => 0, :type => :int
|
33
32
|
options.opt :latest, 'Seconds since epoch of latest item to load',
|
34
|
-
:short => '
|
33
|
+
:short => 'x', :default => Time.now.to_i + (60 * 60 * 24 * 360 * 20),
|
35
34
|
:type => :int
|
36
35
|
options.opt :number, 'Total number of items to load',
|
37
36
|
:short => 'n', :type => :int, :default => 2**48
|
@@ -15,10 +15,7 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
15
15
|
include GHTorrent::Logging
|
16
16
|
include GHTorrent::Persister
|
17
17
|
include GHTorrent::APIClient
|
18
|
-
|
19
|
-
def logger
|
20
|
-
@logger
|
21
|
-
end
|
18
|
+
include GHTorrent::Logging
|
22
19
|
|
23
20
|
def store_count(events)
|
24
21
|
stored = Array.new
|
@@ -41,13 +38,13 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
41
38
|
def retrieve(exchange)
|
42
39
|
begin
|
43
40
|
new = dupl = 0
|
44
|
-
events = api_request "https://api.github.com/events"
|
41
|
+
events = api_request "https://api.github.com/events?per_page=100"
|
45
42
|
(new, dupl, stored) = store_count events
|
46
43
|
|
47
|
-
# This means that first page
|
48
|
-
#
|
44
|
+
# This means that the first page does not contain all new events. Do
|
45
|
+
# a paged request and get everything on the queue
|
49
46
|
if dupl == 0
|
50
|
-
events = paged_api_request "https://api.github.com/events"
|
47
|
+
events = paged_api_request "https://api.github.com/events?per_page=100"
|
51
48
|
(new1, dupl1, stored1) = store_count events
|
52
49
|
stored = stored | stored1
|
53
50
|
new = new + new1
|
@@ -58,7 +55,7 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
58
55
|
exchange.publish e['id'], :persistent => true, :routing_key => key
|
59
56
|
end
|
60
57
|
return new, dupl
|
61
|
-
rescue
|
58
|
+
rescue StandardError => e
|
62
59
|
STDERR.puts e.message
|
63
60
|
STDERR.puts e.backtrace
|
64
61
|
end
|
@@ -66,7 +63,6 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
66
63
|
|
67
64
|
def go
|
68
65
|
@persister = connect(:mongo, @settings)
|
69
|
-
@logger = Logger.new(STDOUT)
|
70
66
|
|
71
67
|
conn = Bunny.new(:host => config(:amqp_host),
|
72
68
|
:port => config(:amqp_port),
|
@@ -75,7 +71,7 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
75
71
|
conn.start
|
76
72
|
|
77
73
|
ch = conn.create_channel
|
78
|
-
|
74
|
+
debug "Connection to #{config(:amqp_host)} succeded"
|
79
75
|
|
80
76
|
exchange = ch.topic(config(:amqp_exchange), :durable => true,
|
81
77
|
:auto_delete => false)
|
@@ -97,7 +93,7 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
97
93
|
end
|
98
94
|
rescue Interrupt
|
99
95
|
stopped = true
|
100
|
-
rescue
|
96
|
+
rescue StandardError => e
|
101
97
|
@logger.error e
|
102
98
|
end
|
103
99
|
end
|