ghtorrent 0.10 → 0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +16 -0
- data/Gemfile.lock +12 -27
- data/README.md +20 -33
- data/Rakefile +1 -9
- data/bin/ght-log-analyzer +11 -6
- data/bin/ght-log-influx +190 -0
- data/bin/ght-queue-grep.rb +55 -0
- data/bin/ght-retrieve-users +6 -0
- data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
- data/lib/ghtorrent.rb +4 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
- data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
- data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
- data/lib/ghtorrent/api_client.rb +45 -119
- data/lib/ghtorrent/command.rb +25 -8
- data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
- data/lib/ghtorrent/commands/ght_load.rb +1 -2
- data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
- data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
- data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
- data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
- data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
- data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
- data/lib/ghtorrent/event_processing.rb +140 -0
- data/lib/ghtorrent/ghtorrent.rb +330 -396
- data/lib/ghtorrent/logging.rb +65 -12
- data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
- data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
- data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
- data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
- data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
- data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
- data/lib/ghtorrent/retriever.rb +100 -57
- data/lib/ghtorrent/settings.rb +14 -17
- data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
- data/lib/version.rb +1 -1
- metadata +14 -46
- data/bin/ght-process-event +0 -35
- data/lib/ghtorrent/cache.rb +0 -97
- data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
- data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
- data/spec/api_client_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -21
@@ -4,7 +4,6 @@ require 'pp'
|
|
4
4
|
|
5
5
|
require 'ghtorrent/ghtorrent'
|
6
6
|
require 'ghtorrent/settings'
|
7
|
-
require 'ghtorrent/logging'
|
8
7
|
require 'ghtorrent/command'
|
9
8
|
|
10
9
|
class GHTRetrieveDependents < GHTorrent::Command
|
@@ -33,10 +32,6 @@ pull_request (owner repo pullreq_id)
|
|
33
32
|
:pull_request => 3
|
34
33
|
}
|
35
34
|
|
36
|
-
def logger
|
37
|
-
ghtorrent.logger
|
38
|
-
end
|
39
|
-
|
40
35
|
def persister
|
41
36
|
@persister ||= connect(:mongo, settings)
|
42
37
|
@persister
|
@@ -2,7 +2,6 @@ require 'rubygems'
|
|
2
2
|
|
3
3
|
require 'ghtorrent/ghtorrent'
|
4
4
|
require 'ghtorrent/settings'
|
5
|
-
require 'ghtorrent/logging'
|
6
5
|
require 'ghtorrent/command'
|
7
6
|
require 'ghtorrent/retriever'
|
8
7
|
|
@@ -11,6 +10,7 @@ class GHTRetrieveRepo < GHTorrent::Command
|
|
11
10
|
include GHTorrent::Settings
|
12
11
|
include GHTorrent::Retriever
|
13
12
|
include GHTorrent::Persister
|
13
|
+
include GHTorrent::EventProcessing
|
14
14
|
|
15
15
|
def prepare_options(options)
|
16
16
|
options.banner <<-BANNER
|
@@ -19,35 +19,54 @@ An efficient way to get all data for a single repo
|
|
19
19
|
#{command_name} [options] owner repo
|
20
20
|
|
21
21
|
BANNER
|
22
|
+
options.opt :no_events, 'Skip retrieving events', :default => false
|
23
|
+
options.opt :no_entities, 'Skip retrieving entities', :default => false
|
24
|
+
|
25
|
+
options.opt :only_stage, "Only do the provided stage of entity retrieval (one of: #{stages.join(',')})",
|
26
|
+
:type => String
|
27
|
+
options.opt :exclude_events, 'Comma separated list of event types to exclude from processing',
|
28
|
+
:type => String
|
29
|
+
options.opt :events_after, 'Process all events later than the provided event id',
|
30
|
+
:type => Integer
|
31
|
+
options.opt :events_before, 'Process all events earlier than the provided event id',
|
32
|
+
:type => Integer
|
22
33
|
end
|
23
34
|
|
24
35
|
def validate
|
25
36
|
super
|
26
|
-
Trollop::die
|
37
|
+
Trollop::die 'Two arguments are required' unless args[0] && !args[0].empty?
|
38
|
+
|
39
|
+
unless options[:exclude_events].nil?
|
40
|
+
@exclude_event_types = options[:exclude_events].split(/,/)
|
41
|
+
else
|
42
|
+
@exclude_event_types = []
|
43
|
+
end
|
44
|
+
|
45
|
+
unless options[:only_stage].nil?
|
46
|
+
Trollop::die("Not a valid function: #{options[:only_stage]}") unless stages.include? options[:only_stage]
|
47
|
+
end
|
48
|
+
|
27
49
|
end
|
28
50
|
|
29
|
-
def
|
30
|
-
|
51
|
+
def stages
|
52
|
+
%w(ensure_commits ensure_forks ensure_pull_requests
|
53
|
+
ensure_issues ensure_watchers ensure_labels ensure_languages) #ensure_project_members
|
31
54
|
end
|
32
55
|
|
56
|
+
|
33
57
|
def persister
|
34
58
|
@persister ||= connect(:mongo, settings)
|
35
59
|
@persister
|
36
60
|
end
|
37
61
|
|
38
|
-
def
|
39
|
-
@
|
40
|
-
@
|
41
|
-
end
|
42
|
-
|
43
|
-
def ght
|
44
|
-
@ght ||= TransactedGhtorrent.new(settings)
|
45
|
-
@ght
|
62
|
+
def ghtorrent
|
63
|
+
@ghtorrent ||= TransactedGHTorrent.new(settings)
|
64
|
+
@ghtorrent
|
46
65
|
end
|
47
66
|
|
48
67
|
def go
|
49
68
|
self.settings = override_config(settings, :mirror_history_pages_back, 1000)
|
50
|
-
user_entry =
|
69
|
+
user_entry = ghtorrent.transaction{ghtorrent.ensure_user(ARGV[0], false, false)}
|
51
70
|
|
52
71
|
if user_entry.nil?
|
53
72
|
Trollop::die "Cannot find user #{ARGV[0]}"
|
@@ -55,7 +74,7 @@ An efficient way to get all data for a single repo
|
|
55
74
|
|
56
75
|
user = user_entry[:login]
|
57
76
|
|
58
|
-
repo_entry =
|
77
|
+
repo_entry = ghtorrent.transaction{ghtorrent.ensure_repo(ARGV[0], ARGV[1])}
|
59
78
|
|
60
79
|
if repo_entry.nil?
|
61
80
|
Trollop::die "Cannot find repository #{ARGV[0]}/#{ARGV[1]}"
|
@@ -63,21 +82,32 @@ An efficient way to get all data for a single repo
|
|
63
82
|
|
64
83
|
repo = repo_entry[:name]
|
65
84
|
|
66
|
-
|
67
|
-
|
85
|
+
unless options[:no_entities_given]
|
86
|
+
if options[:only_stage].nil?
|
87
|
+
stages.each do |x|
|
88
|
+
ghtorrent.send(x, user, repo)
|
89
|
+
end
|
90
|
+
else
|
91
|
+
ghtorrent.send(options[:only_stage], user, repo)
|
92
|
+
end
|
68
93
|
end
|
69
94
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
95
|
+
# Process repo events
|
96
|
+
unless options[:no_events_given]
|
97
|
+
events = get_repo_events(ARGV[0], ARGV[1]).sort{|e| e['id'].to_i}
|
98
|
+
events.each do |event|
|
99
|
+
begin
|
100
|
+
next if @exclude_event_types.include? event['type']
|
101
|
+
next if options[:events_after_given] and event['id'].to_i <= options[:events_after]
|
102
|
+
next if options[:events_before_given] and event['id'].to_i >= options[:events_before]
|
103
|
+
|
104
|
+
send(event['type'], event)
|
105
|
+
puts "Processed event #{event['type']}-#{event['id']}"
|
106
|
+
rescue StandardError => e
|
107
|
+
puts "Could not process event #{event['type']}-#{event['id']}: #{e.message}"
|
108
|
+
end
|
76
109
|
end
|
77
|
-
else
|
78
|
-
Trollop::die("Not a valid function: #{ARGV[2]}") unless functions.include? ARGV[2]
|
79
|
-
send_message(ARGV[2], user, repo)
|
80
110
|
end
|
111
|
+
|
81
112
|
end
|
82
113
|
end
|
83
|
-
|
@@ -28,13 +28,8 @@ class GHTRepoRetriever
|
|
28
28
|
ght.logger
|
29
29
|
end
|
30
30
|
|
31
|
-
def ext_uniq
|
32
|
-
@ext_uniq ||= config(:uniq_id)
|
33
|
-
@ext_uniq
|
34
|
-
end
|
35
|
-
|
36
31
|
def ght
|
37
|
-
@ght ||=
|
32
|
+
@ght ||= TransactedGHTorrent.new(@config)
|
38
33
|
@ght
|
39
34
|
end
|
40
35
|
|
@@ -58,7 +53,7 @@ class GHTRepoRetriever
|
|
58
53
|
warn("Trying to get user #{owner}, attempt #{i}")
|
59
54
|
begin
|
60
55
|
user_entry = ght.transaction { ght.ensure_user(owner, false, false) }
|
61
|
-
rescue
|
56
|
+
rescue StandardError => e
|
62
57
|
warn e.message
|
63
58
|
end
|
64
59
|
end
|
@@ -69,11 +64,7 @@ class GHTRepoRetriever
|
|
69
64
|
end
|
70
65
|
|
71
66
|
repo_entry = ght.transaction { ght.ensure_repo(owner, repo,
|
72
|
-
|
73
|
-
project_members = false,
|
74
|
-
watchers = false,
|
75
|
-
forks = false,
|
76
|
-
labels = false) }
|
67
|
+
recursive = false) }
|
77
68
|
|
78
69
|
if repo_entry.nil?
|
79
70
|
warn("Cannot find repository #{owner}/#{repo}")
|
@@ -83,8 +74,7 @@ class GHTRepoRetriever
|
|
83
74
|
debug("Retrieving repo #{owner}/#{repo}")
|
84
75
|
|
85
76
|
retrieval_stages = %w(ensure_commits ensure_forks ensure_pull_requests
|
86
|
-
ensure_issues ensure_project_members
|
87
|
-
ensure_watchers ensure_labels)
|
77
|
+
ensure_issues ensure_watchers ensure_labels) # ensure_project_members
|
88
78
|
|
89
79
|
retrieval_stages.each do |x|
|
90
80
|
run_retrieval_stage(ght, owner, repo, x)
|
@@ -114,7 +104,7 @@ class GHTRepoRetriever
|
|
114
104
|
else
|
115
105
|
ght.send(function, owner, repo)
|
116
106
|
end
|
117
|
-
rescue
|
107
|
+
rescue StandardError => e
|
118
108
|
warn("Error processing #{function} for #{owner}/#{repo}")
|
119
109
|
warn("Exception message #{$!}")
|
120
110
|
warn("Exception trace #{e.backtrace.join("\n")}")
|
@@ -2,19 +2,21 @@ require 'rubygems'
|
|
2
2
|
|
3
3
|
require 'ghtorrent/ghtorrent'
|
4
4
|
require 'ghtorrent/settings'
|
5
|
-
require 'ghtorrent/logging'
|
6
5
|
require 'ghtorrent/command'
|
7
6
|
require 'ghtorrent/retriever'
|
8
|
-
require 'ghtorrent/
|
9
|
-
require 'ghtorrent/commands/
|
7
|
+
require 'ghtorrent/transacted_gh_torrent'
|
8
|
+
require 'ghtorrent/commands/full_user_retriever'
|
10
9
|
|
11
|
-
class GHTRetrieveUser <
|
10
|
+
class GHTRetrieveUser < GHTorrent::Command
|
11
|
+
|
12
|
+
include GHTorrent::Retriever
|
13
|
+
include GHTorrent::Commands::FullUserRetriever
|
12
14
|
|
13
15
|
def prepare_options(options)
|
14
16
|
options.banner <<-BANNER
|
15
17
|
An efficient way to get all data for a single user
|
16
18
|
|
17
|
-
#{command_name} [options]
|
19
|
+
#{command_name} [options] login
|
18
20
|
|
19
21
|
BANNER
|
20
22
|
end
|
@@ -24,57 +26,14 @@ An efficient way to get all data for a single user
|
|
24
26
|
Trollop::die "One argument is required" unless args[0] && !args[0].empty?
|
25
27
|
end
|
26
28
|
|
27
|
-
def
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
if user_entry.nil?
|
32
|
-
Trollop::die "Cannot find user #{ARGV[0]}"
|
33
|
-
end
|
34
|
-
|
35
|
-
user = user_entry[:login]
|
36
|
-
|
37
|
-
def send_message(function, user)
|
38
|
-
begin
|
39
|
-
ght.send(function, user)
|
40
|
-
rescue Exception => e
|
41
|
-
puts STDERR, e.message
|
42
|
-
puts STDERR, e.backtrace
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
functions = %w(ensure_user_followers ensure_orgs ensure_org)
|
47
|
-
|
48
|
-
if ARGV[1].nil?
|
49
|
-
functions.each do |x|
|
50
|
-
send_message(x, user)
|
51
|
-
end
|
52
|
-
else
|
53
|
-
Trollop::die("Not a valid function: #{ARGV[1]}") unless functions.include? ARGV[1]
|
54
|
-
send_message(ARGV[1], user)
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
class TransactedGhtorrent
|
61
|
-
|
62
|
-
def ensure_user_followers(user)
|
63
|
-
check_transaction do
|
64
|
-
super(user)
|
65
|
-
end
|
29
|
+
def ght
|
30
|
+
@ght ||= TransactedGHTorrent.new(settings)
|
31
|
+
@ght
|
66
32
|
end
|
67
33
|
|
68
|
-
def
|
69
|
-
|
70
|
-
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def ensure_org(user, members = true)
|
75
|
-
check_transaction do
|
76
|
-
super(user, members)
|
77
|
-
end
|
34
|
+
def go
|
35
|
+
login = ARGV[0]
|
36
|
+
retrieve_user(login)
|
78
37
|
end
|
79
38
|
|
80
39
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'ghtorrent/retriever'
|
2
|
+
require 'ghtorrent/transacted_gh_torrent'
|
3
|
+
require 'ghtorrent/commands/full_user_retriever'
|
4
|
+
|
5
|
+
# Retrieve user information en masse
|
6
|
+
class GHTRetrieveUsers < MultiprocessQueueClient
|
7
|
+
|
8
|
+
def clazz
|
9
|
+
GHTUserRetriever
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
# Initialize a user retrieval process
|
15
|
+
class GHTUserRetriever
|
16
|
+
|
17
|
+
include GHTorrent::Retriever
|
18
|
+
include GHTorrent::Commands::FullUserRetriever
|
19
|
+
|
20
|
+
attr_accessor :ght
|
21
|
+
|
22
|
+
def initialize(config, queue)
|
23
|
+
@config = config
|
24
|
+
@queue = queue
|
25
|
+
end
|
26
|
+
|
27
|
+
def settings
|
28
|
+
@config
|
29
|
+
end
|
30
|
+
|
31
|
+
def run(command)
|
32
|
+
|
33
|
+
processor = Proc.new do |user|
|
34
|
+
@ght ||= TransactedGHTorrent.new(@config)
|
35
|
+
|
36
|
+
retrieve_user(user)
|
37
|
+
end
|
38
|
+
|
39
|
+
command.queue_client(@queue, :after, processor)
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
def stop
|
44
|
+
warn('Stop flag set, waiting for operations to finish')
|
45
|
+
@stop = true
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
@@ -0,0 +1,126 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'ghtorrent'
|
4
|
+
|
5
|
+
class GHTUpdateRepo < GHTorrent::Command
|
6
|
+
|
7
|
+
include GHTorrent::Settings
|
8
|
+
include GHTorrent::Retriever
|
9
|
+
include GHTorrent::Persister
|
10
|
+
include GHTorrent::Logging
|
11
|
+
|
12
|
+
def prepare_options(options)
|
13
|
+
options.banner <<-BANNER
|
14
|
+
Updates the deleted field in the project table with current data
|
15
|
+
|
16
|
+
#{command_name} owner repo
|
17
|
+
|
18
|
+
BANNER
|
19
|
+
end
|
20
|
+
|
21
|
+
def validate
|
22
|
+
super
|
23
|
+
Trollop::die "Takes two arguments" if ARGV.size == 1
|
24
|
+
end
|
25
|
+
|
26
|
+
def persister
|
27
|
+
@persister ||= connect(:mongo, settings)
|
28
|
+
@persister
|
29
|
+
end
|
30
|
+
|
31
|
+
def db
|
32
|
+
@db ||= @ght.get_db
|
33
|
+
end
|
34
|
+
|
35
|
+
def date(arg)
|
36
|
+
if arg.class != Time
|
37
|
+
Time.parse(arg)#.to_i
|
38
|
+
else
|
39
|
+
arg
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def set_deleted(owner, repo)
|
44
|
+
db.from(:projects, :users).\
|
45
|
+
where(:projects__owner_id => :users__id).\
|
46
|
+
where(:users__login => owner).\
|
47
|
+
where(:projects__name => repo).\
|
48
|
+
update(:projects__deleted => true)
|
49
|
+
info("Project #{owner}/#{repo} marked as deleted")
|
50
|
+
end
|
51
|
+
|
52
|
+
def update_mysql(owner, repo, retrieved)
|
53
|
+
|
54
|
+
parent = unless retrieved['parent'].nil?
|
55
|
+
@ght.ensure_repo(retrieved['parent']['owner']['login'],
|
56
|
+
retrieved['parent']['name'])
|
57
|
+
end
|
58
|
+
|
59
|
+
db.from(:projects, :users).\
|
60
|
+
where(:projects__owner_id => :users__id).\
|
61
|
+
where(:users__login => owner).\
|
62
|
+
where(:projects__name => repo).\
|
63
|
+
update(
|
64
|
+
:projects__url => retrieved['url'],
|
65
|
+
:projects__description => retrieved['description'],
|
66
|
+
:projects__language => retrieved['language'],
|
67
|
+
:projects__created_at => date(retrieved['created_at']),
|
68
|
+
:projects__forked_from => unless parent.nil? then parent[:id] end)
|
69
|
+
debug("Repo #{owner}/#{repo} updated")
|
70
|
+
|
71
|
+
@ght.ensure_languages(owner, repo)
|
72
|
+
end
|
73
|
+
|
74
|
+
def process_project(owner, name)
|
75
|
+
@ght.transaction do
|
76
|
+
|
77
|
+
in_mongo = persister.find(:repos, {'owner.login' => owner, 'name' => name })
|
78
|
+
on_github = api_request(ghurl ("repos/#{owner}/#{name}"))
|
79
|
+
|
80
|
+
unless in_mongo.empty? and on_github.empty?
|
81
|
+
in_mysql = retrieve_repo(owner, name)
|
82
|
+
end
|
83
|
+
|
84
|
+
if in_mongo.empty?
|
85
|
+
if on_github.empty?
|
86
|
+
if in_mysql.nil?
|
87
|
+
# Project does not exist anywhere
|
88
|
+
warn "Repo #{owner}/#{name} does not exist in MySQL"
|
89
|
+
else
|
90
|
+
# Project exists in MySQL but not on Github or Mongo
|
91
|
+
# Mark it as deleted
|
92
|
+
set_deleted(owner, name)
|
93
|
+
end
|
94
|
+
else
|
95
|
+
# Project does not exist in Mongo, but exists in Github
|
96
|
+
if in_mysql.nil?
|
97
|
+
warn "Repo #{owner}/#{name} does not exist in MySQL"
|
98
|
+
else
|
99
|
+
# The retrieval process already added it to Mongo, so update MySQL
|
100
|
+
update_mysql(owner, name, in_mysql)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
else
|
104
|
+
if on_github.empty?
|
105
|
+
# Project was deleted on Github. Mark it as deleted.
|
106
|
+
set_deleted(owner, name)
|
107
|
+
else
|
108
|
+
update_mysql(owner, name, in_mysql)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def go
|
115
|
+
|
116
|
+
@ght ||= GHTorrent::Mirror.new(settings)
|
117
|
+
|
118
|
+
unless ARGV[1].nil?
|
119
|
+
process_project(ARGV[0], ARGV[1])
|
120
|
+
exit(0)
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
GHTUpdateRepo.run
|