ghtorrent 0.10 → 0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +16 -0
  3. data/Gemfile.lock +12 -27
  4. data/README.md +20 -33
  5. data/Rakefile +1 -9
  6. data/bin/ght-log-analyzer +11 -6
  7. data/bin/ght-log-influx +190 -0
  8. data/bin/ght-queue-grep.rb +55 -0
  9. data/bin/ght-retrieve-users +6 -0
  10. data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
  11. data/lib/ghtorrent.rb +4 -4
  12. data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
  13. data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
  14. data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
  15. data/lib/ghtorrent/api_client.rb +45 -119
  16. data/lib/ghtorrent/command.rb +25 -8
  17. data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
  18. data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
  19. data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
  20. data/lib/ghtorrent/commands/ght_load.rb +1 -2
  21. data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
  22. data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
  23. data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
  24. data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
  25. data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
  26. data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
  27. data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
  28. data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
  29. data/lib/ghtorrent/event_processing.rb +140 -0
  30. data/lib/ghtorrent/ghtorrent.rb +330 -396
  31. data/lib/ghtorrent/logging.rb +65 -12
  32. data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
  33. data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
  34. data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
  35. data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
  36. data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
  37. data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
  38. data/lib/ghtorrent/retriever.rb +100 -57
  39. data/lib/ghtorrent/settings.rb +14 -17
  40. data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
  41. data/lib/version.rb +1 -1
  42. metadata +14 -46
  43. data/bin/ght-process-event +0 -35
  44. data/lib/ghtorrent/cache.rb +0 -97
  45. data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
  46. data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
  47. data/spec/api_client_spec.rb +0 -42
  48. data/spec/spec_helper.rb +0 -21
@@ -4,7 +4,6 @@ require 'pp'
4
4
 
5
5
  require 'ghtorrent/ghtorrent'
6
6
  require 'ghtorrent/settings'
7
- require 'ghtorrent/logging'
8
7
  require 'ghtorrent/command'
9
8
 
10
9
  class GHTRetrieveDependents < GHTorrent::Command
@@ -33,10 +32,6 @@ pull_request (owner repo pullreq_id)
33
32
  :pull_request => 3
34
33
  }
35
34
 
36
- def logger
37
- ghtorrent.logger
38
- end
39
-
40
35
  def persister
41
36
  @persister ||= connect(:mongo, settings)
42
37
  @persister
@@ -38,13 +38,8 @@ Retrieve just one item
38
38
  @persister
39
39
  end
40
40
 
41
- def ext_uniq
42
- @ext_uniq ||= config(:uniq_id)
43
- @ext_uniq
44
- end
45
-
46
41
  def ght
47
- @ght ||= TransactedGhtorrent.new(settings)
42
+ @ght ||= TransactedGHTorrent.new(settings)
48
43
  @ght
49
44
  end
50
45
 
@@ -2,7 +2,6 @@ require 'rubygems'
2
2
 
3
3
  require 'ghtorrent/ghtorrent'
4
4
  require 'ghtorrent/settings'
5
- require 'ghtorrent/logging'
6
5
  require 'ghtorrent/command'
7
6
  require 'ghtorrent/retriever'
8
7
 
@@ -11,6 +10,7 @@ class GHTRetrieveRepo < GHTorrent::Command
11
10
  include GHTorrent::Settings
12
11
  include GHTorrent::Retriever
13
12
  include GHTorrent::Persister
13
+ include GHTorrent::EventProcessing
14
14
 
15
15
  def prepare_options(options)
16
16
  options.banner <<-BANNER
@@ -19,35 +19,54 @@ An efficient way to get all data for a single repo
19
19
  #{command_name} [options] owner repo
20
20
 
21
21
  BANNER
22
+ options.opt :no_events, 'Skip retrieving events', :default => false
23
+ options.opt :no_entities, 'Skip retrieving entities', :default => false
24
+
25
+ options.opt :only_stage, "Only do the provided stage of entity retrieval (one of: #{stages.join(',')})",
26
+ :type => String
27
+ options.opt :exclude_events, 'Comma separated list of event types to exclude from processing',
28
+ :type => String
29
+ options.opt :events_after, 'Process all events later than the provided event id',
30
+ :type => Integer
31
+ options.opt :events_before, 'Process all events earlier than the provided event id',
32
+ :type => Integer
22
33
  end
23
34
 
24
35
  def validate
25
36
  super
26
- Trollop::die "Two arguments are required" unless args[0] && !args[0].empty?
37
+ Trollop::die 'Two arguments are required' unless args[0] && !args[0].empty?
38
+
39
+ unless options[:exclude_events].nil?
40
+ @exclude_event_types = options[:exclude_events].split(/,/)
41
+ else
42
+ @exclude_event_types = []
43
+ end
44
+
45
+ unless options[:only_stage].nil?
46
+ Trollop::die("Not a valid function: #{options[:only_stage]}") unless stages.include? options[:only_stage]
47
+ end
48
+
27
49
  end
28
50
 
29
- def logger
30
- ght.logger
51
+ def stages
52
+ %w(ensure_commits ensure_forks ensure_pull_requests
53
+ ensure_issues ensure_watchers ensure_labels ensure_languages) #ensure_project_members
31
54
  end
32
55
 
56
+
33
57
  def persister
34
58
  @persister ||= connect(:mongo, settings)
35
59
  @persister
36
60
  end
37
61
 
38
- def ext_uniq
39
- @ext_uniq ||= config(:uniq_id)
40
- @ext_uniq
41
- end
42
-
43
- def ght
44
- @ght ||= TransactedGhtorrent.new(settings)
45
- @ght
62
+ def ghtorrent
63
+ @ghtorrent ||= TransactedGHTorrent.new(settings)
64
+ @ghtorrent
46
65
  end
47
66
 
48
67
  def go
49
68
  self.settings = override_config(settings, :mirror_history_pages_back, 1000)
50
- user_entry = ght.transaction{ght.ensure_user(ARGV[0], false, false)}
69
+ user_entry = ghtorrent.transaction{ghtorrent.ensure_user(ARGV[0], false, false)}
51
70
 
52
71
  if user_entry.nil?
53
72
  Trollop::die "Cannot find user #{ARGV[0]}"
@@ -55,7 +74,7 @@ An efficient way to get all data for a single repo
55
74
 
56
75
  user = user_entry[:login]
57
76
 
58
- repo_entry = ght.transaction{ght.ensure_repo(ARGV[0], ARGV[1])}
77
+ repo_entry = ghtorrent.transaction{ghtorrent.ensure_repo(ARGV[0], ARGV[1])}
59
78
 
60
79
  if repo_entry.nil?
61
80
  Trollop::die "Cannot find repository #{ARGV[0]}/#{ARGV[1]}"
@@ -63,21 +82,32 @@ An efficient way to get all data for a single repo
63
82
 
64
83
  repo = repo_entry[:name]
65
84
 
66
- def send_message(function, user, repo)
67
- ght.send(function, user, repo)
85
+ unless options[:no_entities_given]
86
+ if options[:only_stage].nil?
87
+ stages.each do |x|
88
+ ghtorrent.send(x, user, repo)
89
+ end
90
+ else
91
+ ghtorrent.send(options[:only_stage], user, repo)
92
+ end
68
93
  end
69
94
 
70
- functions = %w(ensure_commits ensure_forks ensure_pull_requests
71
- ensure_issues ensure_project_members ensure_watchers ensure_labels)
72
-
73
- if ARGV[2].nil?
74
- functions.each do |x|
75
- send_message(x, user, repo)
95
+ # Process repo events
96
+ unless options[:no_events_given]
97
+ events = get_repo_events(ARGV[0], ARGV[1]).sort{|e| e['id'].to_i}
98
+ events.each do |event|
99
+ begin
100
+ next if @exclude_event_types.include? event['type']
101
+ next if options[:events_after_given] and event['id'].to_i <= options[:events_after]
102
+ next if options[:events_before_given] and event['id'].to_i >= options[:events_before]
103
+
104
+ send(event['type'], event)
105
+ puts "Processed event #{event['type']}-#{event['id']}"
106
+ rescue StandardError => e
107
+ puts "Could not process event #{event['type']}-#{event['id']}: #{e.message}"
108
+ end
76
109
  end
77
- else
78
- Trollop::die("Not a valid function: #{ARGV[2]}") unless functions.include? ARGV[2]
79
- send_message(ARGV[2], user, repo)
80
110
  end
111
+
81
112
  end
82
113
  end
83
-
@@ -28,13 +28,8 @@ class GHTRepoRetriever
28
28
  ght.logger
29
29
  end
30
30
 
31
- def ext_uniq
32
- @ext_uniq ||= config(:uniq_id)
33
- @ext_uniq
34
- end
35
-
36
31
  def ght
37
- @ght ||= TransactedGhtorrent.new(@config)
32
+ @ght ||= TransactedGHTorrent.new(@config)
38
33
  @ght
39
34
  end
40
35
 
@@ -58,7 +53,7 @@ class GHTRepoRetriever
58
53
  warn("Trying to get user #{owner}, attempt #{i}")
59
54
  begin
60
55
  user_entry = ght.transaction { ght.ensure_user(owner, false, false) }
61
- rescue Exception => e
56
+ rescue StandardError => e
62
57
  warn e.message
63
58
  end
64
59
  end
@@ -69,11 +64,7 @@ class GHTRepoRetriever
69
64
  end
70
65
 
71
66
  repo_entry = ght.transaction { ght.ensure_repo(owner, repo,
72
- commits = false,
73
- project_members = false,
74
- watchers = false,
75
- forks = false,
76
- labels = false) }
67
+ recursive = false) }
77
68
 
78
69
  if repo_entry.nil?
79
70
  warn("Cannot find repository #{owner}/#{repo}")
@@ -83,8 +74,7 @@ class GHTRepoRetriever
83
74
  debug("Retrieving repo #{owner}/#{repo}")
84
75
 
85
76
  retrieval_stages = %w(ensure_commits ensure_forks ensure_pull_requests
86
- ensure_issues ensure_project_members
87
- ensure_watchers ensure_labels)
77
+ ensure_issues ensure_watchers ensure_labels) # ensure_project_members
88
78
 
89
79
  retrieval_stages.each do |x|
90
80
  run_retrieval_stage(ght, owner, repo, x)
@@ -114,7 +104,7 @@ class GHTRepoRetriever
114
104
  else
115
105
  ght.send(function, owner, repo)
116
106
  end
117
- rescue Exception => e
107
+ rescue StandardError => e
118
108
  warn("Error processing #{function} for #{owner}/#{repo}")
119
109
  warn("Exception message #{$!}")
120
110
  warn("Exception trace #{e.backtrace.join("\n")}")
@@ -2,19 +2,21 @@ require 'rubygems'
2
2
 
3
3
  require 'ghtorrent/ghtorrent'
4
4
  require 'ghtorrent/settings'
5
- require 'ghtorrent/logging'
6
5
  require 'ghtorrent/command'
7
6
  require 'ghtorrent/retriever'
8
- require 'ghtorrent/transacted_ghtorrent'
9
- require 'ghtorrent/commands/ght_retrieve_repo'
7
+ require 'ghtorrent/transacted_gh_torrent'
8
+ require 'ghtorrent/commands/full_user_retriever'
10
9
 
11
- class GHTRetrieveUser < GHTRetrieveRepo
10
+ class GHTRetrieveUser < GHTorrent::Command
11
+
12
+ include GHTorrent::Retriever
13
+ include GHTorrent::Commands::FullUserRetriever
12
14
 
13
15
  def prepare_options(options)
14
16
  options.banner <<-BANNER
15
17
  An efficient way to get all data for a single user
16
18
 
17
- #{command_name} [options] user
19
+ #{command_name} [options] login
18
20
 
19
21
  BANNER
20
22
  end
@@ -24,57 +26,14 @@ An efficient way to get all data for a single user
24
26
  Trollop::die "One argument is required" unless args[0] && !args[0].empty?
25
27
  end
26
28
 
27
- def go
28
- self.settings = override_config(settings, :mirror_history_pages_back, -1)
29
- user_entry = ght.transaction{ght.ensure_user(ARGV[0], false, false)}
30
-
31
- if user_entry.nil?
32
- Trollop::die "Cannot find user #{ARGV[0]}"
33
- end
34
-
35
- user = user_entry[:login]
36
-
37
- def send_message(function, user)
38
- begin
39
- ght.send(function, user)
40
- rescue Exception => e
41
- puts STDERR, e.message
42
- puts STDERR, e.backtrace
43
- end
44
- end
45
-
46
- functions = %w(ensure_user_followers ensure_orgs ensure_org)
47
-
48
- if ARGV[1].nil?
49
- functions.each do |x|
50
- send_message(x, user)
51
- end
52
- else
53
- Trollop::die("Not a valid function: #{ARGV[1]}") unless functions.include? ARGV[1]
54
- send_message(ARGV[1], user)
55
- end
56
-
57
- end
58
- end
59
-
60
- class TransactedGhtorrent
61
-
62
- def ensure_user_followers(user)
63
- check_transaction do
64
- super(user)
65
- end
29
+ def ght
30
+ @ght ||= TransactedGHTorrent.new(settings)
31
+ @ght
66
32
  end
67
33
 
68
- def ensure_orgs(user)
69
- check_transaction do
70
- super(user)
71
- end
72
- end
73
-
74
- def ensure_org(user, members = true)
75
- check_transaction do
76
- super(user, members)
77
- end
34
+ def go
35
+ login = ARGV[0]
36
+ retrieve_user(login)
78
37
  end
79
38
 
80
39
  end
@@ -0,0 +1,49 @@
1
+ require 'ghtorrent/retriever'
2
+ require 'ghtorrent/transacted_gh_torrent'
3
+ require 'ghtorrent/commands/full_user_retriever'
4
+
5
+ # Retrieve user information en masse
6
+ class GHTRetrieveUsers < MultiprocessQueueClient
7
+
8
+ def clazz
9
+ GHTUserRetriever
10
+ end
11
+
12
+ end
13
+
14
+ # Initialize a user retrieval process
15
+ class GHTUserRetriever
16
+
17
+ include GHTorrent::Retriever
18
+ include GHTorrent::Commands::FullUserRetriever
19
+
20
+ attr_accessor :ght
21
+
22
+ def initialize(config, queue)
23
+ @config = config
24
+ @queue = queue
25
+ end
26
+
27
+ def settings
28
+ @config
29
+ end
30
+
31
+ def run(command)
32
+
33
+ processor = Proc.new do |user|
34
+ @ght ||= TransactedGHTorrent.new(@config)
35
+
36
+ retrieve_user(user)
37
+ end
38
+
39
+ command.queue_client(@queue, :after, processor)
40
+
41
+ end
42
+
43
+ def stop
44
+ warn('Stop flag set, waiting for operations to finish')
45
+ @stop = true
46
+ end
47
+
48
+ end
49
+
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'ghtorrent'
4
+
5
+ class GHTUpdateRepo < GHTorrent::Command
6
+
7
+ include GHTorrent::Settings
8
+ include GHTorrent::Retriever
9
+ include GHTorrent::Persister
10
+ include GHTorrent::Logging
11
+
12
+ def prepare_options(options)
13
+ options.banner <<-BANNER
14
+ Updates the deleted field in the project table with current data
15
+
16
+ #{command_name} owner repo
17
+
18
+ BANNER
19
+ end
20
+
21
+ def validate
22
+ super
23
+ Trollop::die "Takes two arguments" if ARGV.size == 1
24
+ end
25
+
26
+ def persister
27
+ @persister ||= connect(:mongo, settings)
28
+ @persister
29
+ end
30
+
31
+ def db
32
+ @db ||= @ght.get_db
33
+ end
34
+
35
+ def date(arg)
36
+ if arg.class != Time
37
+ Time.parse(arg)#.to_i
38
+ else
39
+ arg
40
+ end
41
+ end
42
+
43
+ def set_deleted(owner, repo)
44
+ db.from(:projects, :users).\
45
+ where(:projects__owner_id => :users__id).\
46
+ where(:users__login => owner).\
47
+ where(:projects__name => repo).\
48
+ update(:projects__deleted => true)
49
+ info("Project #{owner}/#{repo} marked as deleted")
50
+ end
51
+
52
+ def update_mysql(owner, repo, retrieved)
53
+
54
+ parent = unless retrieved['parent'].nil?
55
+ @ght.ensure_repo(retrieved['parent']['owner']['login'],
56
+ retrieved['parent']['name'])
57
+ end
58
+
59
+ db.from(:projects, :users).\
60
+ where(:projects__owner_id => :users__id).\
61
+ where(:users__login => owner).\
62
+ where(:projects__name => repo).\
63
+ update(
64
+ :projects__url => retrieved['url'],
65
+ :projects__description => retrieved['description'],
66
+ :projects__language => retrieved['language'],
67
+ :projects__created_at => date(retrieved['created_at']),
68
+ :projects__forked_from => unless parent.nil? then parent[:id] end)
69
+ debug("Repo #{owner}/#{repo} updated")
70
+
71
+ @ght.ensure_languages(owner, repo)
72
+ end
73
+
74
+ def process_project(owner, name)
75
+ @ght.transaction do
76
+
77
+ in_mongo = persister.find(:repos, {'owner.login' => owner, 'name' => name })
78
+ on_github = api_request(ghurl ("repos/#{owner}/#{name}"))
79
+
80
+ unless in_mongo.empty? and on_github.empty?
81
+ in_mysql = retrieve_repo(owner, name)
82
+ end
83
+
84
+ if in_mongo.empty?
85
+ if on_github.empty?
86
+ if in_mysql.nil?
87
+ # Project does not exist anywhere
88
+ warn "Repo #{owner}/#{name} does not exist in MySQL"
89
+ else
90
+ # Project exists in MySQL but not on Github or Mongo
91
+ # Mark it as deleted
92
+ set_deleted(owner, name)
93
+ end
94
+ else
95
+ # Project does not exist in Mongo, but exists in Github
96
+ if in_mysql.nil?
97
+ warn "Repo #{owner}/#{name} does not exist in MySQL"
98
+ else
99
+ # The retrieval process already added it to Mongo, so update MySQL
100
+ update_mysql(owner, name, in_mysql)
101
+ end
102
+ end
103
+ else
104
+ if on_github.empty?
105
+ # Project was deleted on Github. Mark it as deleted.
106
+ set_deleted(owner, name)
107
+ else
108
+ update_mysql(owner, name, in_mysql)
109
+ end
110
+ end
111
+ end
112
+ end
113
+
114
+ def go
115
+
116
+ @ght ||= GHTorrent::Mirror.new(settings)
117
+
118
+ unless ARGV[1].nil?
119
+ process_project(ARGV[0], ARGV[1])
120
+ exit(0)
121
+ end
122
+
123
+ end
124
+ end
125
+
126
+ GHTUpdateRepo.run