ghtorrent 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/CHANGELOG +24 -0
  2. data/Gemfile +17 -0
  3. data/Gemfile.lock +40 -0
  4. data/README.md +23 -22
  5. data/bin/ght-data-retrieval +66 -24
  6. data/bin/ght-load +41 -19
  7. data/bin/ght-mirror-events +13 -16
  8. data/bin/ght-rm-dupl +119 -77
  9. data/lib/ghtorrent.rb +14 -4
  10. data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
  11. data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
  12. data/lib/ghtorrent/api_client.rb +151 -16
  13. data/lib/ghtorrent/bson_orderedhash.rb +23 -0
  14. data/lib/ghtorrent/cache.rb +97 -0
  15. data/lib/ghtorrent/command.rb +43 -25
  16. data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
  17. data/lib/ghtorrent/ghtorrent.rb +615 -164
  18. data/lib/ghtorrent/hash.rb +11 -0
  19. data/lib/ghtorrent/logging.rb +11 -7
  20. data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
  21. data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
  22. data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
  23. data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
  24. data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
  25. data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
  26. data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
  27. data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
  28. data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
  29. data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
  30. data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
  31. data/lib/ghtorrent/persister.rb +3 -0
  32. data/lib/ghtorrent/retriever.rb +298 -102
  33. data/lib/ghtorrent/settings.rb +20 -1
  34. data/lib/ghtorrent/time.rb +5 -0
  35. data/lib/ghtorrent/utils.rb +22 -4
  36. data/lib/version.rb +5 -0
  37. metadata +173 -145
  38. data/lib/ghtorrent/call_stack.rb +0 -91
@@ -1,11 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+
1
3
  require 'rubygems'
2
4
  require 'yaml'
3
5
  require 'amqp'
4
6
  require 'eventmachine'
5
- require 'ghtorrent'
6
7
  require 'json'
7
8
  require 'logger'
8
9
 
10
+ require 'ghtorrent/api_client'
11
+ require 'ghtorrent/settings'
12
+ require 'ghtorrent/logging'
13
+ require 'ghtorrent/persister'
14
+ require 'ghtorrent/command'
15
+
9
16
  class GHTMirrorEvents < GHTorrent::Command
10
17
 
11
18
  include GHTorrent::Settings
@@ -13,12 +20,8 @@ class GHTMirrorEvents < GHTorrent::Command
13
20
  include GHTorrent::Persister
14
21
  include GHTorrent::APIClient
15
22
 
16
- attr_reader :settings
17
-
18
- def initialize(args)
19
- super(args)
20
- @args = args
21
- @name = self.class.name
23
+ def logger
24
+ @logger
22
25
  end
23
26
 
24
27
  def store_count(events)
@@ -42,7 +45,7 @@ class GHTMirrorEvents < GHTorrent::Command
42
45
  def retrieve(exchange)
43
46
  begin
44
47
  new = dupl = 0
45
- events = api_request "https://api.github.com/events"
48
+ events = api_request "https://api.github.com/events", false
46
49
  (new, dupl, stored) = store_count events
47
50
 
48
51
  # This means that first page cannot contain all new events. Go
@@ -67,23 +70,17 @@ class GHTMirrorEvents < GHTorrent::Command
67
70
  end
68
71
  end
69
72
 
70
- def prepare_options(options)
71
- @name = "ght-mirror-events"
72
- end
73
-
74
73
  def go
75
- @gh = GHTorrent::Mirror.new(options[:config])
76
- @settings = @gh.settings
77
74
  @persister = connect(:mongo, @settings)
78
75
  @logger = Logger.new(STDOUT)
79
76
 
80
77
  # Graceful exit
81
78
  Signal.trap('INT') {
82
- info ("Received SIGINT, exiting")
79
+ info "Received SIGINT, exiting"
83
80
  AMQP.stop { EM.stop }
84
81
  }
85
82
  Signal.trap('TERM') {
86
- info ("Received SIGTERM, exiting")
83
+ info "Received SIGTERM, exiting"
87
84
  AMQP.stop { EM.stop }
88
85
  }
89
86
 
data/bin/ght-rm-dupl CHANGED
@@ -1,92 +1,134 @@
1
+ #!/usr/bin/env ruby
2
+
1
3
  require 'rubygems'
2
4
  require 'mongo'
3
- require 'ghtorrent-old'
4
-
5
- GH = Mirror.new
6
- GH.init("config.yaml")
7
-
8
- # Unique keys per known collection
9
- per_col = {
10
- :commits => {
11
- :payload => "commit.id",
12
- :col => GH.commits_col,
13
- },
14
- :events => {
15
- :payload => "id",
16
- :col => GH.events_col,
5
+
6
+ require 'ghtorrent/settings'
7
+ require 'ghtorrent/logging'
8
+ require 'ghtorrent/command'
9
+ require 'ghtorrent/persister'
10
+
11
+ class GHRMDupl < GHTorrent::Command
12
+
13
+ include GHTorrent::Settings
14
+ include GHTorrent::Persister
15
+
16
+ def col_info()
17
+ {
18
+ :commits => {
19
+ :unq => "sha",
20
+ :col => persister.get_underlying_connection.collection(:commits.to_s),
21
+ },
22
+ :events => {
23
+ :unq => "id",
24
+ :col => persister.get_underlying_connection.collection(:events.to_s),
25
+ }
17
26
  }
18
- }
19
-
20
- # Print MongoDB remove statements that
21
- # remove all but one entries for each commit.
22
- def remove_duplicates(data, col)
23
- removed = 0
24
- data.select { |k, v| v.size > 1 }.each do |k, v|
25
- v.slice(0..(v.size - 2)).map do |x|
26
- removed += 1 if delete_by_id col, x
27
- end
28
27
  end
29
- removed
30
- end
31
28
 
32
- def delete_by_id(col, id)
33
- begin
34
- col.remove({'_id' => id})
35
- true
36
- rescue Mongo::OperationFailure
37
- puts "Cannot remove record with id #{id} from #{col.name}"
38
- false
29
+ def persister
30
+ @persister ||= connect(:mongo, @settings)
31
+ @persister
39
32
  end
40
- end
41
33
 
42
- which = case ARGV[0]
43
- when "commits" then :commits
44
- when "events" then :events
45
- else puts "Not a known collection name: #{ARGV[0]}\n"
46
- end
47
-
48
- from = case ARGV[1]
49
- when nil then {}
50
- else
51
- t = Time.at(ARGV[1].to_i)
52
- STDERR.puts "Searching for duplicates after #{t}"
53
- {'_id' => {'$gte' => BSON::ObjectId.from_time(t)}}
54
- end
55
-
56
- # Various counters to report stats
57
- processed = total_processed = removed = 0
58
-
59
- data = Hash.new
60
-
61
- # The following code needs to save intermediate results to cope
62
- # with large datasets
63
- per_col[which][:col].find(from, :fields => per_col[which][:payload]).each do |r|
64
- _id = r["_id"]
65
- commit = GH.read_value(r, per_col[which][:payload])
66
-
67
- # If entries cannot be parsed, remove them
68
- if commit.empty?
69
- puts "Deleting unknown entry #{_id}"
70
- removed += 1 if delete_by_id per_col[which][:col], _id
71
- else
72
- data[commit] = [] if data[commit].nil?
73
- data[commit] << _id
34
+ def prepare_options(options)
35
+ options.banner <<-BANNER
36
+ Removes duplicate entries from collections
37
+
38
+ #{command_name} [options] collection
39
+
40
+ #{command_name} options:
41
+ BANNER
42
+
43
+ options.opt :earliest, 'Seconds since epoch of earliest item to load',
44
+ :short => 'e', :default => 0, :type => :int
45
+ options.opt :snapshot, 'Perform clean up every x records',
46
+ :short => 's', :default => -1, :type => :int
74
47
  end
75
48
 
76
- processed += 1
77
- total_processed += 1
49
+ def validate
50
+ super
51
+ Trollop::die "no collection specified" unless args[0] && !args[0].empty?
52
+ end
78
53
 
79
- print "\rProcessed #{processed} records"
54
+ # Print MongoDB remove statements that
55
+ # remove all but one entries for each commit.
56
+ def remove_duplicates(data, col)
57
+ removed = 0
58
+ data.select { |k, v| v.size > 1 }.each do |k, v|
59
+ v.slice(0..(v.size - 2)).map do |x|
60
+ removed += 1 if delete_by_id col, x
61
+ end
62
+ end
63
+ removed
64
+ end
65
+
66
+ def delete_by_id(col, id)
67
+ begin
68
+ col.remove({'_id' => id})
69
+ true
70
+ rescue Mongo::OperationFailure
71
+ puts "Cannot remove record with id #{id} from #{col.name}"
72
+ false
73
+ end
74
+ end
75
+
76
+ def go
77
+ collection = case ARGV[0]
78
+ when "commits" then
79
+ :commits
80
+ when "events" then
81
+ :events
82
+ else
83
+ puts "Not a known collection name: #{ARGV[0]}\n"
84
+ end
85
+
86
+ from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
87
+
88
+ snapshot = options[:snapshot]
89
+
90
+ puts "Deleting duplicates from collection #{collection}"
91
+ puts "Deleting duplicates after #{Time.at(options[:earliest])}"
92
+ puts "Perform clean up every #{snapshot} records"
93
+
94
+ # Various counters to report stats
95
+ processed = total_processed = removed = 0
80
96
 
81
- # Calculate duplicates, save intermediate result
82
- if processed > 500000
83
- puts "\nLoaded #{data.size} values, cleaning"
84
- removed += remove_duplicates data, per_col[which][:col]
85
97
  data = Hash.new
86
- processed = 0
98
+
99
+ # The following code needs to save intermediate results to cope
100
+ # with large datasets
101
+ col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
102
+ _id = r["_id"]
103
+ commit = read_value(r, col_info[collection][:unq])
104
+
105
+ # If entries cannot be parsed, remove them
106
+ if commit.empty?
107
+ puts "Deleting unknown entry #{_id}"
108
+ removed += 1 if delete_by_id col_info[collection][:col], _id
109
+ else
110
+ data[commit] = [] if data[commit].nil?
111
+ data[commit] << _id
112
+ end
113
+
114
+ processed += 1
115
+ total_processed += 1
116
+
117
+ print "\rProcessed #{processed} records"
118
+
119
+ # Calculate duplicates, save intermediate result
120
+ if snapshot > 0 and processed > snapshot
121
+ puts "\nLoaded #{data.size} values, cleaning"
122
+ removed += remove_duplicates data, col_info[collection][:col]
123
+ data = Hash.new
124
+ processed = 0
125
+ end
126
+ end
127
+
128
+ removed += remove_duplicates data, col_info[collection][:col]
129
+
130
+ puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
87
131
  end
88
132
  end
89
133
 
90
- removed += remove_duplicates data, per_col[which][:col]
91
-
92
- puts "Processed #{total_processed}, deleted #{removed} duplicates"
134
+ GHRMDupl.run
data/lib/ghtorrent.rb CHANGED
@@ -1,6 +1,4 @@
1
1
  module GHTorrent
2
- VERSION = '0.4'
3
-
4
2
  # Route keys used for setting up queues for events, using GHTorrent
5
3
  ROUTEKEY_CREATE = "evt.CreateEvent"
6
4
  ROUTEKEY_DELETE = "evt.DeleteEvent"
@@ -22,19 +20,31 @@ module GHTorrent
22
20
 
23
21
  end
24
22
 
25
- require 'ghtorrent/command'
23
+ # Shared extensions to library methods
24
+ require 'ghtorrent/hash'
25
+ require 'ghtorrent/time'
26
+ require 'ghtorrent/bson_orderedhash'
26
27
 
28
+ # Basic utility modules
29
+ require 'version'
30
+ require 'ghtorrent/gh_torrent_exception'
27
31
  require 'ghtorrent/utils'
28
32
  require 'ghtorrent/logging'
29
33
  require 'ghtorrent/settings'
34
+ require 'ghtorrent/cache'
30
35
  require 'ghtorrent/api_client'
31
- require 'ghtorrent/call_stack'
32
36
 
37
+ # Support for command line utilities offered by this gem
38
+ require 'ghtorrent/command'
39
+
40
+ # Configuration and drivers for caching retrieved data
33
41
  require 'ghtorrent/adapters/base_adapter'
34
42
  require 'ghtorrent/adapters/mongo_persister'
35
43
  require 'ghtorrent/adapters/noop_persister'
36
44
 
45
+ # Support for retrieving and saving intermediate results
37
46
  require 'ghtorrent/persister'
38
47
  require 'ghtorrent/retriever'
39
48
 
49
+ # SQL database fillup methods
40
50
  require 'ghtorrent/ghtorrent'
@@ -3,13 +3,14 @@ module GHTorrent
3
3
  class BaseAdapter
4
4
 
5
5
  ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
6
- :commit_comments, :repo_collaborators, :watchers
6
+ :commit_comments, :repo_collaborators, :watchers, :pull_requests,
7
+ :forks, :pull_request_comments, :issue_comments, :issues
7
8
  ]
8
9
 
9
10
  # Stores +data+ into +entity+. Returns a unique key for the stored entry.
10
11
  def store(entity, data = {})
11
12
  unless ENTITIES.include?(entity)
12
- throw GHTorrentException.new("Perister: Entity #{entity} not known")
13
+ raise GHTorrentException.new("Perister: Entity #{entity} not known")
13
14
  end
14
15
  end
15
16
 
@@ -50,14 +51,14 @@ module GHTorrent
50
51
  # matching JSON object.
51
52
  def find(entity, query = {})
52
53
  unless ENTITIES.include?(entity)
53
- throw GHTorrentException.new("Perister: Entity #{entity} not known")
54
+ raise GHTorrentException.new("Perister: Entity #{entity} not known")
54
55
  end
55
56
  end
56
57
 
57
58
  # Find the record identified by +id+ in +entity+
58
59
  def find_by_ext_ref_id(entity, id)
59
60
  unless ENTITIES.include?(entity)
60
- throw GHTorrentException.new("Perister: Entity #{entity} not known")
61
+ raise GHTorrentException.new("Perister: Entity #{entity} not known")
61
62
  end
62
63
  end
63
64
 
@@ -65,8 +66,19 @@ module GHTorrent
65
66
  # The +query+ can be any query supported by +find+.
66
67
  def count(entity, query = {})
67
68
  unless ENTITIES.include?(entity)
68
- throw GHTorrentException.new("Perister: Entity #{entity} not known")
69
+ raise GHTorrentException.new("Perister: Entity #{entity} not known")
69
70
  end
70
71
  end
72
+
73
+ # Get a raw connection to the underlying data store. The connection is
74
+ # implementaiton dependent.
75
+ def get_underlying_connection
76
+ raise "Unimplemented"
77
+ end
78
+
79
+ # Close the current connection and release any held resources
80
+ def close
81
+ raise "Unimplemented"
82
+ end
71
83
  end
72
84
  end
@@ -1,4 +1,6 @@
1
1
  require 'mongo'
2
+ require 'ghtorrent/adapters/base_adapter'
3
+ require 'ghtorrent/bson_orderedhash'
2
4
 
3
5
  module GHTorrent
4
6
 
@@ -14,7 +16,8 @@ module GHTorrent
14
16
  :mongo_port => "mongo.port",
15
17
  :mongo_db => "mongo.db",
16
18
  :mongo_username => "mongo.username",
17
- :mongo_passwd => "mongo.password"
19
+ :mongo_passwd => "mongo.password",
20
+ :mongo_replicas => "mongo.replicas"
18
21
  }
19
22
 
20
23
  attr_reader :settings
@@ -27,47 +30,21 @@ module GHTorrent
27
30
 
28
31
  @settings = set
29
32
  @uniq = config(:uniq_id)
30
- @mongo = Mongo::Connection.new(config(:mongo_host),
31
- config(:mongo_port))\
32
- .db(config(:mongo_db))
33
- @enttodb = {
34
- :users => get_collection("users"),
35
- :commits => get_collection("commits"),
36
- :repos => get_collection("repos"),
37
- :followers => get_collection("followers"),
38
- :events => get_collection("events"),
39
- :org_members => get_collection("org_members"),
40
- :commit_comments => get_collection("commit_comments"),
41
- :repo_collaborators => get_collection("repo_collaborators"),
42
- :watchers => get_collection("watchers")
43
- }
44
-
45
- # Ensure that the necessary indexes exist
46
- ensure_index(:events, "id")
47
- ensure_index(:users, "login")
48
- ensure_index(:commits, "sha")
49
- ensure_index(:repos, "name")
50
- ensure_index(:followers, "follows")
51
- ensure_index(:org_members, "org")
52
- ensure_index(:commit_comments, "repo")
53
- ensure_index(:commit_comments, "user")
54
- ensure_index(:commit_comments, "commit_id")
55
- ensure_index(:repo_collaborators, "repo")
56
- ensure_index(:repo_collaborators, "owner")
57
- ensure_index(:repo_collaborators, "login")
58
- ensure_index(:watchers, "repo")
59
- ensure_index(:watchers, "owner")
60
- ensure_index(:watchers, "login")
61
33
  end
62
34
 
63
35
  def store(entity, data = {})
64
36
  super
65
- get_entity(entity).insert(data).to_s
37
+ rescue_connection_failure do
38
+ get_entity(entity).insert(data).to_s
39
+ end
66
40
  end
67
41
 
68
42
  def find(entity, query = {})
69
43
  super
70
- result = get_entity(entity).find(query)
44
+ result = rescue_connection_failure do
45
+ get_entity(entity).find(query)
46
+ end
47
+
71
48
  result.to_a.map { |r|
72
49
  r[@uniq] = r['_id'].to_s;
73
50
  r.to_h
@@ -83,27 +60,86 @@ module GHTorrent
83
60
  # Count the number of items returned by +query+
84
61
  def count(entity, query)
85
62
  super
86
- get_entity(entity).count(:query => query)
63
+ rescue_connection_failure do
64
+ get_entity(entity).count(:query => query)
65
+ end
66
+ end
67
+
68
+ def get_underlying_connection
69
+ mongo
70
+ end
71
+
72
+ def close
73
+ unless @mongo.nil?
74
+ @mongo.close if @mongo.class == Mongo::ReplSetConnection
75
+ @mongo.connection.close if @mongo.class == Mongo::Connection
76
+
77
+ @mongo = nil
78
+ end
87
79
  end
88
80
 
89
81
  private
90
82
 
91
83
  def get_collection(col)
92
- @mongo.collection(col.to_s)
84
+ mongo.collection(col.to_s)
93
85
  end
94
86
 
95
87
  def get_entity(entity)
96
- col = @enttodb[entity]
88
+ case entity
89
+ when :users
90
+ get_collection("users")
91
+ when :commits
92
+ get_collection("commits")
93
+ when :repos
94
+ get_collection("repos")
95
+ when :followers
96
+ get_collection("followers")
97
+ when :org_members
98
+ get_collection("org_members")
99
+ when :events
100
+ get_collection("events")
101
+ when :commit_comments
102
+ get_collection("commit_comments")
103
+ when :repo_collaborators
104
+ get_collection("repo_collaborators")
105
+ when :watchers
106
+ get_collection("watchers")
107
+ when :pull_requests
108
+ get_collection("pull_requests")
109
+ when :forks
110
+ get_collection("forks")
111
+ when :pull_request_comments
112
+ get_collection("pull_request_comments")
113
+ when :issue_comments
114
+ get_collection("issue_comments")
115
+ end
116
+ end
97
117
 
98
- if col.nil?
99
- raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
118
+ def mongo
119
+ if @mongo.nil?
120
+
121
+ replicas = config(:mongo_replicas)
122
+
123
+ @mongo = if replicas.nil?
124
+ Mongo::Connection.new(config(:mongo_host),
125
+ config(:mongo_port))\
126
+ .db(config(:mongo_db))
127
+ else
128
+ repl_arr = replicas.strip.split(/ /).map{|x| "#{x}:#{config(:mongo_port)}"}
129
+ repl_arr << "#{config(:mongo_host)}:#{config(:mongo_port)}"
130
+ Mongo::ReplSetConnection.new(repl_arr, :read => :secondary)\
131
+ .db(config(:mongo_db))
132
+ end
133
+ init_db(@mongo) if @mongo.collections.size <= 0
134
+ @mongo
135
+ else
136
+ @mongo
100
137
  end
101
- col
102
138
  end
103
139
 
104
140
  # Declare an index on +field+ for +collection+ if it does not exist
105
141
  def ensure_index(collection, field)
106
- col = @enttodb[collection]
142
+ col = get_entity(collection)
107
143
 
108
144
  exists = col.index_information.find {|k,v|
109
145
  k == "#{field}_1"
@@ -115,21 +151,51 @@ module GHTorrent
115
151
  end
116
152
  end
117
153
 
118
- end
119
- end
154
+ def init_db(mongo)
155
+ ENTITIES.each {|x| mongo.collection(x.to_s)}
156
+
157
+ # Ensure that the necessary indexes exist
158
+ ensure_index(:events, "id")
159
+ ensure_index(:users, "login")
160
+ ensure_index(:commits, "sha")
161
+ ensure_index(:repos, "name")
162
+ ensure_index(:followers, "follows")
163
+ ensure_index(:org_members, "org")
164
+ ensure_index(:commit_comments, "repo")
165
+ ensure_index(:commit_comments, "user")
166
+ ensure_index(:commit_comments, "commit_id")
167
+ ensure_index(:repo_collaborators, "repo")
168
+ ensure_index(:repo_collaborators, "owner")
169
+ ensure_index(:repo_collaborators, "login")
170
+ ensure_index(:watchers, "repo")
171
+ ensure_index(:watchers, "owner")
172
+ ensure_index(:watchers, "login")
173
+ ensure_index(:pull_requests, "repo")
174
+ ensure_index(:pull_requests, "owner")
175
+ ensure_index(:forks, "repo")
176
+ ensure_index(:forks, "owner")
177
+ ensure_index(:forks, "id")
178
+ ensure_index(:issue_comments, "repo")
179
+ ensure_index(:issue_comments, "owner")
180
+ ensure_index(:issue_comments, "issue_id")
181
+ ensure_index(:issue_comments, "id")
182
+ ensure_index(:pull_request_comments, "repo")
183
+ ensure_index(:pull_request_comments, "owner")
184
+ ensure_index(:pull_request_comments, "pullreq_id")
185
+ ensure_index(:pull_request_comments, "id")
186
+ end
120
187
 
121
- class BSON::OrderedHash
122
-
123
- # Convert a BSON result to a +Hash+
124
- def to_h
125
- inject({}) do |acc, element|
126
- k, v = element;
127
- acc[k] = if v.class == BSON::OrderedHash then
128
- v.to_h
129
- else
130
- v
131
- end;
132
- acc
188
+ def rescue_connection_failure(max_retries=60)
189
+ retries = 0
190
+ begin
191
+ yield
192
+ rescue Mongo::ConnectionFailure => ex
193
+ retries += 1
194
+ raise ex if retries > max_retries
195
+ sleep(0.5)
196
+ @mongo.refresh if @mongo.class == Mongo::ReplSetConnection
197
+ retry
198
+ end
133
199
  end
134
200
  end
135
- end
201
+ end