ghtorrent 0.4 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/CHANGELOG +24 -0
  2. data/Gemfile +17 -0
  3. data/Gemfile.lock +40 -0
  4. data/README.md +23 -22
  5. data/bin/ght-data-retrieval +66 -24
  6. data/bin/ght-load +41 -19
  7. data/bin/ght-mirror-events +13 -16
  8. data/bin/ght-rm-dupl +119 -77
  9. data/lib/ghtorrent.rb +14 -4
  10. data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
  11. data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
  12. data/lib/ghtorrent/api_client.rb +151 -16
  13. data/lib/ghtorrent/bson_orderedhash.rb +23 -0
  14. data/lib/ghtorrent/cache.rb +97 -0
  15. data/lib/ghtorrent/command.rb +43 -25
  16. data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
  17. data/lib/ghtorrent/ghtorrent.rb +615 -164
  18. data/lib/ghtorrent/hash.rb +11 -0
  19. data/lib/ghtorrent/logging.rb +11 -7
  20. data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
  21. data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
  22. data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
  23. data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
  24. data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
  25. data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
  26. data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
  27. data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
  28. data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
  29. data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
  30. data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
  31. data/lib/ghtorrent/persister.rb +3 -0
  32. data/lib/ghtorrent/retriever.rb +298 -102
  33. data/lib/ghtorrent/settings.rb +20 -1
  34. data/lib/ghtorrent/time.rb +5 -0
  35. data/lib/ghtorrent/utils.rb +22 -4
  36. data/lib/version.rb +5 -0
  37. metadata +173 -145
  38. data/lib/ghtorrent/call_stack.rb +0 -91
@@ -1,11 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+
1
3
  require 'rubygems'
2
4
  require 'yaml'
3
5
  require 'amqp'
4
6
  require 'eventmachine'
5
- require 'ghtorrent'
6
7
  require 'json'
7
8
  require 'logger'
8
9
 
10
+ require 'ghtorrent/api_client'
11
+ require 'ghtorrent/settings'
12
+ require 'ghtorrent/logging'
13
+ require 'ghtorrent/persister'
14
+ require 'ghtorrent/command'
15
+
9
16
  class GHTMirrorEvents < GHTorrent::Command
10
17
 
11
18
  include GHTorrent::Settings
@@ -13,12 +20,8 @@ class GHTMirrorEvents < GHTorrent::Command
13
20
  include GHTorrent::Persister
14
21
  include GHTorrent::APIClient
15
22
 
16
- attr_reader :settings
17
-
18
- def initialize(args)
19
- super(args)
20
- @args = args
21
- @name = self.class.name
23
+ def logger
24
+ @logger
22
25
  end
23
26
 
24
27
  def store_count(events)
@@ -42,7 +45,7 @@ class GHTMirrorEvents < GHTorrent::Command
42
45
  def retrieve(exchange)
43
46
  begin
44
47
  new = dupl = 0
45
- events = api_request "https://api.github.com/events"
48
+ events = api_request "https://api.github.com/events", false
46
49
  (new, dupl, stored) = store_count events
47
50
 
48
51
  # This means that first page cannot contain all new events. Go
@@ -67,23 +70,17 @@ class GHTMirrorEvents < GHTorrent::Command
67
70
  end
68
71
  end
69
72
 
70
- def prepare_options(options)
71
- @name = "ght-mirror-events"
72
- end
73
-
74
73
  def go
75
- @gh = GHTorrent::Mirror.new(options[:config])
76
- @settings = @gh.settings
77
74
  @persister = connect(:mongo, @settings)
78
75
  @logger = Logger.new(STDOUT)
79
76
 
80
77
  # Graceful exit
81
78
  Signal.trap('INT') {
82
- info ("Received SIGINT, exiting")
79
+ info "Received SIGINT, exiting"
83
80
  AMQP.stop { EM.stop }
84
81
  }
85
82
  Signal.trap('TERM') {
86
- info ("Received SIGTERM, exiting")
83
+ info "Received SIGTERM, exiting"
87
84
  AMQP.stop { EM.stop }
88
85
  }
89
86
 
data/bin/ght-rm-dupl CHANGED
@@ -1,92 +1,134 @@
1
+ #!/usr/bin/env ruby
2
+
1
3
  require 'rubygems'
2
4
  require 'mongo'
3
- require 'ghtorrent-old'
4
-
5
- GH = Mirror.new
6
- GH.init("config.yaml")
7
-
8
- # Unique keys per known collection
9
- per_col = {
10
- :commits => {
11
- :payload => "commit.id",
12
- :col => GH.commits_col,
13
- },
14
- :events => {
15
- :payload => "id",
16
- :col => GH.events_col,
5
+
6
+ require 'ghtorrent/settings'
7
+ require 'ghtorrent/logging'
8
+ require 'ghtorrent/command'
9
+ require 'ghtorrent/persister'
10
+
11
+ class GHRMDupl < GHTorrent::Command
12
+
13
+ include GHTorrent::Settings
14
+ include GHTorrent::Persister
15
+
16
+ def col_info()
17
+ {
18
+ :commits => {
19
+ :unq => "sha",
20
+ :col => persister.get_underlying_connection.collection(:commits.to_s),
21
+ },
22
+ :events => {
23
+ :unq => "id",
24
+ :col => persister.get_underlying_connection.collection(:events.to_s),
25
+ }
17
26
  }
18
- }
19
-
20
- # Print MongoDB remove statements that
21
- # remove all but one entries for each commit.
22
- def remove_duplicates(data, col)
23
- removed = 0
24
- data.select { |k, v| v.size > 1 }.each do |k, v|
25
- v.slice(0..(v.size - 2)).map do |x|
26
- removed += 1 if delete_by_id col, x
27
- end
28
27
  end
29
- removed
30
- end
31
28
 
32
- def delete_by_id(col, id)
33
- begin
34
- col.remove({'_id' => id})
35
- true
36
- rescue Mongo::OperationFailure
37
- puts "Cannot remove record with id #{id} from #{col.name}"
38
- false
29
+ def persister
30
+ @persister ||= connect(:mongo, @settings)
31
+ @persister
39
32
  end
40
- end
41
33
 
42
- which = case ARGV[0]
43
- when "commits" then :commits
44
- when "events" then :events
45
- else puts "Not a known collection name: #{ARGV[0]}\n"
46
- end
47
-
48
- from = case ARGV[1]
49
- when nil then {}
50
- else
51
- t = Time.at(ARGV[1].to_i)
52
- STDERR.puts "Searching for duplicates after #{t}"
53
- {'_id' => {'$gte' => BSON::ObjectId.from_time(t)}}
54
- end
55
-
56
- # Various counters to report stats
57
- processed = total_processed = removed = 0
58
-
59
- data = Hash.new
60
-
61
- # The following code needs to save intermediate results to cope
62
- # with large datasets
63
- per_col[which][:col].find(from, :fields => per_col[which][:payload]).each do |r|
64
- _id = r["_id"]
65
- commit = GH.read_value(r, per_col[which][:payload])
66
-
67
- # If entries cannot be parsed, remove them
68
- if commit.empty?
69
- puts "Deleting unknown entry #{_id}"
70
- removed += 1 if delete_by_id per_col[which][:col], _id
71
- else
72
- data[commit] = [] if data[commit].nil?
73
- data[commit] << _id
34
+ def prepare_options(options)
35
+ options.banner <<-BANNER
36
+ Removes duplicate entries from collections
37
+
38
+ #{command_name} [options] collection
39
+
40
+ #{command_name} options:
41
+ BANNER
42
+
43
+ options.opt :earliest, 'Seconds since epoch of earliest item to load',
44
+ :short => 'e', :default => 0, :type => :int
45
+ options.opt :snapshot, 'Perform clean up every x records',
46
+ :short => 's', :default => -1, :type => :int
74
47
  end
75
48
 
76
- processed += 1
77
- total_processed += 1
49
+ def validate
50
+ super
51
+ Trollop::die "no collection specified" unless args[0] && !args[0].empty?
52
+ end
78
53
 
79
- print "\rProcessed #{processed} records"
54
+ # Print MongoDB remove statements that
55
+ # remove all but one entries for each commit.
56
+ def remove_duplicates(data, col)
57
+ removed = 0
58
+ data.select { |k, v| v.size > 1 }.each do |k, v|
59
+ v.slice(0..(v.size - 2)).map do |x|
60
+ removed += 1 if delete_by_id col, x
61
+ end
62
+ end
63
+ removed
64
+ end
65
+
66
+ def delete_by_id(col, id)
67
+ begin
68
+ col.remove({'_id' => id})
69
+ true
70
+ rescue Mongo::OperationFailure
71
+ puts "Cannot remove record with id #{id} from #{col.name}"
72
+ false
73
+ end
74
+ end
75
+
76
+ def go
77
+ collection = case ARGV[0]
78
+ when "commits" then
79
+ :commits
80
+ when "events" then
81
+ :events
82
+ else
83
+ puts "Not a known collection name: #{ARGV[0]}\n"
84
+ end
85
+
86
+ from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
87
+
88
+ snapshot = options[:snapshot]
89
+
90
+ puts "Deleting duplicates from collection #{collection}"
91
+ puts "Deleting duplicates after #{Time.at(options[:earliest])}"
92
+ puts "Perform clean up every #{snapshot} records"
93
+
94
+ # Various counters to report stats
95
+ processed = total_processed = removed = 0
80
96
 
81
- # Calculate duplicates, save intermediate result
82
- if processed > 500000
83
- puts "\nLoaded #{data.size} values, cleaning"
84
- removed += remove_duplicates data, per_col[which][:col]
85
97
  data = Hash.new
86
- processed = 0
98
+
99
+ # The following code needs to save intermediate results to cope
100
+ # with large datasets
101
+ col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
102
+ _id = r["_id"]
103
+ commit = read_value(r, col_info[collection][:unq])
104
+
105
+ # If entries cannot be parsed, remove them
106
+ if commit.empty?
107
+ puts "Deleting unknown entry #{_id}"
108
+ removed += 1 if delete_by_id col_info[collection][:col], _id
109
+ else
110
+ data[commit] = [] if data[commit].nil?
111
+ data[commit] << _id
112
+ end
113
+
114
+ processed += 1
115
+ total_processed += 1
116
+
117
+ print "\rProcessed #{processed} records"
118
+
119
+ # Calculate duplicates, save intermediate result
120
+ if snapshot > 0 and processed > snapshot
121
+ puts "\nLoaded #{data.size} values, cleaning"
122
+ removed += remove_duplicates data, col_info[collection][:col]
123
+ data = Hash.new
124
+ processed = 0
125
+ end
126
+ end
127
+
128
+ removed += remove_duplicates data, col_info[collection][:col]
129
+
130
+ puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
87
131
  end
88
132
  end
89
133
 
90
- removed += remove_duplicates data, per_col[which][:col]
91
-
92
- puts "Processed #{total_processed}, deleted #{removed} duplicates"
134
+ GHRMDupl.run
data/lib/ghtorrent.rb CHANGED
@@ -1,6 +1,4 @@
1
1
  module GHTorrent
2
- VERSION = '0.4'
3
-
4
2
  # Route keys used for setting up queues for events, using GHTorrent
5
3
  ROUTEKEY_CREATE = "evt.CreateEvent"
6
4
  ROUTEKEY_DELETE = "evt.DeleteEvent"
@@ -22,19 +20,31 @@ module GHTorrent
22
20
 
23
21
  end
24
22
 
25
- require 'ghtorrent/command'
23
+ # Shared extensions to library methods
24
+ require 'ghtorrent/hash'
25
+ require 'ghtorrent/time'
26
+ require 'ghtorrent/bson_orderedhash'
26
27
 
28
+ # Basic utility modules
29
+ require 'version'
30
+ require 'ghtorrent/gh_torrent_exception'
27
31
  require 'ghtorrent/utils'
28
32
  require 'ghtorrent/logging'
29
33
  require 'ghtorrent/settings'
34
+ require 'ghtorrent/cache'
30
35
  require 'ghtorrent/api_client'
31
- require 'ghtorrent/call_stack'
32
36
 
37
+ # Support for command line utilities offered by this gem
38
+ require 'ghtorrent/command'
39
+
40
+ # Configuration and drivers for caching retrieved data
33
41
  require 'ghtorrent/adapters/base_adapter'
34
42
  require 'ghtorrent/adapters/mongo_persister'
35
43
  require 'ghtorrent/adapters/noop_persister'
36
44
 
45
+ # Support for retrieving and saving intermediate results
37
46
  require 'ghtorrent/persister'
38
47
  require 'ghtorrent/retriever'
39
48
 
49
+ # SQL database fillup methods
40
50
  require 'ghtorrent/ghtorrent'
@@ -3,13 +3,14 @@ module GHTorrent
3
3
  class BaseAdapter
4
4
 
5
5
  ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
6
- :commit_comments, :repo_collaborators, :watchers
6
+ :commit_comments, :repo_collaborators, :watchers, :pull_requests,
7
+ :forks, :pull_request_comments, :issue_comments, :issues
7
8
  ]
8
9
 
9
10
  # Stores +data+ into +entity+. Returns a unique key for the stored entry.
10
11
  def store(entity, data = {})
11
12
  unless ENTITIES.include?(entity)
12
- throw GHTorrentException.new("Perister: Entity #{entity} not known")
13
+ raise GHTorrentException.new("Perister: Entity #{entity} not known")
13
14
  end
14
15
  end
15
16
 
@@ -50,14 +51,14 @@ module GHTorrent
50
51
  # matching JSON object.
51
52
  def find(entity, query = {})
52
53
  unless ENTITIES.include?(entity)
53
- throw GHTorrentException.new("Perister: Entity #{entity} not known")
54
+ raise GHTorrentException.new("Perister: Entity #{entity} not known")
54
55
  end
55
56
  end
56
57
 
57
58
  # Find the record identified by +id+ in +entity+
58
59
  def find_by_ext_ref_id(entity, id)
59
60
  unless ENTITIES.include?(entity)
60
- throw GHTorrentException.new("Perister: Entity #{entity} not known")
61
+ raise GHTorrentException.new("Perister: Entity #{entity} not known")
61
62
  end
62
63
  end
63
64
 
@@ -65,8 +66,19 @@ module GHTorrent
65
66
  # The +query+ can be any query supported by +find+.
66
67
  def count(entity, query = {})
67
68
  unless ENTITIES.include?(entity)
68
- throw GHTorrentException.new("Perister: Entity #{entity} not known")
69
+ raise GHTorrentException.new("Perister: Entity #{entity} not known")
69
70
  end
70
71
  end
72
+
73
+ # Get a raw connection to the underlying data store. The connection is
74
+ # implementaiton dependent.
75
+ def get_underlying_connection
76
+ raise "Unimplemented"
77
+ end
78
+
79
+ # Close the current connection and release any held resources
80
+ def close
81
+ raise "Unimplemented"
82
+ end
71
83
  end
72
84
  end
@@ -1,4 +1,6 @@
1
1
  require 'mongo'
2
+ require 'ghtorrent/adapters/base_adapter'
3
+ require 'ghtorrent/bson_orderedhash'
2
4
 
3
5
  module GHTorrent
4
6
 
@@ -14,7 +16,8 @@ module GHTorrent
14
16
  :mongo_port => "mongo.port",
15
17
  :mongo_db => "mongo.db",
16
18
  :mongo_username => "mongo.username",
17
- :mongo_passwd => "mongo.password"
19
+ :mongo_passwd => "mongo.password",
20
+ :mongo_replicas => "mongo.replicas"
18
21
  }
19
22
 
20
23
  attr_reader :settings
@@ -27,47 +30,21 @@ module GHTorrent
27
30
 
28
31
  @settings = set
29
32
  @uniq = config(:uniq_id)
30
- @mongo = Mongo::Connection.new(config(:mongo_host),
31
- config(:mongo_port))\
32
- .db(config(:mongo_db))
33
- @enttodb = {
34
- :users => get_collection("users"),
35
- :commits => get_collection("commits"),
36
- :repos => get_collection("repos"),
37
- :followers => get_collection("followers"),
38
- :events => get_collection("events"),
39
- :org_members => get_collection("org_members"),
40
- :commit_comments => get_collection("commit_comments"),
41
- :repo_collaborators => get_collection("repo_collaborators"),
42
- :watchers => get_collection("watchers")
43
- }
44
-
45
- # Ensure that the necessary indexes exist
46
- ensure_index(:events, "id")
47
- ensure_index(:users, "login")
48
- ensure_index(:commits, "sha")
49
- ensure_index(:repos, "name")
50
- ensure_index(:followers, "follows")
51
- ensure_index(:org_members, "org")
52
- ensure_index(:commit_comments, "repo")
53
- ensure_index(:commit_comments, "user")
54
- ensure_index(:commit_comments, "commit_id")
55
- ensure_index(:repo_collaborators, "repo")
56
- ensure_index(:repo_collaborators, "owner")
57
- ensure_index(:repo_collaborators, "login")
58
- ensure_index(:watchers, "repo")
59
- ensure_index(:watchers, "owner")
60
- ensure_index(:watchers, "login")
61
33
  end
62
34
 
63
35
  def store(entity, data = {})
64
36
  super
65
- get_entity(entity).insert(data).to_s
37
+ rescue_connection_failure do
38
+ get_entity(entity).insert(data).to_s
39
+ end
66
40
  end
67
41
 
68
42
  def find(entity, query = {})
69
43
  super
70
- result = get_entity(entity).find(query)
44
+ result = rescue_connection_failure do
45
+ get_entity(entity).find(query)
46
+ end
47
+
71
48
  result.to_a.map { |r|
72
49
  r[@uniq] = r['_id'].to_s;
73
50
  r.to_h
@@ -83,27 +60,86 @@ module GHTorrent
83
60
  # Count the number of items returned by +query+
84
61
  def count(entity, query)
85
62
  super
86
- get_entity(entity).count(:query => query)
63
+ rescue_connection_failure do
64
+ get_entity(entity).count(:query => query)
65
+ end
66
+ end
67
+
68
+ def get_underlying_connection
69
+ mongo
70
+ end
71
+
72
+ def close
73
+ unless @mongo.nil?
74
+ @mongo.close if @mongo.class == Mongo::ReplSetConnection
75
+ @mongo.connection.close if @mongo.class == Mongo::Connection
76
+
77
+ @mongo = nil
78
+ end
87
79
  end
88
80
 
89
81
  private
90
82
 
91
83
  def get_collection(col)
92
- @mongo.collection(col.to_s)
84
+ mongo.collection(col.to_s)
93
85
  end
94
86
 
95
87
  def get_entity(entity)
96
- col = @enttodb[entity]
88
+ case entity
89
+ when :users
90
+ get_collection("users")
91
+ when :commits
92
+ get_collection("commits")
93
+ when :repos
94
+ get_collection("repos")
95
+ when :followers
96
+ get_collection("followers")
97
+ when :org_members
98
+ get_collection("org_members")
99
+ when :events
100
+ get_collection("events")
101
+ when :commit_comments
102
+ get_collection("commit_comments")
103
+ when :repo_collaborators
104
+ get_collection("repo_collaborators")
105
+ when :watchers
106
+ get_collection("watchers")
107
+ when :pull_requests
108
+ get_collection("pull_requests")
109
+ when :forks
110
+ get_collection("forks")
111
+ when :pull_request_comments
112
+ get_collection("pull_request_comments")
113
+ when :issue_comments
114
+ get_collection("issue_comments")
115
+ end
116
+ end
97
117
 
98
- if col.nil?
99
- raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
118
+ def mongo
119
+ if @mongo.nil?
120
+
121
+ replicas = config(:mongo_replicas)
122
+
123
+ @mongo = if replicas.nil?
124
+ Mongo::Connection.new(config(:mongo_host),
125
+ config(:mongo_port))\
126
+ .db(config(:mongo_db))
127
+ else
128
+ repl_arr = replicas.strip.split(/ /).map{|x| "#{x}:#{config(:mongo_port)}"}
129
+ repl_arr << "#{config(:mongo_host)}:#{config(:mongo_port)}"
130
+ Mongo::ReplSetConnection.new(repl_arr, :read => :secondary)\
131
+ .db(config(:mongo_db))
132
+ end
133
+ init_db(@mongo) if @mongo.collections.size <= 0
134
+ @mongo
135
+ else
136
+ @mongo
100
137
  end
101
- col
102
138
  end
103
139
 
104
140
  # Declare an index on +field+ for +collection+ if it does not exist
105
141
  def ensure_index(collection, field)
106
- col = @enttodb[collection]
142
+ col = get_entity(collection)
107
143
 
108
144
  exists = col.index_information.find {|k,v|
109
145
  k == "#{field}_1"
@@ -115,21 +151,51 @@ module GHTorrent
115
151
  end
116
152
  end
117
153
 
118
- end
119
- end
154
+ def init_db(mongo)
155
+ ENTITIES.each {|x| mongo.collection(x.to_s)}
156
+
157
+ # Ensure that the necessary indexes exist
158
+ ensure_index(:events, "id")
159
+ ensure_index(:users, "login")
160
+ ensure_index(:commits, "sha")
161
+ ensure_index(:repos, "name")
162
+ ensure_index(:followers, "follows")
163
+ ensure_index(:org_members, "org")
164
+ ensure_index(:commit_comments, "repo")
165
+ ensure_index(:commit_comments, "user")
166
+ ensure_index(:commit_comments, "commit_id")
167
+ ensure_index(:repo_collaborators, "repo")
168
+ ensure_index(:repo_collaborators, "owner")
169
+ ensure_index(:repo_collaborators, "login")
170
+ ensure_index(:watchers, "repo")
171
+ ensure_index(:watchers, "owner")
172
+ ensure_index(:watchers, "login")
173
+ ensure_index(:pull_requests, "repo")
174
+ ensure_index(:pull_requests, "owner")
175
+ ensure_index(:forks, "repo")
176
+ ensure_index(:forks, "owner")
177
+ ensure_index(:forks, "id")
178
+ ensure_index(:issue_comments, "repo")
179
+ ensure_index(:issue_comments, "owner")
180
+ ensure_index(:issue_comments, "issue_id")
181
+ ensure_index(:issue_comments, "id")
182
+ ensure_index(:pull_request_comments, "repo")
183
+ ensure_index(:pull_request_comments, "owner")
184
+ ensure_index(:pull_request_comments, "pullreq_id")
185
+ ensure_index(:pull_request_comments, "id")
186
+ end
120
187
 
121
- class BSON::OrderedHash
122
-
123
- # Convert a BSON result to a +Hash+
124
- def to_h
125
- inject({}) do |acc, element|
126
- k, v = element;
127
- acc[k] = if v.class == BSON::OrderedHash then
128
- v.to_h
129
- else
130
- v
131
- end;
132
- acc
188
+ def rescue_connection_failure(max_retries=60)
189
+ retries = 0
190
+ begin
191
+ yield
192
+ rescue Mongo::ConnectionFailure => ex
193
+ retries += 1
194
+ raise ex if retries > max_retries
195
+ sleep(0.5)
196
+ @mongo.refresh if @mongo.class == Mongo::ReplSetConnection
197
+ retry
198
+ end
133
199
  end
134
200
  end
135
- end
201
+ end