ghtorrent 0.4 → 0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +24 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +40 -0
- data/README.md +23 -22
- data/bin/ght-data-retrieval +66 -24
- data/bin/ght-load +41 -19
- data/bin/ght-mirror-events +13 -16
- data/bin/ght-rm-dupl +119 -77
- data/lib/ghtorrent.rb +14 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
- data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
- data/lib/ghtorrent/api_client.rb +151 -16
- data/lib/ghtorrent/bson_orderedhash.rb +23 -0
- data/lib/ghtorrent/cache.rb +97 -0
- data/lib/ghtorrent/command.rb +43 -25
- data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
- data/lib/ghtorrent/ghtorrent.rb +615 -164
- data/lib/ghtorrent/hash.rb +11 -0
- data/lib/ghtorrent/logging.rb +11 -7
- data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
- data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
- data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
- data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
- data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
- data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
- data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
- data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
- data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
- data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
- data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
- data/lib/ghtorrent/persister.rb +3 -0
- data/lib/ghtorrent/retriever.rb +298 -102
- data/lib/ghtorrent/settings.rb +20 -1
- data/lib/ghtorrent/time.rb +5 -0
- data/lib/ghtorrent/utils.rb +22 -4
- data/lib/version.rb +5 -0
- metadata +173 -145
- data/lib/ghtorrent/call_stack.rb +0 -91
data/bin/ght-mirror-events
CHANGED
@@ -1,11 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
1
3
|
require 'rubygems'
|
2
4
|
require 'yaml'
|
3
5
|
require 'amqp'
|
4
6
|
require 'eventmachine'
|
5
|
-
require 'ghtorrent'
|
6
7
|
require 'json'
|
7
8
|
require 'logger'
|
8
9
|
|
10
|
+
require 'ghtorrent/api_client'
|
11
|
+
require 'ghtorrent/settings'
|
12
|
+
require 'ghtorrent/logging'
|
13
|
+
require 'ghtorrent/persister'
|
14
|
+
require 'ghtorrent/command'
|
15
|
+
|
9
16
|
class GHTMirrorEvents < GHTorrent::Command
|
10
17
|
|
11
18
|
include GHTorrent::Settings
|
@@ -13,12 +20,8 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
13
20
|
include GHTorrent::Persister
|
14
21
|
include GHTorrent::APIClient
|
15
22
|
|
16
|
-
|
17
|
-
|
18
|
-
def initialize(args)
|
19
|
-
super(args)
|
20
|
-
@args = args
|
21
|
-
@name = self.class.name
|
23
|
+
def logger
|
24
|
+
@logger
|
22
25
|
end
|
23
26
|
|
24
27
|
def store_count(events)
|
@@ -42,7 +45,7 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
42
45
|
def retrieve(exchange)
|
43
46
|
begin
|
44
47
|
new = dupl = 0
|
45
|
-
events = api_request "https://api.github.com/events"
|
48
|
+
events = api_request "https://api.github.com/events", false
|
46
49
|
(new, dupl, stored) = store_count events
|
47
50
|
|
48
51
|
# This means that first page cannot contain all new events. Go
|
@@ -67,23 +70,17 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
67
70
|
end
|
68
71
|
end
|
69
72
|
|
70
|
-
def prepare_options(options)
|
71
|
-
@name = "ght-mirror-events"
|
72
|
-
end
|
73
|
-
|
74
73
|
def go
|
75
|
-
@gh = GHTorrent::Mirror.new(options[:config])
|
76
|
-
@settings = @gh.settings
|
77
74
|
@persister = connect(:mongo, @settings)
|
78
75
|
@logger = Logger.new(STDOUT)
|
79
76
|
|
80
77
|
# Graceful exit
|
81
78
|
Signal.trap('INT') {
|
82
|
-
info
|
79
|
+
info "Received SIGINT, exiting"
|
83
80
|
AMQP.stop { EM.stop }
|
84
81
|
}
|
85
82
|
Signal.trap('TERM') {
|
86
|
-
info
|
83
|
+
info "Received SIGTERM, exiting"
|
87
84
|
AMQP.stop { EM.stop }
|
88
85
|
}
|
89
86
|
|
data/bin/ght-rm-dupl
CHANGED
@@ -1,92 +1,134 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
1
3
|
require 'rubygems'
|
2
4
|
require 'mongo'
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
:
|
5
|
+
|
6
|
+
require 'ghtorrent/settings'
|
7
|
+
require 'ghtorrent/logging'
|
8
|
+
require 'ghtorrent/command'
|
9
|
+
require 'ghtorrent/persister'
|
10
|
+
|
11
|
+
class GHRMDupl < GHTorrent::Command
|
12
|
+
|
13
|
+
include GHTorrent::Settings
|
14
|
+
include GHTorrent::Persister
|
15
|
+
|
16
|
+
def col_info()
|
17
|
+
{
|
18
|
+
:commits => {
|
19
|
+
:unq => "sha",
|
20
|
+
:col => persister.get_underlying_connection.collection(:commits.to_s),
|
21
|
+
},
|
22
|
+
:events => {
|
23
|
+
:unq => "id",
|
24
|
+
:col => persister.get_underlying_connection.collection(:events.to_s),
|
25
|
+
}
|
17
26
|
}
|
18
|
-
}
|
19
|
-
|
20
|
-
# Print MongoDB remove statements that
|
21
|
-
# remove all but one entries for each commit.
|
22
|
-
def remove_duplicates(data, col)
|
23
|
-
removed = 0
|
24
|
-
data.select { |k, v| v.size > 1 }.each do |k, v|
|
25
|
-
v.slice(0..(v.size - 2)).map do |x|
|
26
|
-
removed += 1 if delete_by_id col, x
|
27
|
-
end
|
28
27
|
end
|
29
|
-
removed
|
30
|
-
end
|
31
28
|
|
32
|
-
def
|
33
|
-
|
34
|
-
|
35
|
-
true
|
36
|
-
rescue Mongo::OperationFailure
|
37
|
-
puts "Cannot remove record with id #{id} from #{col.name}"
|
38
|
-
false
|
29
|
+
def persister
|
30
|
+
@persister ||= connect(:mongo, @settings)
|
31
|
+
@persister
|
39
32
|
end
|
40
|
-
end
|
41
33
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
# Various counters to report stats
|
57
|
-
processed = total_processed = removed = 0
|
58
|
-
|
59
|
-
data = Hash.new
|
60
|
-
|
61
|
-
# The following code needs to save intermediate results to cope
|
62
|
-
# with large datasets
|
63
|
-
per_col[which][:col].find(from, :fields => per_col[which][:payload]).each do |r|
|
64
|
-
_id = r["_id"]
|
65
|
-
commit = GH.read_value(r, per_col[which][:payload])
|
66
|
-
|
67
|
-
# If entries cannot be parsed, remove them
|
68
|
-
if commit.empty?
|
69
|
-
puts "Deleting unknown entry #{_id}"
|
70
|
-
removed += 1 if delete_by_id per_col[which][:col], _id
|
71
|
-
else
|
72
|
-
data[commit] = [] if data[commit].nil?
|
73
|
-
data[commit] << _id
|
34
|
+
def prepare_options(options)
|
35
|
+
options.banner <<-BANNER
|
36
|
+
Removes duplicate entries from collections
|
37
|
+
|
38
|
+
#{command_name} [options] collection
|
39
|
+
|
40
|
+
#{command_name} options:
|
41
|
+
BANNER
|
42
|
+
|
43
|
+
options.opt :earliest, 'Seconds since epoch of earliest item to load',
|
44
|
+
:short => 'e', :default => 0, :type => :int
|
45
|
+
options.opt :snapshot, 'Perform clean up every x records',
|
46
|
+
:short => 's', :default => -1, :type => :int
|
74
47
|
end
|
75
48
|
|
76
|
-
|
77
|
-
|
49
|
+
def validate
|
50
|
+
super
|
51
|
+
Trollop::die "no collection specified" unless args[0] && !args[0].empty?
|
52
|
+
end
|
78
53
|
|
79
|
-
|
54
|
+
# Print MongoDB remove statements that
|
55
|
+
# remove all but one entries for each commit.
|
56
|
+
def remove_duplicates(data, col)
|
57
|
+
removed = 0
|
58
|
+
data.select { |k, v| v.size > 1 }.each do |k, v|
|
59
|
+
v.slice(0..(v.size - 2)).map do |x|
|
60
|
+
removed += 1 if delete_by_id col, x
|
61
|
+
end
|
62
|
+
end
|
63
|
+
removed
|
64
|
+
end
|
65
|
+
|
66
|
+
def delete_by_id(col, id)
|
67
|
+
begin
|
68
|
+
col.remove({'_id' => id})
|
69
|
+
true
|
70
|
+
rescue Mongo::OperationFailure
|
71
|
+
puts "Cannot remove record with id #{id} from #{col.name}"
|
72
|
+
false
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def go
|
77
|
+
collection = case ARGV[0]
|
78
|
+
when "commits" then
|
79
|
+
:commits
|
80
|
+
when "events" then
|
81
|
+
:events
|
82
|
+
else
|
83
|
+
puts "Not a known collection name: #{ARGV[0]}\n"
|
84
|
+
end
|
85
|
+
|
86
|
+
from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
|
87
|
+
|
88
|
+
snapshot = options[:snapshot]
|
89
|
+
|
90
|
+
puts "Deleting duplicates from collection #{collection}"
|
91
|
+
puts "Deleting duplicates after #{Time.at(options[:earliest])}"
|
92
|
+
puts "Perform clean up every #{snapshot} records"
|
93
|
+
|
94
|
+
# Various counters to report stats
|
95
|
+
processed = total_processed = removed = 0
|
80
96
|
|
81
|
-
# Calculate duplicates, save intermediate result
|
82
|
-
if processed > 500000
|
83
|
-
puts "\nLoaded #{data.size} values, cleaning"
|
84
|
-
removed += remove_duplicates data, per_col[which][:col]
|
85
97
|
data = Hash.new
|
86
|
-
|
98
|
+
|
99
|
+
# The following code needs to save intermediate results to cope
|
100
|
+
# with large datasets
|
101
|
+
col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
|
102
|
+
_id = r["_id"]
|
103
|
+
commit = read_value(r, col_info[collection][:unq])
|
104
|
+
|
105
|
+
# If entries cannot be parsed, remove them
|
106
|
+
if commit.empty?
|
107
|
+
puts "Deleting unknown entry #{_id}"
|
108
|
+
removed += 1 if delete_by_id col_info[collection][:col], _id
|
109
|
+
else
|
110
|
+
data[commit] = [] if data[commit].nil?
|
111
|
+
data[commit] << _id
|
112
|
+
end
|
113
|
+
|
114
|
+
processed += 1
|
115
|
+
total_processed += 1
|
116
|
+
|
117
|
+
print "\rProcessed #{processed} records"
|
118
|
+
|
119
|
+
# Calculate duplicates, save intermediate result
|
120
|
+
if snapshot > 0 and processed > snapshot
|
121
|
+
puts "\nLoaded #{data.size} values, cleaning"
|
122
|
+
removed += remove_duplicates data, col_info[collection][:col]
|
123
|
+
data = Hash.new
|
124
|
+
processed = 0
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
removed += remove_duplicates data, col_info[collection][:col]
|
129
|
+
|
130
|
+
puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
|
87
131
|
end
|
88
132
|
end
|
89
133
|
|
90
|
-
|
91
|
-
|
92
|
-
puts "Processed #{total_processed}, deleted #{removed} duplicates"
|
134
|
+
GHRMDupl.run
|
data/lib/ghtorrent.rb
CHANGED
@@ -1,6 +1,4 @@
|
|
1
1
|
module GHTorrent
|
2
|
-
VERSION = '0.4'
|
3
|
-
|
4
2
|
# Route keys used for setting up queues for events, using GHTorrent
|
5
3
|
ROUTEKEY_CREATE = "evt.CreateEvent"
|
6
4
|
ROUTEKEY_DELETE = "evt.DeleteEvent"
|
@@ -22,19 +20,31 @@ module GHTorrent
|
|
22
20
|
|
23
21
|
end
|
24
22
|
|
25
|
-
|
23
|
+
# Shared extensions to library methods
|
24
|
+
require 'ghtorrent/hash'
|
25
|
+
require 'ghtorrent/time'
|
26
|
+
require 'ghtorrent/bson_orderedhash'
|
26
27
|
|
28
|
+
# Basic utility modules
|
29
|
+
require 'version'
|
30
|
+
require 'ghtorrent/gh_torrent_exception'
|
27
31
|
require 'ghtorrent/utils'
|
28
32
|
require 'ghtorrent/logging'
|
29
33
|
require 'ghtorrent/settings'
|
34
|
+
require 'ghtorrent/cache'
|
30
35
|
require 'ghtorrent/api_client'
|
31
|
-
require 'ghtorrent/call_stack'
|
32
36
|
|
37
|
+
# Support for command line utilities offered by this gem
|
38
|
+
require 'ghtorrent/command'
|
39
|
+
|
40
|
+
# Configuration and drivers for caching retrieved data
|
33
41
|
require 'ghtorrent/adapters/base_adapter'
|
34
42
|
require 'ghtorrent/adapters/mongo_persister'
|
35
43
|
require 'ghtorrent/adapters/noop_persister'
|
36
44
|
|
45
|
+
# Support for retrieving and saving intermediate results
|
37
46
|
require 'ghtorrent/persister'
|
38
47
|
require 'ghtorrent/retriever'
|
39
48
|
|
49
|
+
# SQL database fillup methods
|
40
50
|
require 'ghtorrent/ghtorrent'
|
@@ -3,13 +3,14 @@ module GHTorrent
|
|
3
3
|
class BaseAdapter
|
4
4
|
|
5
5
|
ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
|
6
|
-
:commit_comments, :repo_collaborators, :watchers
|
6
|
+
:commit_comments, :repo_collaborators, :watchers, :pull_requests,
|
7
|
+
:forks, :pull_request_comments, :issue_comments, :issues
|
7
8
|
]
|
8
9
|
|
9
10
|
# Stores +data+ into +entity+. Returns a unique key for the stored entry.
|
10
11
|
def store(entity, data = {})
|
11
12
|
unless ENTITIES.include?(entity)
|
12
|
-
|
13
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
13
14
|
end
|
14
15
|
end
|
15
16
|
|
@@ -50,14 +51,14 @@ module GHTorrent
|
|
50
51
|
# matching JSON object.
|
51
52
|
def find(entity, query = {})
|
52
53
|
unless ENTITIES.include?(entity)
|
53
|
-
|
54
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
54
55
|
end
|
55
56
|
end
|
56
57
|
|
57
58
|
# Find the record identified by +id+ in +entity+
|
58
59
|
def find_by_ext_ref_id(entity, id)
|
59
60
|
unless ENTITIES.include?(entity)
|
60
|
-
|
61
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
61
62
|
end
|
62
63
|
end
|
63
64
|
|
@@ -65,8 +66,19 @@ module GHTorrent
|
|
65
66
|
# The +query+ can be any query supported by +find+.
|
66
67
|
def count(entity, query = {})
|
67
68
|
unless ENTITIES.include?(entity)
|
68
|
-
|
69
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
69
70
|
end
|
70
71
|
end
|
72
|
+
|
73
|
+
# Get a raw connection to the underlying data store. The connection is
|
74
|
+
# implementaiton dependent.
|
75
|
+
def get_underlying_connection
|
76
|
+
raise "Unimplemented"
|
77
|
+
end
|
78
|
+
|
79
|
+
# Close the current connection and release any held resources
|
80
|
+
def close
|
81
|
+
raise "Unimplemented"
|
82
|
+
end
|
71
83
|
end
|
72
84
|
end
|
@@ -1,4 +1,6 @@
|
|
1
1
|
require 'mongo'
|
2
|
+
require 'ghtorrent/adapters/base_adapter'
|
3
|
+
require 'ghtorrent/bson_orderedhash'
|
2
4
|
|
3
5
|
module GHTorrent
|
4
6
|
|
@@ -14,7 +16,8 @@ module GHTorrent
|
|
14
16
|
:mongo_port => "mongo.port",
|
15
17
|
:mongo_db => "mongo.db",
|
16
18
|
:mongo_username => "mongo.username",
|
17
|
-
:mongo_passwd => "mongo.password"
|
19
|
+
:mongo_passwd => "mongo.password",
|
20
|
+
:mongo_replicas => "mongo.replicas"
|
18
21
|
}
|
19
22
|
|
20
23
|
attr_reader :settings
|
@@ -27,47 +30,21 @@ module GHTorrent
|
|
27
30
|
|
28
31
|
@settings = set
|
29
32
|
@uniq = config(:uniq_id)
|
30
|
-
@mongo = Mongo::Connection.new(config(:mongo_host),
|
31
|
-
config(:mongo_port))\
|
32
|
-
.db(config(:mongo_db))
|
33
|
-
@enttodb = {
|
34
|
-
:users => get_collection("users"),
|
35
|
-
:commits => get_collection("commits"),
|
36
|
-
:repos => get_collection("repos"),
|
37
|
-
:followers => get_collection("followers"),
|
38
|
-
:events => get_collection("events"),
|
39
|
-
:org_members => get_collection("org_members"),
|
40
|
-
:commit_comments => get_collection("commit_comments"),
|
41
|
-
:repo_collaborators => get_collection("repo_collaborators"),
|
42
|
-
:watchers => get_collection("watchers")
|
43
|
-
}
|
44
|
-
|
45
|
-
# Ensure that the necessary indexes exist
|
46
|
-
ensure_index(:events, "id")
|
47
|
-
ensure_index(:users, "login")
|
48
|
-
ensure_index(:commits, "sha")
|
49
|
-
ensure_index(:repos, "name")
|
50
|
-
ensure_index(:followers, "follows")
|
51
|
-
ensure_index(:org_members, "org")
|
52
|
-
ensure_index(:commit_comments, "repo")
|
53
|
-
ensure_index(:commit_comments, "user")
|
54
|
-
ensure_index(:commit_comments, "commit_id")
|
55
|
-
ensure_index(:repo_collaborators, "repo")
|
56
|
-
ensure_index(:repo_collaborators, "owner")
|
57
|
-
ensure_index(:repo_collaborators, "login")
|
58
|
-
ensure_index(:watchers, "repo")
|
59
|
-
ensure_index(:watchers, "owner")
|
60
|
-
ensure_index(:watchers, "login")
|
61
33
|
end
|
62
34
|
|
63
35
|
def store(entity, data = {})
|
64
36
|
super
|
65
|
-
|
37
|
+
rescue_connection_failure do
|
38
|
+
get_entity(entity).insert(data).to_s
|
39
|
+
end
|
66
40
|
end
|
67
41
|
|
68
42
|
def find(entity, query = {})
|
69
43
|
super
|
70
|
-
result =
|
44
|
+
result = rescue_connection_failure do
|
45
|
+
get_entity(entity).find(query)
|
46
|
+
end
|
47
|
+
|
71
48
|
result.to_a.map { |r|
|
72
49
|
r[@uniq] = r['_id'].to_s;
|
73
50
|
r.to_h
|
@@ -83,27 +60,86 @@ module GHTorrent
|
|
83
60
|
# Count the number of items returned by +query+
|
84
61
|
def count(entity, query)
|
85
62
|
super
|
86
|
-
|
63
|
+
rescue_connection_failure do
|
64
|
+
get_entity(entity).count(:query => query)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def get_underlying_connection
|
69
|
+
mongo
|
70
|
+
end
|
71
|
+
|
72
|
+
def close
|
73
|
+
unless @mongo.nil?
|
74
|
+
@mongo.close if @mongo.class == Mongo::ReplSetConnection
|
75
|
+
@mongo.connection.close if @mongo.class == Mongo::Connection
|
76
|
+
|
77
|
+
@mongo = nil
|
78
|
+
end
|
87
79
|
end
|
88
80
|
|
89
81
|
private
|
90
82
|
|
91
83
|
def get_collection(col)
|
92
|
-
|
84
|
+
mongo.collection(col.to_s)
|
93
85
|
end
|
94
86
|
|
95
87
|
def get_entity(entity)
|
96
|
-
|
88
|
+
case entity
|
89
|
+
when :users
|
90
|
+
get_collection("users")
|
91
|
+
when :commits
|
92
|
+
get_collection("commits")
|
93
|
+
when :repos
|
94
|
+
get_collection("repos")
|
95
|
+
when :followers
|
96
|
+
get_collection("followers")
|
97
|
+
when :org_members
|
98
|
+
get_collection("org_members")
|
99
|
+
when :events
|
100
|
+
get_collection("events")
|
101
|
+
when :commit_comments
|
102
|
+
get_collection("commit_comments")
|
103
|
+
when :repo_collaborators
|
104
|
+
get_collection("repo_collaborators")
|
105
|
+
when :watchers
|
106
|
+
get_collection("watchers")
|
107
|
+
when :pull_requests
|
108
|
+
get_collection("pull_requests")
|
109
|
+
when :forks
|
110
|
+
get_collection("forks")
|
111
|
+
when :pull_request_comments
|
112
|
+
get_collection("pull_request_comments")
|
113
|
+
when :issue_comments
|
114
|
+
get_collection("issue_comments")
|
115
|
+
end
|
116
|
+
end
|
97
117
|
|
98
|
-
|
99
|
-
|
118
|
+
def mongo
|
119
|
+
if @mongo.nil?
|
120
|
+
|
121
|
+
replicas = config(:mongo_replicas)
|
122
|
+
|
123
|
+
@mongo = if replicas.nil?
|
124
|
+
Mongo::Connection.new(config(:mongo_host),
|
125
|
+
config(:mongo_port))\
|
126
|
+
.db(config(:mongo_db))
|
127
|
+
else
|
128
|
+
repl_arr = replicas.strip.split(/ /).map{|x| "#{x}:#{config(:mongo_port)}"}
|
129
|
+
repl_arr << "#{config(:mongo_host)}:#{config(:mongo_port)}"
|
130
|
+
Mongo::ReplSetConnection.new(repl_arr, :read => :secondary)\
|
131
|
+
.db(config(:mongo_db))
|
132
|
+
end
|
133
|
+
init_db(@mongo) if @mongo.collections.size <= 0
|
134
|
+
@mongo
|
135
|
+
else
|
136
|
+
@mongo
|
100
137
|
end
|
101
|
-
col
|
102
138
|
end
|
103
139
|
|
104
140
|
# Declare an index on +field+ for +collection+ if it does not exist
|
105
141
|
def ensure_index(collection, field)
|
106
|
-
col =
|
142
|
+
col = get_entity(collection)
|
107
143
|
|
108
144
|
exists = col.index_information.find {|k,v|
|
109
145
|
k == "#{field}_1"
|
@@ -115,21 +151,51 @@ module GHTorrent
|
|
115
151
|
end
|
116
152
|
end
|
117
153
|
|
118
|
-
|
119
|
-
|
154
|
+
def init_db(mongo)
|
155
|
+
ENTITIES.each {|x| mongo.collection(x.to_s)}
|
156
|
+
|
157
|
+
# Ensure that the necessary indexes exist
|
158
|
+
ensure_index(:events, "id")
|
159
|
+
ensure_index(:users, "login")
|
160
|
+
ensure_index(:commits, "sha")
|
161
|
+
ensure_index(:repos, "name")
|
162
|
+
ensure_index(:followers, "follows")
|
163
|
+
ensure_index(:org_members, "org")
|
164
|
+
ensure_index(:commit_comments, "repo")
|
165
|
+
ensure_index(:commit_comments, "user")
|
166
|
+
ensure_index(:commit_comments, "commit_id")
|
167
|
+
ensure_index(:repo_collaborators, "repo")
|
168
|
+
ensure_index(:repo_collaborators, "owner")
|
169
|
+
ensure_index(:repo_collaborators, "login")
|
170
|
+
ensure_index(:watchers, "repo")
|
171
|
+
ensure_index(:watchers, "owner")
|
172
|
+
ensure_index(:watchers, "login")
|
173
|
+
ensure_index(:pull_requests, "repo")
|
174
|
+
ensure_index(:pull_requests, "owner")
|
175
|
+
ensure_index(:forks, "repo")
|
176
|
+
ensure_index(:forks, "owner")
|
177
|
+
ensure_index(:forks, "id")
|
178
|
+
ensure_index(:issue_comments, "repo")
|
179
|
+
ensure_index(:issue_comments, "owner")
|
180
|
+
ensure_index(:issue_comments, "issue_id")
|
181
|
+
ensure_index(:issue_comments, "id")
|
182
|
+
ensure_index(:pull_request_comments, "repo")
|
183
|
+
ensure_index(:pull_request_comments, "owner")
|
184
|
+
ensure_index(:pull_request_comments, "pullreq_id")
|
185
|
+
ensure_index(:pull_request_comments, "id")
|
186
|
+
end
|
120
187
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
acc
|
188
|
+
def rescue_connection_failure(max_retries=60)
|
189
|
+
retries = 0
|
190
|
+
begin
|
191
|
+
yield
|
192
|
+
rescue Mongo::ConnectionFailure => ex
|
193
|
+
retries += 1
|
194
|
+
raise ex if retries > max_retries
|
195
|
+
sleep(0.5)
|
196
|
+
@mongo.refresh if @mongo.class == Mongo::ReplSetConnection
|
197
|
+
retry
|
198
|
+
end
|
133
199
|
end
|
134
200
|
end
|
135
|
-
end
|
201
|
+
end
|