ghtorrent 0.4 → 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +24 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +40 -0
- data/README.md +23 -22
- data/bin/ght-data-retrieval +66 -24
- data/bin/ght-load +41 -19
- data/bin/ght-mirror-events +13 -16
- data/bin/ght-rm-dupl +119 -77
- data/lib/ghtorrent.rb +14 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
- data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
- data/lib/ghtorrent/api_client.rb +151 -16
- data/lib/ghtorrent/bson_orderedhash.rb +23 -0
- data/lib/ghtorrent/cache.rb +97 -0
- data/lib/ghtorrent/command.rb +43 -25
- data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
- data/lib/ghtorrent/ghtorrent.rb +615 -164
- data/lib/ghtorrent/hash.rb +11 -0
- data/lib/ghtorrent/logging.rb +11 -7
- data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
- data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
- data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
- data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
- data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
- data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
- data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
- data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
- data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
- data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
- data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
- data/lib/ghtorrent/persister.rb +3 -0
- data/lib/ghtorrent/retriever.rb +298 -102
- data/lib/ghtorrent/settings.rb +20 -1
- data/lib/ghtorrent/time.rb +5 -0
- data/lib/ghtorrent/utils.rb +22 -4
- data/lib/version.rb +5 -0
- metadata +173 -145
- data/lib/ghtorrent/call_stack.rb +0 -91
data/bin/ght-mirror-events
CHANGED
@@ -1,11 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
1
3
|
require 'rubygems'
|
2
4
|
require 'yaml'
|
3
5
|
require 'amqp'
|
4
6
|
require 'eventmachine'
|
5
|
-
require 'ghtorrent'
|
6
7
|
require 'json'
|
7
8
|
require 'logger'
|
8
9
|
|
10
|
+
require 'ghtorrent/api_client'
|
11
|
+
require 'ghtorrent/settings'
|
12
|
+
require 'ghtorrent/logging'
|
13
|
+
require 'ghtorrent/persister'
|
14
|
+
require 'ghtorrent/command'
|
15
|
+
|
9
16
|
class GHTMirrorEvents < GHTorrent::Command
|
10
17
|
|
11
18
|
include GHTorrent::Settings
|
@@ -13,12 +20,8 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
13
20
|
include GHTorrent::Persister
|
14
21
|
include GHTorrent::APIClient
|
15
22
|
|
16
|
-
|
17
|
-
|
18
|
-
def initialize(args)
|
19
|
-
super(args)
|
20
|
-
@args = args
|
21
|
-
@name = self.class.name
|
23
|
+
def logger
|
24
|
+
@logger
|
22
25
|
end
|
23
26
|
|
24
27
|
def store_count(events)
|
@@ -42,7 +45,7 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
42
45
|
def retrieve(exchange)
|
43
46
|
begin
|
44
47
|
new = dupl = 0
|
45
|
-
events = api_request "https://api.github.com/events"
|
48
|
+
events = api_request "https://api.github.com/events", false
|
46
49
|
(new, dupl, stored) = store_count events
|
47
50
|
|
48
51
|
# This means that first page cannot contain all new events. Go
|
@@ -67,23 +70,17 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
67
70
|
end
|
68
71
|
end
|
69
72
|
|
70
|
-
def prepare_options(options)
|
71
|
-
@name = "ght-mirror-events"
|
72
|
-
end
|
73
|
-
|
74
73
|
def go
|
75
|
-
@gh = GHTorrent::Mirror.new(options[:config])
|
76
|
-
@settings = @gh.settings
|
77
74
|
@persister = connect(:mongo, @settings)
|
78
75
|
@logger = Logger.new(STDOUT)
|
79
76
|
|
80
77
|
# Graceful exit
|
81
78
|
Signal.trap('INT') {
|
82
|
-
info
|
79
|
+
info "Received SIGINT, exiting"
|
83
80
|
AMQP.stop { EM.stop }
|
84
81
|
}
|
85
82
|
Signal.trap('TERM') {
|
86
|
-
info
|
83
|
+
info "Received SIGTERM, exiting"
|
87
84
|
AMQP.stop { EM.stop }
|
88
85
|
}
|
89
86
|
|
data/bin/ght-rm-dupl
CHANGED
@@ -1,92 +1,134 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
1
3
|
require 'rubygems'
|
2
4
|
require 'mongo'
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
:
|
5
|
+
|
6
|
+
require 'ghtorrent/settings'
|
7
|
+
require 'ghtorrent/logging'
|
8
|
+
require 'ghtorrent/command'
|
9
|
+
require 'ghtorrent/persister'
|
10
|
+
|
11
|
+
class GHRMDupl < GHTorrent::Command
|
12
|
+
|
13
|
+
include GHTorrent::Settings
|
14
|
+
include GHTorrent::Persister
|
15
|
+
|
16
|
+
def col_info()
|
17
|
+
{
|
18
|
+
:commits => {
|
19
|
+
:unq => "sha",
|
20
|
+
:col => persister.get_underlying_connection.collection(:commits.to_s),
|
21
|
+
},
|
22
|
+
:events => {
|
23
|
+
:unq => "id",
|
24
|
+
:col => persister.get_underlying_connection.collection(:events.to_s),
|
25
|
+
}
|
17
26
|
}
|
18
|
-
}
|
19
|
-
|
20
|
-
# Print MongoDB remove statements that
|
21
|
-
# remove all but one entries for each commit.
|
22
|
-
def remove_duplicates(data, col)
|
23
|
-
removed = 0
|
24
|
-
data.select { |k, v| v.size > 1 }.each do |k, v|
|
25
|
-
v.slice(0..(v.size - 2)).map do |x|
|
26
|
-
removed += 1 if delete_by_id col, x
|
27
|
-
end
|
28
27
|
end
|
29
|
-
removed
|
30
|
-
end
|
31
28
|
|
32
|
-
def
|
33
|
-
|
34
|
-
|
35
|
-
true
|
36
|
-
rescue Mongo::OperationFailure
|
37
|
-
puts "Cannot remove record with id #{id} from #{col.name}"
|
38
|
-
false
|
29
|
+
def persister
|
30
|
+
@persister ||= connect(:mongo, @settings)
|
31
|
+
@persister
|
39
32
|
end
|
40
|
-
end
|
41
33
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
# Various counters to report stats
|
57
|
-
processed = total_processed = removed = 0
|
58
|
-
|
59
|
-
data = Hash.new
|
60
|
-
|
61
|
-
# The following code needs to save intermediate results to cope
|
62
|
-
# with large datasets
|
63
|
-
per_col[which][:col].find(from, :fields => per_col[which][:payload]).each do |r|
|
64
|
-
_id = r["_id"]
|
65
|
-
commit = GH.read_value(r, per_col[which][:payload])
|
66
|
-
|
67
|
-
# If entries cannot be parsed, remove them
|
68
|
-
if commit.empty?
|
69
|
-
puts "Deleting unknown entry #{_id}"
|
70
|
-
removed += 1 if delete_by_id per_col[which][:col], _id
|
71
|
-
else
|
72
|
-
data[commit] = [] if data[commit].nil?
|
73
|
-
data[commit] << _id
|
34
|
+
def prepare_options(options)
|
35
|
+
options.banner <<-BANNER
|
36
|
+
Removes duplicate entries from collections
|
37
|
+
|
38
|
+
#{command_name} [options] collection
|
39
|
+
|
40
|
+
#{command_name} options:
|
41
|
+
BANNER
|
42
|
+
|
43
|
+
options.opt :earliest, 'Seconds since epoch of earliest item to load',
|
44
|
+
:short => 'e', :default => 0, :type => :int
|
45
|
+
options.opt :snapshot, 'Perform clean up every x records',
|
46
|
+
:short => 's', :default => -1, :type => :int
|
74
47
|
end
|
75
48
|
|
76
|
-
|
77
|
-
|
49
|
+
def validate
|
50
|
+
super
|
51
|
+
Trollop::die "no collection specified" unless args[0] && !args[0].empty?
|
52
|
+
end
|
78
53
|
|
79
|
-
|
54
|
+
# Print MongoDB remove statements that
|
55
|
+
# remove all but one entries for each commit.
|
56
|
+
def remove_duplicates(data, col)
|
57
|
+
removed = 0
|
58
|
+
data.select { |k, v| v.size > 1 }.each do |k, v|
|
59
|
+
v.slice(0..(v.size - 2)).map do |x|
|
60
|
+
removed += 1 if delete_by_id col, x
|
61
|
+
end
|
62
|
+
end
|
63
|
+
removed
|
64
|
+
end
|
65
|
+
|
66
|
+
def delete_by_id(col, id)
|
67
|
+
begin
|
68
|
+
col.remove({'_id' => id})
|
69
|
+
true
|
70
|
+
rescue Mongo::OperationFailure
|
71
|
+
puts "Cannot remove record with id #{id} from #{col.name}"
|
72
|
+
false
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def go
|
77
|
+
collection = case ARGV[0]
|
78
|
+
when "commits" then
|
79
|
+
:commits
|
80
|
+
when "events" then
|
81
|
+
:events
|
82
|
+
else
|
83
|
+
puts "Not a known collection name: #{ARGV[0]}\n"
|
84
|
+
end
|
85
|
+
|
86
|
+
from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
|
87
|
+
|
88
|
+
snapshot = options[:snapshot]
|
89
|
+
|
90
|
+
puts "Deleting duplicates from collection #{collection}"
|
91
|
+
puts "Deleting duplicates after #{Time.at(options[:earliest])}"
|
92
|
+
puts "Perform clean up every #{snapshot} records"
|
93
|
+
|
94
|
+
# Various counters to report stats
|
95
|
+
processed = total_processed = removed = 0
|
80
96
|
|
81
|
-
# Calculate duplicates, save intermediate result
|
82
|
-
if processed > 500000
|
83
|
-
puts "\nLoaded #{data.size} values, cleaning"
|
84
|
-
removed += remove_duplicates data, per_col[which][:col]
|
85
97
|
data = Hash.new
|
86
|
-
|
98
|
+
|
99
|
+
# The following code needs to save intermediate results to cope
|
100
|
+
# with large datasets
|
101
|
+
col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
|
102
|
+
_id = r["_id"]
|
103
|
+
commit = read_value(r, col_info[collection][:unq])
|
104
|
+
|
105
|
+
# If entries cannot be parsed, remove them
|
106
|
+
if commit.empty?
|
107
|
+
puts "Deleting unknown entry #{_id}"
|
108
|
+
removed += 1 if delete_by_id col_info[collection][:col], _id
|
109
|
+
else
|
110
|
+
data[commit] = [] if data[commit].nil?
|
111
|
+
data[commit] << _id
|
112
|
+
end
|
113
|
+
|
114
|
+
processed += 1
|
115
|
+
total_processed += 1
|
116
|
+
|
117
|
+
print "\rProcessed #{processed} records"
|
118
|
+
|
119
|
+
# Calculate duplicates, save intermediate result
|
120
|
+
if snapshot > 0 and processed > snapshot
|
121
|
+
puts "\nLoaded #{data.size} values, cleaning"
|
122
|
+
removed += remove_duplicates data, col_info[collection][:col]
|
123
|
+
data = Hash.new
|
124
|
+
processed = 0
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
removed += remove_duplicates data, col_info[collection][:col]
|
129
|
+
|
130
|
+
puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
|
87
131
|
end
|
88
132
|
end
|
89
133
|
|
90
|
-
|
91
|
-
|
92
|
-
puts "Processed #{total_processed}, deleted #{removed} duplicates"
|
134
|
+
GHRMDupl.run
|
data/lib/ghtorrent.rb
CHANGED
@@ -1,6 +1,4 @@
|
|
1
1
|
module GHTorrent
|
2
|
-
VERSION = '0.4'
|
3
|
-
|
4
2
|
# Route keys used for setting up queues for events, using GHTorrent
|
5
3
|
ROUTEKEY_CREATE = "evt.CreateEvent"
|
6
4
|
ROUTEKEY_DELETE = "evt.DeleteEvent"
|
@@ -22,19 +20,31 @@ module GHTorrent
|
|
22
20
|
|
23
21
|
end
|
24
22
|
|
25
|
-
|
23
|
+
# Shared extensions to library methods
|
24
|
+
require 'ghtorrent/hash'
|
25
|
+
require 'ghtorrent/time'
|
26
|
+
require 'ghtorrent/bson_orderedhash'
|
26
27
|
|
28
|
+
# Basic utility modules
|
29
|
+
require 'version'
|
30
|
+
require 'ghtorrent/gh_torrent_exception'
|
27
31
|
require 'ghtorrent/utils'
|
28
32
|
require 'ghtorrent/logging'
|
29
33
|
require 'ghtorrent/settings'
|
34
|
+
require 'ghtorrent/cache'
|
30
35
|
require 'ghtorrent/api_client'
|
31
|
-
require 'ghtorrent/call_stack'
|
32
36
|
|
37
|
+
# Support for command line utilities offered by this gem
|
38
|
+
require 'ghtorrent/command'
|
39
|
+
|
40
|
+
# Configuration and drivers for caching retrieved data
|
33
41
|
require 'ghtorrent/adapters/base_adapter'
|
34
42
|
require 'ghtorrent/adapters/mongo_persister'
|
35
43
|
require 'ghtorrent/adapters/noop_persister'
|
36
44
|
|
45
|
+
# Support for retrieving and saving intermediate results
|
37
46
|
require 'ghtorrent/persister'
|
38
47
|
require 'ghtorrent/retriever'
|
39
48
|
|
49
|
+
# SQL database fillup methods
|
40
50
|
require 'ghtorrent/ghtorrent'
|
@@ -3,13 +3,14 @@ module GHTorrent
|
|
3
3
|
class BaseAdapter
|
4
4
|
|
5
5
|
ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
|
6
|
-
:commit_comments, :repo_collaborators, :watchers
|
6
|
+
:commit_comments, :repo_collaborators, :watchers, :pull_requests,
|
7
|
+
:forks, :pull_request_comments, :issue_comments, :issues
|
7
8
|
]
|
8
9
|
|
9
10
|
# Stores +data+ into +entity+. Returns a unique key for the stored entry.
|
10
11
|
def store(entity, data = {})
|
11
12
|
unless ENTITIES.include?(entity)
|
12
|
-
|
13
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
13
14
|
end
|
14
15
|
end
|
15
16
|
|
@@ -50,14 +51,14 @@ module GHTorrent
|
|
50
51
|
# matching JSON object.
|
51
52
|
def find(entity, query = {})
|
52
53
|
unless ENTITIES.include?(entity)
|
53
|
-
|
54
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
54
55
|
end
|
55
56
|
end
|
56
57
|
|
57
58
|
# Find the record identified by +id+ in +entity+
|
58
59
|
def find_by_ext_ref_id(entity, id)
|
59
60
|
unless ENTITIES.include?(entity)
|
60
|
-
|
61
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
61
62
|
end
|
62
63
|
end
|
63
64
|
|
@@ -65,8 +66,19 @@ module GHTorrent
|
|
65
66
|
# The +query+ can be any query supported by +find+.
|
66
67
|
def count(entity, query = {})
|
67
68
|
unless ENTITIES.include?(entity)
|
68
|
-
|
69
|
+
raise GHTorrentException.new("Perister: Entity #{entity} not known")
|
69
70
|
end
|
70
71
|
end
|
72
|
+
|
73
|
+
# Get a raw connection to the underlying data store. The connection is
|
74
|
+
# implementaiton dependent.
|
75
|
+
def get_underlying_connection
|
76
|
+
raise "Unimplemented"
|
77
|
+
end
|
78
|
+
|
79
|
+
# Close the current connection and release any held resources
|
80
|
+
def close
|
81
|
+
raise "Unimplemented"
|
82
|
+
end
|
71
83
|
end
|
72
84
|
end
|
@@ -1,4 +1,6 @@
|
|
1
1
|
require 'mongo'
|
2
|
+
require 'ghtorrent/adapters/base_adapter'
|
3
|
+
require 'ghtorrent/bson_orderedhash'
|
2
4
|
|
3
5
|
module GHTorrent
|
4
6
|
|
@@ -14,7 +16,8 @@ module GHTorrent
|
|
14
16
|
:mongo_port => "mongo.port",
|
15
17
|
:mongo_db => "mongo.db",
|
16
18
|
:mongo_username => "mongo.username",
|
17
|
-
:mongo_passwd => "mongo.password"
|
19
|
+
:mongo_passwd => "mongo.password",
|
20
|
+
:mongo_replicas => "mongo.replicas"
|
18
21
|
}
|
19
22
|
|
20
23
|
attr_reader :settings
|
@@ -27,47 +30,21 @@ module GHTorrent
|
|
27
30
|
|
28
31
|
@settings = set
|
29
32
|
@uniq = config(:uniq_id)
|
30
|
-
@mongo = Mongo::Connection.new(config(:mongo_host),
|
31
|
-
config(:mongo_port))\
|
32
|
-
.db(config(:mongo_db))
|
33
|
-
@enttodb = {
|
34
|
-
:users => get_collection("users"),
|
35
|
-
:commits => get_collection("commits"),
|
36
|
-
:repos => get_collection("repos"),
|
37
|
-
:followers => get_collection("followers"),
|
38
|
-
:events => get_collection("events"),
|
39
|
-
:org_members => get_collection("org_members"),
|
40
|
-
:commit_comments => get_collection("commit_comments"),
|
41
|
-
:repo_collaborators => get_collection("repo_collaborators"),
|
42
|
-
:watchers => get_collection("watchers")
|
43
|
-
}
|
44
|
-
|
45
|
-
# Ensure that the necessary indexes exist
|
46
|
-
ensure_index(:events, "id")
|
47
|
-
ensure_index(:users, "login")
|
48
|
-
ensure_index(:commits, "sha")
|
49
|
-
ensure_index(:repos, "name")
|
50
|
-
ensure_index(:followers, "follows")
|
51
|
-
ensure_index(:org_members, "org")
|
52
|
-
ensure_index(:commit_comments, "repo")
|
53
|
-
ensure_index(:commit_comments, "user")
|
54
|
-
ensure_index(:commit_comments, "commit_id")
|
55
|
-
ensure_index(:repo_collaborators, "repo")
|
56
|
-
ensure_index(:repo_collaborators, "owner")
|
57
|
-
ensure_index(:repo_collaborators, "login")
|
58
|
-
ensure_index(:watchers, "repo")
|
59
|
-
ensure_index(:watchers, "owner")
|
60
|
-
ensure_index(:watchers, "login")
|
61
33
|
end
|
62
34
|
|
63
35
|
def store(entity, data = {})
|
64
36
|
super
|
65
|
-
|
37
|
+
rescue_connection_failure do
|
38
|
+
get_entity(entity).insert(data).to_s
|
39
|
+
end
|
66
40
|
end
|
67
41
|
|
68
42
|
def find(entity, query = {})
|
69
43
|
super
|
70
|
-
result =
|
44
|
+
result = rescue_connection_failure do
|
45
|
+
get_entity(entity).find(query)
|
46
|
+
end
|
47
|
+
|
71
48
|
result.to_a.map { |r|
|
72
49
|
r[@uniq] = r['_id'].to_s;
|
73
50
|
r.to_h
|
@@ -83,27 +60,86 @@ module GHTorrent
|
|
83
60
|
# Count the number of items returned by +query+
|
84
61
|
def count(entity, query)
|
85
62
|
super
|
86
|
-
|
63
|
+
rescue_connection_failure do
|
64
|
+
get_entity(entity).count(:query => query)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def get_underlying_connection
|
69
|
+
mongo
|
70
|
+
end
|
71
|
+
|
72
|
+
def close
|
73
|
+
unless @mongo.nil?
|
74
|
+
@mongo.close if @mongo.class == Mongo::ReplSetConnection
|
75
|
+
@mongo.connection.close if @mongo.class == Mongo::Connection
|
76
|
+
|
77
|
+
@mongo = nil
|
78
|
+
end
|
87
79
|
end
|
88
80
|
|
89
81
|
private
|
90
82
|
|
91
83
|
def get_collection(col)
|
92
|
-
|
84
|
+
mongo.collection(col.to_s)
|
93
85
|
end
|
94
86
|
|
95
87
|
def get_entity(entity)
|
96
|
-
|
88
|
+
case entity
|
89
|
+
when :users
|
90
|
+
get_collection("users")
|
91
|
+
when :commits
|
92
|
+
get_collection("commits")
|
93
|
+
when :repos
|
94
|
+
get_collection("repos")
|
95
|
+
when :followers
|
96
|
+
get_collection("followers")
|
97
|
+
when :org_members
|
98
|
+
get_collection("org_members")
|
99
|
+
when :events
|
100
|
+
get_collection("events")
|
101
|
+
when :commit_comments
|
102
|
+
get_collection("commit_comments")
|
103
|
+
when :repo_collaborators
|
104
|
+
get_collection("repo_collaborators")
|
105
|
+
when :watchers
|
106
|
+
get_collection("watchers")
|
107
|
+
when :pull_requests
|
108
|
+
get_collection("pull_requests")
|
109
|
+
when :forks
|
110
|
+
get_collection("forks")
|
111
|
+
when :pull_request_comments
|
112
|
+
get_collection("pull_request_comments")
|
113
|
+
when :issue_comments
|
114
|
+
get_collection("issue_comments")
|
115
|
+
end
|
116
|
+
end
|
97
117
|
|
98
|
-
|
99
|
-
|
118
|
+
def mongo
|
119
|
+
if @mongo.nil?
|
120
|
+
|
121
|
+
replicas = config(:mongo_replicas)
|
122
|
+
|
123
|
+
@mongo = if replicas.nil?
|
124
|
+
Mongo::Connection.new(config(:mongo_host),
|
125
|
+
config(:mongo_port))\
|
126
|
+
.db(config(:mongo_db))
|
127
|
+
else
|
128
|
+
repl_arr = replicas.strip.split(/ /).map{|x| "#{x}:#{config(:mongo_port)}"}
|
129
|
+
repl_arr << "#{config(:mongo_host)}:#{config(:mongo_port)}"
|
130
|
+
Mongo::ReplSetConnection.new(repl_arr, :read => :secondary)\
|
131
|
+
.db(config(:mongo_db))
|
132
|
+
end
|
133
|
+
init_db(@mongo) if @mongo.collections.size <= 0
|
134
|
+
@mongo
|
135
|
+
else
|
136
|
+
@mongo
|
100
137
|
end
|
101
|
-
col
|
102
138
|
end
|
103
139
|
|
104
140
|
# Declare an index on +field+ for +collection+ if it does not exist
|
105
141
|
def ensure_index(collection, field)
|
106
|
-
col =
|
142
|
+
col = get_entity(collection)
|
107
143
|
|
108
144
|
exists = col.index_information.find {|k,v|
|
109
145
|
k == "#{field}_1"
|
@@ -115,21 +151,51 @@ module GHTorrent
|
|
115
151
|
end
|
116
152
|
end
|
117
153
|
|
118
|
-
|
119
|
-
|
154
|
+
def init_db(mongo)
|
155
|
+
ENTITIES.each {|x| mongo.collection(x.to_s)}
|
156
|
+
|
157
|
+
# Ensure that the necessary indexes exist
|
158
|
+
ensure_index(:events, "id")
|
159
|
+
ensure_index(:users, "login")
|
160
|
+
ensure_index(:commits, "sha")
|
161
|
+
ensure_index(:repos, "name")
|
162
|
+
ensure_index(:followers, "follows")
|
163
|
+
ensure_index(:org_members, "org")
|
164
|
+
ensure_index(:commit_comments, "repo")
|
165
|
+
ensure_index(:commit_comments, "user")
|
166
|
+
ensure_index(:commit_comments, "commit_id")
|
167
|
+
ensure_index(:repo_collaborators, "repo")
|
168
|
+
ensure_index(:repo_collaborators, "owner")
|
169
|
+
ensure_index(:repo_collaborators, "login")
|
170
|
+
ensure_index(:watchers, "repo")
|
171
|
+
ensure_index(:watchers, "owner")
|
172
|
+
ensure_index(:watchers, "login")
|
173
|
+
ensure_index(:pull_requests, "repo")
|
174
|
+
ensure_index(:pull_requests, "owner")
|
175
|
+
ensure_index(:forks, "repo")
|
176
|
+
ensure_index(:forks, "owner")
|
177
|
+
ensure_index(:forks, "id")
|
178
|
+
ensure_index(:issue_comments, "repo")
|
179
|
+
ensure_index(:issue_comments, "owner")
|
180
|
+
ensure_index(:issue_comments, "issue_id")
|
181
|
+
ensure_index(:issue_comments, "id")
|
182
|
+
ensure_index(:pull_request_comments, "repo")
|
183
|
+
ensure_index(:pull_request_comments, "owner")
|
184
|
+
ensure_index(:pull_request_comments, "pullreq_id")
|
185
|
+
ensure_index(:pull_request_comments, "id")
|
186
|
+
end
|
120
187
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
acc
|
188
|
+
def rescue_connection_failure(max_retries=60)
|
189
|
+
retries = 0
|
190
|
+
begin
|
191
|
+
yield
|
192
|
+
rescue Mongo::ConnectionFailure => ex
|
193
|
+
retries += 1
|
194
|
+
raise ex if retries > max_retries
|
195
|
+
sleep(0.5)
|
196
|
+
@mongo.refresh if @mongo.class == Mongo::ReplSetConnection
|
197
|
+
retry
|
198
|
+
end
|
133
199
|
end
|
134
200
|
end
|
135
|
-
end
|
201
|
+
end
|