ghtorrent 0.2 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +12 -4
- data/bin/ght-data-retrieval +13 -3
- data/bin/ght-load +1 -1
- data/bin/ght-mirror-events +47 -17
- data/bin/ght-periodic-dump +51 -13
- data/lib/ghtorrent.rb +1 -1
- data/lib/ghtorrent/adapters/base_adapter.rb +11 -2
- data/lib/ghtorrent/adapters/mongo_persister.rb +45 -16
- data/lib/ghtorrent/api_client.rb +51 -17
- data/lib/ghtorrent/command.rb +43 -2
- data/lib/ghtorrent/ghtorrent.rb +265 -71
- data/lib/ghtorrent/migrations/001_init_schema.rb +5 -3
- data/lib/ghtorrent/migrations/{003_add_external_ref_ids.rb → 002_add_external_ref_ids.rb} +0 -0
- data/lib/ghtorrent/migrations/003_add_orgs.rb +37 -0
- data/lib/ghtorrent/migrations/004_add_commit_comments.rb +27 -0
- data/lib/ghtorrent/retriever.rb +146 -8
- data/lib/ghtorrent/settings.rb +1 -0
- data/lib/ghtorrent/utils.rb +13 -0
- data/test/callstack_test.rb +1 -1
- metadata +38 -5
- data/lib/ghtorrent/migrations/002_add_followers_created_at.rb +0 -15
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
ghtorrent: Mirror and process the Github event steam
|
2
2
|
=========================================================
|
3
3
|
|
4
4
|
A collection of scripts used to mirror the Github event stream, for
|
@@ -9,8 +9,8 @@ GHTorrent relies on the following software to work:
|
|
9
9
|
|
10
10
|
* MongoDB > 2.0
|
11
11
|
* RabbitMQ >= 2.7
|
12
|
-
* An SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html).
|
13
|
-
so your mileage may vary if you are using other databases.
|
12
|
+
* An SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html).
|
13
|
+
GHTorrent is tested with SQLite and MySQL, so your mileage may vary if you are using other databases.
|
14
14
|
|
15
15
|
GHTorrent is written in Ruby (tested with 1.8 and JRuby). To install
|
16
16
|
it as a Gem do:
|
@@ -19,6 +19,14 @@ it as a Gem do:
|
|
19
19
|
sudo gem install ghtorrent
|
20
20
|
</code>
|
21
21
|
|
22
|
+
Depending on which SQL database you want to use, install the appropriate dependency gem.
|
23
|
+
GHTorrent already installs the `sqlite3` gem (if it fails, install the development
|
24
|
+
package for `sqlite3` for your system).
|
25
|
+
|
26
|
+
<code>
|
27
|
+
sudo gem install mysql2 #or postgres
|
28
|
+
</code>
|
29
|
+
|
22
30
|
#### Configuring
|
23
31
|
|
24
32
|
Copy the contents of the
|
@@ -124,7 +132,7 @@ please consider citing the following paper:
|
|
124
132
|
|
125
133
|
Georgios Gousios <gousiosg@gmail.com>
|
126
134
|
|
127
|
-
Diomidis Spinellis
|
135
|
+
[Diomidis Spinellis](http://www.dmst.aueb.gr/dds) <dds@aueb.gr>
|
128
136
|
|
129
137
|
#### License
|
130
138
|
|
data/bin/ght-data-retrieval
CHANGED
@@ -38,7 +38,7 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
38
38
|
|
39
39
|
include GHTorrent::Settings
|
40
40
|
|
41
|
-
attr_reader :settings
|
41
|
+
attr_reader :settings, :name
|
42
42
|
|
43
43
|
def parse(msg)
|
44
44
|
JSON.parse(msg)
|
@@ -71,13 +71,23 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
71
71
|
%w(PushEvent WatchEvent FollowEvent)
|
72
72
|
end
|
73
73
|
|
74
|
+
def prepare_options(options)
|
75
|
+
@name = "ght-data-retrieval"
|
76
|
+
end
|
77
|
+
|
74
78
|
def go
|
75
79
|
@gh = GHTorrent::Mirror.new(options[:config])
|
76
80
|
@settings = @gh.settings
|
77
81
|
|
78
82
|
# Graceful exit
|
79
|
-
Signal.trap('INT') {
|
80
|
-
|
83
|
+
Signal.trap('INT') {
|
84
|
+
info ("Received SIGINT, exiting")
|
85
|
+
AMQP.stop { EM.stop }
|
86
|
+
}
|
87
|
+
Signal.trap('TERM') {
|
88
|
+
info ("Received SIGTERM, exiting")
|
89
|
+
AMQP.stop { EM.stop }
|
90
|
+
}
|
81
91
|
|
82
92
|
AMQP.start(:host => config(:amqp_host),
|
83
93
|
:port => config(:amqp_port),
|
data/bin/ght-load
CHANGED
@@ -113,7 +113,7 @@ Loads object ids from a collection to a queue for further processing.
|
|
113
113
|
:commits
|
114
114
|
end
|
115
115
|
|
116
|
-
puts "Loading
|
116
|
+
puts "Loading from collection #{collection}"
|
117
117
|
puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
|
118
118
|
|
119
119
|
what = case
|
data/bin/ght-mirror-events
CHANGED
@@ -41,26 +41,51 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
41
41
|
include GHTorrent::Settings
|
42
42
|
include GHTorrent::Logging
|
43
43
|
include GHTorrent::Persister
|
44
|
+
include GHTorrent::APIClient
|
44
45
|
|
45
46
|
attr_reader :settings
|
46
47
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
events.each do |e|
|
54
|
-
unless @persister.find(:events, {'id' => e['id']}).empty?
|
55
|
-
info "Already got #{e['id']}"
|
56
|
-
dupl += 1
|
57
|
-
next
|
58
|
-
end
|
48
|
+
def initialize(args)
|
49
|
+
super(args)
|
50
|
+
@args = args
|
51
|
+
@name = self.class.name
|
52
|
+
end
|
59
53
|
|
54
|
+
def store_count(events)
|
55
|
+
stored = Array.new
|
56
|
+
new = dupl = 0
|
57
|
+
events.each do |e|
|
58
|
+
if @persister.find(:events, {'id' => e['id']}).empty?
|
59
|
+
stored << e
|
60
60
|
new += 1
|
61
61
|
@persister.store(:events, e)
|
62
62
|
info "Added #{e['id']}"
|
63
|
+
else
|
64
|
+
info "Already got #{e['id']}"
|
65
|
+
dupl += 1
|
66
|
+
end
|
67
|
+
end
|
68
|
+
return new, dupl, stored
|
69
|
+
end
|
63
70
|
|
71
|
+
# Retrieve events from Github, store them in the DB
|
72
|
+
def retrieve(exchange)
|
73
|
+
begin
|
74
|
+
new = dupl = 0
|
75
|
+
events = api_request "https://api.github.com/events"
|
76
|
+
(new, dupl, stored) = store_count events
|
77
|
+
|
78
|
+
# This means that first page cannot contain all new events. Go
|
79
|
+
# up to 10 pages back to find all new events not contained in first page.
|
80
|
+
if dupl == 0
|
81
|
+
events = paged_api_request "https://api.github.com/events", 10
|
82
|
+
(new1, dupl1, stored1) = store_count events
|
83
|
+
stored = stored | stored1
|
84
|
+
new = new + new1
|
85
|
+
new
|
86
|
+
end
|
87
|
+
|
88
|
+
stored.each do |e|
|
64
89
|
msg = JSON.dump(e)
|
65
90
|
key = "evt.%s" % e['type']
|
66
91
|
exchange.publish msg, :persistent => true, :routing_key => key
|
@@ -73,19 +98,24 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
73
98
|
end
|
74
99
|
|
75
100
|
def prepare_options(options)
|
76
|
-
|
101
|
+
@name = "ght-mirror-events"
|
77
102
|
end
|
78
103
|
|
79
104
|
def go
|
80
|
-
|
81
105
|
@gh = GHTorrent::Mirror.new(options[:config])
|
82
106
|
@settings = @gh.settings
|
83
107
|
@persister = connect(:mongo, @settings)
|
84
108
|
@logger = Logger.new(STDOUT)
|
85
109
|
|
86
110
|
# Graceful exit
|
87
|
-
Signal.trap('INT') {
|
88
|
-
|
111
|
+
Signal.trap('INT') {
|
112
|
+
info ("Received SIGINT, exiting")
|
113
|
+
AMQP.stop { EM.stop }
|
114
|
+
}
|
115
|
+
Signal.trap('TERM') {
|
116
|
+
info ("Received SIGTERM, exiting")
|
117
|
+
AMQP.stop { EM.stop }
|
118
|
+
}
|
89
119
|
|
90
120
|
# The event loop
|
91
121
|
AMQP.start(:host => config(:amqp_host),
|
@@ -105,7 +135,7 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
105
135
|
# Initial delay for the retrieve event loop
|
106
136
|
retrieval_delay = config(:mirror_pollevery)
|
107
137
|
|
108
|
-
# Retrieve
|
138
|
+
# Retrieve events
|
109
139
|
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
110
140
|
(new, dupl) = retrieve exchange
|
111
141
|
dupl_msgs += dupl
|
data/bin/ght-periodic-dump
CHANGED
@@ -6,23 +6,53 @@
|
|
6
6
|
# Directory to place compressed files and torrents
|
7
7
|
OUTDIR=/home/data/github-mirror/dumps
|
8
8
|
|
9
|
-
# Base URL for HTTP dir containing torrents and data
|
9
|
+
# Base URL for HTTP dir containing torrents and data
|
10
10
|
WEBSEED=http://ikaria.dmst.aueb.gr/ghtorrent/
|
11
11
|
|
12
|
+
usage()
|
13
|
+
{
|
14
|
+
echo "Usage: $0 [-f 'yyyy-mm-dd hh:mm'] [-t 'yyyy-mm-dd hh:mm']"
|
15
|
+
echo " [-c collection_to_dump]"
|
16
|
+
echo "Dump the database. -f earliest record timestamp"
|
17
|
+
echo " -t latest record timestamp"
|
18
|
+
echo " -c collection to dump (default: all)"
|
19
|
+
}
|
20
|
+
|
21
|
+
if [ -z $1 ]
|
22
|
+
then
|
23
|
+
usage
|
24
|
+
exit 1
|
25
|
+
fi
|
26
|
+
|
27
|
+
while getopts "f:t:c:" o
|
28
|
+
do
|
29
|
+
case $o in
|
30
|
+
f) timeStart=`date -d "$OPTARG" +%s` ;;
|
31
|
+
t) timeEnd=`date -d "$OPTARG" +%s` ;;
|
32
|
+
c) collection=$OPTARG ;;
|
33
|
+
\?) echo "Invalid option: -$OPTARG" >&2
|
34
|
+
usage
|
35
|
+
exit 1
|
36
|
+
;;
|
37
|
+
esac
|
38
|
+
done
|
39
|
+
|
40
|
+
|
12
41
|
# Time to start dumping from
|
13
|
-
if [ -
|
42
|
+
if [ -z $timeStart ]
|
14
43
|
then
|
15
|
-
|
16
|
-
|
17
|
-
|
44
|
+
if [ -r lastrun ]
|
45
|
+
then
|
46
|
+
timeStart=`cat lastrun`
|
47
|
+
else
|
48
|
+
timeStart=0
|
49
|
+
fi
|
18
50
|
fi
|
19
51
|
|
20
52
|
# Time to end dumping
|
21
|
-
if [
|
53
|
+
if [ -z $timeEnd ]
|
22
54
|
then
|
23
55
|
timeEnd=`date +%s`
|
24
|
-
else
|
25
|
-
timeEnd=`date -d "$1" +%s` || exit 1
|
26
56
|
fi
|
27
57
|
|
28
58
|
# Name used for the files
|
@@ -37,11 +67,19 @@ dateName=`date -d @$timeEnd -u +'%Y-%m-%d'`
|
|
37
67
|
endId=`printf '%08x0000000000000000' $timeEnd`
|
38
68
|
startId=`printf '%08x0000000000000000' $timeStart`
|
39
69
|
|
70
|
+
|
71
|
+
if [ -z $collection ]
|
72
|
+
then
|
73
|
+
collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
|
74
|
+
else
|
75
|
+
collections=$collection
|
76
|
+
fi
|
77
|
+
|
40
78
|
echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`"
|
41
79
|
|
42
|
-
collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
|
43
|
-
|
44
80
|
rm -rf dump
|
81
|
+
mkdir -p dump/github
|
82
|
+
|
45
83
|
for col in $collections; do
|
46
84
|
|
47
85
|
echo "Dumping $col"
|
@@ -62,8 +100,8 @@ for col in $collections; do
|
|
62
100
|
(
|
63
101
|
echo "Start date: `date -u -d @$timeStart +'%Y-%m-%dT%H:%M:%SZ'`"
|
64
102
|
echo "End date: `date -u -d @$timeEnd +'%Y-%m-%dT%H:%M:%SZ'`"
|
65
|
-
meta $col
|
66
|
-
)
|
103
|
+
meta $col
|
104
|
+
)
|
67
105
|
done |
|
68
106
|
tee README.$dateName.txt >dump/github/README.txt || exit 1
|
69
107
|
|
@@ -88,5 +126,5 @@ done
|
|
88
126
|
echo $timeEnd >lastrun || exit 1
|
89
127
|
|
90
128
|
# Clean up
|
91
|
-
rm -rf dump
|
129
|
+
rm -rf dump
|
92
130
|
|
data/lib/ghtorrent.rb
CHANGED
@@ -30,8 +30,9 @@ module GHTorrent
|
|
30
30
|
|
31
31
|
class BaseAdapter
|
32
32
|
|
33
|
-
ENTITIES = [:users, :commits, :followers, :repos, :events
|
34
|
-
|
33
|
+
ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
|
34
|
+
:commit_comments
|
35
|
+
]
|
35
36
|
|
36
37
|
# Stores +data+ into +entity+. Returns a unique key for the stored entry.
|
37
38
|
def store(entity, data = {})
|
@@ -87,5 +88,13 @@ module GHTorrent
|
|
87
88
|
throw GHTorrentException.new("Perister: Entity #{entity} not known")
|
88
89
|
end
|
89
90
|
end
|
91
|
+
|
92
|
+
# Count the number of entries returned by +query+ without retrieving them.
|
93
|
+
# The +query+ can be any query supported by +find+.
|
94
|
+
def count(entity, query = {})
|
95
|
+
unless ENTITIES.include?(entity)
|
96
|
+
throw GHTorrentException.new("Perister: Entity #{entity} not known")
|
97
|
+
end
|
98
|
+
end
|
90
99
|
end
|
91
100
|
end
|
@@ -49,6 +49,7 @@ module GHTorrent
|
|
49
49
|
|
50
50
|
# Creates a new instance of the MongoDB persistence adapter.
|
51
51
|
# Expects a parsed YAML settings document as input.
|
52
|
+
# Will create indexes on fields most frequently used in queries.
|
52
53
|
def initialize(set)
|
53
54
|
merge LOCALCONFIG
|
54
55
|
|
@@ -62,32 +63,31 @@ module GHTorrent
|
|
62
63
|
:commits => get_collection("commits"),
|
63
64
|
:repos => get_collection("repos"),
|
64
65
|
:followers => get_collection("followers"),
|
65
|
-
:events => get_collection("events")
|
66
|
+
:events => get_collection("events"),
|
67
|
+
:org_members => get_collection("org_members"),
|
68
|
+
:commit_comments => get_collection("commit_comments")
|
66
69
|
}
|
70
|
+
|
71
|
+
# Ensure that the necessary indexes exist
|
72
|
+
ensure_index(:users, "login")
|
73
|
+
ensure_index(:commits, "sha")
|
74
|
+
ensure_index(:repos, "name")
|
75
|
+
ensure_index(:followers, "follows")
|
76
|
+
ensure_index(:org_members, "org")
|
77
|
+
ensure_index(:commit_comments, "repo")
|
78
|
+
ensure_index(:commit_comments, "user")
|
79
|
+
ensure_index(:commit_comments, "commit_id")
|
67
80
|
end
|
68
81
|
|
69
82
|
|
70
83
|
def store(entity, data = {})
|
71
84
|
super
|
72
|
-
|
73
|
-
|
74
|
-
if col.nil?
|
75
|
-
raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
|
76
|
-
end
|
77
|
-
|
78
|
-
col.insert(data).to_s
|
85
|
+
get_entity(entity).insert(data).to_s
|
79
86
|
end
|
80
87
|
|
81
88
|
def find(entity, query = {})
|
82
89
|
super
|
83
|
-
|
84
|
-
col = @enttodb[entity]
|
85
|
-
|
86
|
-
if col.nil?
|
87
|
-
raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
|
88
|
-
end
|
89
|
-
|
90
|
-
result = col.find(query)
|
90
|
+
result = get_entity(entity).find(query)
|
91
91
|
result.to_a.map { |r|
|
92
92
|
r[@uniq] = r['_id'].to_s;
|
93
93
|
r.to_h
|
@@ -100,12 +100,41 @@ module GHTorrent
|
|
100
100
|
raise NotImplementedError
|
101
101
|
end
|
102
102
|
|
103
|
+
# Count the number of items returned by +query+
|
104
|
+
def count(entity, query)
|
105
|
+
super
|
106
|
+
get_entity(entity).count(:query => query)
|
107
|
+
end
|
108
|
+
|
103
109
|
private
|
104
110
|
|
105
111
|
def get_collection(col)
|
106
112
|
@mongo.collection(col.to_s)
|
107
113
|
end
|
108
114
|
|
115
|
+
def get_entity(entity)
|
116
|
+
col = @enttodb[entity]
|
117
|
+
|
118
|
+
if col.nil?
|
119
|
+
raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
|
120
|
+
end
|
121
|
+
col
|
122
|
+
end
|
123
|
+
|
124
|
+
# Declare an index on +field+ for +collection+ if it does not exist
|
125
|
+
def ensure_index(collection, field)
|
126
|
+
col = @enttodb[collection]
|
127
|
+
|
128
|
+
exists = col.index_information.find {|k,v|
|
129
|
+
k == "#{field}_1"
|
130
|
+
}
|
131
|
+
|
132
|
+
if exists.nil?
|
133
|
+
col.create_index(field, :background => true)
|
134
|
+
STDERR.puts "Creating index on #{collection}(#{field})"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
109
138
|
end
|
110
139
|
end
|
111
140
|
|
data/lib/ghtorrent/api_client.rb
CHANGED
@@ -40,33 +40,67 @@ module GHTorrent
|
|
40
40
|
@num_api_calls = 0
|
41
41
|
@ts = Time.now().tv_sec()
|
42
42
|
end
|
43
|
-
|
43
|
+
|
44
|
+
# A paged request. Used when the result can expand to more than one
|
45
|
+
# result pages.
|
44
46
|
def paged_api_request(url, pages = -1)
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
48
|
+
data = api_request_raw(url)
|
49
|
+
|
50
|
+
return [] if data.nil?
|
51
|
+
|
52
|
+
unless data.meta['link'].nil?
|
53
|
+
links = parse_links(data.meta['link'])
|
54
|
+
|
55
|
+
if pages > 0
|
56
|
+
pages = pages - 1
|
57
|
+
if pages == 0
|
58
|
+
return parse_request_result(data)
|
59
|
+
end
|
60
|
+
end
|
52
61
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
62
|
+
if links['next'].nil?
|
63
|
+
parse_request_result(data)
|
64
|
+
else
|
65
|
+
parse_request_result(data) | paged_api_request(links['next'], pages)
|
66
|
+
end
|
67
|
+
else
|
68
|
+
parse_request_result(data)
|
69
|
+
end
|
59
70
|
end
|
60
71
|
|
72
|
+
# A normal request. Returns a hash or an array of hashes representing the
|
73
|
+
# parsed JSON result.
|
61
74
|
def api_request(url)
|
62
|
-
|
75
|
+
parse_request_result api_request_raw(url)
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
# Parse a Github link header
|
81
|
+
def parse_links(links)
|
82
|
+
links.split(/,/).reduce({}) do |acc, x|
|
83
|
+
matches = x.strip.match(/<(.*)>; rel=\"(.*)\"/)
|
84
|
+
acc[matches[2]] = matches[1]
|
85
|
+
acc
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Parse the JSON result array
|
90
|
+
def parse_request_result(result)
|
63
91
|
if result.nil?
|
64
|
-
|
92
|
+
[]
|
65
93
|
else
|
66
|
-
|
94
|
+
json = result.read
|
95
|
+
if json.nil?
|
96
|
+
[]
|
97
|
+
else
|
98
|
+
JSON.parse(json)
|
99
|
+
end
|
67
100
|
end
|
68
101
|
end
|
69
102
|
|
103
|
+
# Do the actual request and return the result object
|
70
104
|
def api_request_raw(url)
|
71
105
|
#Rate limiting to avoid error requests
|
72
106
|
if Time.now().tv_sec() - @ts < 60 then
|
@@ -86,7 +120,7 @@ module GHTorrent
|
|
86
120
|
@num_api_calls += 1
|
87
121
|
debug "APIClient: Request: #{url} (num_calls = #{@num_api_calls})"
|
88
122
|
begin
|
89
|
-
open(url)
|
123
|
+
open(url)
|
90
124
|
rescue OpenURI::HTTPError => e
|
91
125
|
case e.io.status[0].to_i
|
92
126
|
# The following indicate valid Github return codes
|