ghtorrent 0.2 → 0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +12 -4
- data/bin/ght-data-retrieval +13 -3
- data/bin/ght-load +1 -1
- data/bin/ght-mirror-events +47 -17
- data/bin/ght-periodic-dump +51 -13
- data/lib/ghtorrent.rb +1 -1
- data/lib/ghtorrent/adapters/base_adapter.rb +11 -2
- data/lib/ghtorrent/adapters/mongo_persister.rb +45 -16
- data/lib/ghtorrent/api_client.rb +51 -17
- data/lib/ghtorrent/command.rb +43 -2
- data/lib/ghtorrent/ghtorrent.rb +265 -71
- data/lib/ghtorrent/migrations/001_init_schema.rb +5 -3
- data/lib/ghtorrent/migrations/{003_add_external_ref_ids.rb → 002_add_external_ref_ids.rb} +0 -0
- data/lib/ghtorrent/migrations/003_add_orgs.rb +37 -0
- data/lib/ghtorrent/migrations/004_add_commit_comments.rb +27 -0
- data/lib/ghtorrent/retriever.rb +146 -8
- data/lib/ghtorrent/settings.rb +1 -0
- data/lib/ghtorrent/utils.rb +13 -0
- data/test/callstack_test.rb +1 -1
- metadata +38 -5
- data/lib/ghtorrent/migrations/002_add_followers_created_at.rb +0 -15
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
ghtorrent: Mirror and process the Github event steam
|
2
2
|
=========================================================
|
3
3
|
|
4
4
|
A collection of scripts used to mirror the Github event stream, for
|
@@ -9,8 +9,8 @@ GHTorrent relies on the following software to work:
|
|
9
9
|
|
10
10
|
* MongoDB > 2.0
|
11
11
|
* RabbitMQ >= 2.7
|
12
|
-
* An SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html).
|
13
|
-
so your mileage may vary if you are using other databases.
|
12
|
+
* An SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html).
|
13
|
+
GHTorrent is tested with SQLite and MySQL, so your mileage may vary if you are using other databases.
|
14
14
|
|
15
15
|
GHTorrent is written in Ruby (tested with 1.8 and JRuby). To install
|
16
16
|
it as a Gem do:
|
@@ -19,6 +19,14 @@ it as a Gem do:
|
|
19
19
|
sudo gem install ghtorrent
|
20
20
|
</code>
|
21
21
|
|
22
|
+
Depending on which SQL database you want to use, install the appropriate dependency gem.
|
23
|
+
GHTorrent already installs the `sqlite3` gem (if it fails, install the development
|
24
|
+
package for `sqlite3` for your system).
|
25
|
+
|
26
|
+
<code>
|
27
|
+
sudo gem install mysql2 #or postgres
|
28
|
+
</code>
|
29
|
+
|
22
30
|
#### Configuring
|
23
31
|
|
24
32
|
Copy the contents of the
|
@@ -124,7 +132,7 @@ please consider citing the following paper:
|
|
124
132
|
|
125
133
|
Georgios Gousios <gousiosg@gmail.com>
|
126
134
|
|
127
|
-
Diomidis Spinellis
|
135
|
+
[Diomidis Spinellis](http://www.dmst.aueb.gr/dds) <dds@aueb.gr>
|
128
136
|
|
129
137
|
#### License
|
130
138
|
|
data/bin/ght-data-retrieval
CHANGED
@@ -38,7 +38,7 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
38
38
|
|
39
39
|
include GHTorrent::Settings
|
40
40
|
|
41
|
-
attr_reader :settings
|
41
|
+
attr_reader :settings, :name
|
42
42
|
|
43
43
|
def parse(msg)
|
44
44
|
JSON.parse(msg)
|
@@ -71,13 +71,23 @@ class GHTDataRetrieval < GHTorrent::Command
|
|
71
71
|
%w(PushEvent WatchEvent FollowEvent)
|
72
72
|
end
|
73
73
|
|
74
|
+
def prepare_options(options)
|
75
|
+
@name = "ght-data-retrieval"
|
76
|
+
end
|
77
|
+
|
74
78
|
def go
|
75
79
|
@gh = GHTorrent::Mirror.new(options[:config])
|
76
80
|
@settings = @gh.settings
|
77
81
|
|
78
82
|
# Graceful exit
|
79
|
-
Signal.trap('INT') {
|
80
|
-
|
83
|
+
Signal.trap('INT') {
|
84
|
+
info ("Received SIGINT, exiting")
|
85
|
+
AMQP.stop { EM.stop }
|
86
|
+
}
|
87
|
+
Signal.trap('TERM') {
|
88
|
+
info ("Received SIGTERM, exiting")
|
89
|
+
AMQP.stop { EM.stop }
|
90
|
+
}
|
81
91
|
|
82
92
|
AMQP.start(:host => config(:amqp_host),
|
83
93
|
:port => config(:amqp_port),
|
data/bin/ght-load
CHANGED
@@ -113,7 +113,7 @@ Loads object ids from a collection to a queue for further processing.
|
|
113
113
|
:commits
|
114
114
|
end
|
115
115
|
|
116
|
-
puts "Loading
|
116
|
+
puts "Loading from collection #{collection}"
|
117
117
|
puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
|
118
118
|
|
119
119
|
what = case
|
data/bin/ght-mirror-events
CHANGED
@@ -41,26 +41,51 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
41
41
|
include GHTorrent::Settings
|
42
42
|
include GHTorrent::Logging
|
43
43
|
include GHTorrent::Persister
|
44
|
+
include GHTorrent::APIClient
|
44
45
|
|
45
46
|
attr_reader :settings
|
46
47
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
events.each do |e|
|
54
|
-
unless @persister.find(:events, {'id' => e['id']}).empty?
|
55
|
-
info "Already got #{e['id']}"
|
56
|
-
dupl += 1
|
57
|
-
next
|
58
|
-
end
|
48
|
+
def initialize(args)
|
49
|
+
super(args)
|
50
|
+
@args = args
|
51
|
+
@name = self.class.name
|
52
|
+
end
|
59
53
|
|
54
|
+
def store_count(events)
|
55
|
+
stored = Array.new
|
56
|
+
new = dupl = 0
|
57
|
+
events.each do |e|
|
58
|
+
if @persister.find(:events, {'id' => e['id']}).empty?
|
59
|
+
stored << e
|
60
60
|
new += 1
|
61
61
|
@persister.store(:events, e)
|
62
62
|
info "Added #{e['id']}"
|
63
|
+
else
|
64
|
+
info "Already got #{e['id']}"
|
65
|
+
dupl += 1
|
66
|
+
end
|
67
|
+
end
|
68
|
+
return new, dupl, stored
|
69
|
+
end
|
63
70
|
|
71
|
+
# Retrieve events from Github, store them in the DB
|
72
|
+
def retrieve(exchange)
|
73
|
+
begin
|
74
|
+
new = dupl = 0
|
75
|
+
events = api_request "https://api.github.com/events"
|
76
|
+
(new, dupl, stored) = store_count events
|
77
|
+
|
78
|
+
# This means that first page cannot contain all new events. Go
|
79
|
+
# up to 10 pages back to find all new events not contained in first page.
|
80
|
+
if dupl == 0
|
81
|
+
events = paged_api_request "https://api.github.com/events", 10
|
82
|
+
(new1, dupl1, stored1) = store_count events
|
83
|
+
stored = stored | stored1
|
84
|
+
new = new + new1
|
85
|
+
new
|
86
|
+
end
|
87
|
+
|
88
|
+
stored.each do |e|
|
64
89
|
msg = JSON.dump(e)
|
65
90
|
key = "evt.%s" % e['type']
|
66
91
|
exchange.publish msg, :persistent => true, :routing_key => key
|
@@ -73,19 +98,24 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
73
98
|
end
|
74
99
|
|
75
100
|
def prepare_options(options)
|
76
|
-
|
101
|
+
@name = "ght-mirror-events"
|
77
102
|
end
|
78
103
|
|
79
104
|
def go
|
80
|
-
|
81
105
|
@gh = GHTorrent::Mirror.new(options[:config])
|
82
106
|
@settings = @gh.settings
|
83
107
|
@persister = connect(:mongo, @settings)
|
84
108
|
@logger = Logger.new(STDOUT)
|
85
109
|
|
86
110
|
# Graceful exit
|
87
|
-
Signal.trap('INT') {
|
88
|
-
|
111
|
+
Signal.trap('INT') {
|
112
|
+
info ("Received SIGINT, exiting")
|
113
|
+
AMQP.stop { EM.stop }
|
114
|
+
}
|
115
|
+
Signal.trap('TERM') {
|
116
|
+
info ("Received SIGTERM, exiting")
|
117
|
+
AMQP.stop { EM.stop }
|
118
|
+
}
|
89
119
|
|
90
120
|
# The event loop
|
91
121
|
AMQP.start(:host => config(:amqp_host),
|
@@ -105,7 +135,7 @@ class GHTMirrorEvents < GHTorrent::Command
|
|
105
135
|
# Initial delay for the retrieve event loop
|
106
136
|
retrieval_delay = config(:mirror_pollevery)
|
107
137
|
|
108
|
-
# Retrieve
|
138
|
+
# Retrieve events
|
109
139
|
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
110
140
|
(new, dupl) = retrieve exchange
|
111
141
|
dupl_msgs += dupl
|
data/bin/ght-periodic-dump
CHANGED
@@ -6,23 +6,53 @@
|
|
6
6
|
# Directory to place compressed files and torrents
|
7
7
|
OUTDIR=/home/data/github-mirror/dumps
|
8
8
|
|
9
|
-
# Base URL for HTTP dir containing torrents and data
|
9
|
+
# Base URL for HTTP dir containing torrents and data
|
10
10
|
WEBSEED=http://ikaria.dmst.aueb.gr/ghtorrent/
|
11
11
|
|
12
|
+
usage()
|
13
|
+
{
|
14
|
+
echo "Usage: $0 [-f 'yyyy-mm-dd hh:mm'] [-t 'yyyy-mm-dd hh:mm']"
|
15
|
+
echo " [-c collection_to_dump]"
|
16
|
+
echo "Dump the database. -f earliest record timestamp"
|
17
|
+
echo " -t latest record timestamp"
|
18
|
+
echo " -c collection to dump (default: all)"
|
19
|
+
}
|
20
|
+
|
21
|
+
if [ -z $1 ]
|
22
|
+
then
|
23
|
+
usage
|
24
|
+
exit 1
|
25
|
+
fi
|
26
|
+
|
27
|
+
while getopts "f:t:c:" o
|
28
|
+
do
|
29
|
+
case $o in
|
30
|
+
f) timeStart=`date -d "$OPTARG" +%s` ;;
|
31
|
+
t) timeEnd=`date -d "$OPTARG" +%s` ;;
|
32
|
+
c) collection=$OPTARG ;;
|
33
|
+
\?) echo "Invalid option: -$OPTARG" >&2
|
34
|
+
usage
|
35
|
+
exit 1
|
36
|
+
;;
|
37
|
+
esac
|
38
|
+
done
|
39
|
+
|
40
|
+
|
12
41
|
# Time to start dumping from
|
13
|
-
if [ -
|
42
|
+
if [ -z $timeStart ]
|
14
43
|
then
|
15
|
-
|
16
|
-
|
17
|
-
|
44
|
+
if [ -r lastrun ]
|
45
|
+
then
|
46
|
+
timeStart=`cat lastrun`
|
47
|
+
else
|
48
|
+
timeStart=0
|
49
|
+
fi
|
18
50
|
fi
|
19
51
|
|
20
52
|
# Time to end dumping
|
21
|
-
if [
|
53
|
+
if [ -z $timeEnd ]
|
22
54
|
then
|
23
55
|
timeEnd=`date +%s`
|
24
|
-
else
|
25
|
-
timeEnd=`date -d "$1" +%s` || exit 1
|
26
56
|
fi
|
27
57
|
|
28
58
|
# Name used for the files
|
@@ -37,11 +67,19 @@ dateName=`date -d @$timeEnd -u +'%Y-%m-%d'`
|
|
37
67
|
endId=`printf '%08x0000000000000000' $timeEnd`
|
38
68
|
startId=`printf '%08x0000000000000000' $timeStart`
|
39
69
|
|
70
|
+
|
71
|
+
if [ -z $collection ]
|
72
|
+
then
|
73
|
+
collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
|
74
|
+
else
|
75
|
+
collections=$collection
|
76
|
+
fi
|
77
|
+
|
40
78
|
echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`"
|
41
79
|
|
42
|
-
collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
|
43
|
-
|
44
80
|
rm -rf dump
|
81
|
+
mkdir -p dump/github
|
82
|
+
|
45
83
|
for col in $collections; do
|
46
84
|
|
47
85
|
echo "Dumping $col"
|
@@ -62,8 +100,8 @@ for col in $collections; do
|
|
62
100
|
(
|
63
101
|
echo "Start date: `date -u -d @$timeStart +'%Y-%m-%dT%H:%M:%SZ'`"
|
64
102
|
echo "End date: `date -u -d @$timeEnd +'%Y-%m-%dT%H:%M:%SZ'`"
|
65
|
-
meta $col
|
66
|
-
)
|
103
|
+
meta $col
|
104
|
+
)
|
67
105
|
done |
|
68
106
|
tee README.$dateName.txt >dump/github/README.txt || exit 1
|
69
107
|
|
@@ -88,5 +126,5 @@ done
|
|
88
126
|
echo $timeEnd >lastrun || exit 1
|
89
127
|
|
90
128
|
# Clean up
|
91
|
-
rm -rf dump
|
129
|
+
rm -rf dump
|
92
130
|
|
data/lib/ghtorrent.rb
CHANGED
@@ -30,8 +30,9 @@ module GHTorrent
|
|
30
30
|
|
31
31
|
class BaseAdapter
|
32
32
|
|
33
|
-
ENTITIES = [:users, :commits, :followers, :repos, :events
|
34
|
-
|
33
|
+
ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
|
34
|
+
:commit_comments
|
35
|
+
]
|
35
36
|
|
36
37
|
# Stores +data+ into +entity+. Returns a unique key for the stored entry.
|
37
38
|
def store(entity, data = {})
|
@@ -87,5 +88,13 @@ module GHTorrent
|
|
87
88
|
throw GHTorrentException.new("Perister: Entity #{entity} not known")
|
88
89
|
end
|
89
90
|
end
|
91
|
+
|
92
|
+
# Count the number of entries returned by +query+ without retrieving them.
|
93
|
+
# The +query+ can be any query supported by +find+.
|
94
|
+
def count(entity, query = {})
|
95
|
+
unless ENTITIES.include?(entity)
|
96
|
+
throw GHTorrentException.new("Perister: Entity #{entity} not known")
|
97
|
+
end
|
98
|
+
end
|
90
99
|
end
|
91
100
|
end
|
@@ -49,6 +49,7 @@ module GHTorrent
|
|
49
49
|
|
50
50
|
# Creates a new instance of the MongoDB persistence adapter.
|
51
51
|
# Expects a parsed YAML settings document as input.
|
52
|
+
# Will create indexes on fields most frequently used in queries.
|
52
53
|
def initialize(set)
|
53
54
|
merge LOCALCONFIG
|
54
55
|
|
@@ -62,32 +63,31 @@ module GHTorrent
|
|
62
63
|
:commits => get_collection("commits"),
|
63
64
|
:repos => get_collection("repos"),
|
64
65
|
:followers => get_collection("followers"),
|
65
|
-
:events => get_collection("events")
|
66
|
+
:events => get_collection("events"),
|
67
|
+
:org_members => get_collection("org_members"),
|
68
|
+
:commit_comments => get_collection("commit_comments")
|
66
69
|
}
|
70
|
+
|
71
|
+
# Ensure that the necessary indexes exist
|
72
|
+
ensure_index(:users, "login")
|
73
|
+
ensure_index(:commits, "sha")
|
74
|
+
ensure_index(:repos, "name")
|
75
|
+
ensure_index(:followers, "follows")
|
76
|
+
ensure_index(:org_members, "org")
|
77
|
+
ensure_index(:commit_comments, "repo")
|
78
|
+
ensure_index(:commit_comments, "user")
|
79
|
+
ensure_index(:commit_comments, "commit_id")
|
67
80
|
end
|
68
81
|
|
69
82
|
|
70
83
|
def store(entity, data = {})
|
71
84
|
super
|
72
|
-
|
73
|
-
|
74
|
-
if col.nil?
|
75
|
-
raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
|
76
|
-
end
|
77
|
-
|
78
|
-
col.insert(data).to_s
|
85
|
+
get_entity(entity).insert(data).to_s
|
79
86
|
end
|
80
87
|
|
81
88
|
def find(entity, query = {})
|
82
89
|
super
|
83
|
-
|
84
|
-
col = @enttodb[entity]
|
85
|
-
|
86
|
-
if col.nil?
|
87
|
-
raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
|
88
|
-
end
|
89
|
-
|
90
|
-
result = col.find(query)
|
90
|
+
result = get_entity(entity).find(query)
|
91
91
|
result.to_a.map { |r|
|
92
92
|
r[@uniq] = r['_id'].to_s;
|
93
93
|
r.to_h
|
@@ -100,12 +100,41 @@ module GHTorrent
|
|
100
100
|
raise NotImplementedError
|
101
101
|
end
|
102
102
|
|
103
|
+
# Count the number of items returned by +query+
|
104
|
+
def count(entity, query)
|
105
|
+
super
|
106
|
+
get_entity(entity).count(:query => query)
|
107
|
+
end
|
108
|
+
|
103
109
|
private
|
104
110
|
|
105
111
|
def get_collection(col)
|
106
112
|
@mongo.collection(col.to_s)
|
107
113
|
end
|
108
114
|
|
115
|
+
def get_entity(entity)
|
116
|
+
col = @enttodb[entity]
|
117
|
+
|
118
|
+
if col.nil?
|
119
|
+
raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
|
120
|
+
end
|
121
|
+
col
|
122
|
+
end
|
123
|
+
|
124
|
+
# Declare an index on +field+ for +collection+ if it does not exist
|
125
|
+
def ensure_index(collection, field)
|
126
|
+
col = @enttodb[collection]
|
127
|
+
|
128
|
+
exists = col.index_information.find {|k,v|
|
129
|
+
k == "#{field}_1"
|
130
|
+
}
|
131
|
+
|
132
|
+
if exists.nil?
|
133
|
+
col.create_index(field, :background => true)
|
134
|
+
STDERR.puts "Creating index on #{collection}(#{field})"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
109
138
|
end
|
110
139
|
end
|
111
140
|
|
data/lib/ghtorrent/api_client.rb
CHANGED
@@ -40,33 +40,67 @@ module GHTorrent
|
|
40
40
|
@num_api_calls = 0
|
41
41
|
@ts = Time.now().tv_sec()
|
42
42
|
end
|
43
|
-
|
43
|
+
|
44
|
+
# A paged request. Used when the result can expand to more than one
|
45
|
+
# result pages.
|
44
46
|
def paged_api_request(url, pages = -1)
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
48
|
+
data = api_request_raw(url)
|
49
|
+
|
50
|
+
return [] if data.nil?
|
51
|
+
|
52
|
+
unless data.meta['link'].nil?
|
53
|
+
links = parse_links(data.meta['link'])
|
54
|
+
|
55
|
+
if pages > 0
|
56
|
+
pages = pages - 1
|
57
|
+
if pages == 0
|
58
|
+
return parse_request_result(data)
|
59
|
+
end
|
60
|
+
end
|
52
61
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
62
|
+
if links['next'].nil?
|
63
|
+
parse_request_result(data)
|
64
|
+
else
|
65
|
+
parse_request_result(data) | paged_api_request(links['next'], pages)
|
66
|
+
end
|
67
|
+
else
|
68
|
+
parse_request_result(data)
|
69
|
+
end
|
59
70
|
end
|
60
71
|
|
72
|
+
# A normal request. Returns a hash or an array of hashes representing the
|
73
|
+
# parsed JSON result.
|
61
74
|
def api_request(url)
|
62
|
-
|
75
|
+
parse_request_result api_request_raw(url)
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
# Parse a Github link header
|
81
|
+
def parse_links(links)
|
82
|
+
links.split(/,/).reduce({}) do |acc, x|
|
83
|
+
matches = x.strip.match(/<(.*)>; rel=\"(.*)\"/)
|
84
|
+
acc[matches[2]] = matches[1]
|
85
|
+
acc
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Parse the JSON result array
|
90
|
+
def parse_request_result(result)
|
63
91
|
if result.nil?
|
64
|
-
|
92
|
+
[]
|
65
93
|
else
|
66
|
-
|
94
|
+
json = result.read
|
95
|
+
if json.nil?
|
96
|
+
[]
|
97
|
+
else
|
98
|
+
JSON.parse(json)
|
99
|
+
end
|
67
100
|
end
|
68
101
|
end
|
69
102
|
|
103
|
+
# Do the actual request and return the result object
|
70
104
|
def api_request_raw(url)
|
71
105
|
#Rate limiting to avoid error requests
|
72
106
|
if Time.now().tv_sec() - @ts < 60 then
|
@@ -86,7 +120,7 @@ module GHTorrent
|
|
86
120
|
@num_api_calls += 1
|
87
121
|
debug "APIClient: Request: #{url} (num_calls = #{@num_api_calls})"
|
88
122
|
begin
|
89
|
-
open(url)
|
123
|
+
open(url)
|
90
124
|
rescue OpenURI::HTTPError => e
|
91
125
|
case e.io.status[0].to_i
|
92
126
|
# The following indicate valid Github return codes
|