ghtorrent 0.2 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- github-mirror: Mirror and process the Github event steam
1
+ ghtorrent: Mirror and process the Github event steam
2
2
  =========================================================
3
3
 
4
4
  A collection of scripts used to mirror the Github event stream, for
@@ -9,8 +9,8 @@ GHTorrent relies on the following software to work:
9
9
 
10
10
  * MongoDB > 2.0
11
11
  * RabbitMQ >= 2.7
12
- * An SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html). GHTorrent is tested with SQLite and MySQL,
13
- so your mileage may vary if you are using other databases.
12
+ * An SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html).
13
+ GHTorrent is tested with SQLite and MySQL, so your mileage may vary if you are using other databases.
14
14
 
15
15
  GHTorrent is written in Ruby (tested with 1.8 and JRuby). To install
16
16
  it as a Gem do:
@@ -19,6 +19,14 @@ it as a Gem do:
19
19
  sudo gem install ghtorrent
20
20
  </code>
21
21
 
22
+ Depending on which SQL database you want to use, install the appropriate dependency gem.
23
+ GHTorrent already installs the `sqlite3` gem (if it fails, install the development
24
+ package for `sqlite3` for your system).
25
+
26
+ <code>
27
+ sudo gem install mysql2 #or postgres
28
+ </code>
29
+
22
30
  #### Configuring
23
31
 
24
32
  Copy the contents of the
@@ -124,7 +132,7 @@ please consider citing the following paper:
124
132
 
125
133
  Georgios Gousios <gousiosg@gmail.com>
126
134
 
127
- Diomidis Spinellis
135
+ [Diomidis Spinellis](http://www.dmst.aueb.gr/dds) <dds@aueb.gr>
128
136
 
129
137
  #### License
130
138
 
@@ -38,7 +38,7 @@ class GHTDataRetrieval < GHTorrent::Command
38
38
 
39
39
  include GHTorrent::Settings
40
40
 
41
- attr_reader :settings
41
+ attr_reader :settings, :name
42
42
 
43
43
  def parse(msg)
44
44
  JSON.parse(msg)
@@ -71,13 +71,23 @@ class GHTDataRetrieval < GHTorrent::Command
71
71
  %w(PushEvent WatchEvent FollowEvent)
72
72
  end
73
73
 
74
+ def prepare_options(options)
75
+ @name = "ght-data-retrieval"
76
+ end
77
+
74
78
  def go
75
79
  @gh = GHTorrent::Mirror.new(options[:config])
76
80
  @settings = @gh.settings
77
81
 
78
82
  # Graceful exit
79
- Signal.trap('INT') { AMQP.stop { EM.stop } }
80
- Signal.trap('TERM') { AMQP.stop { EM.stop } }
83
+ Signal.trap('INT') {
84
+ info ("Received SIGINT, exiting")
85
+ AMQP.stop { EM.stop }
86
+ }
87
+ Signal.trap('TERM') {
88
+ info ("Received SIGTERM, exiting")
89
+ AMQP.stop { EM.stop }
90
+ }
81
91
 
82
92
  AMQP.start(:host => config(:amqp_host),
83
93
  :port => config(:amqp_port),
data/bin/ght-load CHANGED
@@ -113,7 +113,7 @@ Loads object ids from a collection to a queue for further processing.
113
113
  :commits
114
114
  end
115
115
 
116
- puts "Loading form collection #{collection}"
116
+ puts "Loading from collection #{collection}"
117
117
  puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
118
118
 
119
119
  what = case
@@ -41,26 +41,51 @@ class GHTMirrorEvents < GHTorrent::Command
41
41
  include GHTorrent::Settings
42
42
  include GHTorrent::Logging
43
43
  include GHTorrent::Persister
44
+ include GHTorrent::APIClient
44
45
 
45
46
  attr_reader :settings
46
47
 
47
- # Retrieve events from Github, store them in the DB
48
- def retrieve(exchange)
49
- begin
50
- new = dupl = 0
51
- events = @gh.get_events
52
-
53
- events.each do |e|
54
- unless @persister.find(:events, {'id' => e['id']}).empty?
55
- info "Already got #{e['id']}"
56
- dupl += 1
57
- next
58
- end
48
+ def initialize(args)
49
+ super(args)
50
+ @args = args
51
+ @name = self.class.name
52
+ end
59
53
 
54
+ def store_count(events)
55
+ stored = Array.new
56
+ new = dupl = 0
57
+ events.each do |e|
58
+ if @persister.find(:events, {'id' => e['id']}).empty?
59
+ stored << e
60
60
  new += 1
61
61
  @persister.store(:events, e)
62
62
  info "Added #{e['id']}"
63
+ else
64
+ info "Already got #{e['id']}"
65
+ dupl += 1
66
+ end
67
+ end
68
+ return new, dupl, stored
69
+ end
63
70
 
71
+ # Retrieve events from Github, store them in the DB
72
+ def retrieve(exchange)
73
+ begin
74
+ new = dupl = 0
75
+ events = api_request "https://api.github.com/events"
76
+ (new, dupl, stored) = store_count events
77
+
78
+ # This means that first page cannot contain all new events. Go
79
+ # up to 10 pages back to find all new events not contained in first page.
80
+ if dupl == 0
81
+ events = paged_api_request "https://api.github.com/events", 10
82
+ (new1, dupl1, stored1) = store_count events
83
+ stored = stored | stored1
84
+ new = new + new1
85
+ new
86
+ end
87
+
88
+ stored.each do |e|
64
89
  msg = JSON.dump(e)
65
90
  key = "evt.%s" % e['type']
66
91
  exchange.publish msg, :persistent => true, :routing_key => key
@@ -73,19 +98,24 @@ class GHTMirrorEvents < GHTorrent::Command
73
98
  end
74
99
 
75
100
  def prepare_options(options)
76
-
101
+ @name = "ght-mirror-events"
77
102
  end
78
103
 
79
104
  def go
80
-
81
105
  @gh = GHTorrent::Mirror.new(options[:config])
82
106
  @settings = @gh.settings
83
107
  @persister = connect(:mongo, @settings)
84
108
  @logger = Logger.new(STDOUT)
85
109
 
86
110
  # Graceful exit
87
- Signal.trap('INT') { AMQP.stop { EM.stop } }
88
- Signal.trap('TERM') { AMQP.stop { EM.stop } }
111
+ Signal.trap('INT') {
112
+ info ("Received SIGINT, exiting")
113
+ AMQP.stop { EM.stop }
114
+ }
115
+ Signal.trap('TERM') {
116
+ info ("Received SIGTERM, exiting")
117
+ AMQP.stop { EM.stop }
118
+ }
89
119
 
90
120
  # The event loop
91
121
  AMQP.start(:host => config(:amqp_host),
@@ -105,7 +135,7 @@ class GHTMirrorEvents < GHTorrent::Command
105
135
  # Initial delay for the retrieve event loop
106
136
  retrieval_delay = config(:mirror_pollevery)
107
137
 
108
- # Retrieve commits.
138
+ # Retrieve events
109
139
  retriever = EventMachine.add_periodic_timer(retrieval_delay) do
110
140
  (new, dupl) = retrieve exchange
111
141
  dupl_msgs += dupl
@@ -6,23 +6,53 @@
6
6
  # Directory to place compressed files and torrents
7
7
  OUTDIR=/home/data/github-mirror/dumps
8
8
 
9
- # Base URL for HTTP dir containing torrents and data
9
+ # Base URL for HTTP dir containing torrents and data
10
10
  WEBSEED=http://ikaria.dmst.aueb.gr/ghtorrent/
11
11
 
12
+ usage()
13
+ {
14
+ echo "Usage: $0 [-f 'yyyy-mm-dd hh:mm'] [-t 'yyyy-mm-dd hh:mm']"
15
+ echo " [-c collection_to_dump]"
16
+ echo "Dump the database. -f earliest record timestamp"
17
+ echo " -t latest record timestamp"
18
+ echo " -c collection to dump (default: all)"
19
+ }
20
+
21
+ if [ -z $1 ]
22
+ then
23
+ usage
24
+ exit 1
25
+ fi
26
+
27
+ while getopts "f:t:c:" o
28
+ do
29
+ case $o in
30
+ f) timeStart=`date -d "$OPTARG" +%s` ;;
31
+ t) timeEnd=`date -d "$OPTARG" +%s` ;;
32
+ c) collection=$OPTARG ;;
33
+ \?) echo "Invalid option: -$OPTARG" >&2
34
+ usage
35
+ exit 1
36
+ ;;
37
+ esac
38
+ done
39
+
40
+
12
41
  # Time to start dumping from
13
- if [ -r lastrun ]
42
+ if [ -z $timeStart ]
14
43
  then
15
- timeStart=`cat lastrun`
16
- else
17
- timeStart=0
44
+ if [ -r lastrun ]
45
+ then
46
+ timeStart=`cat lastrun`
47
+ else
48
+ timeStart=0
49
+ fi
18
50
  fi
19
51
 
20
52
  # Time to end dumping
21
- if [ "$1" = "" ]
53
+ if [ -z $timeEnd ]
22
54
  then
23
55
  timeEnd=`date +%s`
24
- else
25
- timeEnd=`date -d "$1" +%s` || exit 1
26
56
  fi
27
57
 
28
58
  # Name used for the files
@@ -37,11 +67,19 @@ dateName=`date -d @$timeEnd -u +'%Y-%m-%d'`
37
67
  endId=`printf '%08x0000000000000000' $timeEnd`
38
68
  startId=`printf '%08x0000000000000000' $timeStart`
39
69
 
70
+
71
+ if [ -z $collection ]
72
+ then
73
+ collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
74
+ else
75
+ collections=$collection
76
+ fi
77
+
40
78
  echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`"
41
79
 
42
- collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
43
-
44
80
  rm -rf dump
81
+ mkdir -p dump/github
82
+
45
83
  for col in $collections; do
46
84
 
47
85
  echo "Dumping $col"
@@ -62,8 +100,8 @@ for col in $collections; do
62
100
  (
63
101
  echo "Start date: `date -u -d @$timeStart +'%Y-%m-%dT%H:%M:%SZ'`"
64
102
  echo "End date: `date -u -d @$timeEnd +'%Y-%m-%dT%H:%M:%SZ'`"
65
- meta $col
66
- )
103
+ meta $col
104
+ )
67
105
  done |
68
106
  tee README.$dateName.txt >dump/github/README.txt || exit 1
69
107
 
@@ -88,5 +126,5 @@ done
88
126
  echo $timeEnd >lastrun || exit 1
89
127
 
90
128
  # Clean up
91
- rm -rf dump
129
+ rm -rf dump
92
130
 
data/lib/ghtorrent.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  #require 'ghtorrent-old/ghtorrent-old'
2
2
 
3
3
  module GHTorrent
4
- VERSION = 0.2
4
+ VERSION = 0.3
5
5
  end
6
6
 
7
7
  require 'ghtorrent/command'
@@ -30,8 +30,9 @@ module GHTorrent
30
30
 
31
31
  class BaseAdapter
32
32
 
33
- ENTITIES = [:users, :commits, :followers, :repos, :events]
34
-
33
+ ENTITIES = [:users, :commits, :followers, :repos, :events, :org_members,
34
+ :commit_comments
35
+ ]
35
36
 
36
37
  # Stores +data+ into +entity+. Returns a unique key for the stored entry.
37
38
  def store(entity, data = {})
@@ -87,5 +88,13 @@ module GHTorrent
87
88
  throw GHTorrentException.new("Perister: Entity #{entity} not known")
88
89
  end
89
90
  end
91
+
92
+ # Count the number of entries returned by +query+ without retrieving them.
93
+ # The +query+ can be any query supported by +find+.
94
+ def count(entity, query = {})
95
+ unless ENTITIES.include?(entity)
96
+ throw GHTorrentException.new("Perister: Entity #{entity} not known")
97
+ end
98
+ end
90
99
  end
91
100
  end
@@ -49,6 +49,7 @@ module GHTorrent
49
49
 
50
50
  # Creates a new instance of the MongoDB persistence adapter.
51
51
  # Expects a parsed YAML settings document as input.
52
+ # Will create indexes on fields most frequently used in queries.
52
53
  def initialize(set)
53
54
  merge LOCALCONFIG
54
55
 
@@ -62,32 +63,31 @@ module GHTorrent
62
63
  :commits => get_collection("commits"),
63
64
  :repos => get_collection("repos"),
64
65
  :followers => get_collection("followers"),
65
- :events => get_collection("events")
66
+ :events => get_collection("events"),
67
+ :org_members => get_collection("org_members"),
68
+ :commit_comments => get_collection("commit_comments")
66
69
  }
70
+
71
+ # Ensure that the necessary indexes exist
72
+ ensure_index(:users, "login")
73
+ ensure_index(:commits, "sha")
74
+ ensure_index(:repos, "name")
75
+ ensure_index(:followers, "follows")
76
+ ensure_index(:org_members, "org")
77
+ ensure_index(:commit_comments, "repo")
78
+ ensure_index(:commit_comments, "user")
79
+ ensure_index(:commit_comments, "commit_id")
67
80
  end
68
81
 
69
82
 
70
83
  def store(entity, data = {})
71
84
  super
72
- col = @enttodb[entity]
73
-
74
- if col.nil?
75
- raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
76
- end
77
-
78
- col.insert(data).to_s
85
+ get_entity(entity).insert(data).to_s
79
86
  end
80
87
 
81
88
  def find(entity, query = {})
82
89
  super
83
-
84
- col = @enttodb[entity]
85
-
86
- if col.nil?
87
- raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
88
- end
89
-
90
- result = col.find(query)
90
+ result = get_entity(entity).find(query)
91
91
  result.to_a.map { |r|
92
92
  r[@uniq] = r['_id'].to_s;
93
93
  r.to_h
@@ -100,12 +100,41 @@ module GHTorrent
100
100
  raise NotImplementedError
101
101
  end
102
102
 
103
+ # Count the number of items returned by +query+
104
+ def count(entity, query)
105
+ super
106
+ get_entity(entity).count(:query => query)
107
+ end
108
+
103
109
  private
104
110
 
105
111
  def get_collection(col)
106
112
  @mongo.collection(col.to_s)
107
113
  end
108
114
 
115
+ def get_entity(entity)
116
+ col = @enttodb[entity]
117
+
118
+ if col.nil?
119
+ raise GHTorrentException.new("Mongo: Entity #{entity} not supported")
120
+ end
121
+ col
122
+ end
123
+
124
+ # Declare an index on +field+ for +collection+ if it does not exist
125
+ def ensure_index(collection, field)
126
+ col = @enttodb[collection]
127
+
128
+ exists = col.index_information.find {|k,v|
129
+ k == "#{field}_1"
130
+ }
131
+
132
+ if exists.nil?
133
+ col.create_index(field, :background => true)
134
+ STDERR.puts "Creating index on #{collection}(#{field})"
135
+ end
136
+ end
137
+
109
138
  end
110
139
  end
111
140
 
@@ -40,33 +40,67 @@ module GHTorrent
40
40
  @num_api_calls = 0
41
41
  @ts = Time.now().tv_sec()
42
42
  end
43
-
43
+
44
+ # A paged request. Used when the result can expand to more than one
45
+ # result pages.
44
46
  def paged_api_request(url, pages = -1)
45
47
 
46
- pg = if pages == -1 then
47
- 1000000
48
- else
49
- pages
50
- end
51
- result = Array.new
48
+ data = api_request_raw(url)
49
+
50
+ return [] if data.nil?
51
+
52
+ unless data.meta['link'].nil?
53
+ links = parse_links(data.meta['link'])
54
+
55
+ if pages > 0
56
+ pages = pages - 1
57
+ if pages == 0
58
+ return parse_request_result(data)
59
+ end
60
+ end
52
61
 
53
- (1..pg).each { |x|
54
- data = api_request("#{url}?page=#{x}")
55
- result += data
56
- break if data.empty?
57
- }
58
- result
62
+ if links['next'].nil?
63
+ parse_request_result(data)
64
+ else
65
+ parse_request_result(data) | paged_api_request(links['next'], pages)
66
+ end
67
+ else
68
+ parse_request_result(data)
69
+ end
59
70
  end
60
71
 
72
+ # A normal request. Returns a hash or an array of hashes representing the
73
+ # parsed JSON result.
61
74
  def api_request(url)
62
- result = api_request_raw(url)
75
+ parse_request_result api_request_raw(url)
76
+ end
77
+
78
+ private
79
+
80
+ # Parse a Github link header
81
+ def parse_links(links)
82
+ links.split(/,/).reduce({}) do |acc, x|
83
+ matches = x.strip.match(/<(.*)>; rel=\"(.*)\"/)
84
+ acc[matches[2]] = matches[1]
85
+ acc
86
+ end
87
+ end
88
+
89
+ # Parse the JSON result array
90
+ def parse_request_result(result)
63
91
  if result.nil?
64
- nil
92
+ []
65
93
  else
66
- JSON.parse(result)
94
+ json = result.read
95
+ if json.nil?
96
+ []
97
+ else
98
+ JSON.parse(json)
99
+ end
67
100
  end
68
101
  end
69
102
 
103
+ # Do the actual request and return the result object
70
104
  def api_request_raw(url)
71
105
  #Rate limiting to avoid error requests
72
106
  if Time.now().tv_sec() - @ts < 60 then
@@ -86,7 +120,7 @@ module GHTorrent
86
120
  @num_api_calls += 1
87
121
  debug "APIClient: Request: #{url} (num_calls = #{@num_api_calls})"
88
122
  begin
89
- open(url).read
123
+ open(url)
90
124
  rescue OpenURI::HTTPError => e
91
125
  case e.io.status[0].to_i
92
126
  # The following indicate valid Github return codes