ghtorrent 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/CHANGELOG +24 -0
  2. data/Gemfile +17 -0
  3. data/Gemfile.lock +40 -0
  4. data/README.md +23 -22
  5. data/bin/ght-data-retrieval +66 -24
  6. data/bin/ght-load +41 -19
  7. data/bin/ght-mirror-events +13 -16
  8. data/bin/ght-rm-dupl +119 -77
  9. data/lib/ghtorrent.rb +14 -4
  10. data/lib/ghtorrent/adapters/base_adapter.rb +17 -5
  11. data/lib/ghtorrent/adapters/mongo_persister.rb +122 -56
  12. data/lib/ghtorrent/api_client.rb +151 -16
  13. data/lib/ghtorrent/bson_orderedhash.rb +23 -0
  14. data/lib/ghtorrent/cache.rb +97 -0
  15. data/lib/ghtorrent/command.rb +43 -25
  16. data/lib/ghtorrent/gh_torrent_exception.rb +6 -0
  17. data/lib/ghtorrent/ghtorrent.rb +615 -164
  18. data/lib/ghtorrent/hash.rb +11 -0
  19. data/lib/ghtorrent/logging.rb +11 -7
  20. data/lib/ghtorrent/migrations/001_init_schema.rb +3 -3
  21. data/lib/ghtorrent/migrations/002_add_external_ref_ids.rb +2 -0
  22. data/lib/ghtorrent/migrations/003_add_orgs.rb +4 -1
  23. data/lib/ghtorrent/migrations/004_add_commit_comments.rb +4 -2
  24. data/lib/ghtorrent/migrations/005_add_repo_collaborators.rb +2 -0
  25. data/lib/ghtorrent/migrations/006_add_watchers.rb +2 -0
  26. data/lib/ghtorrent/migrations/007_add_pull_requests.rb +64 -0
  27. data/lib/ghtorrent/migrations/008_add_project_unq.rb +23 -0
  28. data/lib/ghtorrent/migrations/009_add_project_commit.rb +27 -0
  29. data/lib/ghtorrent/migrations/010_add_forks.rb +28 -0
  30. data/lib/ghtorrent/migrations/mysql_defaults.rb +6 -0
  31. data/lib/ghtorrent/persister.rb +3 -0
  32. data/lib/ghtorrent/retriever.rb +298 -102
  33. data/lib/ghtorrent/settings.rb +20 -1
  34. data/lib/ghtorrent/time.rb +5 -0
  35. data/lib/ghtorrent/utils.rb +22 -4
  36. data/lib/version.rb +5 -0
  37. metadata +173 -145
  38. data/lib/ghtorrent/call_stack.rb +0 -91
data/CHANGELOG ADDED
@@ -0,0 +1,24 @@
1
+ = Version 0.5
2
+
3
+ * Generic methods for retrieving items that are bound to repositories
4
+ * Processing of pull requests with commits, comments and history
5
+ * Processing of project forks
6
+ * New tool (ght-load) to filter and load events to the queue
7
+ * New tool (ght-rm-dupl) to delete duplicate entries from collections (events & commits supported)
8
+ * Project wide requesting result caching for multi-page requests
9
+ * Better logging in various places
10
+ * Better defaults for MySQL (UTF8 + InnoDB tables)
11
+ * Commits are now seperated from projects. Project forks can share commits.
12
+ * Support for setting the IP address to use for retrieval on multi-homed
13
+ * Compatibility with Ruby 1.9 (now default) and JRuby
14
+ * Proper modularization, following the cake design pattern
15
+ * Never retrieve arrays of results from MongoDB
16
+
17
+ = Version 0.4
18
+
19
+ * Implement support for retrieving watches and project members
20
+ * Support for processing FollowEvents, WatchEvents, CommitCommentEvents, MemberEvents
21
+ * Projects are exclusively associated to users
22
+ * Remove dependence on Github API v2
23
+ * Remove license headers
24
+
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem "amqp"
4
+ gem "mongo"
5
+ gem "trollop"
6
+ gem "sequel"
7
+ gem "daemons"
8
+ gem "json"
9
+
10
+ platforms :ruby do
11
+ gem "sqlite3"
12
+ gem "bson_ext"
13
+ end
14
+
15
+ platforms :jruby do
16
+ gem "jdbc-mysql"
17
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,40 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ amq-client (0.9.4)
5
+ amq-protocol (>= 0.9.4)
6
+ eventmachine
7
+ amq-protocol (0.9.4)
8
+ amqp (0.9.7)
9
+ amq-client (~> 0.9.4)
10
+ amq-protocol (>= 0.9.4)
11
+ eventmachine
12
+ bson (1.6.4)
13
+ bson (1.6.4-java)
14
+ bson_ext (1.6.4)
15
+ bson (~> 1.6.4)
16
+ daemons (1.1.8)
17
+ eventmachine (0.12.10)
18
+ eventmachine (0.12.10-java)
19
+ jdbc-mysql (5.1.13)
20
+ json (1.7.3)
21
+ mongo (1.6.4)
22
+ bson (~> 1.6.4)
23
+ sequel (3.37.0)
24
+ sqlite3 (1.3.6)
25
+ trollop (1.16.2)
26
+
27
+ PLATFORMS
28
+ java
29
+ ruby
30
+
31
+ DEPENDENCIES
32
+ amqp
33
+ bson_ext
34
+ daemons
35
+ jdbc-mysql
36
+ json
37
+ mongo
38
+ sequel
39
+ sqlite3
40
+ trollop
data/README.md CHANGED
@@ -10,18 +10,18 @@ GHTorrent relies on the following software to work:
10
10
  * MongoDB > 2.0
11
11
  * RabbitMQ >= 2.7
12
12
  * An SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html).
13
- GHTorrent is tested with SQLite and MySQL, so your mileage may vary if you are using other databases.
13
+ GHTorrent is tested mainly with MySQL, so your mileage may vary if you are using other databases.
14
14
 
15
- GHTorrent is written in Ruby (tested with 1.8 and JRuby). To install
15
+ GHTorrent is written in Ruby (tested with 1.9 and JRuby). To install
16
16
  it as a Gem do:
17
17
 
18
18
  <code>
19
19
  sudo gem install ghtorrent
20
20
  </code>
21
21
 
22
- Depending on which SQL database you want to use, install the appropriate dependency gem.
23
- GHTorrent already installs the `sqlite3` gem (if it fails, install the development
24
- package for `sqlite3` for your system).
22
+ Depending on which SQL database you want to use, install the appropriate
23
+ dependency gem. GHTorrent already installs the `sqlite3` gem (if it fails,
24
+ install the development package for `sqlite3` for your system).
25
25
 
26
26
  <code>
27
27
  sudo gem install mysql2 #or postgres
@@ -35,10 +35,10 @@ file to a file in your home directory. All provided scripts accept the `-c`
35
35
  option, which you can use to pass the location of the configuration file as
36
36
  a parameter.
37
37
 
38
- Edit the MongoDB and AMQP
39
- configuration options accordingly. The scripts require accounts with permissions
40
- to create queues and exchanges in the AMQP queue, collections
41
- in MongoDB and tables in the selected SQL database, respectively.
38
+ Edit the MongoDB and AMQP configuration options accordingly. The scripts
39
+ require accounts with permissions to create queues and exchanges in the AMQP
40
+ queue, collections in MongoDB and tables in the selected SQL database,
41
+ respectively.
42
42
 
43
43
  To prepare MongoDB:
44
44
 
@@ -76,25 +76,26 @@ to retrieve data in parallel on the [Wiki](https://github.com/gousiosg/github-mi
76
76
 
77
77
  ### Running
78
78
 
79
- To retrieve data with GHTorrent
79
+ To retrieve data with GHTorrent:
80
80
 
81
81
  * `ght-mirror-events.rb` periodically polls Github's event
82
- queue (`https://api.github.com/events`), stores all new events in the `events`
83
- collection in MongoDB and posts them to the `github` exchange in RabbitMQ.
82
+ queue (`https://api.github.com/events`), stores all new events in the
83
+ `events` collection in MongoDB and posts them to the `github` exchange in
84
+ RabbitMQ.
84
85
 
85
86
  * `ght-data_retrieval.rb` creates queues that route posted events to processor
86
87
  functions, which in turn use the appropriate Github API call to retrieve the
87
88
  linked contents, extract metadata to store in the SQL database and store the
88
- retrieved data in the appropriate collection in Mongo, to avoid further
89
- API calls. Data in the SQL database contain pointers (the MongoDB key)
90
- to the "raw" data in MongoDB.
91
-
92
- Both scripts can be run concurrently on more than one hosts, for resilience and
93
- performance reasons. To catch up with Github's event stream, it is enough to
94
- run `mirror_events.rb` on one host. To collect all data pointed by each event,
95
- one instance of `data_retrieval.rb` is not enough. Both scripts employ
96
- throttling mechanisms to keep API usage whithin the limits imposed by Github
97
- (currently 5000 reqs/hr).
89
+ retrieved data in the appropriate collection in Mongo, to avoid further API
90
+ calls. Data in the SQL database contain pointers (the MongoDB key) to the
91
+ "raw" data in MongoDB.
92
+
93
+ Both scripts can be run concurrently on more than one hosts, for resilience
94
+ and performance reasons. To catch up with Github's event stream, it is
95
+ usually enough to run `ght-mirror-events` on one host. To collect all data
96
+ pointed by each event, one instance of `ght-data-retrieval` is not enough.
97
+ Both scripts employ throttling mechanisms to keep API usage whithin the
98
+ limits imposed by Github (currently 5000 reqs/hr).
98
99
 
99
100
  #### Data
100
101
 
@@ -1,30 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+
1
3
  require 'rubygems'
2
4
  require 'amqp'
3
5
  require 'json'
4
- require 'ghtorrent'
5
6
  require 'pp'
6
7
 
8
+ require 'ghtorrent/ghtorrent'
9
+ require 'ghtorrent/settings'
10
+ require 'ghtorrent/logging'
11
+ require 'ghtorrent/command'
12
+
7
13
  class GHTDataRetrieval < GHTorrent::Command
8
14
 
9
15
  include GHTorrent::Settings
10
16
  include GHTorrent::Logging
11
17
 
12
- attr_reader :settings, :name
13
-
14
18
  def parse(msg)
15
19
  JSON.parse(msg)
16
20
  end
17
21
 
18
- def PushEvent(evt)
19
- data = parse evt
22
+ def PushEvent(data)
20
23
  data['payload']['commits'].each do |c|
21
24
  url = c['url'].split(/\//)
25
+
22
26
  @gh.get_commit url[4], url[5], url[7]
23
27
  end
24
28
  end
25
29
 
26
- def WatchEvent(evt)
27
- data = parse evt
30
+ def WatchEvent(data)
28
31
  owner = data['repo']['name'].split(/\//)[0]
29
32
  repo = data['repo']['name'].split(/\//)[1]
30
33
  watcher = data['actor']['login']
@@ -33,8 +36,7 @@ class GHTDataRetrieval < GHTorrent::Command
33
36
  @gh.get_watcher owner, repo, watcher, created_at
34
37
  end
35
38
 
36
- def FollowEvent(evt)
37
- data = parse evt
39
+ def FollowEvent(data)
38
40
  follower = data['actor']['login']
39
41
  followed = data['payload']['target']['login']
40
42
  created_at = data['created_at']
@@ -42,8 +44,7 @@ class GHTDataRetrieval < GHTorrent::Command
42
44
  @gh.get_follower(follower, followed, created_at)
43
45
  end
44
46
 
45
- def MemberEvent(evt)
46
- data = parse evt
47
+ def MemberEvent(data)
47
48
  owner = data['actor']['login']
48
49
  repo = data['repo']['name'].split(/\//)[1]
49
50
  new_member = data['payload']['member']['login']
@@ -52,8 +53,7 @@ class GHTDataRetrieval < GHTorrent::Command
52
53
  @gh.get_project_member(owner, repo, new_member, created_at)
53
54
  end
54
55
 
55
- def CommitCommentEvent(evt)
56
- data = parse evt
56
+ def CommitCommentEvent(data)
57
57
  user = data['actor']['login']
58
58
  repo = data['repo']['name'].split(/\//)[1]
59
59
  id = data['payload']['comment']['id']
@@ -62,25 +62,63 @@ class GHTDataRetrieval < GHTorrent::Command
62
62
  @gh.get_commit_comment(user, repo, id, created_at)
63
63
  end
64
64
 
65
+ def PullRequestEvent(data)
66
+ owner = data['payload']['pull_request']['base']['repo']['owner']['login']
67
+ repo = data['payload']['pull_request']['base']['repo']['name']
68
+ pullreq_id = data['payload']['number']
69
+ action = data['payload']['action']
70
+ created_at = data['created_at']
71
+
72
+ @gh.get_pull_request(owner, repo, pullreq_id, action, created_at)
73
+ end
74
+
75
+ def ForkEvent(data)
76
+ owner = data['repo']['name'].split(/\//)[0]
77
+ repo = data['repo']['name'].split(/\//)[1]
78
+ fork_id = data['payload']['forkee']['id']
79
+ created_at = data['created_at']
80
+
81
+ @gh.get_fork(owner, repo, fork_id, created_at)
82
+ end
83
+
84
+ def PullRequestReviewCommentEvent(data)
85
+ owner = data['repo']['name'].split(/\//)[0]
86
+ repo = data['repo']['name'].split(/\//)[1]
87
+ comment_id = data['payload']['comment']['id']
88
+ pullreq_id = data['payload']['comment']['_links']['pull_request']['href'].split(/\//)[-1]
89
+ created_at = data['created_at']
90
+
91
+ @gh.get_pullreq_comment(owner, repo, pullreq_id, comment_id, created_at)
92
+ end
93
+
94
+ def IssueCommentEvent(data)
95
+ owner = data['repo']['name'].split(/\//)[0]
96
+ repo = data['repo']['name'].split(/\//)[1]
97
+ pullreq_id = data['payload']['forkee']['id']
98
+ created_at = data['created_at']
99
+
100
+ @gh.get_issue_comment(owner, repo, issue_id, comment_id, created_at)
101
+ end
102
+
65
103
  def handlers
66
- %w(PushEvent WatchEvent FollowEvent MemberEvent CommitCommentEvent)
104
+ %w(PushEvent WatchEvent FollowEvent MemberEvent CommitCommentEvent PullRequestEvent ForkEvent PullRequestReviewCommentEvent)
105
+ #%w(PullRequestReviewCommentEvent)
67
106
  end
68
107
 
69
- def prepare_options(options)
70
- @name = "ght-data-retrieval"
108
+ def logger
109
+ @gh.logger
71
110
  end
72
111
 
73
112
  def go
74
- @gh = GHTorrent::Mirror.new(options[:config])
75
- @settings = @gh.settings
113
+ @gh = GHTorrent::Mirror.new(@settings)
76
114
 
77
115
  # Graceful exit
78
116
  Signal.trap('INT') {
79
- info "Received SIGINT, exiting"
117
+ info "GHTDataRetrieval: Received SIGINT, exiting"
80
118
  AMQP.stop { EM.stop }
81
119
  }
82
120
  Signal.trap('TERM') {
83
- info "Received SIGTERM, exiting"
121
+ info "GHTDataRetrieval: Received SIGTERM, exiting"
84
122
  AMQP.stop { EM.stop }
85
123
  }
86
124
 
@@ -89,7 +127,7 @@ class GHTDataRetrieval < GHTorrent::Command
89
127
  :username => config(:amqp_username),
90
128
  :password => config(:amqp_password)) do |connection|
91
129
 
92
- channel = AMQP::Channel.new(connection, :prefetch => 5)
130
+ channel = AMQP::Channel.new(connection, :prefetch => config(:amqp_prefetch))
93
131
  exchange = channel.topic(config(:amqp_exchange), :durable => true,
94
132
  :auto_delete => false)
95
133
 
@@ -97,21 +135,25 @@ class GHTDataRetrieval < GHTorrent::Command
97
135
  queue = channel.queue("#{h}s", {:durable => true})\
98
136
  .bind(exchange, :routing_key => "evt.#{h}")
99
137
 
100
- info "Binding handler #{h} to routing key evt.#{h}"
138
+ info "GHTDataRetrieval: Binding handler #{h} to routing key evt.#{h}"
101
139
 
102
140
  queue.subscribe(:ack => true) do |headers, msg|
103
141
  begin
104
- send(h, msg)
142
+ data = parse(msg)
143
+ info "GHTDataRetrieval: Processing event: #{data['type']}-#{data['id']}"
144
+ send(h, data)
105
145
  headers.ack
146
+ info "GHTDataRetrieval: Processed event: #{data['type']}-#{data['id']}"
106
147
  rescue Exception => e
107
148
  # Give a message a chance to be reprocessed
108
149
  if headers.redelivered?
150
+ data = parse(msg)
151
+ warn "GHTDataRetrieval: Could not process event: #{data['type']}-#{data['id']}"
109
152
  headers.reject(:requeue => false)
110
153
  else
111
154
  headers.reject(:requeue => true)
112
155
  end
113
156
 
114
- #pp JSON.parse(msg)
115
157
  STDERR.puts e
116
158
  STDERR.puts e.backtrace.join("\n")
117
159
  end
data/bin/ght-load CHANGED
@@ -1,35 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
1
3
  require 'rubygems'
2
- require 'ghtorrent-old'
3
4
  require 'mongo'
4
5
  require 'amqp'
5
6
  require 'set'
6
7
  require 'eventmachine'
7
- require 'optparse'
8
- require 'ostruct'
9
8
  require 'pp'
10
9
  require "amqp/extensions/rabbitmq"
11
10
 
11
+ require 'ghtorrent/settings'
12
+ require 'ghtorrent/logging'
13
+ require 'ghtorrent/persister'
14
+ require 'ghtorrent/command'
15
+ require 'ghtorrent/bson_orderedhash'
16
+
12
17
  class GHTLoad < GHTorrent::Command
13
18
 
19
+ include GHTorrent::Settings
20
+ include GHTorrent::Persister
21
+
14
22
  def col_info()
15
23
  {
16
24
  :commits => {
17
25
  :name => "commits",
18
26
  :payload => "commit.id",
19
27
  :unq => "commit.id",
20
- :col => GH.commits_col,
28
+ :col => persister.get_underlying_connection.collection(:commits.to_s),
21
29
  :routekey => "commit.%s"
22
30
  },
23
31
  :events => {
24
32
  :name => "events",
25
33
  :payload => "",
26
34
  :unq => "type",
27
- :col => GH.events_col,
35
+ :col => persister.get_underlying_connection.collection(:events.to_s),
28
36
  :routekey => "evt.%s"
29
37
  }
30
38
  }
31
39
  end
32
40
 
41
+ def persister
42
+ @persister ||= connect(:mongo, @settings)
43
+ @persister
44
+ end
45
+
33
46
  def prepare_options(options)
34
47
  options.banner <<-BANNER
35
48
  Loads object ids from a collection to a queue for further processing.
@@ -41,6 +54,8 @@ Loads object ids from a collection to a queue for further processing.
41
54
 
42
55
  options.opt :earliest, 'Seconds since epoch of earliest item to load',
43
56
  :short => 'e', :default => 0, :type => :int
57
+ options.opt :number, 'Number of items to load (-1 means all)',
58
+ :short => 'n', :type => :int, :default => -1
44
59
  options.opt :filter,
45
60
  'Filter items by regexp on item attributes: item.attr=regexp',
46
61
  :short => 'f', :type => String, :multi => true
@@ -63,10 +78,6 @@ Loads object ids from a collection to a queue for further processing.
63
78
  end
64
79
 
65
80
  def go
66
- @gh = GHTorrent::Mirror.new(options[:config])
67
- @settings = @gh.settings
68
-
69
- GH.init(options[:config])
70
81
  # Message tags await publisher ack
71
82
  awaiting_ack = SortedSet.new
72
83
 
@@ -82,6 +93,7 @@ Loads object ids from a collection to a queue for further processing.
82
93
 
83
94
  puts "Loading from collection #{collection}"
84
95
  puts "Loading items after #{Time.at(options[:earliest])}" if options[:verbose]
96
+ puts "Loading #{options[:number]} items" if options[:verbose] && options[:number] != -1
85
97
 
86
98
  what = case
87
99
  when options[:filter].is_a?(Array)
@@ -98,13 +110,13 @@ Loads object ids from a collection to a queue for further processing.
98
110
 
99
111
  (puts "Mongo filter:"; pp what.merge(from)) if options[:verbose]
100
112
 
101
- AMQP.start(:host => GH.settings['amqp']['host'],
102
- :port => GH.settings['amqp']['port'],
103
- :username => GH.settings['amqp']['username'],
104
- :password => GH.settings['amqp']['password']) do |connection|
113
+ AMQP.start(:host => config(:amqp_host),
114
+ :port => config(:amqp_port),
115
+ :username => config(:amqp_username),
116
+ :password => config(:amqp_password)) do |connection|
105
117
 
106
118
  channel = AMQP::Channel.new(connection)
107
- exchange = channel.topic(GH.settings['amqp']['exchange'],
119
+ exchange = channel.topic(config(:amqp_exchange),
108
120
  :durable => true, :auto_delete => false)
109
121
 
110
122
  # What to do when the user hits Ctrl+c
@@ -115,18 +127,28 @@ Loads object ids from a collection to a queue for further processing.
115
127
  # Read next 1000 items and queue them
116
128
  read_and_publish = Proc.new {
117
129
 
130
+ to_read = if options.number == -1
131
+ 1000
132
+ else
133
+ if options.number - num_read - 1 <= 0
134
+ -1
135
+ else
136
+ options.number - num_read - 1
137
+ end
138
+ end
139
+
118
140
  read = 0
119
141
  col_info[collection][:col].find(what.merge(from),
120
142
  :skip => num_read,
121
- :limit => 1000).each do |e|
143
+ :limit => to_read).each do |e|
122
144
 
123
- payload = GH.read_value(e, col_info[collection][:payload])
145
+ payload = read_value(e, col_info[collection][:payload])
124
146
  payload = if payload.class == BSON::OrderedHash
125
147
  payload.delete "_id" # Inserted by MongoDB on event insert
126
148
  payload.to_json
127
149
  end
128
150
  read += 1
129
- unq = GH.read_value(e, col_info[collection][:unq])
151
+ unq = read_value(e, col_info[collection][:unq])
130
152
  if unq.class != String or unq.nil? then
131
153
  throw Exception("Unique value can only be a String")
132
154
  end
@@ -136,12 +158,12 @@ Loads object ids from a collection to a queue for further processing.
136
158
  exchange.publish payload, :persistent => true, :routing_key => key
137
159
 
138
160
  num_read += 1
139
- puts("Publish id = #{unq} (#{num_read} total)") if options.verbose
161
+ puts("Publish id = #{payload[unq]} (#{num_read} total)") if options.verbose
140
162
  awaiting_ack << num_read
141
163
  end
142
164
 
143
165
  # Nothing new in the DB and no msgs waiting ack
144
- if read == 0 and awaiting_ack.size == 0
166
+ if (read == 0 and awaiting_ack.size == 0) or to_read == -1
145
167
  puts("Finished reading, exiting")
146
168
  show_stopper.call
147
169
  end