ghtorrent 0.10 → 0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +16 -0
  3. data/Gemfile.lock +12 -27
  4. data/README.md +20 -33
  5. data/Rakefile +1 -9
  6. data/bin/ght-log-analyzer +11 -6
  7. data/bin/ght-log-influx +190 -0
  8. data/bin/ght-queue-grep.rb +55 -0
  9. data/bin/ght-retrieve-users +6 -0
  10. data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
  11. data/lib/ghtorrent.rb +4 -4
  12. data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
  13. data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
  14. data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
  15. data/lib/ghtorrent/api_client.rb +45 -119
  16. data/lib/ghtorrent/command.rb +25 -8
  17. data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
  18. data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
  19. data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
  20. data/lib/ghtorrent/commands/ght_load.rb +1 -2
  21. data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
  22. data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
  23. data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
  24. data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
  25. data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
  26. data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
  27. data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
  28. data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
  29. data/lib/ghtorrent/event_processing.rb +140 -0
  30. data/lib/ghtorrent/ghtorrent.rb +330 -396
  31. data/lib/ghtorrent/logging.rb +65 -12
  32. data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
  33. data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
  34. data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
  35. data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
  36. data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
  37. data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
  38. data/lib/ghtorrent/retriever.rb +100 -57
  39. data/lib/ghtorrent/settings.rb +14 -17
  40. data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
  41. data/lib/version.rb +1 -1
  42. metadata +14 -46
  43. data/bin/ght-process-event +0 -35
  44. data/lib/ghtorrent/cache.rb +0 -97
  45. data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
  46. data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
  47. data/spec/api_client_spec.rb +0 -42
  48. data/spec/spec_helper.rb +0 -21
@@ -24,19 +24,18 @@ module GHTorrent
24
24
  :uniq_id => 'mirror.uniq_id',
25
25
  :user_agent => 'mirror.user_agent',
26
26
 
27
- :cache_mode => 'mirror.cache_mode',
28
- :cache_dir => 'mirror.cache_dir',
29
- :cache_stale_age => 'mirror.cache_stale_age',
30
-
31
27
  :github_username => 'mirror.username',
32
28
  :github_passwd => 'mirror.passwd',
33
29
  :github_token => 'mirror.token',
34
30
 
35
- :respect_api_ratelimit => 'mirror.respect_api_ratelimit',
36
-
37
31
  :attach_ip => 'mirror.attach_ip',
38
32
 
39
- :rescue_loops => 'mirror.rescue_loops'
33
+ :rescue_loops => 'mirror.rescue_loops',
34
+ :req_limit => 'mirror.req_limit',
35
+
36
+ :logging_level => 'logging.level',
37
+ :logging_uniq => 'logging.uniq',
38
+ :logging_file => 'logging.file'
40
39
  }
41
40
 
42
41
  DEFAULTS = {
@@ -52,22 +51,20 @@ module GHTorrent
52
51
  :mirror_urlbase => 'https://api.github.com/',
53
52
  :mirror_persister => 'noop',
54
53
  :mirror_history_pages_back => 1,
55
- :uniq_id => 'ext_ref_id',
56
54
  :user_agent => 'ghtorrent',
57
55
 
58
- :cache_mode => 'dev',
59
- :cache_dir => Dir::tmpdir + File::SEPARATOR + 'ghtorrent',
60
- :cache_stale_age => 604800,
61
-
62
56
  :github_username => 'foo',
63
57
  :github_passwd => 'bar',
64
58
  :github_token => '',
65
59
 
66
- :respect_api_ratelimit => 'true',
67
-
68
60
  :attach_ip => '0.0.0.0',
69
61
 
70
- :rescue_loops => 'true'
62
+ :rescue_loops => 'true',
63
+ :req_limit => 4998,
64
+
65
+ :logging_level => 'info',
66
+ :logging_uniq => '',
67
+ :logging_file => 'stdout'
71
68
  }
72
69
 
73
70
  def config(key, use_default = true)
@@ -78,7 +75,7 @@ module GHTorrent
78
75
  else
79
76
  a
80
77
  end
81
- rescue Exception => e
78
+ rescue StandardError => e
82
79
  if use_default
83
80
  DEFAULTS[key]
84
81
  else
@@ -102,7 +99,7 @@ module GHTorrent
102
99
  end
103
100
 
104
101
  def settings
105
- raise Exception.new('Unimplemented')
102
+ raise StandardError.new('Unimplemented')
106
103
  end
107
104
 
108
105
  end
@@ -3,12 +3,11 @@ require 'ghtorrent/ghtorrent'
3
3
 
4
4
  # A version of the GHTorrent class that creates a transaction per processed
5
5
  # item
6
- class TransactedGhtorrent < GHTorrent::Mirror
6
+ class TransactedGHTorrent < GHTorrent::Mirror
7
7
 
8
- def ensure_repo(owner, repo, commits = false, project_members = false,
9
- watchers = false, forks = false, labels = false)
8
+ def ensure_repo(owner, repo, recursive = false)
10
9
  check_transaction do
11
- super(owner, repo, commits, project_members, watchers, forks, labels)
10
+ super(owner, repo, recursive)
12
11
  end
13
12
  end
14
13
 
@@ -30,6 +29,12 @@ class TransactedGhtorrent < GHTorrent::Mirror
30
29
  end
31
30
  end
32
31
 
32
+ def ensure_fork_commits(owner, repo, parent_owner, parent_repo)
33
+ check_transaction do
34
+ super(owner, repo, parent_owner, parent_repo)
35
+ end
36
+ end
37
+
33
38
  def ensure_pull_request(owner, repo, pullreq_id,
34
39
  comments = true, commits = true, history = true,
35
40
  state = nil, actor = nil, created_at = nil)
@@ -86,8 +91,26 @@ class TransactedGhtorrent < GHTorrent::Mirror
86
91
  end
87
92
  end
88
93
 
94
+ def ensure_user_followers(user)
95
+ check_transaction do
96
+ super(user)
97
+ end
98
+ end
99
+
100
+ def ensure_orgs(user)
101
+ check_transaction do
102
+ super(user)
103
+ end
104
+ end
105
+
106
+ def ensure_org(user, members = true)
107
+ check_transaction do
108
+ super(user, members)
109
+ end
110
+ end
111
+
89
112
  def check_transaction(&block)
90
- if @db.in_transaction?
113
+ if get_db.in_transaction?
91
114
  yield block
92
115
  else
93
116
  transaction do
@@ -1,5 +1,5 @@
1
1
  module GHTorrent
2
2
 
3
- VERSION = '0.10'
3
+ VERSION = '0.11'
4
4
 
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ghtorrent
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.10'
4
+ version: '0.11'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Georgios Gousios
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-10-03 00:00:00.000000000 Z
12
+ date: 2015-09-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mongo
@@ -111,40 +111,6 @@ dependencies:
111
111
  - - '>='
112
112
  - !ruby/object:Gem::Version
113
113
  version: 1.0.0
114
- - !ruby/object:Gem::Dependency
115
- name: rspec
116
- requirement: !ruby/object:Gem::Requirement
117
- requirements:
118
- - - ~>
119
- - !ruby/object:Gem::Version
120
- version: '2.14'
121
- - - '>='
122
- - !ruby/object:Gem::Version
123
- version: 2.14.0
124
- type: :development
125
- prerelease: false
126
- version_requirements: !ruby/object:Gem::Requirement
127
- requirements:
128
- - - ~>
129
- - !ruby/object:Gem::Version
130
- version: '2.14'
131
- - - '>='
132
- - !ruby/object:Gem::Version
133
- version: 2.14.0
134
- - !ruby/object:Gem::Dependency
135
- name: webmock
136
- requirement: !ruby/object:Gem::Requirement
137
- requirements:
138
- - - ~>
139
- - !ruby/object:Gem::Version
140
- version: '1.16'
141
- type: :development
142
- prerelease: false
143
- version_requirements: !ruby/object:Gem::Requirement
144
- requirements:
145
- - - ~>
146
- - !ruby/object:Gem::Version
147
- version: '1.16'
148
114
  description: |-
149
115
  A library and a collection of associated programs
150
116
  to mirror and process Github data
@@ -153,8 +119,6 @@ executables:
153
119
  - ght-data-retrieval
154
120
  - ght-mirror-events
155
121
  - ght-load
156
- - ght-rm-dupl
157
- - ght-process-event
158
122
  - ght-get-more-commits
159
123
  - ght-retrieve-repo
160
124
  - ght-retrieve-user
@@ -167,8 +131,8 @@ files:
167
131
  - lib/ghtorrent/adapters/noop_persister.rb
168
132
  - lib/ghtorrent/api_client.rb
169
133
  - lib/ghtorrent/bson_orderedhash.rb
170
- - lib/ghtorrent/cache.rb
171
134
  - lib/ghtorrent/command.rb
135
+ - lib/ghtorrent/commands/full_user_retriever.rb
172
136
  - lib/ghtorrent/commands/ght_data_retrieval.rb
173
137
  - lib/ghtorrent/commands/ght_get_more_commits.rb
174
138
  - lib/ghtorrent/commands/ght_load.rb
@@ -178,8 +142,9 @@ files:
178
142
  - lib/ghtorrent/commands/ght_retrieve_repo.rb
179
143
  - lib/ghtorrent/commands/ght_retrieve_repos.rb
180
144
  - lib/ghtorrent/commands/ght_retrieve_user.rb
181
- - lib/ghtorrent/commands/ght_rm_dupl.rb
182
- - lib/ghtorrent/gh_torrent_exception.rb
145
+ - lib/ghtorrent/commands/ght_retrieve_users.rb
146
+ - lib/ghtorrent/commands/ght_update_repo.rb
147
+ - lib/ghtorrent/event_processing.rb
183
148
  - lib/ghtorrent/ghtorrent.rb
184
149
  - lib/ghtorrent/hash.rb
185
150
  - lib/ghtorrent/logging.rb
@@ -202,35 +167,38 @@ files:
202
167
  - lib/ghtorrent/migrations/017_drop_forks_table.rb
203
168
  - lib/ghtorrent/migrations/018_drop_merged_user_from_pull_requests.rb
204
169
  - lib/ghtorrent/migrations/019_add_fake_to_users.rb
170
+ - lib/ghtorrent/migrations/020_add_deleted_to_users.rb
171
+ - lib/ghtorrent/migrations/021_remove_ext_ref_id.rb
172
+ - lib/ghtorrent/migrations/022_add_project_languages.rb
205
173
  - lib/ghtorrent/migrations/mysql_defaults.rb
206
174
  - lib/ghtorrent/multiprocess_queue_client.rb
207
175
  - lib/ghtorrent/persister.rb
208
176
  - lib/ghtorrent/retriever.rb
209
177
  - lib/ghtorrent/settings.rb
210
178
  - lib/ghtorrent/time.rb
211
- - lib/ghtorrent/transacted_ghtorrent.rb
179
+ - lib/ghtorrent/transacted_gh_torrent.rb
212
180
  - lib/ghtorrent/utils.rb
213
181
  - lib/version.rb
214
182
  - bin/ght-data-retrieval
215
183
  - bin/ght-get-more-commits
216
184
  - bin/ght-load
217
185
  - bin/ght-log-analyzer
186
+ - bin/ght-log-influx
218
187
  - bin/ght-mirror-events
219
- - bin/ght-process-event
188
+ - bin/ght-queue-grep.rb
220
189
  - bin/ght-retrieve-dependents
221
190
  - bin/ght-retrieve-one
222
191
  - bin/ght-retrieve-repo
223
192
  - bin/ght-retrieve-repos
224
193
  - bin/ght-retrieve-user
225
- - bin/ght-rm-dupl
194
+ - bin/ght-retrieve-users
195
+ - bin/ght-update-repo
226
196
  - CHANGELOG
227
197
  - Gemfile
228
198
  - Gemfile.lock
229
199
  - LICENSE
230
200
  - README.md
231
201
  - Rakefile
232
- - spec/api_client_spec.rb
233
- - spec/spec_helper.rb
234
202
  homepage: https://github.com/gousiosg/github-mirror
235
203
  licenses:
236
204
  - BSD-2-Clause
@@ -1,35 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'rubygems'
4
- require 'ghtorrent'
5
-
6
-
7
- class GHTProcessEvent < GHTDataRetrieval
8
-
9
- def prepare_options(options)
10
- options.banner <<-BANNER
11
- Process one or more event ids
12
- #{command_name} [options] eventid [...]
13
- BANNER
14
-
15
- end
16
-
17
- def go
18
- ARGV.each do |a|
19
- data = ghtorrent.get_event(a)
20
- unless data.empty?
21
- event = data[0]
22
- begin
23
- send(event['type'], event)
24
- rescue Exception => e
25
- puts e
26
- puts e.backtrace
27
- end
28
- else
29
- warn "GHTProcessEvent: No event with id #{a}"
30
- end
31
- end
32
- end
33
- end
34
-
35
- GHTProcessEvent.run
@@ -1,97 +0,0 @@
1
- require 'digest/sha1'
2
- require 'fileutils'
3
-
4
- require 'ghtorrent/logging'
5
- require 'ghtorrent/settings'
6
-
7
- module GHTorrent
8
- module Cache
9
- include GHTorrent::Logging
10
- include GHTorrent::Settings
11
-
12
- # Root dir for cached objects.
13
- def cache_dir
14
- @cache_dir ||= config(:cache_dir)
15
- @cache_dir
16
- end
17
-
18
- # The maximum time an item can be cached before being considered stale
19
- def max_life
20
- @max_life ||= config(:cache_stale_age)
21
- @max_life
22
- end
23
-
24
- # Put an object to the cache
25
- def cache_put(key, object)
26
- file = cache_location(key)
27
- FileUtils.mkdir_p(File.dirname (file))
28
-
29
- begin
30
- File.open(file, 'w') do |f|
31
- f.flocked? do
32
- YAML::dump object, f
33
- end
34
- end
35
- rescue
36
- warn "Cache: Could not cache object #{file} for key #{key}"
37
- end
38
- end
39
-
40
- # Get the object indexed by +key+ from the cache. Returns nil if the
41
- # key is not found or the object is too old.
42
- def cache_get(key)
43
- file = cache_location(key)
44
-
45
- unless File.exist?(file)
46
- return nil
47
- end
48
-
49
- unless (Time.now() - File.mtime(file)) < max_life
50
- debug "Cache: Cached object for key #{key} too old"
51
- return nil
52
- end
53
-
54
- begin
55
- File.open(file, 'r') do |f|
56
- f.flocked? do
57
- YAML::load(f)
58
- end
59
- end
60
- rescue
61
- warn "Cache: Could not read object from cache location #{file}"
62
- File.delete(file)
63
- end
64
- end
65
-
66
- private
67
-
68
- def cache_location(key)
69
- hash = hashkey(key)
70
- start = hash[0,2]
71
- File.join(cache_dir, start, hash)
72
- end
73
-
74
- def hashkey(key)
75
- Digest::SHA1.hexdigest key
76
- end
77
-
78
- end
79
- end
80
-
81
- class File
82
- def flocked? &block
83
- status = flock LOCK_EX
84
- case status
85
- when false
86
- return true
87
- when 0
88
- begin
89
- block ? block.call : false
90
- ensure
91
- flock LOCK_UN
92
- end
93
- else
94
- raise SystemCallError, status
95
- end
96
- end
97
- end
@@ -1,132 +0,0 @@
1
- require 'rubygems'
2
- require 'mongo'
3
-
4
- require 'ghtorrent/settings'
5
- require 'ghtorrent/logging'
6
- require 'ghtorrent/command'
7
- require 'ghtorrent/persister'
8
-
9
- class GHRMDupl < GHTorrent::Command
10
-
11
- include GHTorrent::Settings
12
- include GHTorrent::Persister
13
-
14
- def col_info()
15
- {
16
- :commits => {
17
- :unq => "sha",
18
- :col => persister.get_underlying_connection.collection(:commits.to_s),
19
- },
20
- :events => {
21
- :unq => "id",
22
- :col => persister.get_underlying_connection.collection(:events.to_s),
23
- }
24
- }
25
- end
26
-
27
- def persister
28
- @persister ||= connect(:mongo, @settings)
29
- @persister
30
- end
31
-
32
- def prepare_options(options)
33
- options.banner <<-BANNER
34
- Removes duplicate entries from collections (currently, commits and events)
35
-
36
- #{command_name} [options] collection
37
-
38
- #{command_name} options:
39
- BANNER
40
-
41
- options.opt :earliest, 'Seconds since epoch of earliest item to load',
42
- :short => 'e', :default => 0, :type => :int
43
- options.opt :snapshot, 'Perform clean up every x records',
44
- :short => 's', :default => -1, :type => :int
45
- end
46
-
47
- def validate
48
- super
49
- Trollop::die "no collection specified" unless args[0] && !args[0].empty?
50
- end
51
-
52
- # Print MongoDB remove statements that
53
- # remove all but one entries for each commit.
54
- def remove_duplicates(data, col)
55
- removed = 0
56
- data.select { |k, v| v.size > 1 }.each do |k, v|
57
- v.slice(0..(v.size - 2)).map do |x|
58
- removed += 1 if delete_by_id col, x
59
- end
60
- end
61
- removed
62
- end
63
-
64
- def delete_by_id(col, id)
65
- begin
66
- col.remove({'_id' => id})
67
- true
68
- rescue Mongo::OperationFailure
69
- puts "Cannot remove record with id #{id} from #{col.name}"
70
- false
71
- end
72
- end
73
-
74
- def go
75
- collection = case ARGV[0]
76
- when "commits" then
77
- :commits
78
- when "events" then
79
- :events
80
- else
81
- puts "Not a known collection name: #{ARGV[0]}\n"
82
- end
83
-
84
- from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
85
-
86
- snapshot = options[:snapshot]
87
-
88
- puts "Deleting duplicates from collection #{collection}"
89
- puts "Deleting duplicates after #{Time.at(options[:earliest])}"
90
- puts "Perform clean up every #{snapshot} records"
91
-
92
- # Various counters to report stats
93
- processed = total_processed = removed = 0
94
-
95
- data = Hash.new
96
-
97
- # The following code needs to save intermediate results to cope
98
- # with large datasets
99
- col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
100
- _id = r["_id"]
101
- commit = read_value(r, col_info[collection][:unq])
102
-
103
- # If entries cannot be parsed, remove them
104
- if commit.empty?
105
- puts "Deleting unknown entry #{_id}"
106
- removed += 1 if delete_by_id col_info[collection][:col], _id
107
- else
108
- data[commit] = [] if data[commit].nil?
109
- data[commit] << _id
110
- end
111
-
112
- processed += 1
113
- total_processed += 1
114
-
115
- print "\rProcessed #{processed} records"
116
-
117
- # Calculate duplicates, save intermediate result
118
- if snapshot > 0 and processed > snapshot
119
- puts "\nLoaded #{data.size} values, cleaning"
120
- removed += remove_duplicates data, col_info[collection][:col]
121
- data = Hash.new
122
- processed = 0
123
- end
124
- end
125
-
126
- removed += remove_duplicates data, col_info[collection][:col]
127
-
128
- puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
129
- end
130
- end
131
-
132
- # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :