ghtorrent 0.10 → 0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +16 -0
- data/Gemfile.lock +12 -27
- data/README.md +20 -33
- data/Rakefile +1 -9
- data/bin/ght-log-analyzer +11 -6
- data/bin/ght-log-influx +190 -0
- data/bin/ght-queue-grep.rb +55 -0
- data/bin/ght-retrieve-users +6 -0
- data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
- data/lib/ghtorrent.rb +4 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
- data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
- data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
- data/lib/ghtorrent/api_client.rb +45 -119
- data/lib/ghtorrent/command.rb +25 -8
- data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
- data/lib/ghtorrent/commands/ght_load.rb +1 -2
- data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
- data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
- data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
- data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
- data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
- data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
- data/lib/ghtorrent/event_processing.rb +140 -0
- data/lib/ghtorrent/ghtorrent.rb +330 -396
- data/lib/ghtorrent/logging.rb +65 -12
- data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
- data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
- data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
- data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
- data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
- data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
- data/lib/ghtorrent/retriever.rb +100 -57
- data/lib/ghtorrent/settings.rb +14 -17
- data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
- data/lib/version.rb +1 -1
- metadata +14 -46
- data/bin/ght-process-event +0 -35
- data/lib/ghtorrent/cache.rb +0 -97
- data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
- data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
- data/spec/api_client_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -21
data/lib/ghtorrent/settings.rb
CHANGED
@@ -24,19 +24,18 @@ module GHTorrent
|
|
24
24
|
:uniq_id => 'mirror.uniq_id',
|
25
25
|
:user_agent => 'mirror.user_agent',
|
26
26
|
|
27
|
-
:cache_mode => 'mirror.cache_mode',
|
28
|
-
:cache_dir => 'mirror.cache_dir',
|
29
|
-
:cache_stale_age => 'mirror.cache_stale_age',
|
30
|
-
|
31
27
|
:github_username => 'mirror.username',
|
32
28
|
:github_passwd => 'mirror.passwd',
|
33
29
|
:github_token => 'mirror.token',
|
34
30
|
|
35
|
-
:respect_api_ratelimit => 'mirror.respect_api_ratelimit',
|
36
|
-
|
37
31
|
:attach_ip => 'mirror.attach_ip',
|
38
32
|
|
39
|
-
:rescue_loops => 'mirror.rescue_loops'
|
33
|
+
:rescue_loops => 'mirror.rescue_loops',
|
34
|
+
:req_limit => 'mirror.req_limit',
|
35
|
+
|
36
|
+
:logging_level => 'logging.level',
|
37
|
+
:logging_uniq => 'logging.uniq',
|
38
|
+
:logging_file => 'logging.file'
|
40
39
|
}
|
41
40
|
|
42
41
|
DEFAULTS = {
|
@@ -52,22 +51,20 @@ module GHTorrent
|
|
52
51
|
:mirror_urlbase => 'https://api.github.com/',
|
53
52
|
:mirror_persister => 'noop',
|
54
53
|
:mirror_history_pages_back => 1,
|
55
|
-
:uniq_id => 'ext_ref_id',
|
56
54
|
:user_agent => 'ghtorrent',
|
57
55
|
|
58
|
-
:cache_mode => 'dev',
|
59
|
-
:cache_dir => Dir::tmpdir + File::SEPARATOR + 'ghtorrent',
|
60
|
-
:cache_stale_age => 604800,
|
61
|
-
|
62
56
|
:github_username => 'foo',
|
63
57
|
:github_passwd => 'bar',
|
64
58
|
:github_token => '',
|
65
59
|
|
66
|
-
:respect_api_ratelimit => 'true',
|
67
|
-
|
68
60
|
:attach_ip => '0.0.0.0',
|
69
61
|
|
70
|
-
:rescue_loops => 'true'
|
62
|
+
:rescue_loops => 'true',
|
63
|
+
:req_limit => 4998,
|
64
|
+
|
65
|
+
:logging_level => 'info',
|
66
|
+
:logging_uniq => '',
|
67
|
+
:logging_file => 'stdout'
|
71
68
|
}
|
72
69
|
|
73
70
|
def config(key, use_default = true)
|
@@ -78,7 +75,7 @@ module GHTorrent
|
|
78
75
|
else
|
79
76
|
a
|
80
77
|
end
|
81
|
-
rescue
|
78
|
+
rescue StandardError => e
|
82
79
|
if use_default
|
83
80
|
DEFAULTS[key]
|
84
81
|
else
|
@@ -102,7 +99,7 @@ module GHTorrent
|
|
102
99
|
end
|
103
100
|
|
104
101
|
def settings
|
105
|
-
raise
|
102
|
+
raise StandardError.new('Unimplemented')
|
106
103
|
end
|
107
104
|
|
108
105
|
end
|
@@ -3,12 +3,11 @@ require 'ghtorrent/ghtorrent'
|
|
3
3
|
|
4
4
|
# A version of the GHTorrent class that creates a transaction per processed
|
5
5
|
# item
|
6
|
-
class
|
6
|
+
class TransactedGHTorrent < GHTorrent::Mirror
|
7
7
|
|
8
|
-
def ensure_repo(owner, repo,
|
9
|
-
watchers = false, forks = false, labels = false)
|
8
|
+
def ensure_repo(owner, repo, recursive = false)
|
10
9
|
check_transaction do
|
11
|
-
super(owner, repo,
|
10
|
+
super(owner, repo, recursive)
|
12
11
|
end
|
13
12
|
end
|
14
13
|
|
@@ -30,6 +29,12 @@ class TransactedGhtorrent < GHTorrent::Mirror
|
|
30
29
|
end
|
31
30
|
end
|
32
31
|
|
32
|
+
def ensure_fork_commits(owner, repo, parent_owner, parent_repo)
|
33
|
+
check_transaction do
|
34
|
+
super(owner, repo, parent_owner, parent_repo)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
33
38
|
def ensure_pull_request(owner, repo, pullreq_id,
|
34
39
|
comments = true, commits = true, history = true,
|
35
40
|
state = nil, actor = nil, created_at = nil)
|
@@ -86,8 +91,26 @@ class TransactedGhtorrent < GHTorrent::Mirror
|
|
86
91
|
end
|
87
92
|
end
|
88
93
|
|
94
|
+
def ensure_user_followers(user)
|
95
|
+
check_transaction do
|
96
|
+
super(user)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def ensure_orgs(user)
|
101
|
+
check_transaction do
|
102
|
+
super(user)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def ensure_org(user, members = true)
|
107
|
+
check_transaction do
|
108
|
+
super(user, members)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
89
112
|
def check_transaction(&block)
|
90
|
-
if
|
113
|
+
if get_db.in_transaction?
|
91
114
|
yield block
|
92
115
|
else
|
93
116
|
transaction do
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ghtorrent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.11'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Georgios Gousios
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2015-09-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mongo
|
@@ -111,40 +111,6 @@ dependencies:
|
|
111
111
|
- - '>='
|
112
112
|
- !ruby/object:Gem::Version
|
113
113
|
version: 1.0.0
|
114
|
-
- !ruby/object:Gem::Dependency
|
115
|
-
name: rspec
|
116
|
-
requirement: !ruby/object:Gem::Requirement
|
117
|
-
requirements:
|
118
|
-
- - ~>
|
119
|
-
- !ruby/object:Gem::Version
|
120
|
-
version: '2.14'
|
121
|
-
- - '>='
|
122
|
-
- !ruby/object:Gem::Version
|
123
|
-
version: 2.14.0
|
124
|
-
type: :development
|
125
|
-
prerelease: false
|
126
|
-
version_requirements: !ruby/object:Gem::Requirement
|
127
|
-
requirements:
|
128
|
-
- - ~>
|
129
|
-
- !ruby/object:Gem::Version
|
130
|
-
version: '2.14'
|
131
|
-
- - '>='
|
132
|
-
- !ruby/object:Gem::Version
|
133
|
-
version: 2.14.0
|
134
|
-
- !ruby/object:Gem::Dependency
|
135
|
-
name: webmock
|
136
|
-
requirement: !ruby/object:Gem::Requirement
|
137
|
-
requirements:
|
138
|
-
- - ~>
|
139
|
-
- !ruby/object:Gem::Version
|
140
|
-
version: '1.16'
|
141
|
-
type: :development
|
142
|
-
prerelease: false
|
143
|
-
version_requirements: !ruby/object:Gem::Requirement
|
144
|
-
requirements:
|
145
|
-
- - ~>
|
146
|
-
- !ruby/object:Gem::Version
|
147
|
-
version: '1.16'
|
148
114
|
description: |-
|
149
115
|
A library and a collection of associated programs
|
150
116
|
to mirror and process Github data
|
@@ -153,8 +119,6 @@ executables:
|
|
153
119
|
- ght-data-retrieval
|
154
120
|
- ght-mirror-events
|
155
121
|
- ght-load
|
156
|
-
- ght-rm-dupl
|
157
|
-
- ght-process-event
|
158
122
|
- ght-get-more-commits
|
159
123
|
- ght-retrieve-repo
|
160
124
|
- ght-retrieve-user
|
@@ -167,8 +131,8 @@ files:
|
|
167
131
|
- lib/ghtorrent/adapters/noop_persister.rb
|
168
132
|
- lib/ghtorrent/api_client.rb
|
169
133
|
- lib/ghtorrent/bson_orderedhash.rb
|
170
|
-
- lib/ghtorrent/cache.rb
|
171
134
|
- lib/ghtorrent/command.rb
|
135
|
+
- lib/ghtorrent/commands/full_user_retriever.rb
|
172
136
|
- lib/ghtorrent/commands/ght_data_retrieval.rb
|
173
137
|
- lib/ghtorrent/commands/ght_get_more_commits.rb
|
174
138
|
- lib/ghtorrent/commands/ght_load.rb
|
@@ -178,8 +142,9 @@ files:
|
|
178
142
|
- lib/ghtorrent/commands/ght_retrieve_repo.rb
|
179
143
|
- lib/ghtorrent/commands/ght_retrieve_repos.rb
|
180
144
|
- lib/ghtorrent/commands/ght_retrieve_user.rb
|
181
|
-
- lib/ghtorrent/commands/
|
182
|
-
- lib/ghtorrent/
|
145
|
+
- lib/ghtorrent/commands/ght_retrieve_users.rb
|
146
|
+
- lib/ghtorrent/commands/ght_update_repo.rb
|
147
|
+
- lib/ghtorrent/event_processing.rb
|
183
148
|
- lib/ghtorrent/ghtorrent.rb
|
184
149
|
- lib/ghtorrent/hash.rb
|
185
150
|
- lib/ghtorrent/logging.rb
|
@@ -202,35 +167,38 @@ files:
|
|
202
167
|
- lib/ghtorrent/migrations/017_drop_forks_table.rb
|
203
168
|
- lib/ghtorrent/migrations/018_drop_merged_user_from_pull_requests.rb
|
204
169
|
- lib/ghtorrent/migrations/019_add_fake_to_users.rb
|
170
|
+
- lib/ghtorrent/migrations/020_add_deleted_to_users.rb
|
171
|
+
- lib/ghtorrent/migrations/021_remove_ext_ref_id.rb
|
172
|
+
- lib/ghtorrent/migrations/022_add_project_languages.rb
|
205
173
|
- lib/ghtorrent/migrations/mysql_defaults.rb
|
206
174
|
- lib/ghtorrent/multiprocess_queue_client.rb
|
207
175
|
- lib/ghtorrent/persister.rb
|
208
176
|
- lib/ghtorrent/retriever.rb
|
209
177
|
- lib/ghtorrent/settings.rb
|
210
178
|
- lib/ghtorrent/time.rb
|
211
|
-
- lib/ghtorrent/
|
179
|
+
- lib/ghtorrent/transacted_gh_torrent.rb
|
212
180
|
- lib/ghtorrent/utils.rb
|
213
181
|
- lib/version.rb
|
214
182
|
- bin/ght-data-retrieval
|
215
183
|
- bin/ght-get-more-commits
|
216
184
|
- bin/ght-load
|
217
185
|
- bin/ght-log-analyzer
|
186
|
+
- bin/ght-log-influx
|
218
187
|
- bin/ght-mirror-events
|
219
|
-
- bin/ght-
|
188
|
+
- bin/ght-queue-grep.rb
|
220
189
|
- bin/ght-retrieve-dependents
|
221
190
|
- bin/ght-retrieve-one
|
222
191
|
- bin/ght-retrieve-repo
|
223
192
|
- bin/ght-retrieve-repos
|
224
193
|
- bin/ght-retrieve-user
|
225
|
-
- bin/ght-
|
194
|
+
- bin/ght-retrieve-users
|
195
|
+
- bin/ght-update-repo
|
226
196
|
- CHANGELOG
|
227
197
|
- Gemfile
|
228
198
|
- Gemfile.lock
|
229
199
|
- LICENSE
|
230
200
|
- README.md
|
231
201
|
- Rakefile
|
232
|
-
- spec/api_client_spec.rb
|
233
|
-
- spec/spec_helper.rb
|
234
202
|
homepage: https://github.com/gousiosg/github-mirror
|
235
203
|
licenses:
|
236
204
|
- BSD-2-Clause
|
data/bin/ght-process-event
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'ghtorrent'
|
5
|
-
|
6
|
-
|
7
|
-
class GHTProcessEvent < GHTDataRetrieval
|
8
|
-
|
9
|
-
def prepare_options(options)
|
10
|
-
options.banner <<-BANNER
|
11
|
-
Process one or more event ids
|
12
|
-
#{command_name} [options] eventid [...]
|
13
|
-
BANNER
|
14
|
-
|
15
|
-
end
|
16
|
-
|
17
|
-
def go
|
18
|
-
ARGV.each do |a|
|
19
|
-
data = ghtorrent.get_event(a)
|
20
|
-
unless data.empty?
|
21
|
-
event = data[0]
|
22
|
-
begin
|
23
|
-
send(event['type'], event)
|
24
|
-
rescue Exception => e
|
25
|
-
puts e
|
26
|
-
puts e.backtrace
|
27
|
-
end
|
28
|
-
else
|
29
|
-
warn "GHTProcessEvent: No event with id #{a}"
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
GHTProcessEvent.run
|
data/lib/ghtorrent/cache.rb
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
require 'digest/sha1'
|
2
|
-
require 'fileutils'
|
3
|
-
|
4
|
-
require 'ghtorrent/logging'
|
5
|
-
require 'ghtorrent/settings'
|
6
|
-
|
7
|
-
module GHTorrent
|
8
|
-
module Cache
|
9
|
-
include GHTorrent::Logging
|
10
|
-
include GHTorrent::Settings
|
11
|
-
|
12
|
-
# Root dir for cached objects.
|
13
|
-
def cache_dir
|
14
|
-
@cache_dir ||= config(:cache_dir)
|
15
|
-
@cache_dir
|
16
|
-
end
|
17
|
-
|
18
|
-
# The maximum time an item can be cached before being considered stale
|
19
|
-
def max_life
|
20
|
-
@max_life ||= config(:cache_stale_age)
|
21
|
-
@max_life
|
22
|
-
end
|
23
|
-
|
24
|
-
# Put an object to the cache
|
25
|
-
def cache_put(key, object)
|
26
|
-
file = cache_location(key)
|
27
|
-
FileUtils.mkdir_p(File.dirname (file))
|
28
|
-
|
29
|
-
begin
|
30
|
-
File.open(file, 'w') do |f|
|
31
|
-
f.flocked? do
|
32
|
-
YAML::dump object, f
|
33
|
-
end
|
34
|
-
end
|
35
|
-
rescue
|
36
|
-
warn "Cache: Could not cache object #{file} for key #{key}"
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
# Get the object indexed by +key+ from the cache. Returns nil if the
|
41
|
-
# key is not found or the object is too old.
|
42
|
-
def cache_get(key)
|
43
|
-
file = cache_location(key)
|
44
|
-
|
45
|
-
unless File.exist?(file)
|
46
|
-
return nil
|
47
|
-
end
|
48
|
-
|
49
|
-
unless (Time.now() - File.mtime(file)) < max_life
|
50
|
-
debug "Cache: Cached object for key #{key} too old"
|
51
|
-
return nil
|
52
|
-
end
|
53
|
-
|
54
|
-
begin
|
55
|
-
File.open(file, 'r') do |f|
|
56
|
-
f.flocked? do
|
57
|
-
YAML::load(f)
|
58
|
-
end
|
59
|
-
end
|
60
|
-
rescue
|
61
|
-
warn "Cache: Could not read object from cache location #{file}"
|
62
|
-
File.delete(file)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
private
|
67
|
-
|
68
|
-
def cache_location(key)
|
69
|
-
hash = hashkey(key)
|
70
|
-
start = hash[0,2]
|
71
|
-
File.join(cache_dir, start, hash)
|
72
|
-
end
|
73
|
-
|
74
|
-
def hashkey(key)
|
75
|
-
Digest::SHA1.hexdigest key
|
76
|
-
end
|
77
|
-
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
class File
|
82
|
-
def flocked? &block
|
83
|
-
status = flock LOCK_EX
|
84
|
-
case status
|
85
|
-
when false
|
86
|
-
return true
|
87
|
-
when 0
|
88
|
-
begin
|
89
|
-
block ? block.call : false
|
90
|
-
ensure
|
91
|
-
flock LOCK_UN
|
92
|
-
end
|
93
|
-
else
|
94
|
-
raise SystemCallError, status
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
@@ -1,132 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'mongo'
|
3
|
-
|
4
|
-
require 'ghtorrent/settings'
|
5
|
-
require 'ghtorrent/logging'
|
6
|
-
require 'ghtorrent/command'
|
7
|
-
require 'ghtorrent/persister'
|
8
|
-
|
9
|
-
class GHRMDupl < GHTorrent::Command
|
10
|
-
|
11
|
-
include GHTorrent::Settings
|
12
|
-
include GHTorrent::Persister
|
13
|
-
|
14
|
-
def col_info()
|
15
|
-
{
|
16
|
-
:commits => {
|
17
|
-
:unq => "sha",
|
18
|
-
:col => persister.get_underlying_connection.collection(:commits.to_s),
|
19
|
-
},
|
20
|
-
:events => {
|
21
|
-
:unq => "id",
|
22
|
-
:col => persister.get_underlying_connection.collection(:events.to_s),
|
23
|
-
}
|
24
|
-
}
|
25
|
-
end
|
26
|
-
|
27
|
-
def persister
|
28
|
-
@persister ||= connect(:mongo, @settings)
|
29
|
-
@persister
|
30
|
-
end
|
31
|
-
|
32
|
-
def prepare_options(options)
|
33
|
-
options.banner <<-BANNER
|
34
|
-
Removes duplicate entries from collections (currently, commits and events)
|
35
|
-
|
36
|
-
#{command_name} [options] collection
|
37
|
-
|
38
|
-
#{command_name} options:
|
39
|
-
BANNER
|
40
|
-
|
41
|
-
options.opt :earliest, 'Seconds since epoch of earliest item to load',
|
42
|
-
:short => 'e', :default => 0, :type => :int
|
43
|
-
options.opt :snapshot, 'Perform clean up every x records',
|
44
|
-
:short => 's', :default => -1, :type => :int
|
45
|
-
end
|
46
|
-
|
47
|
-
def validate
|
48
|
-
super
|
49
|
-
Trollop::die "no collection specified" unless args[0] && !args[0].empty?
|
50
|
-
end
|
51
|
-
|
52
|
-
# Print MongoDB remove statements that
|
53
|
-
# remove all but one entries for each commit.
|
54
|
-
def remove_duplicates(data, col)
|
55
|
-
removed = 0
|
56
|
-
data.select { |k, v| v.size > 1 }.each do |k, v|
|
57
|
-
v.slice(0..(v.size - 2)).map do |x|
|
58
|
-
removed += 1 if delete_by_id col, x
|
59
|
-
end
|
60
|
-
end
|
61
|
-
removed
|
62
|
-
end
|
63
|
-
|
64
|
-
def delete_by_id(col, id)
|
65
|
-
begin
|
66
|
-
col.remove({'_id' => id})
|
67
|
-
true
|
68
|
-
rescue Mongo::OperationFailure
|
69
|
-
puts "Cannot remove record with id #{id} from #{col.name}"
|
70
|
-
false
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def go
|
75
|
-
collection = case ARGV[0]
|
76
|
-
when "commits" then
|
77
|
-
:commits
|
78
|
-
when "events" then
|
79
|
-
:events
|
80
|
-
else
|
81
|
-
puts "Not a known collection name: #{ARGV[0]}\n"
|
82
|
-
end
|
83
|
-
|
84
|
-
from = {'_id' => {'$gte' => BSON::ObjectId.from_time(Time.at(options[:earliest]))}}
|
85
|
-
|
86
|
-
snapshot = options[:snapshot]
|
87
|
-
|
88
|
-
puts "Deleting duplicates from collection #{collection}"
|
89
|
-
puts "Deleting duplicates after #{Time.at(options[:earliest])}"
|
90
|
-
puts "Perform clean up every #{snapshot} records"
|
91
|
-
|
92
|
-
# Various counters to report stats
|
93
|
-
processed = total_processed = removed = 0
|
94
|
-
|
95
|
-
data = Hash.new
|
96
|
-
|
97
|
-
# The following code needs to save intermediate results to cope
|
98
|
-
# with large datasets
|
99
|
-
col_info[collection][:col].find(from, :fields => col_info[collection][:unq]).each do |r|
|
100
|
-
_id = r["_id"]
|
101
|
-
commit = read_value(r, col_info[collection][:unq])
|
102
|
-
|
103
|
-
# If entries cannot be parsed, remove them
|
104
|
-
if commit.empty?
|
105
|
-
puts "Deleting unknown entry #{_id}"
|
106
|
-
removed += 1 if delete_by_id col_info[collection][:col], _id
|
107
|
-
else
|
108
|
-
data[commit] = [] if data[commit].nil?
|
109
|
-
data[commit] << _id
|
110
|
-
end
|
111
|
-
|
112
|
-
processed += 1
|
113
|
-
total_processed += 1
|
114
|
-
|
115
|
-
print "\rProcessed #{processed} records"
|
116
|
-
|
117
|
-
# Calculate duplicates, save intermediate result
|
118
|
-
if snapshot > 0 and processed > snapshot
|
119
|
-
puts "\nLoaded #{data.size} values, cleaning"
|
120
|
-
removed += remove_duplicates data, col_info[collection][:col]
|
121
|
-
data = Hash.new
|
122
|
-
processed = 0
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
removed += remove_duplicates data, col_info[collection][:col]
|
127
|
-
|
128
|
-
puts "\nProcessed #{total_processed}, deleted #{removed} duplicates"
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|