data_janitor 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +18 -0
- data/lib/data_janitor.rb +2 -1
- data/lib/data_janitor/data_janitor.rb +80 -0
- data/lib/data_janitor/universal_validator.rb +11 -2
- data/lib/data_janitor/version.rb +1 -1
- data/lib/tasks/data_janitor.rake +16 -73
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8354b3129d9d3776972692c4173df69c43df65c7
|
4
|
+
data.tar.gz: dac7277b118a995afe2b6adb021e3e558d0cb8dc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a12f79726392bb25ef7953ab47772da7fb5a900cf1aef7c0ccea56b745de6ab6e8c06a4676fd722bc1d57a13472647749d380d86b0a4709dd21bdd71ed589fb
|
7
|
+
data.tar.gz: 36574fa58940279106682e87c1a04662c811c5a5da2294e86f43843607292f03d07aec8e8aaa5ad0c3a8e59e22c301fdc7d004ce4dcb0630371066c34cf250bd
|
data/README.md
CHANGED
@@ -81,6 +81,24 @@ rake data_janitor:cleanse[SomeModel]
|
|
81
81
|
|
82
82
|
This will apply all the fixes that do not require semantic analysis of the data (e.g. replace `nil` values with `""` for strings)
|
83
83
|
|
84
|
+
Data Janitor has the experimental ability to perform some built-in type checks, only a small part of which is implemented and currently tends to be noisy when on. It currently defaults to off. Each audit command takes an option string that defaults to 'no-type-checks' and can be set other colon-separated values to turn off specific checks. The values are:
|
85
|
+
type-checks (or any other string not below)
|
86
|
+
no-type-checks
|
87
|
+
no-boolean
|
88
|
+
no-decimal
|
89
|
+
no-float
|
90
|
+
no-integer
|
91
|
+
no-string
|
92
|
+
no-text
|
93
|
+
no-array
|
94
|
+
|
95
|
+
For example:
|
96
|
+
```
|
97
|
+
rake data_janitor:audit_model[SomeModel,no-string:no-boolean]
|
98
|
+
rake data_janitor:audit[tmp/out.json,false,false,no-string:no-boolean]
|
99
|
+
```
|
100
|
+
|
101
|
+
|
84
102
|
## Contributing
|
85
103
|
|
86
104
|
Bug reports and pull requests are welcome on GitHub at https://github.com/westfield/data_janitor.
|
data/lib/data_janitor.rb
CHANGED
@@ -0,0 +1,80 @@
|
|
1
|
+
module DataJanitor
|
2
|
+
def self.audit(output_file, verbose, unscoped, options)
|
3
|
+
output = {}
|
4
|
+
all_models.each do |ar_model|
|
5
|
+
begin
|
6
|
+
audit_model ar_model, output, verbose, unscoped, options
|
7
|
+
rescue ActiveRecord::StatementInvalid # used to catch HABTM and schema migration. Only care about real Models
|
8
|
+
puts "skipping #{ar_model}"
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
File.write(output_file, output.to_json)
|
13
|
+
|
14
|
+
puts "Wrote results to #{output_file}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.audit_model(model, output = {}, verbose = false, unscoped = false, options = 'no-type-check')
|
18
|
+
total = 0
|
19
|
+
failed = 0
|
20
|
+
puts "Validating: #{model.name}"
|
21
|
+
output[model.name] = {}
|
22
|
+
|
23
|
+
model = model.unscoped if unscoped
|
24
|
+
model.include(DataJanitor::UniversalValidator)
|
25
|
+
model.validate do |record|
|
26
|
+
record.validate_field_values options
|
27
|
+
end
|
28
|
+
|
29
|
+
model.find_each do |rec|
|
30
|
+
if rec.invalid?(:dj_audit)
|
31
|
+
rec.errors.to_h.each_pair do |attribute, error_message|
|
32
|
+
output[model.name][attribute] ||= {}
|
33
|
+
output[model.name][attribute][error_message] ||= []
|
34
|
+
output[model.name][attribute][error_message] << rec.id
|
35
|
+
end
|
36
|
+
|
37
|
+
failed += 1
|
38
|
+
end
|
39
|
+
|
40
|
+
total += 1
|
41
|
+
end
|
42
|
+
|
43
|
+
puts output.to_json if verbose
|
44
|
+
puts "Completed #{total} records with #{failed} failures"
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.cleanse
|
48
|
+
all_models.each do |ar_model|
|
49
|
+
cleanse_model! ar_model
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.cleanse_model(model)
|
54
|
+
cleanse_model! model.constantize
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def self.all_models
|
60
|
+
# Needs this executed before here: Rails.application.eager_load!
|
61
|
+
ActiveRecord::Base.descendants
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.cleanse_model!(model)
|
65
|
+
string_columns = model.columns.select{|c| (c.type == :string || c.type == :text) && c.array == false}
|
66
|
+
boolean_columns = model.columns.select{|c| c.type == :boolean && c.array == false}
|
67
|
+
array_columns = model.columns.select{|c| c.array == true}
|
68
|
+
|
69
|
+
clean_nils_from! model, string_columns, ""
|
70
|
+
clean_nils_from! model, boolean_columns, false
|
71
|
+
clean_nils_from! model, array_columns, []
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.clean_nils_from!(model, columns, default)
|
75
|
+
columns.each do |column|
|
76
|
+
count = model.where(column.name => nil).update_all(column.name => default)
|
77
|
+
puts "Fixed #{count} #{model} records where #{column.name} was nil" if count > 0
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -4,21 +4,26 @@ module DataJanitor
|
|
4
4
|
# TODO: Run standard validators instead of home-brewed
|
5
5
|
# validate :validate_field_values
|
6
6
|
# ACCEPTABLE_BOOLEAN_VALUES = %w(t true y yes on 1 f false n no off 0) # this list was taken from Postgres spec. TRUE FALSE, that are also there, are not listed because they are DB-native literals and have no representation in Ruby code
|
7
|
-
def validate_field_values
|
7
|
+
def validate_field_values(options)
|
8
|
+
optional = options.split(':').map(&:strip)
|
9
|
+
return if optional.include? 'no-type-check'
|
10
|
+
|
8
11
|
# selected_attributes = self.changed? ? self.changed_attributes : self.attributes
|
9
12
|
selected_attributes = self.attributes
|
10
13
|
|
11
14
|
selected_attributes.each do |field_name, field_val|
|
12
15
|
column = self.column_for_attribute field_name
|
13
|
-
report_error = lambda {|msg| errors[column.name] << msg}
|
16
|
+
report_error = lambda { |msg| errors[column.name] << msg }
|
14
17
|
|
15
18
|
if column.array
|
19
|
+
next if optional.include? 'no-array'
|
16
20
|
report_error.call "cannot be nil" if field_val.nil?
|
17
21
|
next
|
18
22
|
end
|
19
23
|
|
20
24
|
case column.type
|
21
25
|
when :boolean
|
26
|
+
next if optional.include? 'no-boolean'
|
22
27
|
report_error.call "cannot be nil" if field_val.nil?
|
23
28
|
# report_error.call("must be a valid boolean") unless ACCEPTABLE_BOOLEAN_VALUES.include? field_val
|
24
29
|
when :date
|
@@ -28,15 +33,19 @@ module DataJanitor
|
|
28
33
|
when :datetime
|
29
34
|
# Time.iso8601(field_val) rescue report_error.call("must be a datetime in ISO-8601")
|
30
35
|
when :decimal
|
36
|
+
next if optional.include? 'no-decimal'
|
31
37
|
report_error.call "cannot be nil" if field_val.nil?
|
32
38
|
# TODO: run numericality test
|
33
39
|
when :float
|
40
|
+
next if optional.include? 'no-float'
|
34
41
|
report_error.call "cannot be nil" if field_val.nil?
|
35
42
|
# TODO: run numericality test
|
36
43
|
when :integer
|
44
|
+
next if optional.include? 'no-integer'
|
37
45
|
report_error.call "cannot be nil" if field_val.nil?
|
38
46
|
# TODO: run numericality test
|
39
47
|
when :string, :text
|
48
|
+
next if optional.include?('no-string') || optional.include?('no-text')
|
40
49
|
if field_val.nil?
|
41
50
|
# Almost never does an app need to distinguish between nil and empty string, yet nil needs special handling in all cases
|
42
51
|
report_error.call "cannot be nil. Use an empty string instead if that's what you wanted."
|
data/lib/data_janitor/version.rb
CHANGED
data/lib/tasks/data_janitor.rake
CHANGED
@@ -1,30 +1,26 @@
|
|
1
1
|
namespace :data_janitor do
|
2
2
|
desc 'Summarize invalid database records'
|
3
|
-
task :audit, [:output_file, :verbose, :unscoped] => [:environment] do |_t, args|
|
3
|
+
task :audit, [:output_file, :verbose, :unscoped, :options] => [:environment] do |_t, args|
|
4
4
|
args.with_defaults(
|
5
5
|
output_file: Rails.root.join('tmp', 'data_janitor_results.json'),
|
6
|
-
verbose: false,
|
7
|
-
unscoped: false
|
6
|
+
verbose: 'false',
|
7
|
+
unscoped: 'false',
|
8
|
+
options: 'no-type-check'
|
8
9
|
)
|
10
|
+
verbose = args[:verbose] == 'true'
|
11
|
+
unscoped = args[:unscoped] == 'true'
|
9
12
|
|
10
|
-
|
11
|
-
|
12
|
-
begin
|
13
|
-
audit ar_model, output, args[:verbose], args[:unscoped]
|
14
|
-
rescue ActiveRecord::StatementInvalid # used to catch HABTM and schema migration. Only care about real Models
|
15
|
-
puts "skipping #{ar_model}"
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
File.write(args[:output_file], output.to_json)
|
20
|
-
|
21
|
-
puts "Wrote results to #{args[:output_file]}"
|
13
|
+
Rails.application.eager_load!
|
14
|
+
DataJanitor::audit(args[:output_file], verbose, unscoped, args[:options])
|
22
15
|
end
|
23
16
|
|
24
17
|
desc 'Audit one model for data issues'
|
25
|
-
task :audit_model, [:model] => [:environment] do |_t, args|
|
18
|
+
task :audit_model, [:model, :options] => [:environment] do |_t, args|
|
19
|
+
args.with_defaults(
|
20
|
+
options: 'no-type-check'
|
21
|
+
)
|
26
22
|
Rails.application.eager_load!
|
27
|
-
|
23
|
+
DataJanitor::audit_model args[:model].constantize, {}, true, false, args[:options]
|
28
24
|
end
|
29
25
|
|
30
26
|
# For each model, apply trivial data corrections (those that do not require looking at data semantics).
|
@@ -34,67 +30,14 @@ namespace :data_janitor do
|
|
34
30
|
# - replace all null arrays with []
|
35
31
|
desc 'Apply common and safe data corrections'
|
36
32
|
task cleanse: :environment do
|
37
|
-
|
38
|
-
|
39
|
-
end
|
33
|
+
Rails.application.eager_load!
|
34
|
+
DataJanitor::clense
|
40
35
|
end
|
41
36
|
|
42
37
|
desc 'Apply fixes to one model only'
|
43
38
|
task :cleanse_model, [:model] => [:environment] do |_t, args|
|
44
39
|
Rails.application.eager_load!
|
45
|
-
|
46
|
-
cleanse_model! args[:model].constantize
|
47
|
-
end
|
48
|
-
|
49
|
-
private
|
50
|
-
|
51
|
-
def all_models
|
52
|
-
Rails.application.eager_load!
|
53
|
-
ActiveRecord::Base.descendants
|
40
|
+
DataJanitor::clense_model args[:model].constantize
|
54
41
|
end
|
55
42
|
|
56
|
-
def audit(model, output = {}, verbose = false, unscoped = false)
|
57
|
-
total = 0
|
58
|
-
failed = 0
|
59
|
-
puts "Validating: #{model.name}"
|
60
|
-
output[model.to_s] = {}
|
61
|
-
model = model.unscoped if unscoped
|
62
|
-
|
63
|
-
model.include(DataJanitor::UniversalValidator)
|
64
|
-
model.validate :validate_field_values
|
65
|
-
|
66
|
-
model.find_each do |rec|
|
67
|
-
if rec.invalid?(:dj_audit)
|
68
|
-
rec.errors.to_h.each_pair do |attribute, error_message|
|
69
|
-
output[model.to_s][attribute] ||= {}
|
70
|
-
output[model.to_s][attribute][error_message] ||= []
|
71
|
-
output[model.to_s][attribute][error_message] << rec.id
|
72
|
-
end
|
73
|
-
|
74
|
-
failed += 1
|
75
|
-
end
|
76
|
-
|
77
|
-
total += 1
|
78
|
-
end
|
79
|
-
|
80
|
-
puts output.to_json if verbose
|
81
|
-
puts "Completed #{total} records with #{failed} failures"
|
82
|
-
end
|
83
|
-
|
84
|
-
def cleanse_model!(model)
|
85
|
-
string_columns = model.columns.select{|c| (c.type == :string || c.type == :text) && c.array == false}
|
86
|
-
boolean_columns = model.columns.select{|c| c.type == :boolean && c.array == false}
|
87
|
-
array_columns = model.columns.select{|c| c.array == true}
|
88
|
-
|
89
|
-
clean_nils_from! model, string_columns, ""
|
90
|
-
clean_nils_from! model, boolean_columns, false
|
91
|
-
clean_nils_from! model, array_columns, []
|
92
|
-
end
|
93
|
-
|
94
|
-
def clean_nils_from!(model, columns, default)
|
95
|
-
columns.each do |column|
|
96
|
-
count = model.where(column.name => nil).update_all(column.name => default)
|
97
|
-
puts "Fixed #{count} #{model} records where #{column.name} was nil" if count > 0
|
98
|
-
end
|
99
|
-
end
|
100
43
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_janitor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Louis Tran
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-09-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rails
|
@@ -95,6 +95,7 @@ files:
|
|
95
95
|
- data_janitor.gemspec
|
96
96
|
- lib/data_janitor.rb
|
97
97
|
- lib/data_janitor/audit_validatable.rb
|
98
|
+
- lib/data_janitor/data_janitor.rb
|
98
99
|
- lib/data_janitor/universal_validator.rb
|
99
100
|
- lib/data_janitor/version.rb
|
100
101
|
- lib/tasks/data_janitor.rake
|
@@ -118,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
119
|
version: '0'
|
119
120
|
requirements: []
|
120
121
|
rubyforge_project:
|
121
|
-
rubygems_version: 2.
|
122
|
+
rubygems_version: 2.6.13
|
122
123
|
signing_key:
|
123
124
|
specification_version: 4
|
124
125
|
summary: Rake task to check validity of column types and values.
|