data_janitor 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +18 -0
- data/lib/data_janitor.rb +2 -1
- data/lib/data_janitor/data_janitor.rb +80 -0
- data/lib/data_janitor/universal_validator.rb +11 -2
- data/lib/data_janitor/version.rb +1 -1
- data/lib/tasks/data_janitor.rake +16 -73
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8354b3129d9d3776972692c4173df69c43df65c7
|
4
|
+
data.tar.gz: dac7277b118a995afe2b6adb021e3e558d0cb8dc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a12f79726392bb25ef7953ab47772da7fb5a900cf1aef7c0ccea56b745de6ab6e8c06a4676fd722bc1d57a13472647749d380d86b0a4709dd21bdd71ed589fb
|
7
|
+
data.tar.gz: 36574fa58940279106682e87c1a04662c811c5a5da2294e86f43843607292f03d07aec8e8aaa5ad0c3a8e59e22c301fdc7d004ce4dcb0630371066c34cf250bd
|
data/README.md
CHANGED
@@ -81,6 +81,24 @@ rake data_janitor:cleanse[SomeModel]
|
|
81
81
|
|
82
82
|
This will apply all the fixes that do not require semantic analysis of the data (e.g. replace `nil` values with `""` for strings)
|
83
83
|
|
84
|
+
Data Janitor has the experimental ability to perform some built-in type checks, only a small part of which is implemented and currently tends to be noisy when on. It currently defaults to off. Each audit command takes an option string that defaults to 'no-type-checks' and can be set other colon-separated values to turn off specific checks. The values are:
|
85
|
+
type-checks (or any other string not below)
|
86
|
+
no-type-checks
|
87
|
+
no-boolean
|
88
|
+
no-decimal
|
89
|
+
no-float
|
90
|
+
no-integer
|
91
|
+
no-string
|
92
|
+
no-text
|
93
|
+
no-array
|
94
|
+
|
95
|
+
For example:
|
96
|
+
```
|
97
|
+
rake data_janitor:audit_model[SomeModel,no-string:no-boolean]
|
98
|
+
rake data_janitor:audit[tmp/out.json,false,false,no-string:no-boolean]
|
99
|
+
```
|
100
|
+
|
101
|
+
|
84
102
|
## Contributing
|
85
103
|
|
86
104
|
Bug reports and pull requests are welcome on GitHub at https://github.com/westfield/data_janitor.
|
data/lib/data_janitor.rb
CHANGED
@@ -0,0 +1,80 @@
|
|
1
|
+
module DataJanitor
|
2
|
+
def self.audit(output_file, verbose, unscoped, options)
|
3
|
+
output = {}
|
4
|
+
all_models.each do |ar_model|
|
5
|
+
begin
|
6
|
+
audit_model ar_model, output, verbose, unscoped, options
|
7
|
+
rescue ActiveRecord::StatementInvalid # used to catch HABTM and schema migration. Only care about real Models
|
8
|
+
puts "skipping #{ar_model}"
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
File.write(output_file, output.to_json)
|
13
|
+
|
14
|
+
puts "Wrote results to #{output_file}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.audit_model(model, output = {}, verbose = false, unscoped = false, options = 'no-type-check')
|
18
|
+
total = 0
|
19
|
+
failed = 0
|
20
|
+
puts "Validating: #{model.name}"
|
21
|
+
output[model.name] = {}
|
22
|
+
|
23
|
+
model = model.unscoped if unscoped
|
24
|
+
model.include(DataJanitor::UniversalValidator)
|
25
|
+
model.validate do |record|
|
26
|
+
record.validate_field_values options
|
27
|
+
end
|
28
|
+
|
29
|
+
model.find_each do |rec|
|
30
|
+
if rec.invalid?(:dj_audit)
|
31
|
+
rec.errors.to_h.each_pair do |attribute, error_message|
|
32
|
+
output[model.name][attribute] ||= {}
|
33
|
+
output[model.name][attribute][error_message] ||= []
|
34
|
+
output[model.name][attribute][error_message] << rec.id
|
35
|
+
end
|
36
|
+
|
37
|
+
failed += 1
|
38
|
+
end
|
39
|
+
|
40
|
+
total += 1
|
41
|
+
end
|
42
|
+
|
43
|
+
puts output.to_json if verbose
|
44
|
+
puts "Completed #{total} records with #{failed} failures"
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.cleanse
|
48
|
+
all_models.each do |ar_model|
|
49
|
+
cleanse_model! ar_model
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.cleanse_model(model)
|
54
|
+
cleanse_model! model.constantize
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def self.all_models
|
60
|
+
# Needs this executed before here: Rails.application.eager_load!
|
61
|
+
ActiveRecord::Base.descendants
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.cleanse_model!(model)
|
65
|
+
string_columns = model.columns.select{|c| (c.type == :string || c.type == :text) && c.array == false}
|
66
|
+
boolean_columns = model.columns.select{|c| c.type == :boolean && c.array == false}
|
67
|
+
array_columns = model.columns.select{|c| c.array == true}
|
68
|
+
|
69
|
+
clean_nils_from! model, string_columns, ""
|
70
|
+
clean_nils_from! model, boolean_columns, false
|
71
|
+
clean_nils_from! model, array_columns, []
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.clean_nils_from!(model, columns, default)
|
75
|
+
columns.each do |column|
|
76
|
+
count = model.where(column.name => nil).update_all(column.name => default)
|
77
|
+
puts "Fixed #{count} #{model} records where #{column.name} was nil" if count > 0
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -4,21 +4,26 @@ module DataJanitor
|
|
4
4
|
# TODO: Run standard validators instead of home-brewed
|
5
5
|
# validate :validate_field_values
|
6
6
|
# ACCEPTABLE_BOOLEAN_VALUES = %w(t true y yes on 1 f false n no off 0) # this list was taken from Postgres spec. TRUE FALSE, that are also there, are not listed because they are DB-native literals and have no representation in Ruby code
|
7
|
-
def validate_field_values
|
7
|
+
def validate_field_values(options)
|
8
|
+
optional = options.split(':').map(&:strip)
|
9
|
+
return if optional.include? 'no-type-check'
|
10
|
+
|
8
11
|
# selected_attributes = self.changed? ? self.changed_attributes : self.attributes
|
9
12
|
selected_attributes = self.attributes
|
10
13
|
|
11
14
|
selected_attributes.each do |field_name, field_val|
|
12
15
|
column = self.column_for_attribute field_name
|
13
|
-
report_error = lambda {|msg| errors[column.name] << msg}
|
16
|
+
report_error = lambda { |msg| errors[column.name] << msg }
|
14
17
|
|
15
18
|
if column.array
|
19
|
+
next if optional.include? 'no-array'
|
16
20
|
report_error.call "cannot be nil" if field_val.nil?
|
17
21
|
next
|
18
22
|
end
|
19
23
|
|
20
24
|
case column.type
|
21
25
|
when :boolean
|
26
|
+
next if optional.include? 'no-boolean'
|
22
27
|
report_error.call "cannot be nil" if field_val.nil?
|
23
28
|
# report_error.call("must be a valid boolean") unless ACCEPTABLE_BOOLEAN_VALUES.include? field_val
|
24
29
|
when :date
|
@@ -28,15 +33,19 @@ module DataJanitor
|
|
28
33
|
when :datetime
|
29
34
|
# Time.iso8601(field_val) rescue report_error.call("must be a datetime in ISO-8601")
|
30
35
|
when :decimal
|
36
|
+
next if optional.include? 'no-decimal'
|
31
37
|
report_error.call "cannot be nil" if field_val.nil?
|
32
38
|
# TODO: run numericality test
|
33
39
|
when :float
|
40
|
+
next if optional.include? 'no-float'
|
34
41
|
report_error.call "cannot be nil" if field_val.nil?
|
35
42
|
# TODO: run numericality test
|
36
43
|
when :integer
|
44
|
+
next if optional.include? 'no-integer'
|
37
45
|
report_error.call "cannot be nil" if field_val.nil?
|
38
46
|
# TODO: run numericality test
|
39
47
|
when :string, :text
|
48
|
+
next if optional.include?('no-string') || optional.include?('no-text')
|
40
49
|
if field_val.nil?
|
41
50
|
# Almost never does an app need to distinguish between nil and empty string, yet nil needs special handling in all cases
|
42
51
|
report_error.call "cannot be nil. Use an empty string instead if that's what you wanted."
|
data/lib/data_janitor/version.rb
CHANGED
data/lib/tasks/data_janitor.rake
CHANGED
@@ -1,30 +1,26 @@
|
|
1
1
|
namespace :data_janitor do
|
2
2
|
desc 'Summarize invalid database records'
|
3
|
-
task :audit, [:output_file, :verbose, :unscoped] => [:environment] do |_t, args|
|
3
|
+
task :audit, [:output_file, :verbose, :unscoped, :options] => [:environment] do |_t, args|
|
4
4
|
args.with_defaults(
|
5
5
|
output_file: Rails.root.join('tmp', 'data_janitor_results.json'),
|
6
|
-
verbose: false,
|
7
|
-
unscoped: false
|
6
|
+
verbose: 'false',
|
7
|
+
unscoped: 'false',
|
8
|
+
options: 'no-type-check'
|
8
9
|
)
|
10
|
+
verbose = args[:verbose] == 'true'
|
11
|
+
unscoped = args[:unscoped] == 'true'
|
9
12
|
|
10
|
-
|
11
|
-
|
12
|
-
begin
|
13
|
-
audit ar_model, output, args[:verbose], args[:unscoped]
|
14
|
-
rescue ActiveRecord::StatementInvalid # used to catch HABTM and schema migration. Only care about real Models
|
15
|
-
puts "skipping #{ar_model}"
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
File.write(args[:output_file], output.to_json)
|
20
|
-
|
21
|
-
puts "Wrote results to #{args[:output_file]}"
|
13
|
+
Rails.application.eager_load!
|
14
|
+
DataJanitor::audit(args[:output_file], verbose, unscoped, args[:options])
|
22
15
|
end
|
23
16
|
|
24
17
|
desc 'Audit one model for data issues'
|
25
|
-
task :audit_model, [:model] => [:environment] do |_t, args|
|
18
|
+
task :audit_model, [:model, :options] => [:environment] do |_t, args|
|
19
|
+
args.with_defaults(
|
20
|
+
options: 'no-type-check'
|
21
|
+
)
|
26
22
|
Rails.application.eager_load!
|
27
|
-
|
23
|
+
DataJanitor::audit_model args[:model].constantize, {}, true, false, args[:options]
|
28
24
|
end
|
29
25
|
|
30
26
|
# For each model, apply trivial data corrections (those that do not require looking at data semantics).
|
@@ -34,67 +30,14 @@ namespace :data_janitor do
|
|
34
30
|
# - replace all null arrays with []
|
35
31
|
desc 'Apply common and safe data corrections'
|
36
32
|
task cleanse: :environment do
|
37
|
-
|
38
|
-
|
39
|
-
end
|
33
|
+
Rails.application.eager_load!
|
34
|
+
DataJanitor::clense
|
40
35
|
end
|
41
36
|
|
42
37
|
desc 'Apply fixes to one model only'
|
43
38
|
task :cleanse_model, [:model] => [:environment] do |_t, args|
|
44
39
|
Rails.application.eager_load!
|
45
|
-
|
46
|
-
cleanse_model! args[:model].constantize
|
47
|
-
end
|
48
|
-
|
49
|
-
private
|
50
|
-
|
51
|
-
def all_models
|
52
|
-
Rails.application.eager_load!
|
53
|
-
ActiveRecord::Base.descendants
|
40
|
+
DataJanitor::clense_model args[:model].constantize
|
54
41
|
end
|
55
42
|
|
56
|
-
def audit(model, output = {}, verbose = false, unscoped = false)
|
57
|
-
total = 0
|
58
|
-
failed = 0
|
59
|
-
puts "Validating: #{model.name}"
|
60
|
-
output[model.to_s] = {}
|
61
|
-
model = model.unscoped if unscoped
|
62
|
-
|
63
|
-
model.include(DataJanitor::UniversalValidator)
|
64
|
-
model.validate :validate_field_values
|
65
|
-
|
66
|
-
model.find_each do |rec|
|
67
|
-
if rec.invalid?(:dj_audit)
|
68
|
-
rec.errors.to_h.each_pair do |attribute, error_message|
|
69
|
-
output[model.to_s][attribute] ||= {}
|
70
|
-
output[model.to_s][attribute][error_message] ||= []
|
71
|
-
output[model.to_s][attribute][error_message] << rec.id
|
72
|
-
end
|
73
|
-
|
74
|
-
failed += 1
|
75
|
-
end
|
76
|
-
|
77
|
-
total += 1
|
78
|
-
end
|
79
|
-
|
80
|
-
puts output.to_json if verbose
|
81
|
-
puts "Completed #{total} records with #{failed} failures"
|
82
|
-
end
|
83
|
-
|
84
|
-
def cleanse_model!(model)
|
85
|
-
string_columns = model.columns.select{|c| (c.type == :string || c.type == :text) && c.array == false}
|
86
|
-
boolean_columns = model.columns.select{|c| c.type == :boolean && c.array == false}
|
87
|
-
array_columns = model.columns.select{|c| c.array == true}
|
88
|
-
|
89
|
-
clean_nils_from! model, string_columns, ""
|
90
|
-
clean_nils_from! model, boolean_columns, false
|
91
|
-
clean_nils_from! model, array_columns, []
|
92
|
-
end
|
93
|
-
|
94
|
-
def clean_nils_from!(model, columns, default)
|
95
|
-
columns.each do |column|
|
96
|
-
count = model.where(column.name => nil).update_all(column.name => default)
|
97
|
-
puts "Fixed #{count} #{model} records where #{column.name} was nil" if count > 0
|
98
|
-
end
|
99
|
-
end
|
100
43
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_janitor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Louis Tran
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-09-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rails
|
@@ -95,6 +95,7 @@ files:
|
|
95
95
|
- data_janitor.gemspec
|
96
96
|
- lib/data_janitor.rb
|
97
97
|
- lib/data_janitor/audit_validatable.rb
|
98
|
+
- lib/data_janitor/data_janitor.rb
|
98
99
|
- lib/data_janitor/universal_validator.rb
|
99
100
|
- lib/data_janitor/version.rb
|
100
101
|
- lib/tasks/data_janitor.rake
|
@@ -118,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
119
|
version: '0'
|
119
120
|
requirements: []
|
120
121
|
rubyforge_project:
|
121
|
-
rubygems_version: 2.
|
122
|
+
rubygems_version: 2.6.13
|
122
123
|
signing_key:
|
123
124
|
specification_version: 4
|
124
125
|
summary: Rake task to check validity of column types and values.
|