data_janitor 0.3.7 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 28258e2ec98f8f7db00397740aca503713a4ffba
4
- data.tar.gz: 58f562e76d73c73214d49c0c2616d33e80b9b975
3
+ metadata.gz: 8354b3129d9d3776972692c4173df69c43df65c7
4
+ data.tar.gz: dac7277b118a995afe2b6adb021e3e558d0cb8dc
5
5
  SHA512:
6
- metadata.gz: 3be58cb1964db6279c0227cb8efc99c1dd0bcf008d860ea00bb7c71bbc8b18665a78dcb7a28cee88f3ec0507ed8bb0bde0c567015b1eb690694bdbc58e48057e
7
- data.tar.gz: b04f75842fc3bde77e987d264b4ac3946caad5f28b33bf0c414bf774f5e7dbbc03171a7399cc59f7bd5b7b2e4759c24527eb9e0dda2066f927a0efd003b33616
6
+ metadata.gz: 0a12f79726392bb25ef7953ab47772da7fb5a900cf1aef7c0ccea56b745de6ab6e8c06a4676fd722bc1d57a13472647749d380d86b0a4709dd21bdd71ed589fb
7
+ data.tar.gz: 36574fa58940279106682e87c1a04662c811c5a5da2294e86f43843607292f03d07aec8e8aaa5ad0c3a8e59e22c301fdc7d004ce4dcb0630371066c34cf250bd
data/README.md CHANGED
@@ -81,6 +81,24 @@ rake data_janitor:cleanse[SomeModel]
81
81
 
82
82
  This will apply all the fixes that do not require semantic analysis of the data (e.g. replace `nil` values with `""` for strings)
83
83
 
84
+ Data Janitor has the experimental ability to perform some built-in type checks, only a small part of which is implemented and currently tends to be noisy when on. It currently defaults to off. Each audit command takes an option string that defaults to 'no-type-checks' and can be set other colon-separated values to turn off specific checks. The values are:
85
+ type-checks (or any other string not below)
86
+ no-type-checks
87
+ no-boolean
88
+ no-decimal
89
+ no-float
90
+ no-integer
91
+ no-string
92
+ no-text
93
+ no-array
94
+
95
+ For example:
96
+ ```
97
+ rake data_janitor:audit_model[SomeModel,no-string:no-boolean]
98
+ rake data_janitor:audit[tmp/out.json,false,false,no-string:no-boolean]
99
+ ```
100
+
101
+
84
102
  ## Contributing
85
103
 
86
104
  Bug reports and pull requests are welcome on GitHub at https://github.com/westfield/data_janitor.
data/lib/data_janitor.rb CHANGED
@@ -1,5 +1,6 @@
1
+ require "rails"
1
2
  require "data_janitor/version"
2
- require 'rails'
3
+ require "data_janitor/data_janitor"
3
4
  require "data_janitor/universal_validator"
4
5
  require "data_janitor/audit_validatable"
5
6
 
@@ -0,0 +1,80 @@
1
+ module DataJanitor
2
+ def self.audit(output_file, verbose, unscoped, options)
3
+ output = {}
4
+ all_models.each do |ar_model|
5
+ begin
6
+ audit_model ar_model, output, verbose, unscoped, options
7
+ rescue ActiveRecord::StatementInvalid # used to catch HABTM and schema migration. Only care about real Models
8
+ puts "skipping #{ar_model}"
9
+ end
10
+ end
11
+
12
+ File.write(output_file, output.to_json)
13
+
14
+ puts "Wrote results to #{output_file}"
15
+ end
16
+
17
+ def self.audit_model(model, output = {}, verbose = false, unscoped = false, options = 'no-type-check')
18
+ total = 0
19
+ failed = 0
20
+ puts "Validating: #{model.name}"
21
+ output[model.name] = {}
22
+
23
+ model = model.unscoped if unscoped
24
+ model.include(DataJanitor::UniversalValidator)
25
+ model.validate do |record|
26
+ record.validate_field_values options
27
+ end
28
+
29
+ model.find_each do |rec|
30
+ if rec.invalid?(:dj_audit)
31
+ rec.errors.to_h.each_pair do |attribute, error_message|
32
+ output[model.name][attribute] ||= {}
33
+ output[model.name][attribute][error_message] ||= []
34
+ output[model.name][attribute][error_message] << rec.id
35
+ end
36
+
37
+ failed += 1
38
+ end
39
+
40
+ total += 1
41
+ end
42
+
43
+ puts output.to_json if verbose
44
+ puts "Completed #{total} records with #{failed} failures"
45
+ end
46
+
47
+ def self.cleanse
48
+ all_models.each do |ar_model|
49
+ cleanse_model! ar_model
50
+ end
51
+ end
52
+
53
+ def self.cleanse_model(model)
54
+ cleanse_model! model.constantize
55
+ end
56
+
57
+ private
58
+
59
+ def self.all_models
60
+ # Needs this executed before here: Rails.application.eager_load!
61
+ ActiveRecord::Base.descendants
62
+ end
63
+
64
+ def self.cleanse_model!(model)
65
+ string_columns = model.columns.select{|c| (c.type == :string || c.type == :text) && c.array == false}
66
+ boolean_columns = model.columns.select{|c| c.type == :boolean && c.array == false}
67
+ array_columns = model.columns.select{|c| c.array == true}
68
+
69
+ clean_nils_from! model, string_columns, ""
70
+ clean_nils_from! model, boolean_columns, false
71
+ clean_nils_from! model, array_columns, []
72
+ end
73
+
74
+ def self.clean_nils_from!(model, columns, default)
75
+ columns.each do |column|
76
+ count = model.where(column.name => nil).update_all(column.name => default)
77
+ puts "Fixed #{count} #{model} records where #{column.name} was nil" if count > 0
78
+ end
79
+ end
80
+ end
@@ -4,21 +4,26 @@ module DataJanitor
4
4
  # TODO: Run standard validators instead of home-brewed
5
5
  # validate :validate_field_values
6
6
  # ACCEPTABLE_BOOLEAN_VALUES = %w(t true y yes on 1 f false n no off 0) # this list was taken from Postgres spec. TRUE FALSE, that are also there, are not listed because they are DB-native literals and have no representation in Ruby code
7
- def validate_field_values
7
+ def validate_field_values(options)
8
+ optional = options.split(':').map(&:strip)
9
+ return if optional.include? 'no-type-check'
10
+
8
11
  # selected_attributes = self.changed? ? self.changed_attributes : self.attributes
9
12
  selected_attributes = self.attributes
10
13
 
11
14
  selected_attributes.each do |field_name, field_val|
12
15
  column = self.column_for_attribute field_name
13
- report_error = lambda {|msg| errors[column.name] << msg}
16
+ report_error = lambda { |msg| errors[column.name] << msg }
14
17
 
15
18
  if column.array
19
+ next if optional.include? 'no-array'
16
20
  report_error.call "cannot be nil" if field_val.nil?
17
21
  next
18
22
  end
19
23
 
20
24
  case column.type
21
25
  when :boolean
26
+ next if optional.include? 'no-boolean'
22
27
  report_error.call "cannot be nil" if field_val.nil?
23
28
  # report_error.call("must be a valid boolean") unless ACCEPTABLE_BOOLEAN_VALUES.include? field_val
24
29
  when :date
@@ -28,15 +33,19 @@ module DataJanitor
28
33
  when :datetime
29
34
  # Time.iso8601(field_val) rescue report_error.call("must be a datetime in ISO-8601")
30
35
  when :decimal
36
+ next if optional.include? 'no-decimal'
31
37
  report_error.call "cannot be nil" if field_val.nil?
32
38
  # TODO: run numericality test
33
39
  when :float
40
+ next if optional.include? 'no-float'
34
41
  report_error.call "cannot be nil" if field_val.nil?
35
42
  # TODO: run numericality test
36
43
  when :integer
44
+ next if optional.include? 'no-integer'
37
45
  report_error.call "cannot be nil" if field_val.nil?
38
46
  # TODO: run numericality test
39
47
  when :string, :text
48
+ next if optional.include?('no-string') || optional.include?('no-text')
40
49
  if field_val.nil?
41
50
  # Almost never does an app need to distinguish between nil and empty string, yet nil needs special handling in all cases
42
51
  report_error.call "cannot be nil. Use an empty string instead if that's what you wanted."
@@ -1,3 +1,3 @@
1
1
  module DataJanitor
2
- VERSION = "0.3.7"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -1,30 +1,26 @@
1
1
  namespace :data_janitor do
2
2
  desc 'Summarize invalid database records'
3
- task :audit, [:output_file, :verbose, :unscoped] => [:environment] do |_t, args|
3
+ task :audit, [:output_file, :verbose, :unscoped, :options] => [:environment] do |_t, args|
4
4
  args.with_defaults(
5
5
  output_file: Rails.root.join('tmp', 'data_janitor_results.json'),
6
- verbose: false,
7
- unscoped: false
6
+ verbose: 'false',
7
+ unscoped: 'false',
8
+ options: 'no-type-check'
8
9
  )
10
+ verbose = args[:verbose] == 'true'
11
+ unscoped = args[:unscoped] == 'true'
9
12
 
10
- output = {}
11
- all_models.each do |ar_model|
12
- begin
13
- audit ar_model, output, args[:verbose], args[:unscoped]
14
- rescue ActiveRecord::StatementInvalid # used to catch HABTM and schema migration. Only care about real Models
15
- puts "skipping #{ar_model}"
16
- end
17
- end
18
-
19
- File.write(args[:output_file], output.to_json)
20
-
21
- puts "Wrote results to #{args[:output_file]}"
13
+ Rails.application.eager_load!
14
+ DataJanitor::audit(args[:output_file], verbose, unscoped, args[:options])
22
15
  end
23
16
 
24
17
  desc 'Audit one model for data issues'
25
- task :audit_model, [:model] => [:environment] do |_t, args|
18
+ task :audit_model, [:model, :options] => [:environment] do |_t, args|
19
+ args.with_defaults(
20
+ options: 'no-type-check'
21
+ )
26
22
  Rails.application.eager_load!
27
- audit args[:model].constantize, {}, true
23
+ DataJanitor::audit_model args[:model].constantize, {}, true, false, args[:options]
28
24
  end
29
25
 
30
26
  # For each model, apply trivial data corrections (those that do not require looking at data semantics).
@@ -34,67 +30,14 @@ namespace :data_janitor do
34
30
  # - replace all null arrays with []
35
31
  desc 'Apply common and safe data corrections'
36
32
  task cleanse: :environment do
37
- all_models.each do |ar_model|
38
- cleanse_model! ar_model
39
- end
33
+ Rails.application.eager_load!
34
+ DataJanitor::clense
40
35
  end
41
36
 
42
37
  desc 'Apply fixes to one model only'
43
38
  task :cleanse_model, [:model] => [:environment] do |_t, args|
44
39
  Rails.application.eager_load!
45
-
46
- cleanse_model! args[:model].constantize
47
- end
48
-
49
- private
50
-
51
- def all_models
52
- Rails.application.eager_load!
53
- ActiveRecord::Base.descendants
40
+ DataJanitor::clense_model args[:model].constantize
54
41
  end
55
42
 
56
- def audit(model, output = {}, verbose = false, unscoped = false)
57
- total = 0
58
- failed = 0
59
- puts "Validating: #{model.name}"
60
- output[model.to_s] = {}
61
- model = model.unscoped if unscoped
62
-
63
- model.include(DataJanitor::UniversalValidator)
64
- model.validate :validate_field_values
65
-
66
- model.find_each do |rec|
67
- if rec.invalid?(:dj_audit)
68
- rec.errors.to_h.each_pair do |attribute, error_message|
69
- output[model.to_s][attribute] ||= {}
70
- output[model.to_s][attribute][error_message] ||= []
71
- output[model.to_s][attribute][error_message] << rec.id
72
- end
73
-
74
- failed += 1
75
- end
76
-
77
- total += 1
78
- end
79
-
80
- puts output.to_json if verbose
81
- puts "Completed #{total} records with #{failed} failures"
82
- end
83
-
84
- def cleanse_model!(model)
85
- string_columns = model.columns.select{|c| (c.type == :string || c.type == :text) && c.array == false}
86
- boolean_columns = model.columns.select{|c| c.type == :boolean && c.array == false}
87
- array_columns = model.columns.select{|c| c.array == true}
88
-
89
- clean_nils_from! model, string_columns, ""
90
- clean_nils_from! model, boolean_columns, false
91
- clean_nils_from! model, array_columns, []
92
- end
93
-
94
- def clean_nils_from!(model, columns, default)
95
- columns.each do |column|
96
- count = model.where(column.name => nil).update_all(column.name => default)
97
- puts "Fixed #{count} #{model} records where #{column.name} was nil" if count > 0
98
- end
99
- end
100
43
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_janitor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Louis Tran
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-08-31 00:00:00.000000000 Z
12
+ date: 2017-09-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rails
@@ -95,6 +95,7 @@ files:
95
95
  - data_janitor.gemspec
96
96
  - lib/data_janitor.rb
97
97
  - lib/data_janitor/audit_validatable.rb
98
+ - lib/data_janitor/data_janitor.rb
98
99
  - lib/data_janitor/universal_validator.rb
99
100
  - lib/data_janitor/version.rb
100
101
  - lib/tasks/data_janitor.rake
@@ -118,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
119
  version: '0'
119
120
  requirements: []
120
121
  rubyforge_project:
121
- rubygems_version: 2.4.8
122
+ rubygems_version: 2.6.13
122
123
  signing_key:
123
124
  specification_version: 4
124
125
  summary: Rake task to check validity of column types and values.