data_janitor 0.3.7 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 28258e2ec98f8f7db00397740aca503713a4ffba
4
- data.tar.gz: 58f562e76d73c73214d49c0c2616d33e80b9b975
3
+ metadata.gz: 8354b3129d9d3776972692c4173df69c43df65c7
4
+ data.tar.gz: dac7277b118a995afe2b6adb021e3e558d0cb8dc
5
5
  SHA512:
6
- metadata.gz: 3be58cb1964db6279c0227cb8efc99c1dd0bcf008d860ea00bb7c71bbc8b18665a78dcb7a28cee88f3ec0507ed8bb0bde0c567015b1eb690694bdbc58e48057e
7
- data.tar.gz: b04f75842fc3bde77e987d264b4ac3946caad5f28b33bf0c414bf774f5e7dbbc03171a7399cc59f7bd5b7b2e4759c24527eb9e0dda2066f927a0efd003b33616
6
+ metadata.gz: 0a12f79726392bb25ef7953ab47772da7fb5a900cf1aef7c0ccea56b745de6ab6e8c06a4676fd722bc1d57a13472647749d380d86b0a4709dd21bdd71ed589fb
7
+ data.tar.gz: 36574fa58940279106682e87c1a04662c811c5a5da2294e86f43843607292f03d07aec8e8aaa5ad0c3a8e59e22c301fdc7d004ce4dcb0630371066c34cf250bd
data/README.md CHANGED
@@ -81,6 +81,24 @@ rake data_janitor:cleanse[SomeModel]
81
81
 
82
82
  This will apply all the fixes that do not require semantic analysis of the data (e.g. replace `nil` values with `""` for strings)
83
83
 
84
+ Data Janitor has the experimental ability to perform some built-in type checks, only a small part of which is implemented and currently tends to be noisy when on. It currently defaults to off. Each audit command takes an option string that defaults to 'no-type-checks' and can be set other colon-separated values to turn off specific checks. The values are:
85
+ type-checks (or any other string not below)
86
+ no-type-checks
87
+ no-boolean
88
+ no-decimal
89
+ no-float
90
+ no-integer
91
+ no-string
92
+ no-text
93
+ no-array
94
+
95
+ For example:
96
+ ```
97
+ rake data_janitor:audit_model[SomeModel,no-string:no-boolean]
98
+ rake data_janitor:audit[tmp/out.json,false,false,no-string:no-boolean]
99
+ ```
100
+
101
+
84
102
  ## Contributing
85
103
 
86
104
  Bug reports and pull requests are welcome on GitHub at https://github.com/westfield/data_janitor.
data/lib/data_janitor.rb CHANGED
@@ -1,5 +1,6 @@
1
+ require "rails"
1
2
  require "data_janitor/version"
2
- require 'rails'
3
+ require "data_janitor/data_janitor"
3
4
  require "data_janitor/universal_validator"
4
5
  require "data_janitor/audit_validatable"
5
6
 
@@ -0,0 +1,80 @@
1
+ module DataJanitor
2
+ def self.audit(output_file, verbose, unscoped, options)
3
+ output = {}
4
+ all_models.each do |ar_model|
5
+ begin
6
+ audit_model ar_model, output, verbose, unscoped, options
7
+ rescue ActiveRecord::StatementInvalid # used to catch HABTM and schema migration. Only care about real Models
8
+ puts "skipping #{ar_model}"
9
+ end
10
+ end
11
+
12
+ File.write(output_file, output.to_json)
13
+
14
+ puts "Wrote results to #{output_file}"
15
+ end
16
+
17
+ def self.audit_model(model, output = {}, verbose = false, unscoped = false, options = 'no-type-check')
18
+ total = 0
19
+ failed = 0
20
+ puts "Validating: #{model.name}"
21
+ output[model.name] = {}
22
+
23
+ model = model.unscoped if unscoped
24
+ model.include(DataJanitor::UniversalValidator)
25
+ model.validate do |record|
26
+ record.validate_field_values options
27
+ end
28
+
29
+ model.find_each do |rec|
30
+ if rec.invalid?(:dj_audit)
31
+ rec.errors.to_h.each_pair do |attribute, error_message|
32
+ output[model.name][attribute] ||= {}
33
+ output[model.name][attribute][error_message] ||= []
34
+ output[model.name][attribute][error_message] << rec.id
35
+ end
36
+
37
+ failed += 1
38
+ end
39
+
40
+ total += 1
41
+ end
42
+
43
+ puts output.to_json if verbose
44
+ puts "Completed #{total} records with #{failed} failures"
45
+ end
46
+
47
+ def self.cleanse
48
+ all_models.each do |ar_model|
49
+ cleanse_model! ar_model
50
+ end
51
+ end
52
+
53
+ def self.cleanse_model(model)
54
+ cleanse_model! model.constantize
55
+ end
56
+
57
+ private
58
+
59
+ def self.all_models
60
+ # Needs this executed before here: Rails.application.eager_load!
61
+ ActiveRecord::Base.descendants
62
+ end
63
+
64
+ def self.cleanse_model!(model)
65
+ string_columns = model.columns.select{|c| (c.type == :string || c.type == :text) && c.array == false}
66
+ boolean_columns = model.columns.select{|c| c.type == :boolean && c.array == false}
67
+ array_columns = model.columns.select{|c| c.array == true}
68
+
69
+ clean_nils_from! model, string_columns, ""
70
+ clean_nils_from! model, boolean_columns, false
71
+ clean_nils_from! model, array_columns, []
72
+ end
73
+
74
+ def self.clean_nils_from!(model, columns, default)
75
+ columns.each do |column|
76
+ count = model.where(column.name => nil).update_all(column.name => default)
77
+ puts "Fixed #{count} #{model} records where #{column.name} was nil" if count > 0
78
+ end
79
+ end
80
+ end
@@ -4,21 +4,26 @@ module DataJanitor
4
4
  # TODO: Run standard validators instead of home-brewed
5
5
  # validate :validate_field_values
6
6
  # ACCEPTABLE_BOOLEAN_VALUES = %w(t true y yes on 1 f false n no off 0) # this list was taken from Postgres spec. TRUE FALSE, that are also there, are not listed because they are DB-native literals and have no representation in Ruby code
7
- def validate_field_values
7
+ def validate_field_values(options)
8
+ optional = options.split(':').map(&:strip)
9
+ return if optional.include? 'no-type-check'
10
+
8
11
  # selected_attributes = self.changed? ? self.changed_attributes : self.attributes
9
12
  selected_attributes = self.attributes
10
13
 
11
14
  selected_attributes.each do |field_name, field_val|
12
15
  column = self.column_for_attribute field_name
13
- report_error = lambda {|msg| errors[column.name] << msg}
16
+ report_error = lambda { |msg| errors[column.name] << msg }
14
17
 
15
18
  if column.array
19
+ next if optional.include? 'no-array'
16
20
  report_error.call "cannot be nil" if field_val.nil?
17
21
  next
18
22
  end
19
23
 
20
24
  case column.type
21
25
  when :boolean
26
+ next if optional.include? 'no-boolean'
22
27
  report_error.call "cannot be nil" if field_val.nil?
23
28
  # report_error.call("must be a valid boolean") unless ACCEPTABLE_BOOLEAN_VALUES.include? field_val
24
29
  when :date
@@ -28,15 +33,19 @@ module DataJanitor
28
33
  when :datetime
29
34
  # Time.iso8601(field_val) rescue report_error.call("must be a datetime in ISO-8601")
30
35
  when :decimal
36
+ next if optional.include? 'no-decimal'
31
37
  report_error.call "cannot be nil" if field_val.nil?
32
38
  # TODO: run numericality test
33
39
  when :float
40
+ next if optional.include? 'no-float'
34
41
  report_error.call "cannot be nil" if field_val.nil?
35
42
  # TODO: run numericality test
36
43
  when :integer
44
+ next if optional.include? 'no-integer'
37
45
  report_error.call "cannot be nil" if field_val.nil?
38
46
  # TODO: run numericality test
39
47
  when :string, :text
48
+ next if optional.include?('no-string') || optional.include?('no-text')
40
49
  if field_val.nil?
41
50
  # Almost never does an app need to distinguish between nil and empty string, yet nil needs special handling in all cases
42
51
  report_error.call "cannot be nil. Use an empty string instead if that's what you wanted."
@@ -1,3 +1,3 @@
1
1
  module DataJanitor
2
- VERSION = "0.3.7"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -1,30 +1,26 @@
1
1
  namespace :data_janitor do
2
2
  desc 'Summarize invalid database records'
3
- task :audit, [:output_file, :verbose, :unscoped] => [:environment] do |_t, args|
3
+ task :audit, [:output_file, :verbose, :unscoped, :options] => [:environment] do |_t, args|
4
4
  args.with_defaults(
5
5
  output_file: Rails.root.join('tmp', 'data_janitor_results.json'),
6
- verbose: false,
7
- unscoped: false
6
+ verbose: 'false',
7
+ unscoped: 'false',
8
+ options: 'no-type-check'
8
9
  )
10
+ verbose = args[:verbose] == 'true'
11
+ unscoped = args[:unscoped] == 'true'
9
12
 
10
- output = {}
11
- all_models.each do |ar_model|
12
- begin
13
- audit ar_model, output, args[:verbose], args[:unscoped]
14
- rescue ActiveRecord::StatementInvalid # used to catch HABTM and schema migration. Only care about real Models
15
- puts "skipping #{ar_model}"
16
- end
17
- end
18
-
19
- File.write(args[:output_file], output.to_json)
20
-
21
- puts "Wrote results to #{args[:output_file]}"
13
+ Rails.application.eager_load!
14
+ DataJanitor::audit(args[:output_file], verbose, unscoped, args[:options])
22
15
  end
23
16
 
24
17
  desc 'Audit one model for data issues'
25
- task :audit_model, [:model] => [:environment] do |_t, args|
18
+ task :audit_model, [:model, :options] => [:environment] do |_t, args|
19
+ args.with_defaults(
20
+ options: 'no-type-check'
21
+ )
26
22
  Rails.application.eager_load!
27
- audit args[:model].constantize, {}, true
23
+ DataJanitor::audit_model args[:model].constantize, {}, true, false, args[:options]
28
24
  end
29
25
 
30
26
  # For each model, apply trivial data corrections (those that do not require looking at data semantics).
@@ -34,67 +30,14 @@ namespace :data_janitor do
34
30
  # - replace all null arrays with []
35
31
  desc 'Apply common and safe data corrections'
36
32
  task cleanse: :environment do
37
- all_models.each do |ar_model|
38
- cleanse_model! ar_model
39
- end
33
+ Rails.application.eager_load!
34
+ DataJanitor::clense
40
35
  end
41
36
 
42
37
  desc 'Apply fixes to one model only'
43
38
  task :cleanse_model, [:model] => [:environment] do |_t, args|
44
39
  Rails.application.eager_load!
45
-
46
- cleanse_model! args[:model].constantize
47
- end
48
-
49
- private
50
-
51
- def all_models
52
- Rails.application.eager_load!
53
- ActiveRecord::Base.descendants
40
+ DataJanitor::clense_model args[:model].constantize
54
41
  end
55
42
 
56
- def audit(model, output = {}, verbose = false, unscoped = false)
57
- total = 0
58
- failed = 0
59
- puts "Validating: #{model.name}"
60
- output[model.to_s] = {}
61
- model = model.unscoped if unscoped
62
-
63
- model.include(DataJanitor::UniversalValidator)
64
- model.validate :validate_field_values
65
-
66
- model.find_each do |rec|
67
- if rec.invalid?(:dj_audit)
68
- rec.errors.to_h.each_pair do |attribute, error_message|
69
- output[model.to_s][attribute] ||= {}
70
- output[model.to_s][attribute][error_message] ||= []
71
- output[model.to_s][attribute][error_message] << rec.id
72
- end
73
-
74
- failed += 1
75
- end
76
-
77
- total += 1
78
- end
79
-
80
- puts output.to_json if verbose
81
- puts "Completed #{total} records with #{failed} failures"
82
- end
83
-
84
- def cleanse_model!(model)
85
- string_columns = model.columns.select{|c| (c.type == :string || c.type == :text) && c.array == false}
86
- boolean_columns = model.columns.select{|c| c.type == :boolean && c.array == false}
87
- array_columns = model.columns.select{|c| c.array == true}
88
-
89
- clean_nils_from! model, string_columns, ""
90
- clean_nils_from! model, boolean_columns, false
91
- clean_nils_from! model, array_columns, []
92
- end
93
-
94
- def clean_nils_from!(model, columns, default)
95
- columns.each do |column|
96
- count = model.where(column.name => nil).update_all(column.name => default)
97
- puts "Fixed #{count} #{model} records where #{column.name} was nil" if count > 0
98
- end
99
- end
100
43
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_janitor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Louis Tran
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-08-31 00:00:00.000000000 Z
12
+ date: 2017-09-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rails
@@ -95,6 +95,7 @@ files:
95
95
  - data_janitor.gemspec
96
96
  - lib/data_janitor.rb
97
97
  - lib/data_janitor/audit_validatable.rb
98
+ - lib/data_janitor/data_janitor.rb
98
99
  - lib/data_janitor/universal_validator.rb
99
100
  - lib/data_janitor/version.rb
100
101
  - lib/tasks/data_janitor.rake
@@ -118,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
119
  version: '0'
119
120
  requirements: []
120
121
  rubyforge_project:
121
- rubygems_version: 2.4.8
122
+ rubygems_version: 2.6.13
122
123
  signing_key:
123
124
  specification_version: 4
124
125
  summary: Rake task to check validity of column types and values.