dump_cleaner 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +25 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +21 -0
- data/README.md +295 -0
- data/Rakefile +8 -0
- data/doc/workflow_steps.md +1400 -0
- data/dump_cleaner.gemspec +38 -0
- data/exe/dump_cleaner +7 -0
- data/lib/dump_cleaner/cleaners/base_cleaner.rb +32 -0
- data/lib/dump_cleaner/cleaners/mysql_shell_dump_cleaner.rb +47 -0
- data/lib/dump_cleaner/cleaners/mysql_shell_dump_helpers.rb +11 -0
- data/lib/dump_cleaner/cleaners/mysql_shell_table_cleaner.rb +184 -0
- data/lib/dump_cleaner/cleanup/bytesize_helpers.rb +39 -0
- data/lib/dump_cleaner/cleanup/cleaning.rb +69 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/add_repetition_suffix.rb +23 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/base.rb +33 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/fill_up_with_string.rb +20 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/generate_random_string.rb +37 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/inspect_context.rb +16 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_email.rb +78 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_formatted_number.rb +63 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_number.rb +29 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_bytesize.rb +17 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_pattern.rb +20 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/take_sample.rb +28 -0
- data/lib/dump_cleaner/cleanup/data_source.rb +19 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/base.rb +26 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/group_by_bytesize.rb +37 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/inspect_context.rb +16 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/load_yaml_file.rb +24 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/remove_accents.rb +29 -0
- data/lib/dump_cleaner/cleanup/inspection.rb +37 -0
- data/lib/dump_cleaner/cleanup/step_context.rb +46 -0
- data/lib/dump_cleaner/cleanup/uniqueness.rb +66 -0
- data/lib/dump_cleaner/cleanup/workflow.rb +38 -0
- data/lib/dump_cleaner/conditions.rb +42 -0
- data/lib/dump_cleaner/config.rb +109 -0
- data/lib/dump_cleaner/log.rb +42 -0
- data/lib/dump_cleaner/options.rb +46 -0
- data/lib/dump_cleaner/processor.rb +37 -0
- data/lib/dump_cleaner/version.rb +5 -0
- data/lib/dump_cleaner.rb +10 -0
- metadata +105 -0
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class RandomizeFormattedNumber < Base
|
7
|
+
include Inspection
|
8
|
+
|
9
|
+
def run(format:)
|
10
|
+
regex = Regexp.new("\\A#{format}\\z")
|
11
|
+
|
12
|
+
unless regex.names.any? { _1.start_with?("x") }
|
13
|
+
raise_params_error('The format has no named group starting with \'x\', e.g. \'(?<x>\d)\')')
|
14
|
+
end
|
15
|
+
|
16
|
+
unless current_value.match?(regex)
|
17
|
+
if repetition.zero?
|
18
|
+
Log.warn { "Invalid value: type=#{type}, id=#{record['id']}, value=#{truncate(current_value)}" }
|
19
|
+
end
|
20
|
+
step_context.current_value = nil
|
21
|
+
return step_context
|
22
|
+
end
|
23
|
+
|
24
|
+
random = Random.new(crc32)
|
25
|
+
new_value = randomize_named_captures(regex:, random:)
|
26
|
+
|
27
|
+
if new_value.length != current_value.length
|
28
|
+
raise ArgumentError, "The new value length does not match the original value length.
|
29
|
+
Do the named groups in the format regexp match the whole value?".gsub(/\s+/, " ")
|
30
|
+
end
|
31
|
+
|
32
|
+
step_context.current_value = new_value
|
33
|
+
step_context
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def randomize_named_captures(regex:, random:)
|
39
|
+
new_value = String.new
|
40
|
+
|
41
|
+
current_value.match(regex).named_captures.each do |name, capture|
|
42
|
+
if name.start_with?("x")
|
43
|
+
unless capture.match?(/^\d+$/)
|
44
|
+
raise ArgumentError,
|
45
|
+
"Invalid regexp for capture '#{name}' which matched to '#{capture}': it must match numbers only."
|
46
|
+
end
|
47
|
+
|
48
|
+
new_value << random_number(capture.length, random:)
|
49
|
+
else
|
50
|
+
new_value << capture
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
new_value
|
55
|
+
end
|
56
|
+
|
57
|
+
def random_number(digits, random:)
|
58
|
+
random.rand(10**digits - 1).to_s.rjust(digits, "0")
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class RandomizeNumber < Base
|
7
|
+
def run(difference_within: 1.0)
|
8
|
+
random = Random.new(crc32)
|
9
|
+
|
10
|
+
new_value = current_value.to_f + random.rand(difference_within.to_f * 2) - difference_within.to_f
|
11
|
+
|
12
|
+
# keep sign to keep string length (warning: this skews the distribution of the random numbers)
|
13
|
+
if (current_value.strip[0] == "-") && new_value.positive? ||
|
14
|
+
(current_value.strip[0] != "-") && new_value.negative?
|
15
|
+
new_value *= -1
|
16
|
+
end
|
17
|
+
|
18
|
+
decimal_places = current_value.split(".")[1].to_s.length
|
19
|
+
epsilon = 10**-decimal_places
|
20
|
+
clamped_value = new_value.clamp(current_value.to_f - difference_within + epsilon,
|
21
|
+
current_value.to_f + difference_within - epsilon)
|
22
|
+
|
23
|
+
step_context.current_value = format("%0#{current_value.length}.#{decimal_places}f", clamped_value)
|
24
|
+
step_context
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class SelectDataByBytesize < Base
|
7
|
+
def run
|
8
|
+
return step_context if !cleanup_data || cleanup_data.empty?
|
9
|
+
|
10
|
+
step_context.cleanup_data = cleanup_data["#{current_value.length}-#{current_value.bytesize}"] ||
|
11
|
+
cleanup_data["#{current_value.bytesize}-#{current_value.bytesize}"] # used when current_value is accented but data isn't
|
12
|
+
step_context
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class SelectDataByPattern < Base
|
7
|
+
def run(patterns:, default_key: nil)
|
8
|
+
step_context.cleanup_data = step_context.cleanup_data[match_key(patterns) || default_key]
|
9
|
+
step_context
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def match_key(patterns)
|
15
|
+
patterns.find { Regexp.new(_1["pattern"], _1["flags"]).match?(step_context.current_value) }&.fetch("key")
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class TakeSample < Base
|
7
|
+
def run(uniqueness_strategy: :resample)
|
8
|
+
if !cleanup_data || cleanup_data.empty?
|
9
|
+
step_context.current_value = nil
|
10
|
+
return step_context
|
11
|
+
end
|
12
|
+
|
13
|
+
uniqueness_strategy = uniqueness_strategy.to_sym
|
14
|
+
step_context.current_value =
|
15
|
+
if uniqueness_strategy == :resample
|
16
|
+
cleanup_data[crc32 % cleanup_data.size]
|
17
|
+
elsif uniqueness_strategy == :suffix
|
18
|
+
sample = cleanup_data[crc32(use_repetition: false) % cleanup_data.size]
|
19
|
+
AddRepetitionSuffix.new(StepContext.new_from(step_context, current_value: sample)).run.current_value
|
20
|
+
else
|
21
|
+
raise_params_error("Unknown uniqueness strategy: #{uniqueness_strategy}")
|
22
|
+
end
|
23
|
+
step_context
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
class DataSource
|
6
|
+
def initialize(config:)
|
7
|
+
@config = config
|
8
|
+
@workflow = Workflow.new(phase: :data_source)
|
9
|
+
@data_cache = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def data_for(type)
|
13
|
+
step_context = StepContext.new(type:, cleanup_data: nil)
|
14
|
+
@data_cache[type] ||= @workflow.run(step_context, step_configs: @config.steps_for(type, :data_source))
|
15
|
+
.cleanup_data
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module DataSourceSteps
|
6
|
+
class Base
|
7
|
+
require "forwardable"
|
8
|
+
|
9
|
+
extend Forwardable
|
10
|
+
|
11
|
+
def_delegators :step_context, :cleanup_data, :type
|
12
|
+
|
13
|
+
attr_reader :step_context
|
14
|
+
|
15
|
+
def initialize(step_context)
|
16
|
+
@step_context = step_context.dup
|
17
|
+
end
|
18
|
+
|
19
|
+
def raise_params_error(error)
|
20
|
+
step = self.class.name.split("::").last
|
21
|
+
raise ArgumentError, "Invalid data source step params: type=#{type}, step=#{step}: #{error}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module DataSourceSteps
|
6
|
+
class GroupByBytesize < Base
|
7
|
+
def run(under_keys: [])
|
8
|
+
validate_params(under_keys:)
|
9
|
+
|
10
|
+
group_by_lambda = -> { "#{_1.length}-#{_1.bytesize}" }
|
11
|
+
|
12
|
+
step_context.cleanup_data = begin
|
13
|
+
if under_keys.any?
|
14
|
+
new_data = cleanup_data.dup
|
15
|
+
under_keys.each do |key|
|
16
|
+
new_data[key] = new_data[key].group_by(&group_by_lambda)
|
17
|
+
end
|
18
|
+
new_data
|
19
|
+
else
|
20
|
+
cleanup_data.group_by(&group_by_lambda)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
step_context
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def validate_params(under_keys:)
|
30
|
+
return if under_keys.all? { cleanup_data.key?(_1) }
|
31
|
+
|
32
|
+
raise_params_error("The under_keys param contains keys not present in cleanup_data.")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module DataSourceSteps
|
6
|
+
class LoadYamlFile < Base
|
7
|
+
require "yaml"
|
8
|
+
|
9
|
+
def run(file:, under_key: nil)
|
10
|
+
loaded_data = YAML.load_file(file)
|
11
|
+
|
12
|
+
step_context.cleanup_data = if under_key
|
13
|
+
new_data ||= cleanup_data || {}
|
14
|
+
new_data[under_key] = loaded_data
|
15
|
+
new_data
|
16
|
+
else
|
17
|
+
loaded_data
|
18
|
+
end
|
19
|
+
step_context
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module DataSourceSteps
|
6
|
+
class RemoveAccents < Base
|
7
|
+
def run(under_keys: [])
|
8
|
+
block = lambda do |word|
|
9
|
+
word.match?(/^\p{ASCII}+$/) ? word : word.unicode_normalize(:nfd).gsub(/\p{M}/, "")
|
10
|
+
end
|
11
|
+
|
12
|
+
step_context.cleanup_data = begin
|
13
|
+
if under_keys.any?
|
14
|
+
new_data = cleanup_data.dup
|
15
|
+
under_keys.each do |key|
|
16
|
+
new_data[key] = new_data[key].map(&block)
|
17
|
+
end
|
18
|
+
new_data
|
19
|
+
else
|
20
|
+
cleanup_data.map(&block)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
step_context
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module Inspection
|
6
|
+
def inspect_step_context(step_context, message: "Inspecting step context")
|
7
|
+
Log.info { message }
|
8
|
+
Log.info { "\n#{step_context.pretty_inspect}" }
|
9
|
+
end
|
10
|
+
|
11
|
+
def subset(data, values: 10)
|
12
|
+
case data
|
13
|
+
when Array
|
14
|
+
subset_data = data.take(values)
|
15
|
+
subset_data << "+ #{data.size - values} more..." if data.size > values
|
16
|
+
subset_data.each_with_index { |element, index| subset_data[index] = subset(element, values:) }
|
17
|
+
when Hash
|
18
|
+
subset_data = data.take(values).to_h
|
19
|
+
subset_data["+ #{data.size - values} more..."] = nil if data.size > values
|
20
|
+
subset_data.each_key { |key| subset_data[key] = subset(subset_data[key], values:) }
|
21
|
+
else
|
22
|
+
subset_data = data
|
23
|
+
end
|
24
|
+
|
25
|
+
subset_data
|
26
|
+
end
|
27
|
+
|
28
|
+
def truncate(value, to: 30, omission: "…")
|
29
|
+
return value.dup if value.length <= to
|
30
|
+
|
31
|
+
length_with_room_for_omission = to - omission.length
|
32
|
+
stop = length_with_room_for_omission
|
33
|
+
+"#{value[0, stop]}#{omission}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
class StepContext
|
6
|
+
require "pp"
|
7
|
+
|
8
|
+
include Inspection
|
9
|
+
|
10
|
+
attr_accessor :cleanup_data, :current_value, :repetition
|
11
|
+
attr_reader :orig_value, :type, :record
|
12
|
+
|
13
|
+
def initialize(type:, cleanup_data:, orig_value: nil, record: {}, repetition: 0)
|
14
|
+
@type = type
|
15
|
+
@cleanup_data = cleanup_data
|
16
|
+
@orig_value = @current_value = orig_value
|
17
|
+
@record = record
|
18
|
+
@repetition = repetition
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.new_from(step_context, **params)
|
22
|
+
context_copy = step_context.dup
|
23
|
+
new_context = new(orig_value: params[:orig_value] || context_copy.orig_value,
|
24
|
+
type: params[:type] || context_copy.type,
|
25
|
+
cleanup_data: params[:cleanup_data] || context_copy.cleanup_data,
|
26
|
+
record: params[:record] || context_copy.record,
|
27
|
+
repetition: params[:repetition] || context_copy.repetition)
|
28
|
+
new_context.current_value = params[:current_value] || context_copy.current_value
|
29
|
+
new_context
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_h(subset: false)
|
33
|
+
{ orig_value:, current_value:, type:, record:, repetition:,
|
34
|
+
cleanup_data: subset ? subset(cleanup_data) : cleanup_data }
|
35
|
+
end
|
36
|
+
|
37
|
+
def pretty_print(pp)
|
38
|
+
to_h(subset: true).pretty_print(pp)
|
39
|
+
end
|
40
|
+
|
41
|
+
def ==(other)
|
42
|
+
to_h == other.to_h
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module DumpCleaner
|
2
|
+
module Cleanup
|
3
|
+
module Uniqueness
|
4
|
+
require "singleton"
|
5
|
+
|
6
|
+
class MaxRetriesReachedError < StandardError; end
|
7
|
+
|
8
|
+
def repeat_until_unique(step_context:, max_retries: 1000, &block)
|
9
|
+
n = 0
|
10
|
+
result = nil
|
11
|
+
|
12
|
+
loop do
|
13
|
+
result = block.call(n)
|
14
|
+
|
15
|
+
break unless result
|
16
|
+
|
17
|
+
if n.positive?
|
18
|
+
Log.debug do
|
19
|
+
msg = "Uniqueness run: type=#{step_context.type}, id=#{step_context.record['id']}, "
|
20
|
+
msg << "orig_value=#{step_context.orig_value}, current_value=#{result}, repetition=#{n}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
unless CaseInsensitiveCache.instance.known?(type: step_context.type, value: result)
|
25
|
+
CaseInsensitiveCache.instance.push(type: step_context.type, value: result)
|
26
|
+
break
|
27
|
+
end
|
28
|
+
|
29
|
+
if n >= max_retries
|
30
|
+
warning = "Max retry count #{n} reached for ID:#{step_context.record['id']}, type:#{step_context.type}, "
|
31
|
+
warning << "orig:#{step_context.orig_value}, current:#{result}"
|
32
|
+
Log.warn { warning }
|
33
|
+
raise MaxRetriesReachedError
|
34
|
+
end
|
35
|
+
|
36
|
+
n += 1
|
37
|
+
end
|
38
|
+
|
39
|
+
result
|
40
|
+
end
|
41
|
+
|
42
|
+
class CaseInsensitiveCache
|
43
|
+
include Singleton
|
44
|
+
|
45
|
+
def initialize
|
46
|
+
clear
|
47
|
+
end
|
48
|
+
|
49
|
+
def clear
|
50
|
+
@data = {}
|
51
|
+
end
|
52
|
+
|
53
|
+
def known?(type:, value:)
|
54
|
+
return false unless @data.key?(type)
|
55
|
+
|
56
|
+
@data[type].include?(value.downcase)
|
57
|
+
end
|
58
|
+
|
59
|
+
def push(type:, value:)
|
60
|
+
@data[type] ||= Set.new
|
61
|
+
@data[type].add(value.downcase)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
class Workflow
|
6
|
+
def initialize(phase:)
|
7
|
+
@phase = phase
|
8
|
+
@workflow_steps_cache = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def run(initial_step_context, step_configs:)
|
12
|
+
steps(type: initial_step_context.type, step_configs:).reduce(initial_step_context.dup) do |step_context, step|
|
13
|
+
step.call(step_context)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def steps_namespace(phase)
|
20
|
+
phase == :data_source ? DumpCleaner::Cleanup::DataSourceSteps : DumpCleaner::Cleanup::CleaningSteps
|
21
|
+
end
|
22
|
+
|
23
|
+
def steps(type:, step_configs:)
|
24
|
+
@workflow_steps_cache[cache_key(type:, step_configs:)] ||= step_configs.map do |step_config|
|
25
|
+
lambda do |step_context|
|
26
|
+
steps_namespace(@phase).const_get(step_config.step).new(step_context).run(**step_config.params)
|
27
|
+
rescue NameError => e
|
28
|
+
raise DumpCleaner::Config::ConfigurationError, "Invalid step #{step_config.step}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def cache_key(type:, step_configs:)
|
34
|
+
"#{@phase}-#{type}-#{step_configs.map(&:step).join('-')}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module DumpCleaner
|
2
|
+
class Conditions
|
3
|
+
def initialize(condition_config)
|
4
|
+
@conditions = condition_config
|
5
|
+
end
|
6
|
+
|
7
|
+
def evaluate_to_true?(record:, column_value: nil)
|
8
|
+
return false unless @conditions
|
9
|
+
|
10
|
+
Array(@conditions).map do |condition_config|
|
11
|
+
column = condition_config.column
|
12
|
+
conversion, op, value = parse_condition(condition_config)
|
13
|
+
(column ? record[column] : column_value).send(conversion || :itself).send(op, value)
|
14
|
+
end.any?
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.evaluate_to_true_in_step?(conditions:, step_context:)
|
18
|
+
new(conditions).evaluate_to_true?(record: step_context.record, column_value: step_context.orig_value)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def parse_condition(condition_config)
|
24
|
+
condition_value = condition_config.value
|
25
|
+
|
26
|
+
case condition_config.condition
|
27
|
+
when "eq"
|
28
|
+
[nil, "==", condition_value]
|
29
|
+
when "ne"
|
30
|
+
[nil, "!=", condition_value]
|
31
|
+
when "start_with"
|
32
|
+
[nil, :start_with?, condition_value]
|
33
|
+
when "end_with"
|
34
|
+
[nil, :end_with?, condition_value]
|
35
|
+
when "non_zero"
|
36
|
+
[:to_i, "!=", 0]
|
37
|
+
else
|
38
|
+
raise "Unknown condition #{condition_config.condition} for column #{condition_config.column}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
class Config
|
5
|
+
require "yaml"
|
6
|
+
|
7
|
+
CleanupTableColumnConfig = Data.define(:name, :cleanup_type, :unique) do
|
8
|
+
alias_method :unique_column?, :unique
|
9
|
+
end
|
10
|
+
|
11
|
+
CleanupStepConfig = Data.define(:step, :params)
|
12
|
+
|
13
|
+
ConditionConfig = Data.define(:column, :condition, :value)
|
14
|
+
|
15
|
+
class ConfigurationError < StandardError; end
|
16
|
+
|
17
|
+
def initialize(config_file)
|
18
|
+
@config = load(config_file) || {}
|
19
|
+
@steps_for = {}
|
20
|
+
@keep_same_conditions = {}
|
21
|
+
|
22
|
+
set_log_level
|
23
|
+
end
|
24
|
+
|
25
|
+
def dump_format
|
26
|
+
@config.dig("dump", "format")
|
27
|
+
end
|
28
|
+
|
29
|
+
def steps_for(type, phase)
|
30
|
+
@steps_for[type] ||= {}
|
31
|
+
@steps_for[type][phase.to_s] ||= Array(cleanup_config_for(type)[phase.to_s]).map do
|
32
|
+
CleanupStepConfig.new(step: _1["step"], params: (_1["params"] || {}).transform_keys(&:to_sym))
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def keep_same_conditions(type)
|
37
|
+
@keep_same_conditions[type] ||= Array(cleanup_config_for(type)["keep_same_conditions"]).map do
|
38
|
+
ConditionConfig.new(condition: _1["condition"], value: _1["value"], column: nil)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def ignore_keep_same_record_conditions?(type)
|
43
|
+
cleanup_config_for(type)["ignore_keep_same_record_conditions"] == true
|
44
|
+
end
|
45
|
+
|
46
|
+
def cleanup_tables
|
47
|
+
cleanup_table_configs.map { [_1.db, _1.table] }
|
48
|
+
end
|
49
|
+
|
50
|
+
def cleanup_table_config(db:, table:)
|
51
|
+
cleanup_table_configs.find { _1.db == db && _1.table == table }
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def load(config_file)
|
57
|
+
YAML.load_file(config_file)
|
58
|
+
end
|
59
|
+
|
60
|
+
def set_log_level
|
61
|
+
if (level = @config.dig("dump_cleaner", "log_level"))
|
62
|
+
Log.instance.level = level
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def cleanup_table_configs
|
67
|
+
@cleanup_table_configs ||= Array(@config["cleanup_tables"]).map { CleanupTableConfig.new(_1) }
|
68
|
+
end
|
69
|
+
|
70
|
+
def cleanup_config_for(type)
|
71
|
+
@config.dig("cleanup_types", type.to_s) ||
|
72
|
+
raise(ConfigurationError, "Missing or empty type '#{type}' in the 'cleanup_types' section in config.")
|
73
|
+
end
|
74
|
+
|
75
|
+
class CleanupTableConfig
|
76
|
+
def initialize(cleanup_table_config)
|
77
|
+
@cleanup_table_config = cleanup_table_config
|
78
|
+
end
|
79
|
+
|
80
|
+
def db
|
81
|
+
@cleanup_table_config["db"]
|
82
|
+
end
|
83
|
+
|
84
|
+
def table
|
85
|
+
@cleanup_table_config["table"]
|
86
|
+
end
|
87
|
+
|
88
|
+
def id_column
|
89
|
+
@cleanup_table_config["id_column"] || "id"
|
90
|
+
end
|
91
|
+
|
92
|
+
def columns
|
93
|
+
@columns ||= Array(@cleanup_table_config["columns"]).map do
|
94
|
+
CleanupTableColumnConfig.new(name: _1["name"], cleanup_type: _1["cleanup_type"], unique: _1["unique"] == true)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def record_context_columns
|
99
|
+
@cleanup_table_config["record_context_columns"] || ["id"]
|
100
|
+
end
|
101
|
+
|
102
|
+
def keep_same_record_conditions
|
103
|
+
@keep_same_record_conditions ||= Array(@cleanup_table_config["keep_same_record_conditions"]).map do
|
104
|
+
ConditionConfig.new(condition: _1["condition"], value: _1["value"], column: _1["column"])
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|