dump_cleaner 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +25 -0
  4. data/CHANGELOG.md +5 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +295 -0
  7. data/Rakefile +8 -0
  8. data/doc/workflow_steps.md +1400 -0
  9. data/dump_cleaner.gemspec +38 -0
  10. data/exe/dump_cleaner +7 -0
  11. data/lib/dump_cleaner/cleaners/base_cleaner.rb +32 -0
  12. data/lib/dump_cleaner/cleaners/mysql_shell_dump_cleaner.rb +47 -0
  13. data/lib/dump_cleaner/cleaners/mysql_shell_dump_helpers.rb +11 -0
  14. data/lib/dump_cleaner/cleaners/mysql_shell_table_cleaner.rb +184 -0
  15. data/lib/dump_cleaner/cleanup/bytesize_helpers.rb +39 -0
  16. data/lib/dump_cleaner/cleanup/cleaning.rb +69 -0
  17. data/lib/dump_cleaner/cleanup/cleaning_steps/add_repetition_suffix.rb +23 -0
  18. data/lib/dump_cleaner/cleanup/cleaning_steps/base.rb +33 -0
  19. data/lib/dump_cleaner/cleanup/cleaning_steps/fill_up_with_string.rb +20 -0
  20. data/lib/dump_cleaner/cleanup/cleaning_steps/generate_random_string.rb +37 -0
  21. data/lib/dump_cleaner/cleanup/cleaning_steps/inspect_context.rb +16 -0
  22. data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_email.rb +78 -0
  23. data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_formatted_number.rb +63 -0
  24. data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_number.rb +29 -0
  25. data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_bytesize.rb +17 -0
  26. data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_pattern.rb +20 -0
  27. data/lib/dump_cleaner/cleanup/cleaning_steps/take_sample.rb +28 -0
  28. data/lib/dump_cleaner/cleanup/data_source.rb +19 -0
  29. data/lib/dump_cleaner/cleanup/data_source_steps/base.rb +26 -0
  30. data/lib/dump_cleaner/cleanup/data_source_steps/group_by_bytesize.rb +37 -0
  31. data/lib/dump_cleaner/cleanup/data_source_steps/inspect_context.rb +16 -0
  32. data/lib/dump_cleaner/cleanup/data_source_steps/load_yaml_file.rb +24 -0
  33. data/lib/dump_cleaner/cleanup/data_source_steps/remove_accents.rb +29 -0
  34. data/lib/dump_cleaner/cleanup/inspection.rb +37 -0
  35. data/lib/dump_cleaner/cleanup/step_context.rb +46 -0
  36. data/lib/dump_cleaner/cleanup/uniqueness.rb +66 -0
  37. data/lib/dump_cleaner/cleanup/workflow.rb +38 -0
  38. data/lib/dump_cleaner/conditions.rb +42 -0
  39. data/lib/dump_cleaner/config.rb +109 -0
  40. data/lib/dump_cleaner/log.rb +42 -0
  41. data/lib/dump_cleaner/options.rb +46 -0
  42. data/lib/dump_cleaner/processor.rb +37 -0
  43. data/lib/dump_cleaner/version.rb +5 -0
  44. data/lib/dump_cleaner.rb +10 -0
  45. metadata +105 -0
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/dump_cleaner/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "dump_cleaner"
7
+ spec.version = DumpCleaner::VERSION
8
+ spec.authors = ["Matouš Borák"]
9
+ spec.email = ["matous.borak@nejremeslnici.cz"]
10
+
11
+ spec.summary = "Anonymizes data in logical database dumps."
12
+ spec.description = "Deterministically anonymizes data in logical database dumps. Useful for importing (anonymized) production data into development environments."
13
+ spec.homepage = "https://github.com/NejRemeslnici/dump-cleaner"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 3.1.0"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/NejRemeslnici/dump-cleaner"
19
+ spec.metadata["changelog_uri"] = "https://github.com/NejRemeslnici/dump-cleaner/blob/main/CHANGELOG.md"
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(__dir__) do
24
+ `git ls-files -z`.split("\x0").reject do |f|
25
+ (File.expand_path(f) == __FILE__) ||
26
+ f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
27
+ end
28
+ end
29
+ spec.bindir = "exe"
30
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
31
+ spec.require_paths = ["lib"]
32
+
33
+ # Uncomment to register a new dependency of your gem
34
+ spec.add_dependency "zeitwerk", "~> 2.6"
35
+
36
+ # For more information and examples about making a new gem, check out our
37
+ # guide at: https://bundler.io/guides/creating_gem.html
38
+ end
data/exe/dump_cleaner ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "dump_cleaner"
5
+
6
+ options = DumpCleaner::Options.new(ARGV)
7
+ DumpCleaner::Processor.new(options).run
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleaners
5
+ class BaseCleaner
6
+ attr_reader :config, :options
7
+
8
+ def initialize(config:, options:)
9
+ @config = config
10
+ @options = options
11
+ end
12
+
13
+ def pre_cleanup
14
+ # Implement in subclass if needed
15
+ end
16
+
17
+ def clean
18
+ raise NotImplementedError
19
+ end
20
+
21
+ def post_cleanup
22
+ # Implement in subclass if needed
23
+ end
24
+
25
+ def keep_same_record?(record, table_config:)
26
+ return false unless table_config.keep_same_record_conditions
27
+
28
+ Conditions.new(table_config.keep_same_record_conditions).evaluate_to_true?(record:)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleaners
5
+ class MysqlShellDumpCleaner < BaseCleaner
6
+ require "fileutils"
7
+
8
+ include MysqlShellDumpHelpers
9
+
10
+ def pre_cleanup
11
+ validate_source_dump
12
+ prepare_destination_dump
13
+ end
14
+
15
+ def clean
16
+ config.cleanup_tables.each do |db, table|
17
+ table_cleaner = MysqlShellTableCleaner.new(db:, table:, config:, options:)
18
+
19
+ table_cleaner.pre_cleanup
20
+ table_cleaner.clean
21
+ table_cleaner.post_cleanup
22
+ end
23
+ end
24
+
25
+ def post_cleanup
26
+ copy_remaining_files
27
+ end
28
+
29
+ private
30
+
31
+ def validate_source_dump
32
+ raise "Source dump path does not exist: #{options.source_dump_path}" unless Dir.exist?(options.source_dump_path)
33
+ end
34
+
35
+ def prepare_destination_dump
36
+ Dir.mkdir(options.destination_dump_path) unless Dir.exist?(options.destination_dump_path)
37
+ end
38
+
39
+ def copy_remaining_files
40
+ Dir.glob("#{options.source_dump_path}/*").each do |file|
41
+ destination_file = destination_file_for(file)
42
+ FileUtils.cp(file, destination_file, preserve: true) unless File.exist?(destination_file)
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleaners
5
+ module MysqlShellDumpHelpers
6
+ def destination_file_for(source_file)
7
+ source_file.sub(options.source_dump_path, options.destination_dump_path)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,184 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleaners
5
+ class MysqlShellTableCleaner < BaseCleaner
6
+ require "open3"
7
+
8
+ include MysqlShellDumpHelpers
9
+
10
+ attr_reader :table_info, :cleanup_data, :cleaning
11
+
12
+ def initialize(db:, table:, config:, options:)
13
+ super(config:, options:)
14
+ @db = db
15
+ @table = table
16
+ @cleanup_data = Cleanup::DataSource.new(config:)
17
+ @cleaning = Cleanup::Cleaning.new(config:)
18
+ end
19
+
20
+ def pre_cleanup
21
+ @table_info = DumpTableInfo.load(db: @db, table: @table, source_dump_path: options.source_dump_path)
22
+ validate_table_info
23
+ end
24
+
25
+ def clean
26
+ table_config = config.cleanup_table_config(db: @db, table: @table)
27
+ Log.info { "Cleaning table #{table_info.db_dot_table}…" }
28
+
29
+ DumpCleaner::Cleanup::Uniqueness::CaseInsensitiveCache.instance.clear
30
+
31
+ Dir.glob("#{options.source_dump_path}/#{table_info.db_at_table}@@*.#{table_info.extension}").each do |file|
32
+ # Open3.pipeline_r(pipe_source_args(file), ["head", "-n", "1000"]) do |tsv_data, _wait_thread|
33
+ Open3.pipeline_r(pipe_source_args(file)) do |tsv_data, _wait_thread|
34
+ Open3.pipeline_w(pipe_sink_args(destination_file_for(file))) do |zstd_out, _wait_thread|
35
+ tsv_data.each_line do |line|
36
+ line = line.chomp(table_info.dialect.lines_terminated_by)
37
+ zstd_out.print "#{clean_line(line, table_config:)}#{table_info.dialect.lines_terminated_by}"
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ def clean_line(line, table_config:)
47
+ record = line.split("\t")
48
+ record_context = record_context(record, table_config:)
49
+ print "\r#{record_context['id']}… " if (record_context["id"].to_i % 10_000).zero?
50
+
51
+ keep_record = keep_same_record?(record_context, table_config:)
52
+
53
+ table_config.columns.each do |column_config|
54
+ column_index = table_info.column_index(column_config.name)
55
+ raise "Invalid column specified in config: #{column_config.name}" unless column_index
56
+
57
+ next if record[column_index] == "\\N" # ignore NULL values
58
+
59
+ cleanup_data_for_type = cleanup_data.data_for(column_config.cleanup_type)
60
+
61
+ record[column_index] = cleaning.clean_value_for(record[column_index],
62
+ type: column_config.cleanup_type,
63
+ cleanup_data: cleanup_data_for_type,
64
+ record: record_context,
65
+ keep_record:,
66
+ column_config:)
67
+ end
68
+
69
+ new_line = record.join("\t")
70
+ warn_on_changed_line_length(line, new_line, id: record_context["id"], record:)
71
+
72
+ new_line
73
+ end
74
+
75
+ def record_context(record, table_config:)
76
+ columns = table_config.record_context_columns
77
+ context = columns.each_with_object({}) do |column, context|
78
+ context[column] = record[table_info.column_index(column)]
79
+ end
80
+ context["id_column"] = record[table_info.column_index(table_config.id_column)]
81
+ context
82
+ end
83
+
84
+ def warn_on_changed_line_length(orig_line, new_line, id:, record:)
85
+ return if orig_line.bytesize == new_line.bytesize
86
+
87
+ warning = "ID: #{id} bytesize changed: #{orig_line.bytesize} => #{new_line.bytesize}"
88
+ orig_line.split("\t").each_with_index do |column, i|
89
+ warning << "#{column} -> #{record[i]}" if !record[i] || column.bytesize != record[i].bytesize
90
+ end
91
+
92
+ Log.error { warning }
93
+ end
94
+
95
+ def validate_table_info
96
+ case table_info.compression
97
+ when "zstd"
98
+ system("zstd --version >/dev/null 2>&1") || raise("zstd not found in \$PATH")
99
+ else
100
+ raise "Unsupported dump compression format '#{table_info.compression}'"
101
+ end
102
+ end
103
+
104
+ def pipe_source_args(file)
105
+ case table_info.compression
106
+ when "zstd"
107
+ ["zstd", "-dc", file]
108
+ end
109
+ end
110
+
111
+ def pipe_sink_args(file)
112
+ case table_info.compression
113
+ when "zstd"
114
+ ["zstd", "-qfo", file]
115
+ end
116
+ end
117
+
118
+ class DumpTableInfo
119
+ require "json"
120
+
121
+ DialectOptions = Data.define(:lines_terminated_by, :fields_terminated_by, :fields_enclosed_by,
122
+ :fields_optionally_enclosed, :fields_escaped_by)
123
+
124
+ def self.load(db:, table:, source_dump_path:)
125
+ new(JSON.parse(File.read(table_info_file_path(db:, table:, source_dump_path:))))
126
+ rescue Errno::ENOENT
127
+ raise "Table info file not found in dump for table '#{db}.#{table}'. Is the table included in the dump?"
128
+ end
129
+
130
+ def self.table_info_file_path(db:, table:, source_dump_path:)
131
+ "#{source_dump_path}/#{db}@#{table}.json"
132
+ end
133
+
134
+ def initialize(table_info)
135
+ @table_info = table_info
136
+ end
137
+
138
+ def db
139
+ @db ||= @table_info.dig("options", "schema")
140
+ end
141
+
142
+ def table
143
+ @table ||= @table_info.dig("options", "table")
144
+ end
145
+
146
+ def db_dot_table
147
+ "#{db}.#{table}"
148
+ end
149
+
150
+ def db_at_table
151
+ "#{db}@#{table}"
152
+ end
153
+
154
+ def compression
155
+ @table_info["compression"]
156
+ end
157
+
158
+ def extension
159
+ @table_info["extension"]
160
+ end
161
+
162
+ def columns
163
+ @columns ||= @table_info.dig("options", "columns")
164
+ end
165
+
166
+ def column_index(name)
167
+ columns.index(name)
168
+ end
169
+
170
+ def dialect
171
+ @dialect ||= begin
172
+ dialect_options = DialectOptions.members.each_with_object({}) do |option, options|
173
+ lowercase_option = option.to_s.split("_").each_with_object([]) do |e, buffer|
174
+ buffer.push(buffer.empty? ? e : e.capitalize)
175
+ end.join
176
+ options[option] = @table_info.dig("options", lowercase_option)
177
+ end
178
+ DialectOptions.new(**dialect_options)
179
+ end
180
+ end
181
+ end
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,39 @@
1
+ module DumpCleaner
2
+ module Cleanup
3
+ module BytesizeHelpers
4
+ # inspired by https://stackoverflow.com/a/67825008/1544012
5
+ def truncate_to_bytesize(string, max_bytesize:, padding: " ")
6
+ return string unless string.bytesize > max_bytesize
7
+
8
+ check_padding_bytesize(padding)
9
+
10
+ just_over = (0...string.size).bsearch { string[0.._1].bytesize > max_bytesize }
11
+ string = string[0...just_over]
12
+
13
+ string << padding while string.bytesize < max_bytesize
14
+ string
15
+ end
16
+
17
+ def set_to_bytesize(string, bytesize:, padding: " ")
18
+ string = string.ljust(bytesize, "#{padding}#{string}") if string.bytesize < bytesize
19
+ truncate_to_bytesize(string, max_bytesize: bytesize, padding:)
20
+ end
21
+
22
+ def replace_suffix(string, suffix:, padding: " ")
23
+ front_max_bytes = string.bytesize - suffix.bytesize
24
+ front = truncate_to_bytesize(string, max_bytesize: front_max_bytes, padding:)
25
+
26
+ "#{front}#{suffix}"
27
+ end
28
+
29
+ private
30
+
31
+ def check_padding_bytesize(padding)
32
+ return unless padding.bytesize > 1
33
+
34
+ raise ArgumentError,
35
+ "Use only a single-byte string in the padding otherwise it may prevent adjusting the result precisely."
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ class Cleaning
6
+ include Uniqueness
7
+
8
+ attr_reader :config
9
+
10
+ def initialize(config:)
11
+ @cleaning_workflow = Workflow.new(phase: :cleaning)
12
+ @failure_workflow = Workflow.new(phase: :failure)
13
+ @config = config
14
+ end
15
+
16
+ def clean_value_for(orig_value, type:, cleanup_data:, column_config:, record: {}, keep_record: false) # rubocop:disable Metrics/ParameterLists
17
+ step_context = StepContext.new(orig_value:, type:, cleanup_data:, record:)
18
+
19
+ # return orig_value if keep_same conditions are met
20
+ if (keep_record && !config.ignore_keep_same_record_conditions?(type)) ||
21
+ Conditions.evaluate_to_true_in_step?(conditions: config.keep_same_conditions(type), step_context:)
22
+ return orig_value_with_optional_suffix(step_context, column_config:)
23
+ end
24
+
25
+ if column_config.unique_column?
26
+ begin
27
+ repeat_until_unique(step_context:) do |repetition|
28
+ step_context.repetition = repetition
29
+ run_workflows(step_context)
30
+ end
31
+ rescue MaxRetriesReachedError
32
+ repeat_until_unique(step_context:) do |repetition|
33
+ step_context.repetition = repetition
34
+ run_failure_workflow(step_context)
35
+ end
36
+ end
37
+ else
38
+ run_workflows(step_context)
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def orig_value_with_optional_suffix(step_context, column_config:)
45
+ if column_config.unique_column?
46
+ repeat_until_unique(step_context:) do |repetition|
47
+ step_context.repetition = repetition
48
+ DumpCleaner::Cleanup::CleaningSteps::AddRepetitionSuffix.new(step_context).run.current_value
49
+ end
50
+ else
51
+ step_context.orig_value
52
+ end
53
+ end
54
+
55
+ def run_workflows(step_context)
56
+ run_cleaning_workflow(step_context) || run_failure_workflow(step_context)
57
+ end
58
+
59
+ def run_cleaning_workflow(step_context)
60
+ @cleaning_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :cleaning)).current_value
61
+ end
62
+
63
+ def run_failure_workflow(step_context)
64
+ step_context.current_value = step_context.orig_value # reset current_value
65
+ @failure_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :failure)).current_value
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class AddRepetitionSuffix < Base
7
+ include BytesizeHelpers
8
+
9
+ def run
10
+ step_context.current_value = if repetition.zero?
11
+ current_value
12
+ elsif current_value.bytesize > repetition.to_s.bytesize
13
+ replace_suffix(current_value, suffix: repetition.to_s, padding: "0")
14
+ else
15
+ GenerateRandomString.new(StepContext.new_from(step_context))
16
+ .run.current_value
17
+ end
18
+ step_context
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class Base
7
+ require "forwardable"
8
+ require "zlib"
9
+
10
+ extend Forwardable
11
+
12
+ def_delegators :step_context, :cleanup_data, :current_value, :orig_value, :type, :record, :repetition
13
+
14
+ attr_reader :step_context
15
+
16
+ def initialize(step_context)
17
+ @step_context = step_context.dup
18
+ end
19
+
20
+ def crc32(use_repetition: true)
21
+ value_to_hash = "#{record['id_column']}-#{current_value}"
22
+ value_to_hash += "-#{repetition}" if repetition.positive? && use_repetition
23
+ Zlib.crc32(value_to_hash)
24
+ end
25
+
26
+ def raise_params_error(error)
27
+ step = self.class.name.split("::").last
28
+ raise ArgumentError, "Invalid cleanup step params: type=#{type}, step=#{step}: #{error}"
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class FillUpWithString < Base
7
+ include BytesizeHelpers
8
+
9
+ def run(string: "anonymized #{type}", padding: " ", strict_bytesize_check: false)
10
+ if strict_bytesize_check && string.bytesize != orig_value.bytesize
11
+ raise "The bytesize of the string must be equal to the bytesize of the original value."
12
+ end
13
+
14
+ string = set_to_bytesize(string, bytesize: orig_value.bytesize, padding:)
15
+ AddRepetitionSuffix.new(StepContext.new_from(step_context, current_value: string)).run
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class GenerateRandomString < Base
7
+ require "random/formatter"
8
+
9
+ def run(character_set: "alphanumeric")
10
+ random = Random.new(crc32)
11
+
12
+ step_context.current_value = random.alphanumeric(current_value.bytesize, chars: characters(character_set))
13
+ step_context
14
+ end
15
+
16
+ private
17
+
18
+ def characters(character_set)
19
+ case character_set.to_s
20
+ when "alphanumeric"
21
+ Random::Formatter::ALPHANUMERIC
22
+ when "alpha"
23
+ [*"a".."z", *"A".."Z"]
24
+ when "lowercase"
25
+ [*"a".."z"]
26
+ when "uppercase"
27
+ [*"A".."Z"]
28
+ when "numeric"
29
+ [*"0".."9"]
30
+ else
31
+ character_set
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class InspectContext < Base
7
+ include Inspection
8
+
9
+ def run
10
+ inspect_step_context(step_context)
11
+ step_context
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class RandomizeEmail < Base
7
+ def run(domains_to_keep_data_key: "domains_to_keep", words_data_key: "words")
8
+ validate_params(domains_to_keep_data_key:, words_data_key:)
9
+
10
+ mailbox, domain = current_value.split("@", 2)
11
+
12
+ if !mailbox || !domain || mailbox.empty? || domain.empty? || !domain.include?(".")
13
+ Log.warn { "Invalid email: type=#{type}, id=#{record['id']}, value=#{current_value}" } if repetition.zero?
14
+ step_context.current_value = nil
15
+ return step_context
16
+ end
17
+
18
+ new_mailbox = new_mailbox(mailbox, words: cleanup_data[words_data_key])
19
+ new_domain = new_domain(domain, domains: cleanup_data[domains_to_keep_data_key],
20
+ words: cleanup_data[words_data_key])
21
+
22
+ step_context.current_value = "#{new_mailbox}@#{new_domain}"
23
+ step_context
24
+ end
25
+
26
+ private
27
+
28
+ def new_mailbox(mailbox, words:)
29
+ if mailbox !~ /^\.|\.\.|\.$/
30
+ mailbox.split(".").map { dictionary_or_random_word_instead_of(_1, words:) }.join(".")
31
+ else
32
+ dictionary_or_random_word_instead_of(mailbox, words:)
33
+ end
34
+ end
35
+
36
+ def new_domain(domain, domains:, words:)
37
+ if domains.include?(domain)
38
+ domain
39
+ else
40
+ tld2, _dot, tld = domain.rpartition(".")
41
+ new_tld2 = dictionary_or_random_word_instead_of(tld2, words:)
42
+ "#{new_tld2}.#{tld}"
43
+ end
44
+ end
45
+
46
+ def dictionary_or_random_word_instead_of(word, words:)
47
+ dictionary_word_instead_of(word, words:) || random_word_instead_of(word)
48
+ end
49
+
50
+ def dictionary_word_instead_of(word, words:)
51
+ context = StepContext.new_from(step_context, current_value: word, cleanup_data: words)
52
+ context = SelectDataByBytesize.new(context).run
53
+ TakeSample.new(context).run(uniqueness_strategy: :suffix).current_value
54
+ end
55
+
56
+ def random_word_instead_of(word)
57
+ GenerateRandomString.new(StepContext.new_from(step_context, current_value: word))
58
+ .run(character_set: :lowercase).current_value
59
+ end
60
+
61
+ def validate_params(domains_to_keep_data_key:, words_data_key:)
62
+ raise("The cleanup_data must be a hash") unless cleanup_data.respond_to?(:key)
63
+
64
+ unless !domains_to_keep_data_key || domains_to_keep_data_key.empty? ||
65
+ cleanup_data.key?(domains_to_keep_data_key)
66
+ raise_params_error("The cleanup_data does not contain the \"#{domains_to_keep_data_key}\" key.
67
+ Either add the domains to the cleanup data hash or set the domains_to_keep_data_key
68
+ to null or an empty string.".gsub(/\s+/, " "))
69
+ end
70
+
71
+ return if cleanup_data.key?(words_data_key)
72
+
73
+ raise_params_error("The cleanup_data does not contain the \"#{words_data_key}\" key")
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end