dump_cleaner 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +25 -0
  4. data/CHANGELOG.md +5 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +295 -0
  7. data/Rakefile +8 -0
  8. data/doc/workflow_steps.md +1400 -0
  9. data/dump_cleaner.gemspec +38 -0
  10. data/exe/dump_cleaner +7 -0
  11. data/lib/dump_cleaner/cleaners/base_cleaner.rb +32 -0
  12. data/lib/dump_cleaner/cleaners/mysql_shell_dump_cleaner.rb +47 -0
  13. data/lib/dump_cleaner/cleaners/mysql_shell_dump_helpers.rb +11 -0
  14. data/lib/dump_cleaner/cleaners/mysql_shell_table_cleaner.rb +184 -0
  15. data/lib/dump_cleaner/cleanup/bytesize_helpers.rb +39 -0
  16. data/lib/dump_cleaner/cleanup/cleaning.rb +69 -0
  17. data/lib/dump_cleaner/cleanup/cleaning_steps/add_repetition_suffix.rb +23 -0
  18. data/lib/dump_cleaner/cleanup/cleaning_steps/base.rb +33 -0
  19. data/lib/dump_cleaner/cleanup/cleaning_steps/fill_up_with_string.rb +20 -0
  20. data/lib/dump_cleaner/cleanup/cleaning_steps/generate_random_string.rb +37 -0
  21. data/lib/dump_cleaner/cleanup/cleaning_steps/inspect_context.rb +16 -0
  22. data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_email.rb +78 -0
  23. data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_formatted_number.rb +63 -0
  24. data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_number.rb +29 -0
  25. data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_bytesize.rb +17 -0
  26. data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_pattern.rb +20 -0
  27. data/lib/dump_cleaner/cleanup/cleaning_steps/take_sample.rb +28 -0
  28. data/lib/dump_cleaner/cleanup/data_source.rb +19 -0
  29. data/lib/dump_cleaner/cleanup/data_source_steps/base.rb +26 -0
  30. data/lib/dump_cleaner/cleanup/data_source_steps/group_by_bytesize.rb +37 -0
  31. data/lib/dump_cleaner/cleanup/data_source_steps/inspect_context.rb +16 -0
  32. data/lib/dump_cleaner/cleanup/data_source_steps/load_yaml_file.rb +24 -0
  33. data/lib/dump_cleaner/cleanup/data_source_steps/remove_accents.rb +29 -0
  34. data/lib/dump_cleaner/cleanup/inspection.rb +37 -0
  35. data/lib/dump_cleaner/cleanup/step_context.rb +46 -0
  36. data/lib/dump_cleaner/cleanup/uniqueness.rb +66 -0
  37. data/lib/dump_cleaner/cleanup/workflow.rb +38 -0
  38. data/lib/dump_cleaner/conditions.rb +42 -0
  39. data/lib/dump_cleaner/config.rb +109 -0
  40. data/lib/dump_cleaner/log.rb +42 -0
  41. data/lib/dump_cleaner/options.rb +46 -0
  42. data/lib/dump_cleaner/processor.rb +37 -0
  43. data/lib/dump_cleaner/version.rb +5 -0
  44. data/lib/dump_cleaner.rb +10 -0
  45. metadata +105 -0
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/dump_cleaner/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "dump_cleaner"
7
+ spec.version = DumpCleaner::VERSION
8
+ spec.authors = ["Matouš Borák"]
9
+ spec.email = ["matous.borak@nejremeslnici.cz"]
10
+
11
+ spec.summary = "Anonymizes data in logical database dumps."
12
+ spec.description = "Deterministically anonymizes data in logical database dumps. Useful for importing (anonymized) production data into development environments."
13
+ spec.homepage = "https://github.com/NejRemeslnici/dump-cleaner"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 3.1.0"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/NejRemeslnici/dump-cleaner"
19
+ spec.metadata["changelog_uri"] = "https://github.com/NejRemeslnici/dump-cleaner/blob/main/CHANGELOG.md"
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(__dir__) do
24
+ `git ls-files -z`.split("\x0").reject do |f|
25
+ (File.expand_path(f) == __FILE__) ||
26
+ f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
27
+ end
28
+ end
29
+ spec.bindir = "exe"
30
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
31
+ spec.require_paths = ["lib"]
32
+
33
+ # Uncomment to register a new dependency of your gem
34
+ spec.add_dependency "zeitwerk", "~> 2.6"
35
+
36
+ # For more information and examples about making a new gem, check out our
37
+ # guide at: https://bundler.io/guides/creating_gem.html
38
+ end
data/exe/dump_cleaner ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "dump_cleaner"
5
+
6
+ options = DumpCleaner::Options.new(ARGV)
7
+ DumpCleaner::Processor.new(options).run
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleaners
5
+ class BaseCleaner
6
+ attr_reader :config, :options
7
+
8
+ def initialize(config:, options:)
9
+ @config = config
10
+ @options = options
11
+ end
12
+
13
+ def pre_cleanup
14
+ # Implement in subclass if needed
15
+ end
16
+
17
+ def clean
18
+ raise NotImplementedError
19
+ end
20
+
21
+ def post_cleanup
22
+ # Implement in subclass if needed
23
+ end
24
+
25
+ def keep_same_record?(record, table_config:)
26
+ return false unless table_config.keep_same_record_conditions
27
+
28
+ Conditions.new(table_config.keep_same_record_conditions).evaluate_to_true?(record:)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleaners
5
+ class MysqlShellDumpCleaner < BaseCleaner
6
+ require "fileutils"
7
+
8
+ include MysqlShellDumpHelpers
9
+
10
+ def pre_cleanup
11
+ validate_source_dump
12
+ prepare_destination_dump
13
+ end
14
+
15
+ def clean
16
+ config.cleanup_tables.each do |db, table|
17
+ table_cleaner = MysqlShellTableCleaner.new(db:, table:, config:, options:)
18
+
19
+ table_cleaner.pre_cleanup
20
+ table_cleaner.clean
21
+ table_cleaner.post_cleanup
22
+ end
23
+ end
24
+
25
+ def post_cleanup
26
+ copy_remaining_files
27
+ end
28
+
29
+ private
30
+
31
+ def validate_source_dump
32
+ raise "Source dump path does not exist: #{options.source_dump_path}" unless Dir.exist?(options.source_dump_path)
33
+ end
34
+
35
+ def prepare_destination_dump
36
+ Dir.mkdir(options.destination_dump_path) unless Dir.exist?(options.destination_dump_path)
37
+ end
38
+
39
+ def copy_remaining_files
40
+ Dir.glob("#{options.source_dump_path}/*").each do |file|
41
+ destination_file = destination_file_for(file)
42
+ FileUtils.cp(file, destination_file, preserve: true) unless File.exist?(destination_file)
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleaners
5
+ module MysqlShellDumpHelpers
6
+ def destination_file_for(source_file)
7
+ source_file.sub(options.source_dump_path, options.destination_dump_path)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,184 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleaners
5
+ class MysqlShellTableCleaner < BaseCleaner
6
+ require "open3"
7
+
8
+ include MysqlShellDumpHelpers
9
+
10
+ attr_reader :table_info, :cleanup_data, :cleaning
11
+
12
+ def initialize(db:, table:, config:, options:)
13
+ super(config:, options:)
14
+ @db = db
15
+ @table = table
16
+ @cleanup_data = Cleanup::DataSource.new(config:)
17
+ @cleaning = Cleanup::Cleaning.new(config:)
18
+ end
19
+
20
+ def pre_cleanup
21
+ @table_info = DumpTableInfo.load(db: @db, table: @table, source_dump_path: options.source_dump_path)
22
+ validate_table_info
23
+ end
24
+
25
+ def clean
26
+ table_config = config.cleanup_table_config(db: @db, table: @table)
27
+ Log.info { "Cleaning table #{table_info.db_dot_table}…" }
28
+
29
+ DumpCleaner::Cleanup::Uniqueness::CaseInsensitiveCache.instance.clear
30
+
31
+ Dir.glob("#{options.source_dump_path}/#{table_info.db_at_table}@@*.#{table_info.extension}").each do |file|
32
+ # Open3.pipeline_r(pipe_source_args(file), ["head", "-n", "1000"]) do |tsv_data, _wait_thread|
33
+ Open3.pipeline_r(pipe_source_args(file)) do |tsv_data, _wait_thread|
34
+ Open3.pipeline_w(pipe_sink_args(destination_file_for(file))) do |zstd_out, _wait_thread|
35
+ tsv_data.each_line do |line|
36
+ line = line.chomp(table_info.dialect.lines_terminated_by)
37
+ zstd_out.print "#{clean_line(line, table_config:)}#{table_info.dialect.lines_terminated_by}"
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ def clean_line(line, table_config:)
47
+ record = line.split("\t")
48
+ record_context = record_context(record, table_config:)
49
+ print "\r#{record_context['id']}… " if (record_context["id"].to_i % 10_000).zero?
50
+
51
+ keep_record = keep_same_record?(record_context, table_config:)
52
+
53
+ table_config.columns.each do |column_config|
54
+ column_index = table_info.column_index(column_config.name)
55
+ raise "Invalid column specified in config: #{column_config.name}" unless column_index
56
+
57
+ next if record[column_index] == "\\N" # ignore NULL values
58
+
59
+ cleanup_data_for_type = cleanup_data.data_for(column_config.cleanup_type)
60
+
61
+ record[column_index] = cleaning.clean_value_for(record[column_index],
62
+ type: column_config.cleanup_type,
63
+ cleanup_data: cleanup_data_for_type,
64
+ record: record_context,
65
+ keep_record:,
66
+ column_config:)
67
+ end
68
+
69
+ new_line = record.join("\t")
70
+ warn_on_changed_line_length(line, new_line, id: record_context["id"], record:)
71
+
72
+ new_line
73
+ end
74
+
75
+ def record_context(record, table_config:)
76
+ columns = table_config.record_context_columns
77
+ context = columns.each_with_object({}) do |column, context|
78
+ context[column] = record[table_info.column_index(column)]
79
+ end
80
+ context["id_column"] = record[table_info.column_index(table_config.id_column)]
81
+ context
82
+ end
83
+
84
+ def warn_on_changed_line_length(orig_line, new_line, id:, record:)
85
+ return if orig_line.bytesize == new_line.bytesize
86
+
87
+ warning = "ID: #{id} bytesize changed: #{orig_line.bytesize} => #{new_line.bytesize}"
88
+ orig_line.split("\t").each_with_index do |column, i|
89
+ warning << "#{column} -> #{record[i]}" if !record[i] || column.bytesize != record[i].bytesize
90
+ end
91
+
92
+ Log.error { warning }
93
+ end
94
+
95
+ def validate_table_info
96
+ case table_info.compression
97
+ when "zstd"
98
+ system("zstd --version >/dev/null 2>&1") || raise("zstd not found in \$PATH")
99
+ else
100
+ raise "Unsupported dump compression format '#{table_info.compression}'"
101
+ end
102
+ end
103
+
104
+ def pipe_source_args(file)
105
+ case table_info.compression
106
+ when "zstd"
107
+ ["zstd", "-dc", file]
108
+ end
109
+ end
110
+
111
+ def pipe_sink_args(file)
112
+ case table_info.compression
113
+ when "zstd"
114
+ ["zstd", "-qfo", file]
115
+ end
116
+ end
117
+
118
+ class DumpTableInfo
119
+ require "json"
120
+
121
+ DialectOptions = Data.define(:lines_terminated_by, :fields_terminated_by, :fields_enclosed_by,
122
+ :fields_optionally_enclosed, :fields_escaped_by)
123
+
124
+ def self.load(db:, table:, source_dump_path:)
125
+ new(JSON.parse(File.read(table_info_file_path(db:, table:, source_dump_path:))))
126
+ rescue Errno::ENOENT
127
+ raise "Table info file not found in dump for table '#{db}.#{table}'. Is the table included in the dump?"
128
+ end
129
+
130
+ def self.table_info_file_path(db:, table:, source_dump_path:)
131
+ "#{source_dump_path}/#{db}@#{table}.json"
132
+ end
133
+
134
+ def initialize(table_info)
135
+ @table_info = table_info
136
+ end
137
+
138
+ def db
139
+ @db ||= @table_info.dig("options", "schema")
140
+ end
141
+
142
+ def table
143
+ @table ||= @table_info.dig("options", "table")
144
+ end
145
+
146
+ def db_dot_table
147
+ "#{db}.#{table}"
148
+ end
149
+
150
+ def db_at_table
151
+ "#{db}@#{table}"
152
+ end
153
+
154
+ def compression
155
+ @table_info["compression"]
156
+ end
157
+
158
+ def extension
159
+ @table_info["extension"]
160
+ end
161
+
162
+ def columns
163
+ @columns ||= @table_info.dig("options", "columns")
164
+ end
165
+
166
+ def column_index(name)
167
+ columns.index(name)
168
+ end
169
+
170
+ def dialect
171
+ @dialect ||= begin
172
+ dialect_options = DialectOptions.members.each_with_object({}) do |option, options|
173
+ lowercase_option = option.to_s.split("_").each_with_object([]) do |e, buffer|
174
+ buffer.push(buffer.empty? ? e : e.capitalize)
175
+ end.join
176
+ options[option] = @table_info.dig("options", lowercase_option)
177
+ end
178
+ DialectOptions.new(**dialect_options)
179
+ end
180
+ end
181
+ end
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,39 @@
1
+ module DumpCleaner
2
+ module Cleanup
3
+ module BytesizeHelpers
4
+ # inspired by https://stackoverflow.com/a/67825008/1544012
5
+ def truncate_to_bytesize(string, max_bytesize:, padding: " ")
6
+ return string unless string.bytesize > max_bytesize
7
+
8
+ check_padding_bytesize(padding)
9
+
10
+ just_over = (0...string.size).bsearch { string[0.._1].bytesize > max_bytesize }
11
+ string = string[0...just_over]
12
+
13
+ string << padding while string.bytesize < max_bytesize
14
+ string
15
+ end
16
+
17
+ def set_to_bytesize(string, bytesize:, padding: " ")
18
+ string = string.ljust(bytesize, "#{padding}#{string}") if string.bytesize < bytesize
19
+ truncate_to_bytesize(string, max_bytesize: bytesize, padding:)
20
+ end
21
+
22
+ def replace_suffix(string, suffix:, padding: " ")
23
+ front_max_bytes = string.bytesize - suffix.bytesize
24
+ front = truncate_to_bytesize(string, max_bytesize: front_max_bytes, padding:)
25
+
26
+ "#{front}#{suffix}"
27
+ end
28
+
29
+ private
30
+
31
+ def check_padding_bytesize(padding)
32
+ return unless padding.bytesize > 1
33
+
34
+ raise ArgumentError,
35
+ "Use only a single-byte string in the padding otherwise it may prevent adjusting the result precisely."
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ class Cleaning
6
+ include Uniqueness
7
+
8
+ attr_reader :config
9
+
10
+ def initialize(config:)
11
+ @cleaning_workflow = Workflow.new(phase: :cleaning)
12
+ @failure_workflow = Workflow.new(phase: :failure)
13
+ @config = config
14
+ end
15
+
16
+ def clean_value_for(orig_value, type:, cleanup_data:, column_config:, record: {}, keep_record: false) # rubocop:disable Metrics/ParameterLists
17
+ step_context = StepContext.new(orig_value:, type:, cleanup_data:, record:)
18
+
19
+ # return orig_value if keep_same conditions are met
20
+ if (keep_record && !config.ignore_keep_same_record_conditions?(type)) ||
21
+ Conditions.evaluate_to_true_in_step?(conditions: config.keep_same_conditions(type), step_context:)
22
+ return orig_value_with_optional_suffix(step_context, column_config:)
23
+ end
24
+
25
+ if column_config.unique_column?
26
+ begin
27
+ repeat_until_unique(step_context:) do |repetition|
28
+ step_context.repetition = repetition
29
+ run_workflows(step_context)
30
+ end
31
+ rescue MaxRetriesReachedError
32
+ repeat_until_unique(step_context:) do |repetition|
33
+ step_context.repetition = repetition
34
+ run_failure_workflow(step_context)
35
+ end
36
+ end
37
+ else
38
+ run_workflows(step_context)
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def orig_value_with_optional_suffix(step_context, column_config:)
45
+ if column_config.unique_column?
46
+ repeat_until_unique(step_context:) do |repetition|
47
+ step_context.repetition = repetition
48
+ DumpCleaner::Cleanup::CleaningSteps::AddRepetitionSuffix.new(step_context).run.current_value
49
+ end
50
+ else
51
+ step_context.orig_value
52
+ end
53
+ end
54
+
55
+ def run_workflows(step_context)
56
+ run_cleaning_workflow(step_context) || run_failure_workflow(step_context)
57
+ end
58
+
59
+ def run_cleaning_workflow(step_context)
60
+ @cleaning_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :cleaning)).current_value
61
+ end
62
+
63
+ def run_failure_workflow(step_context)
64
+ step_context.current_value = step_context.orig_value # reset current_value
65
+ @failure_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :failure)).current_value
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class AddRepetitionSuffix < Base
7
+ include BytesizeHelpers
8
+
9
+ def run
10
+ step_context.current_value = if repetition.zero?
11
+ current_value
12
+ elsif current_value.bytesize > repetition.to_s.bytesize
13
+ replace_suffix(current_value, suffix: repetition.to_s, padding: "0")
14
+ else
15
+ GenerateRandomString.new(StepContext.new_from(step_context))
16
+ .run.current_value
17
+ end
18
+ step_context
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class Base
7
+ require "forwardable"
8
+ require "zlib"
9
+
10
+ extend Forwardable
11
+
12
+ def_delegators :step_context, :cleanup_data, :current_value, :orig_value, :type, :record, :repetition
13
+
14
+ attr_reader :step_context
15
+
16
+ def initialize(step_context)
17
+ @step_context = step_context.dup
18
+ end
19
+
20
+ def crc32(use_repetition: true)
21
+ value_to_hash = "#{record['id_column']}-#{current_value}"
22
+ value_to_hash += "-#{repetition}" if repetition.positive? && use_repetition
23
+ Zlib.crc32(value_to_hash)
24
+ end
25
+
26
+ def raise_params_error(error)
27
+ step = self.class.name.split("::").last
28
+ raise ArgumentError, "Invalid cleanup step params: type=#{type}, step=#{step}: #{error}"
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class FillUpWithString < Base
7
+ include BytesizeHelpers
8
+
9
+ def run(string: "anonymized #{type}", padding: " ", strict_bytesize_check: false)
10
+ if strict_bytesize_check && string.bytesize != orig_value.bytesize
11
+ raise "The bytesize of the string must be equal to the bytesize of the original value."
12
+ end
13
+
14
+ string = set_to_bytesize(string, bytesize: orig_value.bytesize, padding:)
15
+ AddRepetitionSuffix.new(StepContext.new_from(step_context, current_value: string)).run
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class GenerateRandomString < Base
7
+ require "random/formatter"
8
+
9
+ def run(character_set: "alphanumeric")
10
+ random = Random.new(crc32)
11
+
12
+ step_context.current_value = random.alphanumeric(current_value.bytesize, chars: characters(character_set))
13
+ step_context
14
+ end
15
+
16
+ private
17
+
18
+ def characters(character_set)
19
+ case character_set.to_s
20
+ when "alphanumeric"
21
+ Random::Formatter::ALPHANUMERIC
22
+ when "alpha"
23
+ [*"a".."z", *"A".."Z"]
24
+ when "lowercase"
25
+ [*"a".."z"]
26
+ when "uppercase"
27
+ [*"A".."Z"]
28
+ when "numeric"
29
+ [*"0".."9"]
30
+ else
31
+ character_set
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class InspectContext < Base
7
+ include Inspection
8
+
9
+ def run
10
+ inspect_step_context(step_context)
11
+ step_context
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DumpCleaner
4
+ module Cleanup
5
+ module CleaningSteps
6
+ class RandomizeEmail < Base
7
+ def run(domains_to_keep_data_key: "domains_to_keep", words_data_key: "words")
8
+ validate_params(domains_to_keep_data_key:, words_data_key:)
9
+
10
+ mailbox, domain = current_value.split("@", 2)
11
+
12
+ if !mailbox || !domain || mailbox.empty? || domain.empty? || !domain.include?(".")
13
+ Log.warn { "Invalid email: type=#{type}, id=#{record['id']}, value=#{current_value}" } if repetition.zero?
14
+ step_context.current_value = nil
15
+ return step_context
16
+ end
17
+
18
+ new_mailbox = new_mailbox(mailbox, words: cleanup_data[words_data_key])
19
+ new_domain = new_domain(domain, domains: cleanup_data[domains_to_keep_data_key],
20
+ words: cleanup_data[words_data_key])
21
+
22
+ step_context.current_value = "#{new_mailbox}@#{new_domain}"
23
+ step_context
24
+ end
25
+
26
+ private
27
+
28
+ def new_mailbox(mailbox, words:)
29
+ if mailbox !~ /^\.|\.\.|\.$/
30
+ mailbox.split(".").map { dictionary_or_random_word_instead_of(_1, words:) }.join(".")
31
+ else
32
+ dictionary_or_random_word_instead_of(mailbox, words:)
33
+ end
34
+ end
35
+
36
+ def new_domain(domain, domains:, words:)
37
+ if domains.include?(domain)
38
+ domain
39
+ else
40
+ tld2, _dot, tld = domain.rpartition(".")
41
+ new_tld2 = dictionary_or_random_word_instead_of(tld2, words:)
42
+ "#{new_tld2}.#{tld}"
43
+ end
44
+ end
45
+
46
+ def dictionary_or_random_word_instead_of(word, words:)
47
+ dictionary_word_instead_of(word, words:) || random_word_instead_of(word)
48
+ end
49
+
50
+ def dictionary_word_instead_of(word, words:)
51
+ context = StepContext.new_from(step_context, current_value: word, cleanup_data: words)
52
+ context = SelectDataByBytesize.new(context).run
53
+ TakeSample.new(context).run(uniqueness_strategy: :suffix).current_value
54
+ end
55
+
56
+ def random_word_instead_of(word)
57
+ GenerateRandomString.new(StepContext.new_from(step_context, current_value: word))
58
+ .run(character_set: :lowercase).current_value
59
+ end
60
+
61
+ def validate_params(domains_to_keep_data_key:, words_data_key:)
62
+ raise("The cleanup_data must be a hash") unless cleanup_data.respond_to?(:key)
63
+
64
+ unless !domains_to_keep_data_key || domains_to_keep_data_key.empty? ||
65
+ cleanup_data.key?(domains_to_keep_data_key)
66
+ raise_params_error("The cleanup_data does not contain the \"#{domains_to_keep_data_key}\" key.
67
+ Either add the domains to the cleanup data hash or set the domains_to_keep_data_key
68
+ to null or an empty string.".gsub(/\s+/, " "))
69
+ end
70
+
71
+ return if cleanup_data.key?(words_data_key)
72
+
73
+ raise_params_error("The cleanup_data does not contain the \"#{words_data_key}\" key")
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end