dump_cleaner 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +25 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +21 -0
- data/README.md +295 -0
- data/Rakefile +8 -0
- data/doc/workflow_steps.md +1400 -0
- data/dump_cleaner.gemspec +38 -0
- data/exe/dump_cleaner +7 -0
- data/lib/dump_cleaner/cleaners/base_cleaner.rb +32 -0
- data/lib/dump_cleaner/cleaners/mysql_shell_dump_cleaner.rb +47 -0
- data/lib/dump_cleaner/cleaners/mysql_shell_dump_helpers.rb +11 -0
- data/lib/dump_cleaner/cleaners/mysql_shell_table_cleaner.rb +184 -0
- data/lib/dump_cleaner/cleanup/bytesize_helpers.rb +39 -0
- data/lib/dump_cleaner/cleanup/cleaning.rb +69 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/add_repetition_suffix.rb +23 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/base.rb +33 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/fill_up_with_string.rb +20 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/generate_random_string.rb +37 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/inspect_context.rb +16 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_email.rb +78 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_formatted_number.rb +63 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_number.rb +29 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_bytesize.rb +17 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_pattern.rb +20 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/take_sample.rb +28 -0
- data/lib/dump_cleaner/cleanup/data_source.rb +19 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/base.rb +26 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/group_by_bytesize.rb +37 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/inspect_context.rb +16 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/load_yaml_file.rb +24 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/remove_accents.rb +29 -0
- data/lib/dump_cleaner/cleanup/inspection.rb +37 -0
- data/lib/dump_cleaner/cleanup/step_context.rb +46 -0
- data/lib/dump_cleaner/cleanup/uniqueness.rb +66 -0
- data/lib/dump_cleaner/cleanup/workflow.rb +38 -0
- data/lib/dump_cleaner/conditions.rb +42 -0
- data/lib/dump_cleaner/config.rb +109 -0
- data/lib/dump_cleaner/log.rb +42 -0
- data/lib/dump_cleaner/options.rb +46 -0
- data/lib/dump_cleaner/processor.rb +37 -0
- data/lib/dump_cleaner/version.rb +5 -0
- data/lib/dump_cleaner.rb +10 -0
- metadata +105 -0
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/dump_cleaner/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "dump_cleaner"
|
7
|
+
spec.version = DumpCleaner::VERSION
|
8
|
+
spec.authors = ["Matouš Borák"]
|
9
|
+
spec.email = ["matous.borak@nejremeslnici.cz"]
|
10
|
+
|
11
|
+
spec.summary = "Anonymizes data in logical database dumps."
|
12
|
+
spec.description = "Deterministically anonymizes data in logical database dumps. Useful for importing (anonymized) production data into development environments."
|
13
|
+
spec.homepage = "https://github.com/NejRemeslnici/dump-cleaner"
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = ">= 3.1.0"
|
16
|
+
|
17
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
18
|
+
spec.metadata["source_code_uri"] = "https://github.com/NejRemeslnici/dump-cleaner"
|
19
|
+
spec.metadata["changelog_uri"] = "https://github.com/NejRemeslnici/dump-cleaner/blob/main/CHANGELOG.md"
|
20
|
+
|
21
|
+
# Specify which files should be added to the gem when it is released.
|
22
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
23
|
+
spec.files = Dir.chdir(__dir__) do
|
24
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
25
|
+
(File.expand_path(f) == __FILE__) ||
|
26
|
+
f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
|
27
|
+
end
|
28
|
+
end
|
29
|
+
spec.bindir = "exe"
|
30
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
31
|
+
spec.require_paths = ["lib"]
|
32
|
+
|
33
|
+
# Uncomment to register a new dependency of your gem
|
34
|
+
spec.add_dependency "zeitwerk", "~> 2.6"
|
35
|
+
|
36
|
+
# For more information and examples about making a new gem, check out our
|
37
|
+
# guide at: https://bundler.io/guides/creating_gem.html
|
38
|
+
end
|
data/exe/dump_cleaner
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleaners
|
5
|
+
class BaseCleaner
|
6
|
+
attr_reader :config, :options
|
7
|
+
|
8
|
+
def initialize(config:, options:)
|
9
|
+
@config = config
|
10
|
+
@options = options
|
11
|
+
end
|
12
|
+
|
13
|
+
def pre_cleanup
|
14
|
+
# Implement in subclass if needed
|
15
|
+
end
|
16
|
+
|
17
|
+
def clean
|
18
|
+
raise NotImplementedError
|
19
|
+
end
|
20
|
+
|
21
|
+
def post_cleanup
|
22
|
+
# Implement in subclass if needed
|
23
|
+
end
|
24
|
+
|
25
|
+
def keep_same_record?(record, table_config:)
|
26
|
+
return false unless table_config.keep_same_record_conditions
|
27
|
+
|
28
|
+
Conditions.new(table_config.keep_same_record_conditions).evaluate_to_true?(record:)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleaners
|
5
|
+
class MysqlShellDumpCleaner < BaseCleaner
|
6
|
+
require "fileutils"
|
7
|
+
|
8
|
+
include MysqlShellDumpHelpers
|
9
|
+
|
10
|
+
def pre_cleanup
|
11
|
+
validate_source_dump
|
12
|
+
prepare_destination_dump
|
13
|
+
end
|
14
|
+
|
15
|
+
def clean
|
16
|
+
config.cleanup_tables.each do |db, table|
|
17
|
+
table_cleaner = MysqlShellTableCleaner.new(db:, table:, config:, options:)
|
18
|
+
|
19
|
+
table_cleaner.pre_cleanup
|
20
|
+
table_cleaner.clean
|
21
|
+
table_cleaner.post_cleanup
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def post_cleanup
|
26
|
+
copy_remaining_files
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def validate_source_dump
|
32
|
+
raise "Source dump path does not exist: #{options.source_dump_path}" unless Dir.exist?(options.source_dump_path)
|
33
|
+
end
|
34
|
+
|
35
|
+
def prepare_destination_dump
|
36
|
+
Dir.mkdir(options.destination_dump_path) unless Dir.exist?(options.destination_dump_path)
|
37
|
+
end
|
38
|
+
|
39
|
+
def copy_remaining_files
|
40
|
+
Dir.glob("#{options.source_dump_path}/*").each do |file|
|
41
|
+
destination_file = destination_file_for(file)
|
42
|
+
FileUtils.cp(file, destination_file, preserve: true) unless File.exist?(destination_file)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,184 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleaners
|
5
|
+
class MysqlShellTableCleaner < BaseCleaner
|
6
|
+
require "open3"
|
7
|
+
|
8
|
+
include MysqlShellDumpHelpers
|
9
|
+
|
10
|
+
attr_reader :table_info, :cleanup_data, :cleaning
|
11
|
+
|
12
|
+
def initialize(db:, table:, config:, options:)
|
13
|
+
super(config:, options:)
|
14
|
+
@db = db
|
15
|
+
@table = table
|
16
|
+
@cleanup_data = Cleanup::DataSource.new(config:)
|
17
|
+
@cleaning = Cleanup::Cleaning.new(config:)
|
18
|
+
end
|
19
|
+
|
20
|
+
def pre_cleanup
|
21
|
+
@table_info = DumpTableInfo.load(db: @db, table: @table, source_dump_path: options.source_dump_path)
|
22
|
+
validate_table_info
|
23
|
+
end
|
24
|
+
|
25
|
+
def clean
|
26
|
+
table_config = config.cleanup_table_config(db: @db, table: @table)
|
27
|
+
Log.info { "Cleaning table #{table_info.db_dot_table}…" }
|
28
|
+
|
29
|
+
DumpCleaner::Cleanup::Uniqueness::CaseInsensitiveCache.instance.clear
|
30
|
+
|
31
|
+
Dir.glob("#{options.source_dump_path}/#{table_info.db_at_table}@@*.#{table_info.extension}").each do |file|
|
32
|
+
# Open3.pipeline_r(pipe_source_args(file), ["head", "-n", "1000"]) do |tsv_data, _wait_thread|
|
33
|
+
Open3.pipeline_r(pipe_source_args(file)) do |tsv_data, _wait_thread|
|
34
|
+
Open3.pipeline_w(pipe_sink_args(destination_file_for(file))) do |zstd_out, _wait_thread|
|
35
|
+
tsv_data.each_line do |line|
|
36
|
+
line = line.chomp(table_info.dialect.lines_terminated_by)
|
37
|
+
zstd_out.print "#{clean_line(line, table_config:)}#{table_info.dialect.lines_terminated_by}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def clean_line(line, table_config:)
|
47
|
+
record = line.split("\t")
|
48
|
+
record_context = record_context(record, table_config:)
|
49
|
+
print "\r#{record_context['id']}… " if (record_context["id"].to_i % 10_000).zero?
|
50
|
+
|
51
|
+
keep_record = keep_same_record?(record_context, table_config:)
|
52
|
+
|
53
|
+
table_config.columns.each do |column_config|
|
54
|
+
column_index = table_info.column_index(column_config.name)
|
55
|
+
raise "Invalid column specified in config: #{column_config.name}" unless column_index
|
56
|
+
|
57
|
+
next if record[column_index] == "\\N" # ignore NULL values
|
58
|
+
|
59
|
+
cleanup_data_for_type = cleanup_data.data_for(column_config.cleanup_type)
|
60
|
+
|
61
|
+
record[column_index] = cleaning.clean_value_for(record[column_index],
|
62
|
+
type: column_config.cleanup_type,
|
63
|
+
cleanup_data: cleanup_data_for_type,
|
64
|
+
record: record_context,
|
65
|
+
keep_record:,
|
66
|
+
column_config:)
|
67
|
+
end
|
68
|
+
|
69
|
+
new_line = record.join("\t")
|
70
|
+
warn_on_changed_line_length(line, new_line, id: record_context["id"], record:)
|
71
|
+
|
72
|
+
new_line
|
73
|
+
end
|
74
|
+
|
75
|
+
def record_context(record, table_config:)
|
76
|
+
columns = table_config.record_context_columns
|
77
|
+
context = columns.each_with_object({}) do |column, context|
|
78
|
+
context[column] = record[table_info.column_index(column)]
|
79
|
+
end
|
80
|
+
context["id_column"] = record[table_info.column_index(table_config.id_column)]
|
81
|
+
context
|
82
|
+
end
|
83
|
+
|
84
|
+
def warn_on_changed_line_length(orig_line, new_line, id:, record:)
|
85
|
+
return if orig_line.bytesize == new_line.bytesize
|
86
|
+
|
87
|
+
warning = "ID: #{id} bytesize changed: #{orig_line.bytesize} => #{new_line.bytesize}"
|
88
|
+
orig_line.split("\t").each_with_index do |column, i|
|
89
|
+
warning << "#{column} -> #{record[i]}" if !record[i] || column.bytesize != record[i].bytesize
|
90
|
+
end
|
91
|
+
|
92
|
+
Log.error { warning }
|
93
|
+
end
|
94
|
+
|
95
|
+
def validate_table_info
|
96
|
+
case table_info.compression
|
97
|
+
when "zstd"
|
98
|
+
system("zstd --version >/dev/null 2>&1") || raise("zstd not found in \$PATH")
|
99
|
+
else
|
100
|
+
raise "Unsupported dump compression format '#{table_info.compression}'"
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def pipe_source_args(file)
|
105
|
+
case table_info.compression
|
106
|
+
when "zstd"
|
107
|
+
["zstd", "-dc", file]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def pipe_sink_args(file)
|
112
|
+
case table_info.compression
|
113
|
+
when "zstd"
|
114
|
+
["zstd", "-qfo", file]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
class DumpTableInfo
|
119
|
+
require "json"
|
120
|
+
|
121
|
+
DialectOptions = Data.define(:lines_terminated_by, :fields_terminated_by, :fields_enclosed_by,
|
122
|
+
:fields_optionally_enclosed, :fields_escaped_by)
|
123
|
+
|
124
|
+
def self.load(db:, table:, source_dump_path:)
|
125
|
+
new(JSON.parse(File.read(table_info_file_path(db:, table:, source_dump_path:))))
|
126
|
+
rescue Errno::ENOENT
|
127
|
+
raise "Table info file not found in dump for table '#{db}.#{table}'. Is the table included in the dump?"
|
128
|
+
end
|
129
|
+
|
130
|
+
def self.table_info_file_path(db:, table:, source_dump_path:)
|
131
|
+
"#{source_dump_path}/#{db}@#{table}.json"
|
132
|
+
end
|
133
|
+
|
134
|
+
def initialize(table_info)
|
135
|
+
@table_info = table_info
|
136
|
+
end
|
137
|
+
|
138
|
+
def db
|
139
|
+
@db ||= @table_info.dig("options", "schema")
|
140
|
+
end
|
141
|
+
|
142
|
+
def table
|
143
|
+
@table ||= @table_info.dig("options", "table")
|
144
|
+
end
|
145
|
+
|
146
|
+
def db_dot_table
|
147
|
+
"#{db}.#{table}"
|
148
|
+
end
|
149
|
+
|
150
|
+
def db_at_table
|
151
|
+
"#{db}@#{table}"
|
152
|
+
end
|
153
|
+
|
154
|
+
def compression
|
155
|
+
@table_info["compression"]
|
156
|
+
end
|
157
|
+
|
158
|
+
def extension
|
159
|
+
@table_info["extension"]
|
160
|
+
end
|
161
|
+
|
162
|
+
def columns
|
163
|
+
@columns ||= @table_info.dig("options", "columns")
|
164
|
+
end
|
165
|
+
|
166
|
+
def column_index(name)
|
167
|
+
columns.index(name)
|
168
|
+
end
|
169
|
+
|
170
|
+
def dialect
|
171
|
+
@dialect ||= begin
|
172
|
+
dialect_options = DialectOptions.members.each_with_object({}) do |option, options|
|
173
|
+
lowercase_option = option.to_s.split("_").each_with_object([]) do |e, buffer|
|
174
|
+
buffer.push(buffer.empty? ? e : e.capitalize)
|
175
|
+
end.join
|
176
|
+
options[option] = @table_info.dig("options", lowercase_option)
|
177
|
+
end
|
178
|
+
DialectOptions.new(**dialect_options)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module DumpCleaner
|
2
|
+
module Cleanup
|
3
|
+
module BytesizeHelpers
|
4
|
+
# inspired by https://stackoverflow.com/a/67825008/1544012
|
5
|
+
def truncate_to_bytesize(string, max_bytesize:, padding: " ")
|
6
|
+
return string unless string.bytesize > max_bytesize
|
7
|
+
|
8
|
+
check_padding_bytesize(padding)
|
9
|
+
|
10
|
+
just_over = (0...string.size).bsearch { string[0.._1].bytesize > max_bytesize }
|
11
|
+
string = string[0...just_over]
|
12
|
+
|
13
|
+
string << padding while string.bytesize < max_bytesize
|
14
|
+
string
|
15
|
+
end
|
16
|
+
|
17
|
+
def set_to_bytesize(string, bytesize:, padding: " ")
|
18
|
+
string = string.ljust(bytesize, "#{padding}#{string}") if string.bytesize < bytesize
|
19
|
+
truncate_to_bytesize(string, max_bytesize: bytesize, padding:)
|
20
|
+
end
|
21
|
+
|
22
|
+
def replace_suffix(string, suffix:, padding: " ")
|
23
|
+
front_max_bytes = string.bytesize - suffix.bytesize
|
24
|
+
front = truncate_to_bytesize(string, max_bytesize: front_max_bytes, padding:)
|
25
|
+
|
26
|
+
"#{front}#{suffix}"
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def check_padding_bytesize(padding)
|
32
|
+
return unless padding.bytesize > 1
|
33
|
+
|
34
|
+
raise ArgumentError,
|
35
|
+
"Use only a single-byte string in the padding otherwise it may prevent adjusting the result precisely."
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
class Cleaning
|
6
|
+
include Uniqueness
|
7
|
+
|
8
|
+
attr_reader :config
|
9
|
+
|
10
|
+
def initialize(config:)
|
11
|
+
@cleaning_workflow = Workflow.new(phase: :cleaning)
|
12
|
+
@failure_workflow = Workflow.new(phase: :failure)
|
13
|
+
@config = config
|
14
|
+
end
|
15
|
+
|
16
|
+
def clean_value_for(orig_value, type:, cleanup_data:, column_config:, record: {}, keep_record: false) # rubocop:disable Metrics/ParameterLists
|
17
|
+
step_context = StepContext.new(orig_value:, type:, cleanup_data:, record:)
|
18
|
+
|
19
|
+
# return orig_value if keep_same conditions are met
|
20
|
+
if (keep_record && !config.ignore_keep_same_record_conditions?(type)) ||
|
21
|
+
Conditions.evaluate_to_true_in_step?(conditions: config.keep_same_conditions(type), step_context:)
|
22
|
+
return orig_value_with_optional_suffix(step_context, column_config:)
|
23
|
+
end
|
24
|
+
|
25
|
+
if column_config.unique_column?
|
26
|
+
begin
|
27
|
+
repeat_until_unique(step_context:) do |repetition|
|
28
|
+
step_context.repetition = repetition
|
29
|
+
run_workflows(step_context)
|
30
|
+
end
|
31
|
+
rescue MaxRetriesReachedError
|
32
|
+
repeat_until_unique(step_context:) do |repetition|
|
33
|
+
step_context.repetition = repetition
|
34
|
+
run_failure_workflow(step_context)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
else
|
38
|
+
run_workflows(step_context)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def orig_value_with_optional_suffix(step_context, column_config:)
|
45
|
+
if column_config.unique_column?
|
46
|
+
repeat_until_unique(step_context:) do |repetition|
|
47
|
+
step_context.repetition = repetition
|
48
|
+
DumpCleaner::Cleanup::CleaningSteps::AddRepetitionSuffix.new(step_context).run.current_value
|
49
|
+
end
|
50
|
+
else
|
51
|
+
step_context.orig_value
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def run_workflows(step_context)
|
56
|
+
run_cleaning_workflow(step_context) || run_failure_workflow(step_context)
|
57
|
+
end
|
58
|
+
|
59
|
+
def run_cleaning_workflow(step_context)
|
60
|
+
@cleaning_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :cleaning)).current_value
|
61
|
+
end
|
62
|
+
|
63
|
+
def run_failure_workflow(step_context)
|
64
|
+
step_context.current_value = step_context.orig_value # reset current_value
|
65
|
+
@failure_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :failure)).current_value
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class AddRepetitionSuffix < Base
|
7
|
+
include BytesizeHelpers
|
8
|
+
|
9
|
+
def run
|
10
|
+
step_context.current_value = if repetition.zero?
|
11
|
+
current_value
|
12
|
+
elsif current_value.bytesize > repetition.to_s.bytesize
|
13
|
+
replace_suffix(current_value, suffix: repetition.to_s, padding: "0")
|
14
|
+
else
|
15
|
+
GenerateRandomString.new(StepContext.new_from(step_context))
|
16
|
+
.run.current_value
|
17
|
+
end
|
18
|
+
step_context
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class Base
|
7
|
+
require "forwardable"
|
8
|
+
require "zlib"
|
9
|
+
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
def_delegators :step_context, :cleanup_data, :current_value, :orig_value, :type, :record, :repetition
|
13
|
+
|
14
|
+
attr_reader :step_context
|
15
|
+
|
16
|
+
def initialize(step_context)
|
17
|
+
@step_context = step_context.dup
|
18
|
+
end
|
19
|
+
|
20
|
+
def crc32(use_repetition: true)
|
21
|
+
value_to_hash = "#{record['id_column']}-#{current_value}"
|
22
|
+
value_to_hash += "-#{repetition}" if repetition.positive? && use_repetition
|
23
|
+
Zlib.crc32(value_to_hash)
|
24
|
+
end
|
25
|
+
|
26
|
+
def raise_params_error(error)
|
27
|
+
step = self.class.name.split("::").last
|
28
|
+
raise ArgumentError, "Invalid cleanup step params: type=#{type}, step=#{step}: #{error}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class FillUpWithString < Base
|
7
|
+
include BytesizeHelpers
|
8
|
+
|
9
|
+
def run(string: "anonymized #{type}", padding: " ", strict_bytesize_check: false)
|
10
|
+
if strict_bytesize_check && string.bytesize != orig_value.bytesize
|
11
|
+
raise "The bytesize of the string must be equal to the bytesize of the original value."
|
12
|
+
end
|
13
|
+
|
14
|
+
string = set_to_bytesize(string, bytesize: orig_value.bytesize, padding:)
|
15
|
+
AddRepetitionSuffix.new(StepContext.new_from(step_context, current_value: string)).run
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class GenerateRandomString < Base
|
7
|
+
require "random/formatter"
|
8
|
+
|
9
|
+
def run(character_set: "alphanumeric")
|
10
|
+
random = Random.new(crc32)
|
11
|
+
|
12
|
+
step_context.current_value = random.alphanumeric(current_value.bytesize, chars: characters(character_set))
|
13
|
+
step_context
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def characters(character_set)
|
19
|
+
case character_set.to_s
|
20
|
+
when "alphanumeric"
|
21
|
+
Random::Formatter::ALPHANUMERIC
|
22
|
+
when "alpha"
|
23
|
+
[*"a".."z", *"A".."Z"]
|
24
|
+
when "lowercase"
|
25
|
+
[*"a".."z"]
|
26
|
+
when "uppercase"
|
27
|
+
[*"A".."Z"]
|
28
|
+
when "numeric"
|
29
|
+
[*"0".."9"]
|
30
|
+
else
|
31
|
+
character_set
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DumpCleaner
|
4
|
+
module Cleanup
|
5
|
+
module CleaningSteps
|
6
|
+
class RandomizeEmail < Base
|
7
|
+
def run(domains_to_keep_data_key: "domains_to_keep", words_data_key: "words")
|
8
|
+
validate_params(domains_to_keep_data_key:, words_data_key:)
|
9
|
+
|
10
|
+
mailbox, domain = current_value.split("@", 2)
|
11
|
+
|
12
|
+
if !mailbox || !domain || mailbox.empty? || domain.empty? || !domain.include?(".")
|
13
|
+
Log.warn { "Invalid email: type=#{type}, id=#{record['id']}, value=#{current_value}" } if repetition.zero?
|
14
|
+
step_context.current_value = nil
|
15
|
+
return step_context
|
16
|
+
end
|
17
|
+
|
18
|
+
new_mailbox = new_mailbox(mailbox, words: cleanup_data[words_data_key])
|
19
|
+
new_domain = new_domain(domain, domains: cleanup_data[domains_to_keep_data_key],
|
20
|
+
words: cleanup_data[words_data_key])
|
21
|
+
|
22
|
+
step_context.current_value = "#{new_mailbox}@#{new_domain}"
|
23
|
+
step_context
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def new_mailbox(mailbox, words:)
|
29
|
+
if mailbox !~ /^\.|\.\.|\.$/
|
30
|
+
mailbox.split(".").map { dictionary_or_random_word_instead_of(_1, words:) }.join(".")
|
31
|
+
else
|
32
|
+
dictionary_or_random_word_instead_of(mailbox, words:)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def new_domain(domain, domains:, words:)
|
37
|
+
if domains.include?(domain)
|
38
|
+
domain
|
39
|
+
else
|
40
|
+
tld2, _dot, tld = domain.rpartition(".")
|
41
|
+
new_tld2 = dictionary_or_random_word_instead_of(tld2, words:)
|
42
|
+
"#{new_tld2}.#{tld}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def dictionary_or_random_word_instead_of(word, words:)
|
47
|
+
dictionary_word_instead_of(word, words:) || random_word_instead_of(word)
|
48
|
+
end
|
49
|
+
|
50
|
+
def dictionary_word_instead_of(word, words:)
|
51
|
+
context = StepContext.new_from(step_context, current_value: word, cleanup_data: words)
|
52
|
+
context = SelectDataByBytesize.new(context).run
|
53
|
+
TakeSample.new(context).run(uniqueness_strategy: :suffix).current_value
|
54
|
+
end
|
55
|
+
|
56
|
+
def random_word_instead_of(word)
|
57
|
+
GenerateRandomString.new(StepContext.new_from(step_context, current_value: word))
|
58
|
+
.run(character_set: :lowercase).current_value
|
59
|
+
end
|
60
|
+
|
61
|
+
def validate_params(domains_to_keep_data_key:, words_data_key:)
|
62
|
+
raise("The cleanup_data must be a hash") unless cleanup_data.respond_to?(:key)
|
63
|
+
|
64
|
+
unless !domains_to_keep_data_key || domains_to_keep_data_key.empty? ||
|
65
|
+
cleanup_data.key?(domains_to_keep_data_key)
|
66
|
+
raise_params_error("The cleanup_data does not contain the \"#{domains_to_keep_data_key}\" key.
|
67
|
+
Either add the domains to the cleanup data hash or set the domains_to_keep_data_key
|
68
|
+
to null or an empty string.".gsub(/\s+/, " "))
|
69
|
+
end
|
70
|
+
|
71
|
+
return if cleanup_data.key?(words_data_key)
|
72
|
+
|
73
|
+
raise_params_error("The cleanup_data does not contain the \"#{words_data_key}\" key")
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|