dump_cleaner 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +25 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +21 -0
- data/README.md +295 -0
- data/Rakefile +8 -0
- data/doc/workflow_steps.md +1400 -0
- data/dump_cleaner.gemspec +38 -0
- data/exe/dump_cleaner +7 -0
- data/lib/dump_cleaner/cleaners/base_cleaner.rb +32 -0
- data/lib/dump_cleaner/cleaners/mysql_shell_dump_cleaner.rb +47 -0
- data/lib/dump_cleaner/cleaners/mysql_shell_dump_helpers.rb +11 -0
- data/lib/dump_cleaner/cleaners/mysql_shell_table_cleaner.rb +184 -0
- data/lib/dump_cleaner/cleanup/bytesize_helpers.rb +39 -0
- data/lib/dump_cleaner/cleanup/cleaning.rb +69 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/add_repetition_suffix.rb +23 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/base.rb +33 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/fill_up_with_string.rb +20 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/generate_random_string.rb +37 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/inspect_context.rb +16 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_email.rb +78 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_formatted_number.rb +63 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_number.rb +29 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_bytesize.rb +17 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_pattern.rb +20 -0
- data/lib/dump_cleaner/cleanup/cleaning_steps/take_sample.rb +28 -0
- data/lib/dump_cleaner/cleanup/data_source.rb +19 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/base.rb +26 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/group_by_bytesize.rb +37 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/inspect_context.rb +16 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/load_yaml_file.rb +24 -0
- data/lib/dump_cleaner/cleanup/data_source_steps/remove_accents.rb +29 -0
- data/lib/dump_cleaner/cleanup/inspection.rb +37 -0
- data/lib/dump_cleaner/cleanup/step_context.rb +46 -0
- data/lib/dump_cleaner/cleanup/uniqueness.rb +66 -0
- data/lib/dump_cleaner/cleanup/workflow.rb +38 -0
- data/lib/dump_cleaner/conditions.rb +42 -0
- data/lib/dump_cleaner/config.rb +109 -0
- data/lib/dump_cleaner/log.rb +42 -0
- data/lib/dump_cleaner/options.rb +46 -0
- data/lib/dump_cleaner/processor.rb +37 -0
- data/lib/dump_cleaner/version.rb +5 -0
- data/lib/dump_cleaner.rb +10 -0
- metadata +105 -0
| @@ -0,0 +1,38 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require_relative "lib/dump_cleaner/version"
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Gem::Specification.new do |spec|
         | 
| 6 | 
            +
              spec.name = "dump_cleaner"
         | 
| 7 | 
            +
              spec.version = DumpCleaner::VERSION
         | 
| 8 | 
            +
              spec.authors = ["Matouš Borák"]
         | 
| 9 | 
            +
              spec.email = ["matous.borak@nejremeslnici.cz"]
         | 
| 10 | 
            +
             | 
| 11 | 
            +
              spec.summary = "Anonymizes data in logical database dumps."
         | 
| 12 | 
            +
              spec.description = "Deterministically anonymizes data in logical database dumps. Useful for importing (anonymized) production data into development environments."
         | 
| 13 | 
            +
              spec.homepage = "https://github.com/NejRemeslnici/dump-cleaner"
         | 
| 14 | 
            +
              spec.license = "MIT"
         | 
| 15 | 
            +
              spec.required_ruby_version = ">= 3.1.0"
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              spec.metadata["homepage_uri"] = spec.homepage
         | 
| 18 | 
            +
              spec.metadata["source_code_uri"] = "https://github.com/NejRemeslnici/dump-cleaner"
         | 
| 19 | 
            +
              spec.metadata["changelog_uri"] = "https://github.com/NejRemeslnici/dump-cleaner/blob/main/CHANGELOG.md"
         | 
| 20 | 
            +
             | 
| 21 | 
            +
              # Specify which files should be added to the gem when it is released.
         | 
| 22 | 
            +
              # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
         | 
| 23 | 
            +
              spec.files = Dir.chdir(__dir__) do
         | 
| 24 | 
            +
                `git ls-files -z`.split("\x0").reject do |f|
         | 
| 25 | 
            +
                  (File.expand_path(f) == __FILE__) ||
         | 
| 26 | 
            +
                    f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
         | 
| 27 | 
            +
                end
         | 
| 28 | 
            +
              end
         | 
| 29 | 
            +
              spec.bindir = "exe"
         | 
| 30 | 
            +
              spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
         | 
| 31 | 
            +
              spec.require_paths = ["lib"]
         | 
| 32 | 
            +
             | 
| 33 | 
            +
              # Uncomment to register a new dependency of your gem
         | 
| 34 | 
            +
              spec.add_dependency "zeitwerk", "~> 2.6"
         | 
| 35 | 
            +
             | 
| 36 | 
            +
              # For more information and examples about making a new gem, check out our
         | 
| 37 | 
            +
              # guide at: https://bundler.io/guides/creating_gem.html
         | 
| 38 | 
            +
            end
         | 
    
        data/exe/dump_cleaner
    ADDED
    
    
| @@ -0,0 +1,32 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module DumpCleaner
         | 
| 4 | 
            +
              module Cleaners
         | 
| 5 | 
            +
                class BaseCleaner
         | 
| 6 | 
            +
                  attr_reader :config, :options
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                  def initialize(config:, options:)
         | 
| 9 | 
            +
                    @config = config
         | 
| 10 | 
            +
                    @options = options
         | 
| 11 | 
            +
                  end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  def pre_cleanup
         | 
| 14 | 
            +
                    # Implement in subclass if needed
         | 
| 15 | 
            +
                  end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                  def clean
         | 
| 18 | 
            +
                    raise NotImplementedError
         | 
| 19 | 
            +
                  end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  def post_cleanup
         | 
| 22 | 
            +
                    # Implement in subclass if needed
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  def keep_same_record?(record, table_config:)
         | 
| 26 | 
            +
                    return false unless table_config.keep_same_record_conditions
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    Conditions.new(table_config.keep_same_record_conditions).evaluate_to_true?(record:)
         | 
| 29 | 
            +
                  end
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
              end
         | 
| 32 | 
            +
            end
         | 
| @@ -0,0 +1,47 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module DumpCleaner
         | 
| 4 | 
            +
              module Cleaners
         | 
| 5 | 
            +
                class MysqlShellDumpCleaner < BaseCleaner
         | 
| 6 | 
            +
                  require "fileutils"
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                  include MysqlShellDumpHelpers
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  def pre_cleanup
         | 
| 11 | 
            +
                    validate_source_dump
         | 
| 12 | 
            +
                    prepare_destination_dump
         | 
| 13 | 
            +
                  end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                  def clean
         | 
| 16 | 
            +
                    config.cleanup_tables.each do |db, table|
         | 
| 17 | 
            +
                      table_cleaner = MysqlShellTableCleaner.new(db:, table:, config:, options:)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                      table_cleaner.pre_cleanup
         | 
| 20 | 
            +
                      table_cleaner.clean
         | 
| 21 | 
            +
                      table_cleaner.post_cleanup
         | 
| 22 | 
            +
                    end
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  def post_cleanup
         | 
| 26 | 
            +
                    copy_remaining_files
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                  private
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                  def validate_source_dump
         | 
| 32 | 
            +
                    raise "Source dump path does not exist: #{options.source_dump_path}" unless Dir.exist?(options.source_dump_path)
         | 
| 33 | 
            +
                  end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                  def prepare_destination_dump
         | 
| 36 | 
            +
                    Dir.mkdir(options.destination_dump_path) unless Dir.exist?(options.destination_dump_path)
         | 
| 37 | 
            +
                  end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                  def copy_remaining_files
         | 
| 40 | 
            +
                    Dir.glob("#{options.source_dump_path}/*").each do |file|
         | 
| 41 | 
            +
                      destination_file = destination_file_for(file)
         | 
| 42 | 
            +
                      FileUtils.cp(file, destination_file, preserve: true) unless File.exist?(destination_file)
         | 
| 43 | 
            +
                    end
         | 
| 44 | 
            +
                  end
         | 
| 45 | 
            +
                end
         | 
| 46 | 
            +
              end
         | 
| 47 | 
            +
            end
         | 
| @@ -0,0 +1,184 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module DumpCleaner
         | 
| 4 | 
            +
              module Cleaners
         | 
| 5 | 
            +
                class MysqlShellTableCleaner < BaseCleaner
         | 
| 6 | 
            +
                  require "open3"
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                  include MysqlShellDumpHelpers
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  attr_reader :table_info, :cleanup_data, :cleaning
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  def initialize(db:, table:, config:, options:)
         | 
| 13 | 
            +
                    super(config:, options:)
         | 
| 14 | 
            +
                    @db = db
         | 
| 15 | 
            +
                    @table = table
         | 
| 16 | 
            +
                    @cleanup_data = Cleanup::DataSource.new(config:)
         | 
| 17 | 
            +
                    @cleaning = Cleanup::Cleaning.new(config:)
         | 
| 18 | 
            +
                  end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                  def pre_cleanup
         | 
| 21 | 
            +
                    @table_info = DumpTableInfo.load(db: @db, table: @table, source_dump_path: options.source_dump_path)
         | 
| 22 | 
            +
                    validate_table_info
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  def clean
         | 
| 26 | 
            +
                    table_config = config.cleanup_table_config(db: @db, table: @table)
         | 
| 27 | 
            +
                    Log.info { "Cleaning table #{table_info.db_dot_table}…" }
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    DumpCleaner::Cleanup::Uniqueness::CaseInsensitiveCache.instance.clear
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    Dir.glob("#{options.source_dump_path}/#{table_info.db_at_table}@@*.#{table_info.extension}").each do |file|
         | 
| 32 | 
            +
                      # Open3.pipeline_r(pipe_source_args(file), ["head", "-n", "1000"]) do |tsv_data, _wait_thread|
         | 
| 33 | 
            +
                      Open3.pipeline_r(pipe_source_args(file)) do |tsv_data, _wait_thread|
         | 
| 34 | 
            +
                        Open3.pipeline_w(pipe_sink_args(destination_file_for(file))) do |zstd_out, _wait_thread|
         | 
| 35 | 
            +
                          tsv_data.each_line do |line|
         | 
| 36 | 
            +
                            line = line.chomp(table_info.dialect.lines_terminated_by)
         | 
| 37 | 
            +
                            zstd_out.print "#{clean_line(line, table_config:)}#{table_info.dialect.lines_terminated_by}"
         | 
| 38 | 
            +
                          end
         | 
| 39 | 
            +
                        end
         | 
| 40 | 
            +
                      end
         | 
| 41 | 
            +
                    end
         | 
| 42 | 
            +
                  end
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                  private
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                  def clean_line(line, table_config:)
         | 
| 47 | 
            +
                    record = line.split("\t")
         | 
| 48 | 
            +
                    record_context = record_context(record, table_config:)
         | 
| 49 | 
            +
                    print "\r#{record_context['id']}… " if (record_context["id"].to_i % 10_000).zero?
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    keep_record = keep_same_record?(record_context, table_config:)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    table_config.columns.each do |column_config|
         | 
| 54 | 
            +
                      column_index = table_info.column_index(column_config.name)
         | 
| 55 | 
            +
                      raise "Invalid column specified in config: #{column_config.name}" unless column_index
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                      next if record[column_index] == "\\N" # ignore NULL values
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                      cleanup_data_for_type = cleanup_data.data_for(column_config.cleanup_type)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                      record[column_index] = cleaning.clean_value_for(record[column_index],
         | 
| 62 | 
            +
                                                                      type: column_config.cleanup_type,
         | 
| 63 | 
            +
                                                                      cleanup_data: cleanup_data_for_type,
         | 
| 64 | 
            +
                                                                      record: record_context,
         | 
| 65 | 
            +
                                                                      keep_record:,
         | 
| 66 | 
            +
                                                                      column_config:)
         | 
| 67 | 
            +
                    end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                    new_line = record.join("\t")
         | 
| 70 | 
            +
                    warn_on_changed_line_length(line, new_line, id: record_context["id"], record:)
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                    new_line
         | 
| 73 | 
            +
                  end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                  def record_context(record, table_config:)
         | 
| 76 | 
            +
                    columns = table_config.record_context_columns
         | 
| 77 | 
            +
                    context = columns.each_with_object({}) do |column, context|
         | 
| 78 | 
            +
                      context[column] = record[table_info.column_index(column)]
         | 
| 79 | 
            +
                    end
         | 
| 80 | 
            +
                    context["id_column"] = record[table_info.column_index(table_config.id_column)]
         | 
| 81 | 
            +
                    context
         | 
| 82 | 
            +
                  end
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                  def warn_on_changed_line_length(orig_line, new_line, id:, record:)
         | 
| 85 | 
            +
                    return if orig_line.bytesize == new_line.bytesize
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                    warning = "ID: #{id} bytesize changed: #{orig_line.bytesize} => #{new_line.bytesize}"
         | 
| 88 | 
            +
                    orig_line.split("\t").each_with_index do |column, i|
         | 
| 89 | 
            +
                      warning << "#{column} -> #{record[i]}" if !record[i] || column.bytesize != record[i].bytesize
         | 
| 90 | 
            +
                    end
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                    Log.error { warning }
         | 
| 93 | 
            +
                  end
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                  def validate_table_info
         | 
| 96 | 
            +
                    case table_info.compression
         | 
| 97 | 
            +
                    when "zstd"
         | 
| 98 | 
            +
                      system("zstd --version >/dev/null 2>&1") || raise("zstd not found in \$PATH")
         | 
| 99 | 
            +
                    else
         | 
| 100 | 
            +
                      raise "Unsupported dump compression format '#{table_info.compression}'"
         | 
| 101 | 
            +
                    end
         | 
| 102 | 
            +
                  end
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                  def pipe_source_args(file)
         | 
| 105 | 
            +
                    case table_info.compression
         | 
| 106 | 
            +
                    when "zstd"
         | 
| 107 | 
            +
                      ["zstd", "-dc", file]
         | 
| 108 | 
            +
                    end
         | 
| 109 | 
            +
                  end
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                  def pipe_sink_args(file)
         | 
| 112 | 
            +
                    case table_info.compression
         | 
| 113 | 
            +
                    when "zstd"
         | 
| 114 | 
            +
                      ["zstd", "-qfo", file]
         | 
| 115 | 
            +
                    end
         | 
| 116 | 
            +
                  end
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                  class DumpTableInfo
         | 
| 119 | 
            +
                    require "json"
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                    DialectOptions = Data.define(:lines_terminated_by, :fields_terminated_by, :fields_enclosed_by,
         | 
| 122 | 
            +
                                                 :fields_optionally_enclosed, :fields_escaped_by)
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                    def self.load(db:, table:, source_dump_path:)
         | 
| 125 | 
            +
                      new(JSON.parse(File.read(table_info_file_path(db:, table:, source_dump_path:))))
         | 
| 126 | 
            +
                    rescue Errno::ENOENT
         | 
| 127 | 
            +
                      raise "Table info file not found in dump for table '#{db}.#{table}'. Is the table included in the dump?"
         | 
| 128 | 
            +
                    end
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                    def self.table_info_file_path(db:, table:, source_dump_path:)
         | 
| 131 | 
            +
                      "#{source_dump_path}/#{db}@#{table}.json"
         | 
| 132 | 
            +
                    end
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                    def initialize(table_info)
         | 
| 135 | 
            +
                      @table_info = table_info
         | 
| 136 | 
            +
                    end
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                    def db
         | 
| 139 | 
            +
                      @db ||= @table_info.dig("options", "schema")
         | 
| 140 | 
            +
                    end
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    def table
         | 
| 143 | 
            +
                      @table ||= @table_info.dig("options", "table")
         | 
| 144 | 
            +
                    end
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                    def db_dot_table
         | 
| 147 | 
            +
                      "#{db}.#{table}"
         | 
| 148 | 
            +
                    end
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                    def db_at_table
         | 
| 151 | 
            +
                      "#{db}@#{table}"
         | 
| 152 | 
            +
                    end
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    def compression
         | 
| 155 | 
            +
                      @table_info["compression"]
         | 
| 156 | 
            +
                    end
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    def extension
         | 
| 159 | 
            +
                      @table_info["extension"]
         | 
| 160 | 
            +
                    end
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    def columns
         | 
| 163 | 
            +
                      @columns ||= @table_info.dig("options", "columns")
         | 
| 164 | 
            +
                    end
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    def column_index(name)
         | 
| 167 | 
            +
                      columns.index(name)
         | 
| 168 | 
            +
                    end
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                    def dialect
         | 
| 171 | 
            +
                      @dialect ||= begin
         | 
| 172 | 
            +
                        dialect_options = DialectOptions.members.each_with_object({}) do |option, options|
         | 
| 173 | 
            +
                          lowercase_option = option.to_s.split("_").each_with_object([]) do |e, buffer|
         | 
| 174 | 
            +
                            buffer.push(buffer.empty? ? e : e.capitalize)
         | 
| 175 | 
            +
                          end.join
         | 
| 176 | 
            +
                          options[option] = @table_info.dig("options", lowercase_option)
         | 
| 177 | 
            +
                        end
         | 
| 178 | 
            +
                        DialectOptions.new(**dialect_options)
         | 
| 179 | 
            +
                      end
         | 
| 180 | 
            +
                    end
         | 
| 181 | 
            +
                  end
         | 
| 182 | 
            +
                end
         | 
| 183 | 
            +
              end
         | 
| 184 | 
            +
            end
         | 
| @@ -0,0 +1,39 @@ | |
| 1 | 
            +
            module DumpCleaner
         | 
| 2 | 
            +
              module Cleanup
         | 
| 3 | 
            +
                module BytesizeHelpers
         | 
| 4 | 
            +
                  # inspired by https://stackoverflow.com/a/67825008/1544012
         | 
| 5 | 
            +
                  def truncate_to_bytesize(string, max_bytesize:, padding: " ")
         | 
| 6 | 
            +
                    return string unless string.bytesize > max_bytesize
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                    check_padding_bytesize(padding)
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                    just_over = (0...string.size).bsearch { string[0.._1].bytesize > max_bytesize }
         | 
| 11 | 
            +
                    string = string[0...just_over]
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                    string << padding while string.bytesize < max_bytesize
         | 
| 14 | 
            +
                    string
         | 
| 15 | 
            +
                  end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                  def set_to_bytesize(string, bytesize:, padding: " ")
         | 
| 18 | 
            +
                    string = string.ljust(bytesize, "#{padding}#{string}") if string.bytesize < bytesize
         | 
| 19 | 
            +
                    truncate_to_bytesize(string, max_bytesize: bytesize, padding:)
         | 
| 20 | 
            +
                  end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                  def replace_suffix(string, suffix:, padding: " ")
         | 
| 23 | 
            +
                    front_max_bytes = string.bytesize - suffix.bytesize
         | 
| 24 | 
            +
                    front = truncate_to_bytesize(string, max_bytesize: front_max_bytes, padding:)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    "#{front}#{suffix}"
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                  private
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                  def check_padding_bytesize(padding)
         | 
| 32 | 
            +
                    return unless padding.bytesize > 1
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    raise ArgumentError,
         | 
| 35 | 
            +
                          "Use only a single-byte string in the padding otherwise it may prevent adjusting the result precisely."
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
            end
         | 
| @@ -0,0 +1,69 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module DumpCleaner
         | 
| 4 | 
            +
              module Cleanup
         | 
| 5 | 
            +
                class Cleaning
         | 
| 6 | 
            +
                  include Uniqueness
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                  attr_reader :config
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  def initialize(config:)
         | 
| 11 | 
            +
                    @cleaning_workflow = Workflow.new(phase: :cleaning)
         | 
| 12 | 
            +
                    @failure_workflow = Workflow.new(phase: :failure)
         | 
| 13 | 
            +
                    @config = config
         | 
| 14 | 
            +
                  end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                  def clean_value_for(orig_value, type:, cleanup_data:, column_config:, record: {}, keep_record: false) # rubocop:disable Metrics/ParameterLists
         | 
| 17 | 
            +
                    step_context = StepContext.new(orig_value:, type:, cleanup_data:, record:)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                    # return orig_value if keep_same conditions are met
         | 
| 20 | 
            +
                    if (keep_record && !config.ignore_keep_same_record_conditions?(type)) ||
         | 
| 21 | 
            +
                       Conditions.evaluate_to_true_in_step?(conditions: config.keep_same_conditions(type), step_context:)
         | 
| 22 | 
            +
                      return orig_value_with_optional_suffix(step_context, column_config:)
         | 
| 23 | 
            +
                    end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    if column_config.unique_column?
         | 
| 26 | 
            +
                      begin
         | 
| 27 | 
            +
                        repeat_until_unique(step_context:) do |repetition|
         | 
| 28 | 
            +
                          step_context.repetition = repetition
         | 
| 29 | 
            +
                          run_workflows(step_context)
         | 
| 30 | 
            +
                        end
         | 
| 31 | 
            +
                      rescue MaxRetriesReachedError
         | 
| 32 | 
            +
                        repeat_until_unique(step_context:) do |repetition|
         | 
| 33 | 
            +
                          step_context.repetition = repetition
         | 
| 34 | 
            +
                          run_failure_workflow(step_context)
         | 
| 35 | 
            +
                        end
         | 
| 36 | 
            +
                      end
         | 
| 37 | 
            +
                    else
         | 
| 38 | 
            +
                      run_workflows(step_context)
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  private
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                  def orig_value_with_optional_suffix(step_context, column_config:)
         | 
| 45 | 
            +
                    if column_config.unique_column?
         | 
| 46 | 
            +
                      repeat_until_unique(step_context:) do |repetition|
         | 
| 47 | 
            +
                        step_context.repetition = repetition
         | 
| 48 | 
            +
                        DumpCleaner::Cleanup::CleaningSteps::AddRepetitionSuffix.new(step_context).run.current_value
         | 
| 49 | 
            +
                      end
         | 
| 50 | 
            +
                    else
         | 
| 51 | 
            +
                      step_context.orig_value
         | 
| 52 | 
            +
                    end
         | 
| 53 | 
            +
                  end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                  def run_workflows(step_context)
         | 
| 56 | 
            +
                    run_cleaning_workflow(step_context) || run_failure_workflow(step_context)
         | 
| 57 | 
            +
                  end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                  def run_cleaning_workflow(step_context)
         | 
| 60 | 
            +
                    @cleaning_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :cleaning)).current_value
         | 
| 61 | 
            +
                  end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                  def run_failure_workflow(step_context)
         | 
| 64 | 
            +
                    step_context.current_value = step_context.orig_value # reset current_value
         | 
| 65 | 
            +
                    @failure_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :failure)).current_value
         | 
| 66 | 
            +
                  end
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
              end
         | 
| 69 | 
            +
            end
         | 
| @@ -0,0 +1,23 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module DumpCleaner
         | 
| 4 | 
            +
              module Cleanup
         | 
| 5 | 
            +
                module CleaningSteps
         | 
| 6 | 
            +
                  class AddRepetitionSuffix < Base
         | 
| 7 | 
            +
                    include BytesizeHelpers
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                    def run
         | 
| 10 | 
            +
                      step_context.current_value = if repetition.zero?
         | 
| 11 | 
            +
                                                     current_value
         | 
| 12 | 
            +
                                                   elsif current_value.bytesize > repetition.to_s.bytesize
         | 
| 13 | 
            +
                                                     replace_suffix(current_value, suffix: repetition.to_s, padding: "0")
         | 
| 14 | 
            +
                                                   else
         | 
| 15 | 
            +
                                                     GenerateRandomString.new(StepContext.new_from(step_context))
         | 
| 16 | 
            +
                                                                         .run.current_value
         | 
| 17 | 
            +
                                                   end
         | 
| 18 | 
            +
                      step_context
         | 
| 19 | 
            +
                    end
         | 
| 20 | 
            +
                  end
         | 
| 21 | 
            +
                end
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
            end
         | 
| @@ -0,0 +1,33 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module DumpCleaner
         | 
| 4 | 
            +
              module Cleanup
         | 
| 5 | 
            +
                module CleaningSteps
         | 
| 6 | 
            +
                  class Base
         | 
| 7 | 
            +
                    require "forwardable"
         | 
| 8 | 
            +
                    require "zlib"
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                    extend Forwardable
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                    def_delegators :step_context, :cleanup_data, :current_value, :orig_value, :type, :record, :repetition
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                    attr_reader :step_context
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    def initialize(step_context)
         | 
| 17 | 
            +
                      @step_context = step_context.dup
         | 
| 18 | 
            +
                    end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    def crc32(use_repetition: true)
         | 
| 21 | 
            +
                      value_to_hash = "#{record['id_column']}-#{current_value}"
         | 
| 22 | 
            +
                      value_to_hash += "-#{repetition}" if repetition.positive? && use_repetition
         | 
| 23 | 
            +
                      Zlib.crc32(value_to_hash)
         | 
| 24 | 
            +
                    end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    def raise_params_error(error)
         | 
| 27 | 
            +
                      step = self.class.name.split("::").last
         | 
| 28 | 
            +
                      raise ArgumentError, "Invalid cleanup step params: type=#{type}, step=#{step}: #{error}"
         | 
| 29 | 
            +
                    end
         | 
| 30 | 
            +
                  end
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
              end
         | 
| 33 | 
            +
            end
         | 
| @@ -0,0 +1,20 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module DumpCleaner
         | 
| 4 | 
            +
              module Cleanup
         | 
| 5 | 
            +
                module CleaningSteps
         | 
| 6 | 
            +
                  class FillUpWithString < Base
         | 
| 7 | 
            +
                    include BytesizeHelpers
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                    def run(string: "anonymized #{type}", padding: " ", strict_bytesize_check: false)
         | 
| 10 | 
            +
                      if strict_bytesize_check && string.bytesize != orig_value.bytesize
         | 
| 11 | 
            +
                        raise "The bytesize of the string must be equal to the bytesize of the original value."
         | 
| 12 | 
            +
                      end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                      string = set_to_bytesize(string, bytesize: orig_value.bytesize, padding:)
         | 
| 15 | 
            +
                      AddRepetitionSuffix.new(StepContext.new_from(step_context, current_value: string)).run
         | 
| 16 | 
            +
                    end
         | 
| 17 | 
            +
                  end
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
              end
         | 
| 20 | 
            +
            end
         | 
| @@ -0,0 +1,37 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module DumpCleaner
         | 
| 4 | 
            +
              module Cleanup
         | 
| 5 | 
            +
                module CleaningSteps
         | 
| 6 | 
            +
                  class GenerateRandomString < Base
         | 
| 7 | 
            +
                    require "random/formatter"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                    def run(character_set: "alphanumeric")
         | 
| 10 | 
            +
                      random = Random.new(crc32)
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                      step_context.current_value = random.alphanumeric(current_value.bytesize, chars: characters(character_set))
         | 
| 13 | 
            +
                      step_context
         | 
| 14 | 
            +
                    end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    private
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                    def characters(character_set)
         | 
| 19 | 
            +
                      case character_set.to_s
         | 
| 20 | 
            +
                      when "alphanumeric"
         | 
| 21 | 
            +
                        Random::Formatter::ALPHANUMERIC
         | 
| 22 | 
            +
                      when "alpha"
         | 
| 23 | 
            +
                        [*"a".."z", *"A".."Z"]
         | 
| 24 | 
            +
                      when "lowercase"
         | 
| 25 | 
            +
                        [*"a".."z"]
         | 
| 26 | 
            +
                      when "uppercase"
         | 
| 27 | 
            +
                        [*"A".."Z"]
         | 
| 28 | 
            +
                      when "numeric"
         | 
| 29 | 
            +
                        [*"0".."9"]
         | 
| 30 | 
            +
                      else
         | 
| 31 | 
            +
                        character_set
         | 
| 32 | 
            +
                      end
         | 
| 33 | 
            +
                    end
         | 
| 34 | 
            +
                  end
         | 
| 35 | 
            +
                end
         | 
| 36 | 
            +
              end
         | 
| 37 | 
            +
            end
         | 
| @@ -0,0 +1,78 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module DumpCleaner
         | 
| 4 | 
            +
              module Cleanup
         | 
| 5 | 
            +
                module CleaningSteps
         | 
| 6 | 
            +
                  class RandomizeEmail < Base
         | 
| 7 | 
            +
                    def run(domains_to_keep_data_key: "domains_to_keep", words_data_key: "words")
         | 
| 8 | 
            +
                      validate_params(domains_to_keep_data_key:, words_data_key:)
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                      mailbox, domain = current_value.split("@", 2)
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                      if !mailbox || !domain || mailbox.empty? || domain.empty? || !domain.include?(".")
         | 
| 13 | 
            +
                        Log.warn { "Invalid email: type=#{type}, id=#{record['id']}, value=#{current_value}" } if repetition.zero?
         | 
| 14 | 
            +
                        step_context.current_value = nil
         | 
| 15 | 
            +
                        return step_context
         | 
| 16 | 
            +
                      end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                      new_mailbox = new_mailbox(mailbox, words: cleanup_data[words_data_key])
         | 
| 19 | 
            +
                      new_domain = new_domain(domain, domains: cleanup_data[domains_to_keep_data_key],
         | 
| 20 | 
            +
                                                      words: cleanup_data[words_data_key])
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                      step_context.current_value = "#{new_mailbox}@#{new_domain}"
         | 
| 23 | 
            +
                      step_context
         | 
| 24 | 
            +
                    end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    private
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    def new_mailbox(mailbox, words:)
         | 
| 29 | 
            +
                      if mailbox !~ /^\.|\.\.|\.$/
         | 
| 30 | 
            +
                        mailbox.split(".").map { dictionary_or_random_word_instead_of(_1, words:) }.join(".")
         | 
| 31 | 
            +
                      else
         | 
| 32 | 
            +
                        dictionary_or_random_word_instead_of(mailbox, words:)
         | 
| 33 | 
            +
                      end
         | 
| 34 | 
            +
                    end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    def new_domain(domain, domains:, words:)
         | 
| 37 | 
            +
                      if domains.include?(domain)
         | 
| 38 | 
            +
                        domain
         | 
| 39 | 
            +
                      else
         | 
| 40 | 
            +
                        tld2, _dot, tld = domain.rpartition(".")
         | 
| 41 | 
            +
                        new_tld2 = dictionary_or_random_word_instead_of(tld2, words:)
         | 
| 42 | 
            +
                        "#{new_tld2}.#{tld}"
         | 
| 43 | 
            +
                      end
         | 
| 44 | 
            +
                    end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    def dictionary_or_random_word_instead_of(word, words:)
         | 
| 47 | 
            +
                      dictionary_word_instead_of(word, words:) || random_word_instead_of(word)
         | 
| 48 | 
            +
                    end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                    def dictionary_word_instead_of(word, words:)
         | 
| 51 | 
            +
                      context = StepContext.new_from(step_context, current_value: word, cleanup_data: words)
         | 
| 52 | 
            +
                      context = SelectDataByBytesize.new(context).run
         | 
| 53 | 
            +
                      TakeSample.new(context).run(uniqueness_strategy: :suffix).current_value
         | 
| 54 | 
            +
                    end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    def random_word_instead_of(word)
         | 
| 57 | 
            +
                      GenerateRandomString.new(StepContext.new_from(step_context, current_value: word))
         | 
| 58 | 
            +
                                          .run(character_set: :lowercase).current_value
         | 
| 59 | 
            +
                    end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                    def validate_params(domains_to_keep_data_key:, words_data_key:)
         | 
| 62 | 
            +
                      raise("The cleanup_data must be a hash") unless cleanup_data.respond_to?(:key)
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                      unless !domains_to_keep_data_key || domains_to_keep_data_key.empty? ||
         | 
| 65 | 
            +
                             cleanup_data.key?(domains_to_keep_data_key)
         | 
| 66 | 
            +
                        raise_params_error("The cleanup_data does not contain the \"#{domains_to_keep_data_key}\" key.
         | 
| 67 | 
            +
                                            Either add the domains to the cleanup data hash or set the domains_to_keep_data_key
         | 
| 68 | 
            +
                                            to null or an empty string.".gsub(/\s+/, " "))
         | 
| 69 | 
            +
                      end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                      return if cleanup_data.key?(words_data_key)
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                      raise_params_error("The cleanup_data does not contain the \"#{words_data_key}\" key")
         | 
| 74 | 
            +
                    end
         | 
| 75 | 
            +
                  end
         | 
| 76 | 
            +
                end
         | 
| 77 | 
            +
              end
         | 
| 78 | 
            +
            end
         |