RubyGems - dump_cleaner - Versions diffs - 0.5.0 - Mend

dump_cleaner 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +7 -0
data/.rspec +2 -0
data/.rubocop.yml +25 -0
data/CHANGELOG.md +5 -0
data/LICENSE.txt +21 -0
data/README.md +295 -0
data/Rakefile +8 -0
data/doc/workflow_steps.md +1400 -0
data/dump_cleaner.gemspec +38 -0
data/exe/dump_cleaner +7 -0
data/lib/dump_cleaner/cleaners/base_cleaner.rb +32 -0
data/lib/dump_cleaner/cleaners/mysql_shell_dump_cleaner.rb +47 -0
data/lib/dump_cleaner/cleaners/mysql_shell_dump_helpers.rb +11 -0
data/lib/dump_cleaner/cleaners/mysql_shell_table_cleaner.rb +184 -0
data/lib/dump_cleaner/cleanup/bytesize_helpers.rb +39 -0
data/lib/dump_cleaner/cleanup/cleaning.rb +69 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/add_repetition_suffix.rb +23 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/base.rb +33 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/fill_up_with_string.rb +20 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/generate_random_string.rb +37 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/inspect_context.rb +16 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_email.rb +78 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_formatted_number.rb +63 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_number.rb +29 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_bytesize.rb +17 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_pattern.rb +20 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/take_sample.rb +28 -0
data/lib/dump_cleaner/cleanup/data_source.rb +19 -0
data/lib/dump_cleaner/cleanup/data_source_steps/base.rb +26 -0
data/lib/dump_cleaner/cleanup/data_source_steps/group_by_bytesize.rb +37 -0
data/lib/dump_cleaner/cleanup/data_source_steps/inspect_context.rb +16 -0
data/lib/dump_cleaner/cleanup/data_source_steps/load_yaml_file.rb +24 -0
data/lib/dump_cleaner/cleanup/data_source_steps/remove_accents.rb +29 -0
data/lib/dump_cleaner/cleanup/inspection.rb +37 -0
data/lib/dump_cleaner/cleanup/step_context.rb +46 -0
data/lib/dump_cleaner/cleanup/uniqueness.rb +66 -0
data/lib/dump_cleaner/cleanup/workflow.rb +38 -0
data/lib/dump_cleaner/conditions.rb +42 -0
data/lib/dump_cleaner/config.rb +109 -0
data/lib/dump_cleaner/log.rb +42 -0
data/lib/dump_cleaner/options.rb +46 -0
data/lib/dump_cleaner/processor.rb +37 -0
data/lib/dump_cleaner/version.rb +5 -0
data/lib/dump_cleaner.rb +10 -0
metadata +105 -0

data/dump_cleaner.gemspec ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+require_relative "lib/dump_cleaner/version"
+Gem::Specification.new do |spec|
+  spec.name = "dump_cleaner"
+  spec.version = DumpCleaner::VERSION
+  spec.authors = ["Matouš Borák"]
+  spec.email = ["matous.borak@nejremeslnici.cz"]
+  spec.summary = "Anonymizes data in logical database dumps."
+  spec.description = "Deterministically anonymizes data in logical database dumps. Useful for importing (anonymized) production data into development environments."
+  spec.homepage = "https://github.com/NejRemeslnici/dump-cleaner"
+  spec.license = "MIT"
+  spec.required_ruby_version = ">= 3.1.0"
+  spec.metadata["homepage_uri"] = spec.homepage
+  spec.metadata["source_code_uri"] = "https://github.com/NejRemeslnici/dump-cleaner"
+  spec.metadata["changelog_uri"] = "https://github.com/NejRemeslnici/dump-cleaner/blob/main/CHANGELOG.md"
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files = Dir.chdir(__dir__) do
+    `git ls-files -z`.split("\x0").reject do |f|
+      (File.expand_path(f) == __FILE__) ||
+        f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
+    end
+  end
+  spec.bindir = "exe"
+  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  # Uncomment to register a new dependency of your gem
+  spec.add_dependency "zeitwerk", "~> 2.6"
+  # For more information and examples about making a new gem, check out our
+  # guide at: https://bundler.io/guides/creating_gem.html
+end

data/exe/dump_cleaner ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+require "dump_cleaner"
+options = DumpCleaner::Options.new(ARGV)
+DumpCleaner::Processor.new(options).run

data/lib/dump_cleaner/cleaners/base_cleaner.rb ADDED Viewed

@@ -0,0 +1,32 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleaners
+    class BaseCleaner
+      attr_reader :config, :options
+      def initialize(config:, options:)
+        @config = config
+        @options = options
+      end
+      def pre_cleanup
+        # Implement in subclass if needed
+      end
+      def clean
+        raise NotImplementedError
+      end
+      def post_cleanup
+        # Implement in subclass if needed
+      end
+      def keep_same_record?(record, table_config:)
+        return false unless table_config.keep_same_record_conditions
+        Conditions.new(table_config.keep_same_record_conditions).evaluate_to_true?(record:)
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleaners/mysql_shell_dump_cleaner.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleaners
+    class MysqlShellDumpCleaner < BaseCleaner
+      require "fileutils"
+      include MysqlShellDumpHelpers
+      def pre_cleanup
+        validate_source_dump
+        prepare_destination_dump
+      end
+      def clean
+        config.cleanup_tables.each do |db, table|
+          table_cleaner = MysqlShellTableCleaner.new(db:, table:, config:, options:)
+          table_cleaner.pre_cleanup
+          table_cleaner.clean
+          table_cleaner.post_cleanup
+        end
+      end
+      def post_cleanup
+        copy_remaining_files
+      end
+      private
+      def validate_source_dump
+        raise "Source dump path does not exist: #{options.source_dump_path}" unless Dir.exist?(options.source_dump_path)
+      end
+      def prepare_destination_dump
+        Dir.mkdir(options.destination_dump_path) unless Dir.exist?(options.destination_dump_path)
+      end
+      def copy_remaining_files
+        Dir.glob("#{options.source_dump_path}/*").each do |file|
+          destination_file = destination_file_for(file)
+          FileUtils.cp(file, destination_file, preserve: true) unless File.exist?(destination_file)
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleaners/mysql_shell_dump_helpers.rb ADDED Viewed

@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleaners
+    module MysqlShellDumpHelpers
+      def destination_file_for(source_file)
+        source_file.sub(options.source_dump_path, options.destination_dump_path)
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleaners/mysql_shell_table_cleaner.rb ADDED Viewed

@@ -0,0 +1,184 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleaners
+    class MysqlShellTableCleaner < BaseCleaner
+      require "open3"
+      include MysqlShellDumpHelpers
+      attr_reader :table_info, :cleanup_data, :cleaning
+      def initialize(db:, table:, config:, options:)
+        super(config:, options:)
+        @db = db
+        @table = table
+        @cleanup_data = Cleanup::DataSource.new(config:)
+        @cleaning = Cleanup::Cleaning.new(config:)
+      end
+      def pre_cleanup
+        @table_info = DumpTableInfo.load(db: @db, table: @table, source_dump_path: options.source_dump_path)
+        validate_table_info
+      end
+      def clean
+        table_config = config.cleanup_table_config(db: @db, table: @table)
+        Log.info { "Cleaning table #{table_info.db_dot_table}…" }
+        DumpCleaner::Cleanup::Uniqueness::CaseInsensitiveCache.instance.clear
+        Dir.glob("#{options.source_dump_path}/#{table_info.db_at_table}@@*.#{table_info.extension}").each do |file|
+          # Open3.pipeline_r(pipe_source_args(file), ["head", "-n", "1000"]) do |tsv_data, _wait_thread|
+          Open3.pipeline_r(pipe_source_args(file)) do |tsv_data, _wait_thread|
+            Open3.pipeline_w(pipe_sink_args(destination_file_for(file))) do |zstd_out, _wait_thread|
+              tsv_data.each_line do |line|
+                line = line.chomp(table_info.dialect.lines_terminated_by)
+                zstd_out.print "#{clean_line(line, table_config:)}#{table_info.dialect.lines_terminated_by}"
+              end
+            end
+          end
+        end
+      end
+      private
+      def clean_line(line, table_config:)
+        record = line.split("\t")
+        record_context = record_context(record, table_config:)
+        print "\r#{record_context['id']}… " if (record_context["id"].to_i % 10_000).zero?
+        keep_record = keep_same_record?(record_context, table_config:)
+        table_config.columns.each do |column_config|
+          column_index = table_info.column_index(column_config.name)
+          raise "Invalid column specified in config: #{column_config.name}" unless column_index
+          next if record[column_index] == "\\N" # ignore NULL values
+          cleanup_data_for_type = cleanup_data.data_for(column_config.cleanup_type)
+          record[column_index] = cleaning.clean_value_for(record[column_index],
+                                                          type: column_config.cleanup_type,
+                                                          cleanup_data: cleanup_data_for_type,
+                                                          record: record_context,
+                                                          keep_record:,
+                                                          column_config:)
+        end
+        new_line = record.join("\t")
+        warn_on_changed_line_length(line, new_line, id: record_context["id"], record:)
+        new_line
+      end
+      def record_context(record, table_config:)
+        columns = table_config.record_context_columns
+        context = columns.each_with_object({}) do |column, context|
+          context[column] = record[table_info.column_index(column)]
+        end
+        context["id_column"] = record[table_info.column_index(table_config.id_column)]
+        context
+      end
+      def warn_on_changed_line_length(orig_line, new_line, id:, record:)
+        return if orig_line.bytesize == new_line.bytesize
+        warning = "ID: #{id} bytesize changed: #{orig_line.bytesize} => #{new_line.bytesize}"
+        orig_line.split("\t").each_with_index do |column, i|
+          warning << "#{column} -> #{record[i]}" if !record[i] || column.bytesize != record[i].bytesize
+        end
+        Log.error { warning }
+      end
+      def validate_table_info
+        case table_info.compression
+        when "zstd"
+          system("zstd --version >/dev/null 2>&1") || raise("zstd not found in \$PATH")
+        else
+          raise "Unsupported dump compression format '#{table_info.compression}'"
+        end
+      end
+      def pipe_source_args(file)
+        case table_info.compression
+        when "zstd"
+          ["zstd", "-dc", file]
+        end
+      end
+      def pipe_sink_args(file)
+        case table_info.compression
+        when "zstd"
+          ["zstd", "-qfo", file]
+        end
+      end
+      class DumpTableInfo
+        require "json"
+        DialectOptions = Data.define(:lines_terminated_by, :fields_terminated_by, :fields_enclosed_by,
+                                     :fields_optionally_enclosed, :fields_escaped_by)
+        def self.load(db:, table:, source_dump_path:)
+          new(JSON.parse(File.read(table_info_file_path(db:, table:, source_dump_path:))))
+        rescue Errno::ENOENT
+          raise "Table info file not found in dump for table '#{db}.#{table}'. Is the table included in the dump?"
+        end
+        def self.table_info_file_path(db:, table:, source_dump_path:)
+          "#{source_dump_path}/#{db}@#{table}.json"
+        end
+        def initialize(table_info)
+          @table_info = table_info
+        end
+        def db
+          @db ||= @table_info.dig("options", "schema")
+        end
+        def table
+          @table ||= @table_info.dig("options", "table")
+        end
+        def db_dot_table
+          "#{db}.#{table}"
+        end
+        def db_at_table
+          "#{db}@#{table}"
+        end
+        def compression
+          @table_info["compression"]
+        end
+        def extension
+          @table_info["extension"]
+        end
+        def columns
+          @columns ||= @table_info.dig("options", "columns")
+        end
+        def column_index(name)
+          columns.index(name)
+        end
+        def dialect
+          @dialect ||= begin
+            dialect_options = DialectOptions.members.each_with_object({}) do |option, options|
+              lowercase_option = option.to_s.split("_").each_with_object([]) do |e, buffer|
+                buffer.push(buffer.empty? ? e : e.capitalize)
+              end.join
+              options[option] = @table_info.dig("options", lowercase_option)
+            end
+            DialectOptions.new(**dialect_options)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/bytesize_helpers.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module DumpCleaner
+  module Cleanup
+    module BytesizeHelpers
+      # inspired by https://stackoverflow.com/a/67825008/1544012
+      def truncate_to_bytesize(string, max_bytesize:, padding: " ")
+        return string unless string.bytesize > max_bytesize
+        check_padding_bytesize(padding)
+        just_over = (0...string.size).bsearch { string[0.._1].bytesize > max_bytesize }
+        string = string[0...just_over]
+        string << padding while string.bytesize < max_bytesize
+        string
+      end
+      def set_to_bytesize(string, bytesize:, padding: " ")
+        string = string.ljust(bytesize, "#{padding}#{string}") if string.bytesize < bytesize
+        truncate_to_bytesize(string, max_bytesize: bytesize, padding:)
+      end
+      def replace_suffix(string, suffix:, padding: " ")
+        front_max_bytes = string.bytesize - suffix.bytesize
+        front = truncate_to_bytesize(string, max_bytesize: front_max_bytes, padding:)
+        "#{front}#{suffix}"
+      end
+      private
+      def check_padding_bytesize(padding)
+        return unless padding.bytesize > 1
+        raise ArgumentError,
+              "Use only a single-byte string in the padding otherwise it may prevent adjusting the result precisely."
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    class Cleaning
+      include Uniqueness
+      attr_reader :config
+      def initialize(config:)
+        @cleaning_workflow = Workflow.new(phase: :cleaning)
+        @failure_workflow = Workflow.new(phase: :failure)
+        @config = config
+      end
+      def clean_value_for(orig_value, type:, cleanup_data:, column_config:, record: {}, keep_record: false) # rubocop:disable Metrics/ParameterLists
+        step_context = StepContext.new(orig_value:, type:, cleanup_data:, record:)
+        # return orig_value if keep_same conditions are met
+        if (keep_record && !config.ignore_keep_same_record_conditions?(type)) ||
+           Conditions.evaluate_to_true_in_step?(conditions: config.keep_same_conditions(type), step_context:)
+          return orig_value_with_optional_suffix(step_context, column_config:)
+        end
+        if column_config.unique_column?
+          begin
+            repeat_until_unique(step_context:) do |repetition|
+              step_context.repetition = repetition
+              run_workflows(step_context)
+            end
+          rescue MaxRetriesReachedError
+            repeat_until_unique(step_context:) do |repetition|
+              step_context.repetition = repetition
+              run_failure_workflow(step_context)
+            end
+          end
+        else
+          run_workflows(step_context)
+        end
+      end
+      private
+      def orig_value_with_optional_suffix(step_context, column_config:)
+        if column_config.unique_column?
+          repeat_until_unique(step_context:) do |repetition|
+            step_context.repetition = repetition
+            DumpCleaner::Cleanup::CleaningSteps::AddRepetitionSuffix.new(step_context).run.current_value
+          end
+        else
+          step_context.orig_value
+        end
+      end
+      def run_workflows(step_context)
+        run_cleaning_workflow(step_context) || run_failure_workflow(step_context)
+      end
+      def run_cleaning_workflow(step_context)
+        @cleaning_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :cleaning)).current_value
+      end
+      def run_failure_workflow(step_context)
+        step_context.current_value = step_context.orig_value # reset current_value
+        @failure_workflow.run(step_context, step_configs: config.steps_for(step_context.type, :failure)).current_value
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/add_repetition_suffix.rb ADDED Viewed

@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class AddRepetitionSuffix < Base
+        include BytesizeHelpers
+        def run
+          step_context.current_value = if repetition.zero?
+                                         current_value
+                                       elsif current_value.bytesize > repetition.to_s.bytesize
+                                         replace_suffix(current_value, suffix: repetition.to_s, padding: "0")
+                                       else
+                                         GenerateRandomString.new(StepContext.new_from(step_context))
+                                                             .run.current_value
+                                       end
+          step_context
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/base.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class Base
+        require "forwardable"
+        require "zlib"
+        extend Forwardable
+        def_delegators :step_context, :cleanup_data, :current_value, :orig_value, :type, :record, :repetition
+        attr_reader :step_context
+        def initialize(step_context)
+          @step_context = step_context.dup
+        end
+        def crc32(use_repetition: true)
+          value_to_hash = "#{record['id_column']}-#{current_value}"
+          value_to_hash += "-#{repetition}" if repetition.positive? && use_repetition
+          Zlib.crc32(value_to_hash)
+        end
+        def raise_params_error(error)
+          step = self.class.name.split("::").last
+          raise ArgumentError, "Invalid cleanup step params: type=#{type}, step=#{step}: #{error}"
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/fill_up_with_string.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class FillUpWithString < Base
+        include BytesizeHelpers
+        def run(string: "anonymized #{type}", padding: " ", strict_bytesize_check: false)
+          if strict_bytesize_check && string.bytesize != orig_value.bytesize
+            raise "The bytesize of the string must be equal to the bytesize of the original value."
+          end
+          string = set_to_bytesize(string, bytesize: orig_value.bytesize, padding:)
+          AddRepetitionSuffix.new(StepContext.new_from(step_context, current_value: string)).run
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/generate_random_string.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class GenerateRandomString < Base
+        require "random/formatter"
+        def run(character_set: "alphanumeric")
+          random = Random.new(crc32)
+          step_context.current_value = random.alphanumeric(current_value.bytesize, chars: characters(character_set))
+          step_context
+        end
+        private
+        def characters(character_set)
+          case character_set.to_s
+          when "alphanumeric"
+            Random::Formatter::ALPHANUMERIC
+          when "alpha"
+            [*"a".."z", *"A".."Z"]
+          when "lowercase"
+            [*"a".."z"]
+          when "uppercase"
+            [*"A".."Z"]
+          when "numeric"
+            [*"0".."9"]
+          else
+            character_set
+          end
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/inspect_context.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class InspectContext < Base
+        include Inspection
+        def run
+          inspect_step_context(step_context)
+          step_context
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_email.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class RandomizeEmail < Base
+        def run(domains_to_keep_data_key: "domains_to_keep", words_data_key: "words")
+          validate_params(domains_to_keep_data_key:, words_data_key:)
+          mailbox, domain = current_value.split("@", 2)
+          if !mailbox || !domain || mailbox.empty? || domain.empty? || !domain.include?(".")
+            Log.warn { "Invalid email: type=#{type}, id=#{record['id']}, value=#{current_value}" } if repetition.zero?
+            step_context.current_value = nil
+            return step_context
+          end
+          new_mailbox = new_mailbox(mailbox, words: cleanup_data[words_data_key])
+          new_domain = new_domain(domain, domains: cleanup_data[domains_to_keep_data_key],
+                                          words: cleanup_data[words_data_key])
+          step_context.current_value = "#{new_mailbox}@#{new_domain}"
+          step_context
+        end
+        private
+        def new_mailbox(mailbox, words:)
+          if mailbox !~ /^\.|\.\.|\.$/
+            mailbox.split(".").map { dictionary_or_random_word_instead_of(_1, words:) }.join(".")
+          else
+            dictionary_or_random_word_instead_of(mailbox, words:)
+          end
+        end
+        def new_domain(domain, domains:, words:)
+          if domains.include?(domain)
+            domain
+          else
+            tld2, _dot, tld = domain.rpartition(".")
+            new_tld2 = dictionary_or_random_word_instead_of(tld2, words:)
+            "#{new_tld2}.#{tld}"
+          end
+        end
+        def dictionary_or_random_word_instead_of(word, words:)
+          dictionary_word_instead_of(word, words:) || random_word_instead_of(word)
+        end
+        def dictionary_word_instead_of(word, words:)
+          context = StepContext.new_from(step_context, current_value: word, cleanup_data: words)
+          context = SelectDataByBytesize.new(context).run
+          TakeSample.new(context).run(uniqueness_strategy: :suffix).current_value
+        end
+        def random_word_instead_of(word)
+          GenerateRandomString.new(StepContext.new_from(step_context, current_value: word))
+                              .run(character_set: :lowercase).current_value
+        end
+        def validate_params(domains_to_keep_data_key:, words_data_key:)
+          raise("The cleanup_data must be a hash") unless cleanup_data.respond_to?(:key)
+          unless !domains_to_keep_data_key || domains_to_keep_data_key.empty? ||
+                 cleanup_data.key?(domains_to_keep_data_key)
+            raise_params_error("The cleanup_data does not contain the \"#{domains_to_keep_data_key}\" key.
+                                Either add the domains to the cleanup data hash or set the domains_to_keep_data_key
+                                to null or an empty string.".gsub(/\s+/, " "))
+          end
+          return if cleanup_data.key?(words_data_key)
+          raise_params_error("The cleanup_data does not contain the \"#{words_data_key}\" key")
+        end
+      end
+    end
+  end
+end