RubyGems - dump_cleaner - Versions diffs - 0.5.0 - Mend

dump_cleaner 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +7 -0
data/.rspec +2 -0
data/.rubocop.yml +25 -0
data/CHANGELOG.md +5 -0
data/LICENSE.txt +21 -0
data/README.md +295 -0
data/Rakefile +8 -0
data/doc/workflow_steps.md +1400 -0
data/dump_cleaner.gemspec +38 -0
data/exe/dump_cleaner +7 -0
data/lib/dump_cleaner/cleaners/base_cleaner.rb +32 -0
data/lib/dump_cleaner/cleaners/mysql_shell_dump_cleaner.rb +47 -0
data/lib/dump_cleaner/cleaners/mysql_shell_dump_helpers.rb +11 -0
data/lib/dump_cleaner/cleaners/mysql_shell_table_cleaner.rb +184 -0
data/lib/dump_cleaner/cleanup/bytesize_helpers.rb +39 -0
data/lib/dump_cleaner/cleanup/cleaning.rb +69 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/add_repetition_suffix.rb +23 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/base.rb +33 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/fill_up_with_string.rb +20 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/generate_random_string.rb +37 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/inspect_context.rb +16 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_email.rb +78 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_formatted_number.rb +63 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_number.rb +29 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_bytesize.rb +17 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_pattern.rb +20 -0
data/lib/dump_cleaner/cleanup/cleaning_steps/take_sample.rb +28 -0
data/lib/dump_cleaner/cleanup/data_source.rb +19 -0
data/lib/dump_cleaner/cleanup/data_source_steps/base.rb +26 -0
data/lib/dump_cleaner/cleanup/data_source_steps/group_by_bytesize.rb +37 -0
data/lib/dump_cleaner/cleanup/data_source_steps/inspect_context.rb +16 -0
data/lib/dump_cleaner/cleanup/data_source_steps/load_yaml_file.rb +24 -0
data/lib/dump_cleaner/cleanup/data_source_steps/remove_accents.rb +29 -0
data/lib/dump_cleaner/cleanup/inspection.rb +37 -0
data/lib/dump_cleaner/cleanup/step_context.rb +46 -0
data/lib/dump_cleaner/cleanup/uniqueness.rb +66 -0
data/lib/dump_cleaner/cleanup/workflow.rb +38 -0
data/lib/dump_cleaner/conditions.rb +42 -0
data/lib/dump_cleaner/config.rb +109 -0
data/lib/dump_cleaner/log.rb +42 -0
data/lib/dump_cleaner/options.rb +46 -0
data/lib/dump_cleaner/processor.rb +37 -0
data/lib/dump_cleaner/version.rb +5 -0
data/lib/dump_cleaner.rb +10 -0
metadata +105 -0

data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_formatted_number.rb ADDED Viewed

@@ -0,0 +1,63 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class RandomizeFormattedNumber < Base
+        include Inspection
+        def run(format:)
+          regex = Regexp.new("\\A#{format}\\z")
+          unless regex.names.any? { _1.start_with?("x") }
+            raise_params_error('The format has no named group starting with \'x\', e.g. \'(?<x>\d)\')')
+          end
+          unless current_value.match?(regex)
+            if repetition.zero?
+              Log.warn { "Invalid value: type=#{type}, id=#{record['id']}, value=#{truncate(current_value)}" }
+            end
+            step_context.current_value = nil
+            return step_context
+          end
+          random = Random.new(crc32)
+          new_value = randomize_named_captures(regex:, random:)
+          if new_value.length != current_value.length
+            raise ArgumentError, "The new value length does not match the original value length.
+                                  Do the named groups in the format regexp match the whole value?".gsub(/\s+/, " ")
+          end
+          step_context.current_value = new_value
+          step_context
+        end
+        private
+        def randomize_named_captures(regex:, random:)
+          new_value = String.new
+          current_value.match(regex).named_captures.each do |name, capture|
+            if name.start_with?("x")
+              unless capture.match?(/^\d+$/)
+                raise ArgumentError,
+                      "Invalid regexp for capture '#{name}' which matched to '#{capture}': it must match numbers only."
+              end
+              new_value << random_number(capture.length, random:)
+            else
+              new_value << capture
+            end
+          end
+          new_value
+        end
+        def random_number(digits, random:)
+          random.rand(10**digits - 1).to_s.rjust(digits, "0")
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/randomize_number.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class RandomizeNumber < Base
+        def run(difference_within: 1.0)
+          random = Random.new(crc32)
+          new_value = current_value.to_f + random.rand(difference_within.to_f * 2) - difference_within.to_f
+          # keep sign to keep string length (warning: this skews the distribution of the random numbers)
+          if (current_value.strip[0] == "-") && new_value.positive? ||
+             (current_value.strip[0] != "-") && new_value.negative?
+            new_value *= -1
+          end
+          decimal_places = current_value.split(".")[1].to_s.length
+          epsilon = 10**-decimal_places
+          clamped_value = new_value.clamp(current_value.to_f - difference_within + epsilon,
+                                          current_value.to_f + difference_within - epsilon)
+          step_context.current_value = format("%0#{current_value.length}.#{decimal_places}f", clamped_value)
+          step_context
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_bytesize.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class SelectDataByBytesize < Base
+        def run
+          return step_context if !cleanup_data || cleanup_data.empty?
+          step_context.cleanup_data = cleanup_data["#{current_value.length}-#{current_value.bytesize}"] ||
+                                      cleanup_data["#{current_value.bytesize}-#{current_value.bytesize}"] # used when current_value is accented but data isn't
+          step_context
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/select_data_by_pattern.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class SelectDataByPattern < Base
+        def run(patterns:, default_key: nil)
+          step_context.cleanup_data = step_context.cleanup_data[match_key(patterns) || default_key]
+          step_context
+        end
+        private
+        def match_key(patterns)
+          patterns.find { Regexp.new(_1["pattern"], _1["flags"]).match?(step_context.current_value) }&.fetch("key")
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/cleaning_steps/take_sample.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module CleaningSteps
+      class TakeSample < Base
+        def run(uniqueness_strategy: :resample)
+          if !cleanup_data || cleanup_data.empty?
+            step_context.current_value = nil
+            return step_context
+          end
+          uniqueness_strategy = uniqueness_strategy.to_sym
+          step_context.current_value =
+            if uniqueness_strategy == :resample
+              cleanup_data[crc32 % cleanup_data.size]
+            elsif uniqueness_strategy == :suffix
+              sample = cleanup_data[crc32(use_repetition: false) % cleanup_data.size]
+              AddRepetitionSuffix.new(StepContext.new_from(step_context, current_value: sample)).run.current_value
+            else
+              raise_params_error("Unknown uniqueness strategy: #{uniqueness_strategy}")
+            end
+          step_context
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/data_source.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    class DataSource
+      def initialize(config:)
+        @config = config
+        @workflow = Workflow.new(phase: :data_source)
+        @data_cache = {}
+      end
+      def data_for(type)
+        step_context = StepContext.new(type:, cleanup_data: nil)
+        @data_cache[type] ||= @workflow.run(step_context, step_configs: @config.steps_for(type, :data_source))
+                                       .cleanup_data
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/data_source_steps/base.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module DataSourceSteps
+      class Base
+        require "forwardable"
+        extend Forwardable
+        def_delegators :step_context, :cleanup_data, :type
+        attr_reader :step_context
+        def initialize(step_context)
+          @step_context = step_context.dup
+        end
+        def raise_params_error(error)
+          step = self.class.name.split("::").last
+          raise ArgumentError, "Invalid data source step params: type=#{type}, step=#{step}: #{error}"
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/data_source_steps/group_by_bytesize.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module DataSourceSteps
+      class GroupByBytesize < Base
+        def run(under_keys: [])
+          validate_params(under_keys:)
+          group_by_lambda = -> { "#{_1.length}-#{_1.bytesize}" }
+          step_context.cleanup_data = begin
+            if under_keys.any?
+              new_data = cleanup_data.dup
+              under_keys.each do |key|
+                new_data[key] = new_data[key].group_by(&group_by_lambda)
+              end
+              new_data
+            else
+              cleanup_data.group_by(&group_by_lambda)
+            end
+          end
+          step_context
+        end
+        private
+        def validate_params(under_keys:)
+          return if under_keys.all? { cleanup_data.key?(_1) }
+          raise_params_error("The under_keys param contains keys not present in cleanup_data.")
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/data_source_steps/inspect_context.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module DataSourceSteps
+      class InspectContext < Base
+        include Inspection
+        def run
+          inspect_step_context(step_context)
+          step_context
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/data_source_steps/load_yaml_file.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module DataSourceSteps
+      class LoadYamlFile < Base
+        require "yaml"
+        def run(file:, under_key: nil)
+          loaded_data = YAML.load_file(file)
+          step_context.cleanup_data = if under_key
+                                        new_data ||= cleanup_data || {}
+                                        new_data[under_key] = loaded_data
+                                        new_data
+                                      else
+                                        loaded_data
+                                      end
+          step_context
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/data_source_steps/remove_accents.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module DataSourceSteps
+      class RemoveAccents < Base
+        def run(under_keys: [])
+          block = lambda do |word|
+            word.match?(/^\p{ASCII}+$/) ? word : word.unicode_normalize(:nfd).gsub(/\p{M}/, "")
+          end
+          step_context.cleanup_data = begin
+            if under_keys.any?
+              new_data = cleanup_data.dup
+              under_keys.each do |key|
+                new_data[key] = new_data[key].map(&block)
+              end
+              new_data
+            else
+              cleanup_data.map(&block)
+            end
+          end
+          step_context
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/inspection.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    module Inspection
+      def inspect_step_context(step_context, message: "Inspecting step context")
+        Log.info { message }
+        Log.info { "\n#{step_context.pretty_inspect}" }
+      end
+      def subset(data, values: 10)
+        case data
+        when Array
+          subset_data = data.take(values)
+          subset_data << "+ #{data.size - values} more..." if data.size > values
+          subset_data.each_with_index { |element, index| subset_data[index] = subset(element, values:) }
+        when Hash
+          subset_data = data.take(values).to_h
+          subset_data["+ #{data.size - values} more..."] = nil if data.size > values
+          subset_data.each_key { |key| subset_data[key] = subset(subset_data[key], values:) }
+        else
+          subset_data = data
+        end
+        subset_data
+      end
+      def truncate(value, to: 30, omission: "…")
+        return value.dup if value.length <= to
+        length_with_room_for_omission = to - omission.length
+        stop = length_with_room_for_omission
+        +"#{value[0, stop]}#{omission}"
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/step_context.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    class StepContext
+      require "pp"
+      include Inspection
+      attr_accessor :cleanup_data, :current_value, :repetition
+      attr_reader :orig_value, :type, :record
+      def initialize(type:, cleanup_data:, orig_value: nil, record: {}, repetition: 0)
+        @type = type
+        @cleanup_data = cleanup_data
+        @orig_value = @current_value = orig_value
+        @record = record
+        @repetition = repetition
+      end
+      def self.new_from(step_context, **params)
+        context_copy = step_context.dup
+        new_context = new(orig_value: params[:orig_value] || context_copy.orig_value,
+                          type: params[:type] || context_copy.type,
+                          cleanup_data: params[:cleanup_data] || context_copy.cleanup_data,
+                          record: params[:record] || context_copy.record,
+                          repetition: params[:repetition] || context_copy.repetition)
+        new_context.current_value = params[:current_value] || context_copy.current_value
+        new_context
+      end
+      def to_h(subset: false)
+        { orig_value:, current_value:, type:, record:, repetition:,
+          cleanup_data: subset ? subset(cleanup_data) : cleanup_data }
+      end
+      def pretty_print(pp)
+        to_h(subset: true).pretty_print(pp)
+      end
+      def ==(other)
+        to_h == other.to_h
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/uniqueness.rb ADDED Viewed

@@ -0,0 +1,66 @@
+module DumpCleaner
+  module Cleanup
+    module Uniqueness
+      require "singleton"
+      class MaxRetriesReachedError < StandardError; end
+      def repeat_until_unique(step_context:, max_retries: 1000, &block)
+        n = 0
+        result = nil
+        loop do
+          result = block.call(n)
+          break unless result
+          if n.positive?
+            Log.debug do
+              msg = "Uniqueness run:  type=#{step_context.type}, id=#{step_context.record['id']}, "
+              msg << "orig_value=#{step_context.orig_value}, current_value=#{result}, repetition=#{n}"
+            end
+          end
+          unless CaseInsensitiveCache.instance.known?(type: step_context.type, value: result)
+            CaseInsensitiveCache.instance.push(type: step_context.type, value: result)
+            break
+          end
+          if n >= max_retries
+            warning = "Max retry count #{n} reached for ID:#{step_context.record['id']}, type:#{step_context.type}, "
+            warning << "orig:#{step_context.orig_value}, current:#{result}"
+            Log.warn { warning }
+            raise MaxRetriesReachedError
+          end
+          n += 1
+        end
+        result
+      end
+      class CaseInsensitiveCache
+        include Singleton
+        def initialize
+          clear
+        end
+        def clear
+          @data = {}
+        end
+        def known?(type:, value:)
+          return false unless @data.key?(type)
+          @data[type].include?(value.downcase)
+        end
+        def push(type:, value:)
+          @data[type] ||= Set.new
+          @data[type].add(value.downcase)
+        end
+      end
+    end
+  end
+end

data/lib/dump_cleaner/cleanup/workflow.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+module DumpCleaner
+  module Cleanup
+    class Workflow
+      def initialize(phase:)
+        @phase = phase
+        @workflow_steps_cache = {}
+      end
+      def run(initial_step_context, step_configs:)
+        steps(type: initial_step_context.type, step_configs:).reduce(initial_step_context.dup) do |step_context, step|
+          step.call(step_context)
+        end
+      end
+      private
+      def steps_namespace(phase)
+        phase == :data_source ? DumpCleaner::Cleanup::DataSourceSteps : DumpCleaner::Cleanup::CleaningSteps
+      end
+      def steps(type:, step_configs:)
+        @workflow_steps_cache[cache_key(type:, step_configs:)] ||= step_configs.map do |step_config|
+          lambda do |step_context|
+            steps_namespace(@phase).const_get(step_config.step).new(step_context).run(**step_config.params)
+          rescue NameError => e
+            raise DumpCleaner::Config::ConfigurationError, "Invalid step #{step_config.step}"
+          end
+        end
+      end
+      def cache_key(type:, step_configs:)
+        "#{@phase}-#{type}-#{step_configs.map(&:step).join('-')}"
+      end
+    end
+  end
+end

data/lib/dump_cleaner/conditions.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module DumpCleaner
+  class Conditions
+    def initialize(condition_config)
+      @conditions = condition_config
+    end
+    def evaluate_to_true?(record:, column_value: nil)
+      return false unless @conditions
+      Array(@conditions).map do |condition_config|
+        column = condition_config.column
+        conversion, op, value = parse_condition(condition_config)
+        (column ? record[column] : column_value).send(conversion || :itself).send(op, value)
+      end.any?
+    end
+    def self.evaluate_to_true_in_step?(conditions:, step_context:)
+      new(conditions).evaluate_to_true?(record: step_context.record, column_value: step_context.orig_value)
+    end
+    private
+    def parse_condition(condition_config)
+      condition_value = condition_config.value
+      case condition_config.condition
+      when "eq"
+        [nil, "==", condition_value]
+      when "ne"
+        [nil, "!=", condition_value]
+      when "start_with"
+        [nil, :start_with?, condition_value]
+      when "end_with"
+        [nil, :end_with?, condition_value]
+      when "non_zero"
+        [:to_i, "!=", 0]
+      else
+        raise "Unknown condition #{condition_config.condition} for column #{condition_config.column}"
+      end
+    end
+  end
+end

data/lib/dump_cleaner/config.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# frozen_string_literal: true
+module DumpCleaner
+  class Config
+    require "yaml"
+    CleanupTableColumnConfig = Data.define(:name, :cleanup_type, :unique) do
+      alias_method :unique_column?, :unique
+    end
+    CleanupStepConfig = Data.define(:step, :params)
+    ConditionConfig = Data.define(:column, :condition, :value)
+    class ConfigurationError < StandardError; end
+    def initialize(config_file)
+      @config = load(config_file) || {}
+      @steps_for = {}
+      @keep_same_conditions = {}
+      set_log_level
+    end
+    def dump_format
+      @config.dig("dump", "format")
+    end
+    def steps_for(type, phase)
+      @steps_for[type] ||= {}
+      @steps_for[type][phase.to_s] ||= Array(cleanup_config_for(type)[phase.to_s]).map do
+        CleanupStepConfig.new(step: _1["step"], params: (_1["params"] || {}).transform_keys(&:to_sym))
+      end
+    end
+    def keep_same_conditions(type)
+      @keep_same_conditions[type] ||= Array(cleanup_config_for(type)["keep_same_conditions"]).map do
+        ConditionConfig.new(condition: _1["condition"], value: _1["value"], column: nil)
+      end
+    end
+    def ignore_keep_same_record_conditions?(type)
+      cleanup_config_for(type)["ignore_keep_same_record_conditions"] == true
+    end
+    def cleanup_tables
+      cleanup_table_configs.map { [_1.db, _1.table] }
+    end
+    def cleanup_table_config(db:, table:)
+      cleanup_table_configs.find { _1.db == db && _1.table == table }
+    end
+    private
+    def load(config_file)
+      YAML.load_file(config_file)
+    end
+    def set_log_level
+      if (level = @config.dig("dump_cleaner", "log_level"))
+        Log.instance.level = level
+      end
+    end
+    def cleanup_table_configs
+      @cleanup_table_configs ||= Array(@config["cleanup_tables"]).map { CleanupTableConfig.new(_1) }
+    end
+    def cleanup_config_for(type)
+      @config.dig("cleanup_types", type.to_s) ||
+        raise(ConfigurationError, "Missing or empty type '#{type}' in the 'cleanup_types' section in config.")
+    end
+    class CleanupTableConfig
+      def initialize(cleanup_table_config)
+        @cleanup_table_config = cleanup_table_config
+      end
+      def db
+        @cleanup_table_config["db"]
+      end
+      def table
+        @cleanup_table_config["table"]
+      end
+      def id_column
+        @cleanup_table_config["id_column"] || "id"
+      end
+      def columns
+        @columns ||= Array(@cleanup_table_config["columns"]).map do
+          CleanupTableColumnConfig.new(name: _1["name"], cleanup_type: _1["cleanup_type"], unique: _1["unique"] == true)
+        end
+      end
+      def record_context_columns
+        @cleanup_table_config["record_context_columns"] || ["id"]
+      end
+      def keep_same_record_conditions
+        @keep_same_record_conditions ||= Array(@cleanup_table_config["keep_same_record_conditions"]).map do
+          ConditionConfig.new(condition: _1["condition"], value: _1["value"], column: _1["column"])
+        end
+      end
+    end
+  end
+end