RubyGems - cure - Versions diffs - 0.1.2 → 0.4.0 - Mend

cure 0.1.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

checksums.yaml +4 -4
data/.rubocop.yml +13 -3
data/.tool-versions +1 -0
data/Dockerfile +1 -1
data/Gemfile +1 -0
data/Gemfile.lock +25 -6
data/README.md +61 -93
data/docs/README.md +33 -0
data/docs/about.md +219 -0
data/docs/builder/add.md +52 -0
data/docs/builder/black_white_list.md +83 -0
data/docs/builder/copy.md +48 -0
data/docs/builder/explode.md +70 -0
data/docs/builder/main.md +43 -0
data/docs/builder/remove.md +46 -0
data/docs/examples/examples.md +164 -0
data/docs/export/main.md +37 -0
data/docs/extract/main.md +89 -0
data/docs/metadata/main.md +29 -0
data/docs/query/main.md +45 -0
data/docs/sources/main.md +36 -0
data/docs/transform/main.md +53 -0
data/docs/validate/main.md +42 -0
data/exe/cure +12 -41
data/exe/cure.old +59 -0
data/lib/cure/builder/base_builder.rb +151 -0
data/lib/cure/builder/candidate.rb +56 -0
data/lib/cure/cli/command.rb +105 -0
data/lib/cure/cli/generate_command.rb +54 -0
data/lib/cure/cli/new_command.rb +52 -0
data/lib/cure/cli/run_command.rb +19 -0
data/lib/cure/cli/templates/README.md.erb +1 -0
data/lib/cure/cli/templates/gemfile.erb +5 -0
data/lib/cure/cli/templates/gitignore.erb +181 -0
data/lib/cure/cli/templates/new_template.rb.erb +31 -0
data/lib/cure/cli/templates/tool-versions.erb +1 -0
data/lib/cure/config.rb +142 -18
data/lib/cure/coordinator.rb +61 -25
data/lib/cure/database.rb +191 -0
data/lib/cure/dsl/builder.rb +26 -0
data/lib/cure/dsl/exporters.rb +45 -0
data/lib/cure/dsl/extraction.rb +60 -0
data/lib/cure/dsl/metadata.rb +33 -0
data/lib/cure/dsl/queries.rb +36 -0
data/lib/cure/dsl/source_files.rb +36 -0
data/lib/cure/dsl/template.rb +131 -0
data/lib/cure/dsl/transformations.rb +95 -0
data/lib/cure/dsl/validator.rb +22 -0
data/lib/cure/export/base_processor.rb +194 -0
data/lib/cure/export/manager.rb +24 -0
data/lib/cure/extract/base_processor.rb +47 -0
data/lib/cure/extract/csv_lookup.rb +14 -3
data/lib/cure/extract/extractor.rb +41 -84
data/lib/cure/extract/filter.rb +118 -0
data/lib/cure/extract/named_range.rb +94 -0
data/lib/cure/extract/named_range_processor.rb +128 -0
data/lib/cure/extract/variable.rb +25 -0
data/lib/cure/extract/variable_processor.rb +57 -0
data/lib/cure/generator/base_generator.rb +14 -4
data/lib/cure/generator/case_generator.rb +10 -3
data/lib/cure/generator/character_generator.rb +9 -3
data/lib/cure/generator/erb_generator.rb +21 -0
data/lib/cure/generator/eval_generator.rb +34 -0
data/lib/cure/generator/faker_generator.rb +7 -1
data/lib/cure/generator/guid_generator.rb +7 -2
data/lib/cure/generator/hex_generator.rb +6 -1
data/lib/cure/generator/imports.rb +4 -0
data/lib/cure/generator/number_generator.rb +6 -1
data/lib/cure/generator/placeholder_generator.rb +7 -1
data/lib/cure/generator/proc_generator.rb +21 -0
data/lib/cure/generator/redact_generator.rb +9 -3
data/lib/cure/generator/static_generator.rb +21 -0
data/lib/cure/generator/variable_generator.rb +11 -5
data/lib/cure/helpers/file_helpers.rb +12 -2
data/lib/cure/helpers/object_helpers.rb +5 -17
data/lib/cure/helpers/perf_helpers.rb +30 -0
data/lib/cure/helpers/string.rb +54 -0
data/lib/cure/launcher.rb +125 -0
data/lib/cure/log.rb +7 -0
data/lib/cure/planner.rb +136 -0
data/lib/cure/strategy/append_strategy.rb +4 -0
data/lib/cure/strategy/base_strategy.rb +19 -44
data/lib/cure/strategy/contain_strategy.rb +51 -0
data/lib/cure/strategy/end_with_strategy.rb +7 -1
data/lib/cure/strategy/full_strategy.rb +4 -0
data/lib/cure/strategy/history/history_cache.rb +82 -0
data/lib/cure/strategy/imports.rb +2 -0
data/lib/cure/strategy/match_strategy.rb +7 -2
data/lib/cure/strategy/prepend_strategy.rb +28 -0
data/lib/cure/strategy/regex_strategy.rb +7 -1
data/lib/cure/strategy/split_strategy.rb +8 -3
data/lib/cure/strategy/start_with_strategy.rb +7 -1
data/lib/cure/transformation/candidate.rb +32 -35
data/lib/cure/transformation/transform.rb +22 -56
data/lib/cure/validator/base_rule.rb +78 -0
data/lib/cure/validator/candidate.rb +54 -0
data/lib/cure/validator/manager.rb +21 -0
data/lib/cure/validators.rb +3 -3
data/lib/cure/version.rb +1 -1
data/lib/cure.rb +19 -11
data/templates/dsl_example.rb +48 -0
data/templates/empty_template.rb +31 -0
metadata +132 -21
data/lib/cure/export/exporter.rb +0 -74
data/lib/cure/extract/builder.rb +0 -27
data/lib/cure/main.rb +0 -72
data/lib/cure/template/dispatch.rb +0 -30
data/lib/cure/template/extraction.rb +0 -38
data/lib/cure/template/template.rb +0 -28
data/lib/cure/template/transformations.rb +0 -26
data/templates/aws_cur_template.json +0 -145
data/templates/example_template.json +0 -54

data/lib/cure/export/base_processor.rb ADDED Viewed

@@ -0,0 +1,194 @@
+# frozen_string_literal: true
+require "csv"
+require "cure/log"
+require "cure/helpers/file_helpers"
+module Cure
+  module Export
+    class BaseProcessor
+      include Log
+      attr_reader :named_range
+      def initialize(named_range, opts)
+        @named_range = named_range
+        @opts = opts
+        @limit_rows = opts.fetch(:limit_rows, nil)
+        @processed = 0
+      end
+      # @param [Hash]
+      def process_row(row)
+        process(row) unless @limit_rows && @limit_rows <= @processed
+        @processed += 1
+      end
+      # @param [Hash]
+      def process(_row)
+        raise NotImplementedError, "#{self.class} has not implemented method '#{__method__}'"
+      end
+      def setup
+        raise NotImplementedError, "#{self.class} has not implemented method '#{__method__}'"
+      end
+      def cleanup
+        raise NotImplementedError, "#{self.class} has not implemented method '#{__method__}'"
+      end
+    end
+    require "terminal-table"
+    class TerminalProcessor < BaseProcessor
+      attr_reader :table, :limit_rows, :processed
+      def process(row)
+        @table.headings = row.keys if @processed.zero?
+        @table.add_row(row.values)
+      end
+      def setup
+        # Markdown mode
+        Terminal::Table::Style.defaults = {
+          border_top: false,
+          border_bottom: false,
+          border_x: "-",
+          border_y: "|",
+          border_i: "|"
+        }
+        log_info "Exporting [#{@named_range}] to terminal."
+        @table = Terminal::Table.new(title: @opts[:title] || "<No Title Set>")
+      end
+      def cleanup
+        puts @table
+      end
+    end
+    class CsvProcessor < BaseProcessor
+      include Helpers::FileHelpers
+      attr_reader :csv_file
+      def process(row)
+        @csv_file.write(row.keys.to_csv) if @processed.zero?
+        @csv_file.write(row.values.to_csv)
+      end
+      def setup
+        log_info "Exporting [#{@named_range}] to CSV..."
+        output_dir = @opts[:directory]
+        file_name = @opts[:file_name]
+        log_info("Exporting file to [#{output_dir}/#{file_name}]")
+        # file_name = "#{file_name}-#{Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S%-z")}"
+        path = "#{output_dir}/#{file_name}"
+        # clean_dir(output_dir)
+        dir = File.dirname(path)
+        FileUtils.mkdir_p(dir) unless File.directory?(dir)
+        path = "#{path}.csv"
+        @csv_file = File.open(path, "w")
+        @processed = 0
+      end
+      def cleanup
+      ensure
+        log_info File.basename(@csv_file)
+        @csv_file.close
+      end
+    end
+    class ChunkCsvProcessor < BaseProcessor
+      include Helpers::FileHelpers
+      attr_reader :current_csv_file,
+                  :file_name_prefix,
+                  :directory,
+                  :chunk_size,
+                  :include_headers,
+                  :row_count
+      def process(row)
+        chunked_file_handler do |csv_file|
+          if @processed.zero? || (@processed % @chunk_size).zero? || (@processed % @chunk_size).zero?
+            csv_file.write(row.keys.to_csv)
+          end
+          csv_file.write(row.values.to_csv)
+        end
+      end
+      def setup
+        log_info "Exporting [#{@named_range}] to CSV..."
+        extract_opts
+        log_info("Exporting file to [#{@output_dir}/#{@file_name_prefix}]")
+        clean_dir(@output_dir)
+        dir = File.dirname("#{@output_dir}/#{@file_name_prefix}")
+        FileUtils.mkdir_p(dir) unless File.directory?(dir)
+        @processed = 0
+        @current_chunk = 0
+      end
+      def cleanup
+      ensure
+        @current_csv_file.close
+      end
+      def extract_opts
+        # TODO: Add offset? pick a slice?
+        @output_dir = @opts[:directory]
+        @file_name_prefix = @opts[:file_name_prefix]
+        @directory = @opts[:directory]
+        @chunk_size = @opts[:chunk_size]
+        @include_headers = @opts.fetch(:include_headers, true)
+      end
+      def chunked_file_handler(&block)
+        raise "No block" unless block
+        if @processed.zero? || (@processed % @chunk_size).zero?
+          @current_csv_file&.close
+          @current_chunk += 1
+          log_info "Writing file to #{current_file_path}"
+          @current_csv_file = File.open(current_file_path, "w")
+        end
+        yield @current_csv_file
+      end
+      def current_file_path
+        "#{@output_dir}/#{@current_chunk}-#{@file_name_prefix}.csv"
+      end
+    end
+    class YieldRowProcessor < BaseProcessor
+      attr_reader :proc
+      def process_row(row)
+        @proc.call(row)
+      end
+      def setup
+        @proc = @opts.fetch(:proc)
+      end
+      def cleanup; end
+    end
+  end
+end

data/lib/cure/export/manager.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+module Cure
+  module Export
+    class Manager
+      # @param [Array<Cure::Export::BaseProcessor>] candidates
+      attr_reader :processors
+      def initialize(named_range, processors)
+        @named_range = named_range
+        @processors = processors
+      end
+      def with_processors
+        @processors.each(&:setup)
+        yield @processors
+        @processors.each(&:cleanup)
+      end
+    end
+  end
+end

data/lib/cure/extract/base_processor.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+require "cure/log"
+require "cure/config"
+require "cure/extract/csv_lookup"
+require "csv"
+module Cure
+  module Extract
+    class BaseProcessor
+      # @return [Cure::DatabaseService]
+      attr_reader :database_service
+      # @param [Cure::DatabaseService] database_service
+      def initialize(database_service)
+        @database_service = database_service
+      end
+      protected
+      # @param [String] tbl_name
+      # @param [Array<Object>] columns
+      def create_table(tbl_name, columns)
+        candidate_column_names = []
+        columns.each_with_index do |col, idx|
+          candidate_column_names << (col || "col_#{idx}")
+        end
+        @database_service.create_table(tbl_name.to_sym, candidate_column_names)
+      end
+      # @param [String] tbl_name
+      # @param [Array<Object>] values
+      def insert_record(tbl_name, values)
+        @database_service.insert_row(tbl_name.to_sym, values)
+      end
+      # @param [String] tbl_name
+      # @param [Array<Array<Object>>] values
+      def insert_batched_rows(tbl_name, values)
+        @database_service.insert_batched_rows(tbl_name.to_sym, values)
+      end
+    end
+  end
+end

data/lib/cure/extract/csv_lookup.rb CHANGED Viewed

@@ -4,10 +4,12 @@ module Cure
   module Extract
     class CsvLookup
-      # @param [String] position - [Ex A1:B1, A1:B1,A2:B2]
+      # @param [String,Integer] position - [Ex A1:B1, A1:B1,A2:B2]
       # @return [Array] [column_start_idx, column_end_idx, row_start_idx, row_end_idx]
       def self.array_position_lookup(position)
-        return [0, -1, 0, -1] if position.is_a?(Integer) && position == -1 # Whole sheet
+        # This is a better way, still trying to figure out a better way but -1 doesn't work for ranges.
+        # return [0, -1, 0, -1] if position.is_a?(Integer) && position == -1
+        return [0, 1_023, 0, 10_000_000] if position.is_a?(Integer) && position == -1 # Whole sheet
         start, finish, *_excess = position.split(":")
         raise "Invalid format" unless start || finish
@@ -20,10 +22,19 @@ module Cure
         ]
       end
+      # @param [String] range
       def self.position_for_letter(range)
-        range.upcase.scan(/[A-Z]+/).first.ord - 65 # A (65) - 65 = 0 idx
+        result = 0
+        range.upcase.scan(/[A-Z]+/).first&.each_char do |n|
+          result *= 26
+          result += n.ord - 65 + 1
+        end
+        # Excel columns are not 0th indexed.
+        result - 1
       end
+      # @param [String] range
       def self.position_for_digit(range)
         range.upcase.scan(/\d+/).first.to_i - 1
       end

data/lib/cure/extract/extractor.rb CHANGED Viewed

@@ -2,17 +2,26 @@
 require "cure/log"
 require "cure/config"
+require "cure/database"
 require "cure/extract/csv_lookup"
 require "cure/helpers/file_helpers"
+require "cure/helpers/perf_helpers"
+require "cure/extract/named_range_processor"
+require "cure/extract/variable_processor"
+require "csv"
+require "objspace"
 module Cure
   module Extract
     class Extractor
       include Log
+      include Database
       include Configuration
       include Helpers::FileHelpers
+      include Helpers::PerfHelpers
-      # @param [Hash] opts
+      # @return [Hash] opts
       attr_reader :opts
       # @param [Hash] opts
@@ -20,103 +29,51 @@ module Cure
         @opts = opts
       end
-      # @param [String] csv_file_location
-      # @return [WrappedCSV]
-      def extract_from_file(csv_file_location)
-        file_contents = read_file(csv_file_location)
-        extract_from_contents(file_contents)
-      end
-      # @param [String] file_contents
-      # @return [WrappedCSV]
-      def extract_from_contents(file_contents)
-        parsed_content = parse_csv(file_contents, header: :none)
-        log_info("Parsed CSV into #{parsed_content.content.length} sections.")
-        parsed_content
-      end
-      # private
+      # @param [Pathname,String] file - location of file
+      # @param [String] ref_name - name of reference file
+      def parse_csv(file, ref_name:)
+        nr_processor = named_range_processor(ref_name: ref_name)
+        v_processor = variable_processor(ref_name: ref_name)
-      # @param [String] file_contents
-      # @param [Hash] opts
-      # @return [WrappedCSV]
-      def parse_csv(file_contents, opts={})
-        csv_rows = []
+        sample_rows = config.template.extraction.sample_rows
+        row_count = 0
-        Rcsv.parse(file_contents, opts) { |row| csv_rows << row }
+        database_service.with_transaction do
+          CSV.foreach(file, liberal_parsing: true) do |row|
+            next if sample_rows && row_count >= sample_rows
-        result = WrappedCSV.new
-        result.content = extract_named_ranges(csv_rows)
-        result.variables = extract_variables(csv_rows)
+            nr_processor.process_row(row_count, row)
+            v_processor.process_row(row_count, row)
+            row_count += 1
-        result
-      end
+            log_info "#{row_count} rows processed [#{Time.now}]" if (row_count % 1_000).zero?
+          end
-      # @param [Array<Array>] csv_rows
-      # @return [Array<Hash>]
-      def extract_named_ranges(csv_rows)
-        # Use only the NR's that are defined from the candidates list
-        candidates = config.template.transformations.candidates
-        candidate_nrs = config.template.extraction.required_named_ranges(candidates.map(&:named_range).uniq)
-        candidate_nrs.map do |nr|
-          {
-            "rows" => extract_from_rows(csv_rows, nr["section"]),
-            "name" => nr["name"]
-          }
+          nr_processor.after_process
         end
-      end
-      # @param [Array<Array>] csv_rows
-      # @return [Hash]
-      def extract_variables(csv_rows)
-        config.template.extraction.variables.each_with_object({}) do |variable, hash|
-          hash[variable["name"]] = lookup_location(csv_rows, variable["location"])
-        end
+        log_info "[#{row_count}] total rows parsed from CSV"
       end
-      # @param [Array<Array>] rows
-      def extract_from_rows(rows, named_range)
-        psx = CsvLookup.array_position_lookup(named_range)
+      private
-        ret_val = []
-        rows.each_with_index do |row, idx|
-          # If the position of the end row is -1, we need all,
-          # otherwise if its between/equal to start/finish
-          ret_val << row[psx[0]..psx[1]] if psx[3] == -1 || (idx >= psx[2] && idx <= psx[3])
-        end
-        ret_val
-      end
-      # @param [Array<Array>] rows
-      # @param [String] variable_location
-      def lookup_location(rows, variable_location)
-        psx = [CsvLookup.position_for_letter(variable_location),
-               CsvLookup.position_for_digit(variable_location)]
-        rows[psx[1]][psx[0]]
-      end
+      # @param [String] ref_name - name of reference file
+      # @return [Cure::Extract::NamedRangeProcessor]
+      def named_range_processor(ref_name:)
+        candidate_nrs = config.template.extraction.required_named_ranges(ref_name: ref_name)
-      # @param [Integer] row_idx
-      # @param [Array] row
-      # @param [Array] psx
-      # @return [Array, nil]
-      def handle_row(row_idx, row, psx)
-        return nil unless psx[3] == -1 || (row_idx >= psx[2] && row_idx <= psx[3])
+        if candidate_nrs.empty?
+          candidate_nrs = [NamedRange.default_named_range(name: ref_name)]
+        end
-        row[psx[0]..psx[1]]
+        NamedRangeProcessor.new(database_service, candidate_nrs)
       end
-    end
-    class WrappedCSV
-      # @return [Array<Hash>]
-      attr_accessor :content
-      # @return [Hash]
-      attr_accessor :variables
-      def initialize
-        @content = []
-        @variables = {}
+      # @param [String] ref_name - name of reference file
+      # @return [Cure::Extract::VariableProcessor]
+      def variable_processor(ref_name:)
+        variables = config.template.extraction.required_variables(ref_name: ref_name)
+        VariableProcessor.new(database_service, variables || [])
       end
     end
   end

data/lib/cure/extract/filter.rb ADDED Viewed

@@ -0,0 +1,118 @@
+# frozen_string_literal: true
+module Cure
+  module Extract
+    class Filter
+      # @return [Filter::RowHandler] row_handler
+      attr_reader :row_handler
+      # @return [Filter::ColumnHandler] col_handler
+      attr_reader :col_handler
+      def initialize
+        @row_handler = RowHandler.new
+        @col_handler = ColumnHandler.new
+      end
+      def columns(&block)
+        return unless block
+        @col_handler.instance_eval(&block)
+      end
+      def rows(&block)
+        return unless block
+        @row_handler.instance_eval(&block)
+      end
+      class ColumnHandler
+        attr_reader :definitions, :source_col_positions
+        def initialize
+          @definitions = []
+          @source_col_positions = nil
+        end
+        # @param [String] source
+        # @param [String] as
+        def with(source:, as: nil)
+          @definitions << {
+            source: source,
+            as: as || source
+          }
+          self
+        end
+        # @param [Array<String>] columns_arr
+        def set_col_positions(columns_arr)
+          @source_col_positions = @definitions.each_with_object({}) do |d, hash|
+            hash[columns_arr.index(d[:source])] = d
+          end
+        end
+        # @param [Array<String>] columns_arr
+        def translate_headers(columns_arr)
+          return columns_arr unless has_content?
+          @source_col_positions.map do |position, val|
+            if position.nil?
+              raise "Cannot find header position for #{val[:source]}. Please check it exists."
+            end
+            columns_arr[position] = val[:as]
+          end
+        end
+        # @param [Array<String>] columns_arr
+        def filter_row(columns_arr)
+          return columns_arr unless has_content?
+          @source_col_positions.keys.map {|k| columns_arr[k] }
+        end
+        # @return [TrueClass, FalseClass]
+        def has_content?
+          @definitions.any?
+        end
+      end
+      class RowHandler
+        attr_accessor :start_proc, :finish_proc, :including_proc
+        # @param [String] where
+        # @param [Hash] options
+        def start(where:, options: {})
+          @start_proc = {where:, options:}
+          self
+        end
+        # @param [String] where
+        # @param [Hash] options
+        def finish(where:, options: {})
+          @finish_proc = {where:, options:}
+          self
+        end
+        # @param [String] where
+        # @param [Hash] options
+        def including(where:, options: {})
+          @including_proc = {where:, options:}
+          self
+        end
+        # @return [TrueClass, FalseClass]
+        def has_content?
+          !!(@start_proc || @finish_proc || @including_proc)
+        end
+      end
+    end
+  end
+end

data/lib/cure/extract/named_range.rb ADDED Viewed

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+require "cure/extract/filter"
+module Cure
+  module Extract
+    class NamedRange
+      def self.default_named_range(name: nil)
+        name ||= "_default"
+        new(name, -1)
+      end
+      attr_accessor :filter, :row_count
+      attr_reader :name, :section, :headers, :ref_name, :placeholder
+      # This is complex purely to support headers not being the 0th row.
+      # A template can specify that the headers row be completely disconnected
+      # from the content, thus we have three bounds:
+      # - Content bounds
+      # - Header bounds
+      # - Sheet bounds (headers AND content)
+      # @param [String] ref_name - file reference (for multiple files)
+      def initialize(name, section, headers: nil, ref_name: nil, placeholder: false)
+        @name = name
+        @filter = Filter.new
+        @section = Extract::CsvLookup.array_position_lookup(section)
+        @headers = calculate_headers(headers)
+        @row_count = 0
+        @placeholder = placeholder
+        @ref_name = ref_name || "_default"
+      end
+      # @param [Integer] row_idx
+      # @return [TrueClass, FalseClass]
+      def row_in_bounds?(row_idx)
+        row_bounds_range.cover?(row_idx)
+      end
+      # @param [Integer] row_idx
+      # @return [TrueClass, FalseClass]
+      def header_in_bounds?(row_idx)
+        header_bounds_range.cover?(row_idx)
+      end
+      # @param [Integer] row_idx
+      # @return [TrueClass, FalseClass]
+      def content_in_bounds?(row_idx)
+        content_bounds_range.cover?(row_idx)
+      end
+      # @return [Range]
+      def row_bounds_range
+        @row_bounds_range ||= (row_bounds&.first..row_bounds&.last)
+      end
+      def row_bounds
+        @row_bounds ||= content_bounds.concat(header_bounds).uniq.minmax
+      end
+      # @return [Range]
+      def content_bounds_range
+        @content_bounds_range ||= (content_bounds[0]..content_bounds[1])
+      end
+      def content_bounds
+        @content_bounds ||= @section[2..3]
+      end
+      # @return [Range]
+      def header_bounds_range
+        @header_bounds_range ||= (header_bounds&.first..header_bounds&.last)
+      end
+      def header_bounds
+        @header_bounds ||= @headers[2..3]
+      end
+      def active_row_count(row_idx)
+        row_idx - @row_count
+      end
+      private
+      def calculate_headers(headers)
+        return Extract::CsvLookup.array_position_lookup(headers) if headers
+        [@section[0], @section[1], @section[2], @section[2]]
+      end
+    end
+  end
+end