RubyGems - remi - Versions diffs - 0.2.42 → 0.3.0 - Mend

remi 0.2.42 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

checksums.yaml +4 -4
data/.yardopts +7 -0
data/Gemfile +1 -1
data/Gemfile.lock +13 -26
data/README.md +1 -1
data/features/step_definitions/remi_step.rb +33 -13
data/features/sub_job_example.feature +24 -0
data/features/sub_transform_example.feature +35 -0
data/features/sub_transform_many_to_many.feature +49 -0
data/features/support/env_app.rb +1 -1
data/jobs/all_jobs_shared.rb +19 -16
data/jobs/copy_source_job.rb +11 -9
data/jobs/csv_file_target_job.rb +10 -9
data/jobs/json_job.rb +18 -14
data/jobs/metadata_job.rb +33 -28
data/jobs/parameters_job.rb +14 -11
data/jobs/sample_job.rb +106 -77
data/jobs/sftp_file_target_job.rb +14 -13
data/jobs/sub_job_example_job.rb +86 -0
data/jobs/sub_transform_example_job.rb +43 -0
data/jobs/sub_transform_many_to_many_job.rb +46 -0
data/jobs/transforms/concatenate_job.rb +16 -12
data/jobs/transforms/data_frame_sieve_job.rb +24 -19
data/jobs/transforms/date_diff_job.rb +15 -11
data/jobs/transforms/nvl_job.rb +16 -12
data/jobs/transforms/parse_date_job.rb +17 -14
data/jobs/transforms/partitioner_job.rb +27 -19
data/jobs/transforms/prefix_job.rb +13 -10
data/jobs/transforms/truncate_job.rb +14 -10
data/jobs/transforms/truthy_job.rb +11 -8
data/lib/remi.rb +25 -11
data/lib/remi/data_frame.rb +4 -4
data/lib/remi/data_frame/daru.rb +1 -37
data/lib/remi/data_subject.rb +234 -48
data/lib/remi/data_subjects/csv_file.rb +171 -0
data/lib/remi/data_subjects/data_frame.rb +106 -0
data/lib/remi/data_subjects/file_system.rb +115 -0
data/lib/remi/data_subjects/local_file.rb +109 -0
data/lib/remi/data_subjects/none.rb +31 -0
data/lib/remi/data_subjects/postgres.rb +186 -0
data/lib/remi/data_subjects/s3_file.rb +84 -0
data/lib/remi/data_subjects/salesforce.rb +211 -0
data/lib/remi/data_subjects/sftp_file.rb +196 -0
data/lib/remi/data_subjects/sub_job.rb +50 -0
data/lib/remi/dsl.rb +74 -0
data/lib/remi/encoder.rb +45 -0
data/lib/remi/extractor.rb +21 -0
data/lib/remi/field_symbolizers.rb +1 -0
data/lib/remi/job.rb +279 -113
data/lib/remi/job/parameters.rb +90 -0
data/lib/remi/job/sub_job.rb +35 -0
data/lib/remi/job/transform.rb +165 -0
data/lib/remi/loader.rb +22 -0
data/lib/remi/monkeys/daru.rb +4 -0
data/lib/remi/parser.rb +44 -0
data/lib/remi/testing/business_rules.rb +17 -23
data/lib/remi/testing/data_stub.rb +2 -2
data/lib/remi/version.rb +1 -1
data/remi.gemspec +3 -0
data/spec/data_subject_spec.rb +475 -11
data/spec/data_subjects/csv_file_spec.rb +69 -0
data/spec/data_subjects/data_frame_spec.rb +52 -0
data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
data/spec/data_subjects/none_spec.rb +41 -0
data/spec/data_subjects/postgres_spec.rb +80 -0
data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
data/spec/data_subjects/salesforce_spec.rb +117 -0
data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
data/spec/data_subjects/sub_job_spec.rb +33 -0
data/spec/encoder_spec.rb +38 -0
data/spec/extractor_spec.rb +11 -0
data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
data/spec/job/transform_spec.rb +257 -0
data/spec/job_spec.rb +507 -0
data/spec/loader_spec.rb +11 -0
data/spec/parser_spec.rb +38 -0
data/spec/sf_bulk_helper_spec.rb +117 -0
data/spec/testing/data_stub_spec.rb +5 -3
metadata +109 -27
data/features/aggregate.feature +0 -42
data/jobs/aggregate_job.rb +0 -31
data/jobs/transforms/transform_jobs.rb +0 -4
data/lib/remi/data_subject/csv_file.rb +0 -162
data/lib/remi/data_subject/data_frame.rb +0 -52
data/lib/remi/data_subject/postgres.rb +0 -134
data/lib/remi/data_subject/salesforce.rb +0 -136
data/lib/remi/data_subject/sftp_file.rb +0 -65
data/lib/remi/extractor/file_system.rb +0 -92
data/lib/remi/extractor/local_file.rb +0 -43
data/lib/remi/extractor/s3_file.rb +0 -57
data/lib/remi/extractor/sftp_file.rb +0 -83
data/spec/data_subject/csv_file_spec.rb +0 -79
data/spec/data_subject/data_frame.rb +0 -27

data/lib/remi/data_subjects/csv_file.rb ADDED

@@ -0,0 +1,171 @@
+module Remi
+  # @api private
+  #
+  # Contains methods shared between CsvFile Parser/Encoder
+  module DataSubject::CsvFile
+    def self.included(base)
+      base.extend(CsvFileClassMethods)
+    end
+    module CsvFileClassMethods
+      def default_csv_options
+        @default_csv_options ||= CSV::DEFAULT_OPTIONS.merge({
+          headers: true,
+          header_converters: Remi::FieldSymbolizers[:standard],
+          converters: [],
+          col_sep: ',',
+          encoding: 'UTF-8',
+          quote_char: '"'
+        })
+      end
+    end
+  end
+  # @api public
+  #
+  # CsvFile parser
+  #
+  # @example
+  #
+  #  class MyJob < Remi::Job
+  #    source :some_file do
+  #      extractor Remi::Extractor::LocalFile.new(
+  #        remote_path: 'some_file.csv'
+  #      )
+  #      parser Remi::Parser::CsvFile.new(
+  #        csv_options: {
+  #          headers: true,
+  #          col_sep: '|'
+  #        }
+  #      )
+  #    end
+  #  end
+  #
+  #  job = MyJob.new
+  #  job.some_file.df
+  #  # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
+  #  #                    id       name
+  #  #          0          1     Albert
+  #  #          1          2      Betsy
+  #  #          2          3       Camu
+  class Parser::CsvFile < Parser
+    include Remi::DataSubject::CsvFile
+    # @param csv_options [Hash] Standard Ruby CSV parsing options.
+    # @param filename_field [Symbol] Name of the field to be used to write
+    #  the filename of the CSV being parsed (default: nil, meaning no field will be used)
+    # @param preprocessor [Proc] A proc used to pre-process lines of the CSV file before being parsed
+    def initialize(*args, **kargs, &block)
+      super
+      init_csv_file(*args, **kargs, &block)
+    end
+    # @return [Hash] Csv options hash
+    attr_reader :csv_options
+    # Converts a list of filenames into a dataframe after parsing them
+    # according ot the csv options that were set
+    # @param data [Object] Extracted data that needs to be parsed
+    # @return [Remi::DataFrame] The data converted into a dataframe
+    def parse(data)
+      # Assumes that each file has exactly the same structure
+      result_df = nil
+      Array(data).each_with_index do |filename, idx|
+        filename = filename.to_s
+        logger.info "Converting #{filename} to a dataframe"
+        processed_filename = preprocess(filename)
+        csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
+        csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
+        if idx == 0
+          result_df = csv_df
+        else
+          result_df = result_df.concat csv_df
+        end
+      end
+      Remi::DataFrame.create(:daru, result_df)
+    end
+    private
+    def preprocess(filename)
+      return filename unless @preprocessor
+      logger.info "Preprocessing #{filename}"
+      tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
+      dirname = Pathname.new(tmp_filename).dirname
+      FileUtils.mkdir_p(dirname) unless File.directory? dirname
+      File.open(tmp_filename, 'w') do |outfile|
+        File.foreach(filename) do |in_line|
+          outfile.write @preprocessor.call(in_line)
+        end
+      end
+      tmp_filename
+    end
+    def init_csv_file(*args, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
+      @csv_options = self.class.default_csv_options.merge(csv_options)
+      @filename_field = filename_field
+      @preprocessor = preprocessor
+    end
+  end
+  # CsvFile Encoder
+  #
+  # @example
+  #  class MyJob < Remi::Job
+  #    target :my_target do
+  #      encoder Remi::Encoder::CsvFile.new(
+  #        csv_options: { col_sep: '|' }
+  #      )
+  #      loader Remi::Loader::LocalFile.new(
+  #        path: 'test.csv'
+  #      )
+  #    end
+  #  end
+  #
+  #  my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
+  #  job = MyJob.new
+  #  job.my_target.df = my_df
+  #  job.my_target.load
+  class Encoder::CsvFile < Encoder
+    include Remi::DataSubject::CsvFile
+    # @param work_path [String,Pathname] Path to a directory used to temporarily store CSV files (default: Settings.work_dir)
+    # @param csv_options [Hash] Standard Ruby CSV parser options.
+    def initialize(*args, **kargs, &block)
+      super
+      init_csv_file_encoder(*args, **kargs, &block)
+    end
+    default_csv_options[:row_sep] = "\n"
+    # @return [Hash] Csv options hash
+    attr_reader :csv_options
+    # Converts the dataframe to a CSV file stored in the local work directory.
+    #
+    # @param dataframe [Remi::DataFrame] The dataframe to be encoded
+    # @return [Object] The path to the file
+    def encode(dataframe)
+      logger.info "Writing CSV file to temporary location #{@working_file}"
+      dataframe.write_csv @working_file, @csv_options
+      @working_file
+    end
+    private
+    def init_csv_file_encoder(*args, work_path: Settings.work_dir, csv_options: {}, **kargs, &block)
+      @working_file = File.join(work_path, SecureRandom.uuid)
+      @csv_options = self.class.default_csv_options.merge(csv_options)
+    end
+  end
+end

data/lib/remi/data_subjects/data_frame.rb ADDED

@@ -0,0 +1,106 @@
+module Remi
+  # DataFrame extractor.
+  # This class is used to hard-code a dataframe as a simple array of rows.
+  #
+  # @example
+  #
+  #   class MyJob < Remi::Job
+  #     source :my_df do
+  #       fields ({ id: {}, name: {}})
+  #       extractor Remi::Extractor::DataFrame.new(
+  #         data: [
+  #           [1, 'Albert'],
+  #           [2, 'Betsy'],
+  #           [3, 'Camu']
+  #         ]
+  #       )
+  #       parser Remi::Parser::DataFrame.new
+  #     end
+  #   end
+  #
+  #   job = MyJob.new
+  #   job.my_df.df.inspect
+  #   # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
+  #   #                    id       name
+  #   #          0          1     Albert
+  #   #          1          2      Betsy
+  #   #          2          3       Camu
+  class Extractor::DataFrame < Extractor
+    # @param data [Array<Array>] An array of arrays representing rows of a dataframe.
+    def initialize(*args, **kargs, &block)
+      super
+      init_data_frame_extractor(*args, **kargs, &block)
+    end
+    attr_accessor :data
+    # @return [Object] self
+    def extract
+      self
+    end
+    private
+    def init_data_frame_extractor(*args, data: [], **kargs, &block)
+      @data = data
+    end
+  end
+  # DataFrame parser.
+  # In order for the DataFrame::Extractor to be parsed correctly, fields must be defined
+  # on the data subject.
+  #
+  # @example
+  #
+  #   class MyJob < Remi::Job
+  #     source :my_df do
+  #       fields ({ id: {}, name: {}})
+  #       extractor Remi::Extractor::DataFrame.new(
+  #         data: [
+  #           [1, 'Albert'],
+  #           [2, 'Betsy'],
+  #           [3, 'Camu']
+  #         ]
+  #       )
+  #       parser Remi::Parser::DataFrame.new
+  #     end
+  #   end
+  #
+  #   job = MyJob.new
+  #   job.my_df.df.inspect
+  #   # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
+  #   #                    id       name
+  #   #          0          1     Albert
+  #   #          1          2      Betsy
+  #   #          2          3       Camu
+  class Parser::DataFrame < Parser
+    # @param df_extract [Extractor::DataFrame] An object containing data extracted from memory
+    # @return [Remi::DataFrame] The data converted into a dataframe
+    def parse(df_extract)
+      Remi::DataFrame.create(:daru, df_extract.data.transpose, order: fields.keys)
+    end
+  end
+  # DataFrame encoder
+  class Encoder::DataFrame < Encoder
+    # @param data_frame [Remi::DataFrame] The data_frame to be encoded
+    # @return [Object] The data_frame
+    def encode(data_frame)
+      data_frame
+    end
+  end
+  # DataFrame loader
+  # Not sure this is needed, right?
+  # Maybe on SubJobs?
+  class Loader::DataFrame < Loader
+    # @param data [Encoder::Salesforce] Data that has been encoded appropriately to be loaded into the target
+    # @return [true] On success
+    def load(data)
+      true
+    end
+  end
+end

data/lib/remi/data_subjects/file_system.rb ADDED

@@ -0,0 +1,115 @@
+module Remi
+  # Defines properties of an entry in a filesystem.
+  class Extractor::FileSystemEntry
+    # @param pathname [String] The path the file system entry
+    # @param create_time [Time] The time the entry was created
+    # @param modified_time [Time] The time the entry was last modified
+    # @param raw [Object] An object that captures all other aspects of the entry, native to system the entry lives on
+    def initialize(pathname:, create_time:, modified_time:, raw: nil)
+      @pathname = Pathname.new(pathname)
+      @create_time = create_time
+      @modified_time = modified_time
+      @raw = raw
+    end
+    attr_reader :pathname, :create_time, :modified_time, :raw
+    # @return [String] the base name of the entry
+    def name
+      @pathname.basename.to_s
+    end
+  end
+  # Parent class used to describe things that behave like file systems (e.g.,
+  # local file systems, ftp servers, S3 objects) to be used for extraction.
+  #
+  # @param remote_path [String] Path on the remote system that contains the files
+  # @param pattern [Regexp] Only files with a name that matches this regular
+  #  expression are extracted
+  # @param local_path [String] Local path to put copies of extracted files
+  # @param most_recent_only [true,false] Only extract the most recent file
+  #  that matches the given pattern
+  # @param group_by [Regexp] A regular expression used to group files together
+  #  and only extract the most recent file in each group
+  # @param most_recent_by [Symbol] Indicates the FileSystemEntry property used to determine which
+  #   file is the most recent(`:create_time` (default), `:modified_time`, `:name`)
+  class Extractor::FileSystem < Extractor
+    class FileNotFoundError < StandardError; end
+    def initialize(*args, **kargs, &block)
+      super
+      init_file_system(*args, **kargs)
+    end
+    attr_reader :remote_path
+    attr_reader :pattern
+    attr_reader :local_path
+    attr_reader :most_recent_only
+    attr_reader :group_by
+    attr_reader :most_recent_by
+    # Public: Called to extract files from the source filesystem.
+    #
+    # Returns an array with containing the paths to all files extracted.
+    def extract
+      raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
+    end
+    # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
+    # NOTE: all_entries is responsible for matching the path using @remote_path
+    def all_entries
+      raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
+    end
+    # Public: Returns just the entries that are to be extracted.
+    def entries
+      if @group_by
+        most_recent_matching_entry_in_group
+      elsif @most_recent_only
+        Array(most_recent_matching_entry)
+      else
+        matching_entries
+      end
+    end
+    def matching_entries
+      all_entries.select { |e| @pattern.match e.name }
+    end
+    def most_recent_matching_entry
+      matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
+    end
+    def most_recent_matching_entry_in_group
+      entries_with_group = matching_entries.map do |entry|
+        match = entry.name.match(@group_by)
+        next unless match
+        group = match.to_a[1..-1]
+        { group: group, entry: entry }
+      end.compact
+      sorted_entries_with_group = entries_with_group.sort_by { |e| [e[:group], e[:entry].send(@most_recent_by)] }.reverse
+      last_group = nil
+      sorted_entries_with_group.map do |entry|
+        next unless entry[:group] != last_group
+        last_group = entry[:group]
+        entry[:entry]
+      end.compact
+    end
+    private
+    def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, **kargs, &block)
+      @remote_path = Pathname.new(remote_path)
+      @pattern = pattern
+      @local_path = Pathname.new(local_path)
+      @most_recent_only = most_recent_only
+      @group_by = group_by
+      @most_recent_by = most_recent_by
+    end
+  end
+end

data/lib/remi/data_subjects/local_file.rb ADDED

@@ -0,0 +1,109 @@
+module Remi
+  # Local file extractor
+  # Used to "extract" a file from a local filesystem.
+  # Note that even though the file is local, we still use the parameter `remote_path`
+  # to indicate the path.  This makes this class consistent with Remi::FileSystem.
+  #
+  # @example
+  #
+  #  class MyJob < Remi::Job
+  #    source :some_file do
+  #      extractor Remi::Extractor::LocalFile.new(
+  #        remote_path: 'some_file.csv'
+  #      )
+  #      parser Remi::Parser::CsvFile.new(
+  #        csv_options: {
+  #          headers: true,
+  #          col_sep: '|'
+  #        }
+  #      )
+  #    end
+  #  end
+  #
+  #  job = MyJob.new
+  #  job.some_file.df
+  #  # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
+  #  #                    id       name
+  #  #          0          1     Albert
+  #  #          1          2      Betsy
+  #  #          2          3       Camu
+  class Extractor::LocalFile < Extractor::FileSystem
+    def initialize(*args, **kargs)
+      super
+      init_local_file(*args, **kargs)
+    end
+    # Called to extract files from the source filesystem.
+    # @return [Array<String>] An array of paths to a local copy of the files extacted
+    def extract
+      entries.map(&:pathname)
+    end
+    # @return [Array<Extractor::FileSystemEntry>] List of objects in the remote path
+    def all_entries
+      @all_entries ||= all_entries!
+    end
+    # @return [Array<Extractor::FileSystemEntry>] List of objects in the remote path
+    def all_entries!
+      dir = @remote_path.directory? ? @remote_path + '*' : @remote_path
+      Dir[dir].map do |entry|
+        path = Pathname.new(entry)
+        if path.file?
+          Extractor::FileSystemEntry.new(
+            pathname: path.realpath.to_s,
+            create_time: path.ctime,
+            modified_time: path.mtime
+          )
+        end
+      end.compact
+    end
+    private
+    def init_local_file(*args, **kargs)
+    end
+  end
+  # Local file loader
+  # Used to output files to a local filesystem
+  # @example
+  #  class MyJob < Remi::Job
+  #    target :my_target do
+  #      encoder Remi::Encoder::CsvFile.new(
+  #        csv_options: { col_sep: '|' }
+  #      )
+  #      loader Remi::Loader::LocalFile.new(
+  #        path: 'test.csv'
+  #      )
+  #    end
+  #  end
+  #
+  #  my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
+  #  job = MyJob.new
+  #  job.my_target.df = my_df
+  #  job.my_target.load
+  class Loader::LocalFile < Loader
+    def initialize(*args, **kargs)
+      super
+      init_local_file_loader(*args, **kargs)
+    end
+    # Moves the file from the temporary workspace to another local path
+    # @param data [Object] The path to the file in the temporary work location
+    # @return [true] On success
+    def load(data)
+      logger.info "Writing file #{@local_path}"
+      FileUtils.mv(data, @local_path)
+    end
+    private
+    def init_local_file_loader(*args, path:, **kargs)
+      @local_path = path
+    end
+  end
+end