RubyGems - remi - Versions diffs - 0.2.42 → 0.3.0 - Mend

remi 0.2.42 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

checksums.yaml +4 -4
data/.yardopts +7 -0
data/Gemfile +1 -1
data/Gemfile.lock +13 -26
data/README.md +1 -1
data/features/step_definitions/remi_step.rb +33 -13
data/features/sub_job_example.feature +24 -0
data/features/sub_transform_example.feature +35 -0
data/features/sub_transform_many_to_many.feature +49 -0
data/features/support/env_app.rb +1 -1
data/jobs/all_jobs_shared.rb +19 -16
data/jobs/copy_source_job.rb +11 -9
data/jobs/csv_file_target_job.rb +10 -9
data/jobs/json_job.rb +18 -14
data/jobs/metadata_job.rb +33 -28
data/jobs/parameters_job.rb +14 -11
data/jobs/sample_job.rb +106 -77
data/jobs/sftp_file_target_job.rb +14 -13
data/jobs/sub_job_example_job.rb +86 -0
data/jobs/sub_transform_example_job.rb +43 -0
data/jobs/sub_transform_many_to_many_job.rb +46 -0
data/jobs/transforms/concatenate_job.rb +16 -12
data/jobs/transforms/data_frame_sieve_job.rb +24 -19
data/jobs/transforms/date_diff_job.rb +15 -11
data/jobs/transforms/nvl_job.rb +16 -12
data/jobs/transforms/parse_date_job.rb +17 -14
data/jobs/transforms/partitioner_job.rb +27 -19
data/jobs/transforms/prefix_job.rb +13 -10
data/jobs/transforms/truncate_job.rb +14 -10
data/jobs/transforms/truthy_job.rb +11 -8
data/lib/remi.rb +25 -11
data/lib/remi/data_frame.rb +4 -4
data/lib/remi/data_frame/daru.rb +1 -37
data/lib/remi/data_subject.rb +234 -48
data/lib/remi/data_subjects/csv_file.rb +171 -0
data/lib/remi/data_subjects/data_frame.rb +106 -0
data/lib/remi/data_subjects/file_system.rb +115 -0
data/lib/remi/data_subjects/local_file.rb +109 -0
data/lib/remi/data_subjects/none.rb +31 -0
data/lib/remi/data_subjects/postgres.rb +186 -0
data/lib/remi/data_subjects/s3_file.rb +84 -0
data/lib/remi/data_subjects/salesforce.rb +211 -0
data/lib/remi/data_subjects/sftp_file.rb +196 -0
data/lib/remi/data_subjects/sub_job.rb +50 -0
data/lib/remi/dsl.rb +74 -0
data/lib/remi/encoder.rb +45 -0
data/lib/remi/extractor.rb +21 -0
data/lib/remi/field_symbolizers.rb +1 -0
data/lib/remi/job.rb +279 -113
data/lib/remi/job/parameters.rb +90 -0
data/lib/remi/job/sub_job.rb +35 -0
data/lib/remi/job/transform.rb +165 -0
data/lib/remi/loader.rb +22 -0
data/lib/remi/monkeys/daru.rb +4 -0
data/lib/remi/parser.rb +44 -0
data/lib/remi/testing/business_rules.rb +17 -23
data/lib/remi/testing/data_stub.rb +2 -2
data/lib/remi/version.rb +1 -1
data/remi.gemspec +3 -0
data/spec/data_subject_spec.rb +475 -11
data/spec/data_subjects/csv_file_spec.rb +69 -0
data/spec/data_subjects/data_frame_spec.rb +52 -0
data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
data/spec/data_subjects/none_spec.rb +41 -0
data/spec/data_subjects/postgres_spec.rb +80 -0
data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
data/spec/data_subjects/salesforce_spec.rb +117 -0
data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
data/spec/data_subjects/sub_job_spec.rb +33 -0
data/spec/encoder_spec.rb +38 -0
data/spec/extractor_spec.rb +11 -0
data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
data/spec/job/transform_spec.rb +257 -0
data/spec/job_spec.rb +507 -0
data/spec/loader_spec.rb +11 -0
data/spec/parser_spec.rb +38 -0
data/spec/sf_bulk_helper_spec.rb +117 -0
data/spec/testing/data_stub_spec.rb +5 -3
metadata +109 -27
data/features/aggregate.feature +0 -42
data/jobs/aggregate_job.rb +0 -31
data/jobs/transforms/transform_jobs.rb +0 -4
data/lib/remi/data_subject/csv_file.rb +0 -162
data/lib/remi/data_subject/data_frame.rb +0 -52
data/lib/remi/data_subject/postgres.rb +0 -134
data/lib/remi/data_subject/salesforce.rb +0 -136
data/lib/remi/data_subject/sftp_file.rb +0 -65
data/lib/remi/extractor/file_system.rb +0 -92
data/lib/remi/extractor/local_file.rb +0 -43
data/lib/remi/extractor/s3_file.rb +0 -57
data/lib/remi/extractor/sftp_file.rb +0 -83
data/spec/data_subject/csv_file_spec.rb +0 -79
data/spec/data_subject/data_frame.rb +0 -27

data/lib/remi/data_subjects/sftp_file.rb ADDED

@@ -0,0 +1,196 @@
+module Remi
+  # Sftp File extractor
+  # Used to extract files from an SFTP server
+  #
+  # @example
+  #
+  # class MyJob < Remi::Job
+  #   source :some_file do
+  #     extractor Remi::Extractor::SftpFile.new(
+  #       credentials: {
+  #         host: 'coolserver.com',
+  #         username: 'myself',
+  #         password: 'secret'
+  #       },
+  #       remote_path: '/',
+  #       pattern: /^some_file_\d{14}\.csv/,
+  #       most_recent_only: true
+  #     )
+  #
+  #     parser Remi::Parser::CsvFile.new(
+  #       csv_options: {
+  #         headers: true,
+  #         col_sep: ','
+  #       }
+  #     )
+  #   end
+  # end
+  #
+  # job = MyJob.new
+  # job.some_file.df
+  #  # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
+  #  #                    id       name
+  #  #          0          1     Albert
+  #  #          1          2      Betsy
+  #  #          2          3       Camu
+  class Extractor::SftpFile < Extractor::FileSystem
+    N_RETRY = 3
+    # @param credentials [Hash] Options hash containing login credentials
+    # @param credentials [String] :host SFTP host (e.g., coolserver.com)
+    # @param credentials [String] :username SFTP username
+    # @param credentials [String] :password SFTP password
+    # @param credentials [String] :port SFTP port (default: 22)
+    def initialize(*args, **kargs, &block)
+      super
+      init_sftp_extractor(*args, **kargs)
+    end
+    attr_reader :host
+    attr_reader :username
+    attr_reader :password
+    attr_reader :port
+    # Called to extract files from the source filesystem.
+    # @return [Array<String>] An array of paths to a local copy of the files extacted
+    def extract
+      connection do |sftp|
+        entries.map do |entry|
+          local_file = File.join(@local_path, entry.name)
+          logger.info "Downloading #{entry.name} to #{local_file}"
+          retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
+          local_file
+        end
+      end
+    end
+    # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
+    def all_entries
+      @all_entries ||= all_entries!
+    end
+    # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
+    def all_entries!
+      sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
+      sftp_entries.map do |entry|
+        # Early versions of the protocol don't support create time, fake it with modified time?
+        FileSystemEntry.new(
+          pathname: File.join(@remote_path, entry.name),
+          create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
+          modified_time: entry.attributes.mtime
+        )
+      end
+    end
+    private
+    def init_sftp_extractor(*args, credentials:, **kargs)
+      @host     = credentials.fetch(:host)
+      @username = credentials.fetch(:username)
+      @password = credentials.fetch(:password)
+      @port     = credentials.fetch(:port, '22')
+    end
+    def connection(&block)
+      result = nil
+      Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
+        result = yield sftp
+      end
+      result
+    end
+    def retry_download(&block)
+      1.upto(N_RETRY).each do |itry|
+        begin
+          block.call
+          break
+        rescue RuntimeError => err
+          raise err unless itry < N_RETRY
+          logger.error "Download failed with error: #{err.message}"
+          logger.error "Retry attempt #{itry}/#{N_RETRY-1}"
+          sleep(1)
+        end
+      end
+    end
+  end
+  # SFTP file loader
+  #
+  # @example
+  #  class MyJob < Remi::Job
+  #    target :my_target do
+  #      encoder Remi::Encoder::CsvFile.new(
+  #        csv_options: { col_sep: '|' }
+  #      )
+  #      loader Remi::Loader::SftpFile.new(
+  #        credentials: { },
+  #        remote_path: 'some_test.csv'
+  #      )
+  #      loader Remi::Loader::SftpFile.new(
+  #        credentials: { },
+  #        remote_path: 'some_other_test.csv'
+  #      )
+  #    end
+  #  end
+  #
+  #  my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
+  #  job = MyJob.new
+  #  job.my_target.df = my_df
+  #  job.my_target.load
+  class Loader::SftpFile < Loader
+    # @param remote_path [String, Pathname] Full path to the file to be created on the target filesystem
+    def initialize(*args, **kargs, &block)
+      super
+      init_sftp_loader(*args, **kargs, &block)
+    end
+    attr_reader :remote_path
+    # Copies data to the SFTP Server
+    # @param data [Object] The path to the file in the temporary work location
+    # @return [true] On success
+    def load(data)
+      logger.info "Uploading #{data} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
+      connection do |sftp|
+        retry_upload { sftp.upload! data, @remote_path }
+      end
+      true
+    end
+    private
+    def init_sftp_loader(*args, credentials:, remote_path:, **kargs, &block)
+      @credentials = credentials
+      @remote_path = remote_path
+    end
+    def connection(&block)
+      result = nil
+      Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
+        result = yield sftp
+      end
+      result
+    end
+    def retry_upload(ntry=2, &block)
+      1.upto(ntry).each do |itry|
+        begin
+          block.call
+          break
+        rescue RuntimeError => err
+          raise err unless itry < ntry
+          logger.error "Upload failed with error: #{err.message}"
+          logger.error "Retry attempt #{itry}/#{ntry-1}"
+          sleep(1)
+        end
+      end
+    end
+  end
+end

data/lib/remi/data_subjects/sub_job.rb ADDED

@@ -0,0 +1,50 @@
+module Remi
+  class Extractor::SubJob < Extractor
+    # @param sub_job [Object] The name (relative to parent job) of the subjob to use
+    # @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
+    def initialize(*args, **kargs, &block)
+      super
+      init_sub_job_extractor(*args, **kargs, &block)
+    end
+    attr_accessor :sub_job, :data_subject
+    def extract
+      sub_job.job.send(data_subject).df
+    end
+    private
+    def init_sub_job_extractor(*args, sub_job:, data_subject:, **kargs, &block)
+      @sub_job = sub_job
+      @data_subject = data_subject
+    end
+  end
+  class Loader::SubJob < Loader
+    # @param sub_job [Object] The name (relative to parent job) of the subjob to use
+    # @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
+    def initialize(*args, **kargs, &block)
+      super
+      init_sub_job_loader(*args, **kargs, &block)
+    end
+    attr_accessor :sub_job, :data_subject
+    # @param data_frame [Object] Data frame to load to target sub job data subject
+    # @return [true] On success
+    def load(data_frame)
+      sub_job.job.send(data_subject).df = data_frame
+      true
+    end
+    private
+    def init_sub_job_loader(*args, sub_job:, data_subject:, **kargs, &block)
+      @sub_job = sub_job
+      @data_subject = data_subject
+    end
+  end
+end

data/lib/remi/dsl.rb ADDED

@@ -0,0 +1,74 @@
+module Remi
+  # @api private
+  #
+  # A namespace for functions relating to the execution of a block against a
+  # proxy object.
+  #
+  # Much of this code was borrowed from [Docile](https://github.com/ms-ati/docile)
+  # and was modified to support different fallback contexts.
+  # @see Docile [Docile](https://github.com/ms-ati/docile)
+  module Dsl
+    # Execute a block in the context of an object whose methods represent the
+    # commands in a DSL, using a specific proxy class.
+    #
+    # @param dsl          [Object] context object whose methods make up the
+    #                              (initial) DSL
+    # @param fallback_dsl [Object] context object that the DSL should fall back
+    #                              to if the primary context fails to resolve
+    # @param proxy_type   [FallbackContextProxy, ChainingFallbackContextProxy]
+    #                              which class to instantiate as proxy context
+    # @param args         [Array]  arguments to be passed to the block
+    # @param block        [Proc]   the block of DSL commands to be executed
+    # @return             [Object] the return value of the block
+    def exec_in_proxy_context(dsl, fallback_dsl, proxy_type, *args, &block)
+      block_context = fallback_dsl
+      proxy_context = proxy_type.new(dsl, block_context)
+      begin
+        block_context.instance_variables.each do |ivar|
+          value_from_block = block_context.instance_variable_get(ivar)
+          proxy_context.instance_variable_set(ivar, value_from_block)
+        end
+        proxy_context.instance_exec(*args, &block)
+      ensure
+        block_context.instance_variables.each do |ivar|
+          value_from_dsl_proxy = proxy_context.instance_variable_get(ivar)
+          block_context.instance_variable_set(ivar, value_from_dsl_proxy)
+        end
+      end
+    end
+    module_function :exec_in_proxy_context
+    # Execute a block in the context of an object whose methods represent the
+    # commands in a DSL.
+    #
+    # @note Use with an *imperative* DSL (commands modify the context object)
+    #
+    # Use this method to execute an *imperative* DSL, which means that:
+    #
+    #   1. Each command mutates the state of the DSL context object
+    #   2. The return value of each command is ignored
+    #   3. The final return value is the original context object
+    #
+    #
+    # @param dsl            [Object] context object whose methods make up the DSL
+    # @param fallback_dsl   [Object] context object that the DSL should fallback to
+    # @param args           [Array]  arguments to be passed to the block
+    # @param block          [Proc]   the block of DSL commands to be executed against the
+    #                                `dsl` context object
+    # @return               [Object] the `dsl` context object after executing the block
+    def dsl_eval(dsl, fallback_dsl, *args, &block)
+      exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
+      dsl
+    end
+    module_function :dsl_eval
+    def dsl_return(dsl, fallback_dsl, *args, &block)
+      exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
+    end
+    module_function :dsl_return
+  end
+end

data/lib/remi/encoder.rb ADDED

@@ -0,0 +1,45 @@
+module Remi
+  # An encoder is an object tha converts a dataframe into a form that can
+  # be used by a Remi::Loader.  This is a parent class meant to be
+  # inherited by child classes that define specific ways to parse
+  # data.
+  class Encoder
+    # @param context [Object] The context (e.g., DataTarget) for the encoder (default: `nil`)
+    # @param field_symbolizer [Proc] The field symbolizer to use for this encoder
+    # @param fields [Remi::Fields] A hash of field metadata to be used by the encoder
+    def initialize(*args, context: nil, field_symbolizer: Remi::FieldSymbolizers[:standard], fields: Remi::Fields.new({}), logger: Remi::Settings.logger, **kargs, &block)
+      @context = context
+      @field_symbolizer = field_symbolizer
+      @fields = fields
+      @logger = logger
+    end
+    attr_accessor :context
+    attr_accessor :logger
+    attr_writer :field_symbolizer
+    attr_writer :fields
+    # Any child classes need to define an encode method that converts the
+    # data subject's dataframe into a structure that can be loaded into the
+    # target system.
+    # @param dataframe [Remi::DataFrame] The dataframe to be encoded
+    # @return [Object] The encoded data to be loaded into the target
+    def encode(dataframe)
+      raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
+    end
+    # @return [Proc] The field symbolizer (uses the context field symbolizer if defined)
+    def field_symbolizer
+      return context.field_symbolizer if context if context.respond_to? :field_symbolizer
+      @field_symbolizer
+    end
+    # @return [Remi::Fields] The fields (uses the context fields if defined)
+    def fields
+      return context.fields if context if context.respond_to? :fields
+      @fields
+    end
+  end
+end

data/lib/remi/extractor.rb ADDED

@@ -0,0 +1,21 @@
+module Remi
+  # An extractor is an object meant to extract data from some external system.
+  # This is a parent class meant to be inherited by child classes that
+  # define specific ways to extract data.
+  class Extractor
+    def initialize(*args, logger: Remi::Settings.logger, **kargs, &block)
+      @logger = logger
+    end
+    # @return [Object] The logger object used by the extractor
+    attr_accessor :logger
+    # Any child classes need to define an extract method that returns data
+    # in a format that an appropriate parser can use to convert into a dataframe
+    def extract
+      raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
+    end
+  end
+end

data/lib/remi/field_symbolizers.rb CHANGED

@@ -12,6 +12,7 @@ module Remi
                                            gsub(/[^0-9a-zA-Z_.]+/, "").to_sym
         }
       }
     end
   end
 end

data/lib/remi/job.rb CHANGED

@@ -1,176 +1,342 @@
 module Remi
-  module Job
-    module JobClassMethods
-      attr_accessor :params
-      attr_accessor :sources
-      attr_accessor :targets
-      attr_accessor :transforms
-      def define_param(key, value)
-        @params ||= Hash.new { |h, key| raise "Parameter #{key} is not defined" }
-        @params[key] = value
+  # The Job class is the foundation for all Remi ETL jobs.  It
+  # provides a DSL for defining Remi jobs in a way that is natural for
+  # ETL style applications.  In a Remi job, the user defines all of
+  # the sources, transforms, and targets necessary to transform data.
+  # Any number of sources, transforms, and targets can be defined.
+  # Transforms can call other parameterized sub-transforms.  Jobs can
+  # collect data from other parameterized sub-jobs, pass data to other
+  # sub-jobs, or both pass and collect data from other sub-jobs.
+  #
+  # Jobs are executed by calling the `#execute` method in an instance
+  # of the job.  This triggers all transforms to be executed in the
+  # order they are defined.  Sub-transforms are only executed if they
+  # are referenced in a transform.  After all transforms have
+  # executed, the targets are loaded in the order they are defined.
+  #
+  #
+  #
+  # @example
+  #
+  #   class MyJob < Remi::Job
+  #     source :my_csv_file do
+  #       extractor my_extractor
+  #       parser my_parser
+  #       enforce_types
+  #     end
+  #
+  #     target :my_transformed_file do
+  #       loader my_loader
+  #     end
+  #
+  #     transform :transform_data do
+  #       # Data sources are converted into a dataframe the first time the #df method is called.
+  #       transform_work = my_csv_file.df.dup # => a copy of the my_csv_file.df dataframe
+  #
+  #       # Any arbitrary Ruby is allowed in a transform block.  Remi provides a convenient
+  #       # source to target map DSL to map fields from sources to targets
+  #       Remi::SourceToTargetMap.apply(transform_work, my_transformed_file.df) do
+  #         map source(:source_field_id) .target(:prefixed_id)
+  #           .transform(->(v) { "PREFIX#{v}" })
+  #       end
+  #     end
+  #   end
+  #
+  #   # The job is executed when `#execute` is called on an instance of the job.
+  #   # Transforms are executed in the order they are defined.  Targets are loaded
+  #   # in the order they are defined after all transforms have been executed.
+  #   job = MyJob.new
+  #   job.execute
+  #
+  #
+  #
+  # @todo MOAR Examples!  Subtransforms, subjobs, parameters, references to even more
+  #   complete sample jobs.
+  class Job
+    class << self
+      def inherited(base)
+        base.instance_variable_set(:@params, params.clone)
+        base.instance_variable_set(:@sources, sources.dup)
+        base.instance_variable_set(:@targets, targets.dup)
+        base.instance_variable_set(:@transforms, transforms.dup)
+        base.instance_variable_set(:@sub_jobs, sub_jobs.dup)
       end
-      def define_source(name, type_class, **options)
+      # @return [Job::Parameters] all parameters defined at the class level
+      def params
+        @params ||= Parameters.new
+      end
+      # Defines a job parameter.
+      # @example
+      #
+      #   class MyJob < Job
+      #     param(:my_param) { 'the best parameter' }
+      #   end
+      #
+      #   job = MyJob.new
+      #   job.params[:my_param] #=> 'the best parameter'
+      def param(name, &block)
+        params.__define__(name, &block)
+      end
+      # @return [Array<Symbol>] the list of data source names
+      def sources
         @sources ||= []
-        @sources << name unless @sources.include? name
+      end
-        define_method(name) do
-          iv_name = instance_variable_get("@#{name}")
-          return iv_name if iv_name
-          source = type_class.new(options)
+      # @return [Array<Symbol>] the list of sub-jobs
+      def sub_jobs
+        @sub_jobs ||= []
+      end
+      # Defines a sub job resource for this job.
+      # Note that the return value of the DSL block must be an instance of a Remi::Job
+      # @example
+      #
+      #   class MyJob < Job
+      #     sub_job(:my_sub_job) { MySubJob.new }
+      #   end
+      #
+      #   job = MyJob.new
+      #   job.sub_job.job #=> An instance of MySubJob
+      def sub_job(name, &block)
+        sub_jobs << name unless sub_jobs.include? name
+        attr_accessor name
+        define_method("__init_#{name}__".to_sym) do
+          sub_job = Job::SubJob.new(self, name: name, &block)
+          instance_variable_set("@#{name}", sub_job)
+        end
+      end
+      # Defines a data source.
+      # @example
+      #
+      #   class MyJob < Job
+      #     source :my_source do
+      #       extractor my_extractor
+      #       parser my_parser
+      #     end
+      #   end
+      #
+      #   job = MyJob.new
+      #   job.my_source.df #=> a dataframe generated after extracting and parsing
+      def source(name, &block)
+        sources << name unless sources.include? name
+        attr_accessor name
+        define_method("__init_#{name}__".to_sym) do
+          source = DataSource.new(self, name: name, &block)
           instance_variable_set("@#{name}", source)
         end
       end
-      def define_target(name, type_class, **options)
+      # @return [Array<Symbol>] the list of data target names
+      def targets
         @targets ||= []
-        @targets << name unless @targets.include? name
-        define_method(name) do
-          iv_name = instance_variable_get("@#{name}")
-          return iv_name if iv_name
+      end
-          target = type_class.new(options)
+      # Defines a data target.
+      # @example
+      #
+      #   class MyJob < Job
+      #     target :my_target do
+      #       extractor my_extractor
+      #       parser my_parser
+      #     end
+      #   end
+      #
+      #   job = MyJob.new
+      #   job.my_target.df #=> a dataframe generated after extracting and parsing
+      def target(name, &block)
+        targets << name unless targets.include? name
+        attr_accessor name
+        define_method("__init_#{name}__".to_sym) do
+          target = DataTarget.new(self, name: name, &block)
           instance_variable_set("@#{name}", target)
         end
       end
-      def define_transform(name, sources: [], targets: [], &block)
-        @transforms ||= {}
-        @transforms[name] = { sources: Array(sources), targets: Array(targets) }
+      # @return [Array<Symbol>] the list of transform names
+      def transforms
+        @transforms ||= []
+      end
-        define_method(name) do
-          instance_eval { @logger.info "Running transformation #{__method__}" }
-          instance_eval(&block)
+      # Defines a transform.
+      # @example
+      #
+      #   class MyJob < Job
+      #     transform :my_transform do
+      #       puts "hello from my_transform!"
+      #     end
+      #   end
+      #
+      #   job = MyJob.new
+      #   job.my_transform.execute #=>(stdout) 'hello from my_transform!'
+      def transform(name, &block)
+        transforms << name unless transforms.include? name
+        attr_accessor name
+        define_method("__init_#{name}__".to_sym) do
+          transform = Transform.new(self, name: name, &block)
+          instance_variable_set("@#{name}", transform)
         end
       end
-      def params
-        @params || {}
+      # Defines a sub-transform.
+      # @example
+      #
+      #   class MyJob < Job
+      #     sub_transform :my_sub_transform, greeting: 'hello' do
+      #       puts "#{params[:greeting]} from my_sub_transform!"
+      #     end
+      #
+      #     transform :my_transform do
+      #       import :my_sub_transform, greeting: 'bonjour' do
+      #       end
+      #     end
+      #   end
+      #
+      #   job = MyJob.new
+      #   job.my_transform.execute #=>(stdout) 'bonjour from my_sub_transform!'
+      def sub_transform(name, **kargs, &block)
+        define_method(name) do
+          Transform.new(self, name: name, **kargs, &block)
+        end
       end
+    end
-      def sources
-        @sources || []
-      end
+    # Initializes the job
+    #
+    # @param work_dir [String, Path] sets the working directory for this job
+    # @param logger [Object] sets the logger for the job
+    # @param kargs [Hash] Optional job parameters (can be referenced in the job via `#params`)
+    def initialize(work_dir: Settings.work_dir, logger: Settings.logger, **kargs)
+      @work_dir = work_dir
+      @logger = logger
+      create_work_dir
-      def targets
-        @targets || []
-      end
+      __init_params__ **kargs
+      __init_sub_jobs__
+      __init_sources__
+      __init_targets__
+      __init_transforms__
+    end
-      def transforms
-        @transforms || {}
-      end
+    # @return [String] the working directory used for temporary data
+    attr_reader :work_dir
+    # @return [Object] the logging object
+    attr_reader :logger
-      def work_dir
-        Settings.work_dir
-      end
+    # @return [Job::Parameters] parameters defined at the class level or during instantiation
+    attr_reader :params
-      def self.extended(receiver)
-      end
+    # @return [Array] list of sub_jobs defined in the job
+    attr_reader :sub_jobs
-      def included(receiver)
-        receiver.extend(JobClassMethods)
-        receiver.params     = self.params.merge(receiver.params)
-        receiver.sources    = self.sources + receiver.sources
-        receiver.targets    = self.targets + receiver.targets
-        receiver.transforms = self.transforms.merge(receiver.transforms)
-      end
-    end
+    # @return [Array] list of sources defined in the job
+    attr_reader :sources
-    def self.included(receiver)
-      receiver.extend(JobClassMethods)
-    end
+    # @return [Array] list of targets defined in the job
+    attr_reader :targets
+    # @return [Array] list of transforms defined in the job
+    attr_reader :transforms
-    def params
-      self.class.params
-    end
-    def sources
-      self.class.sources
+    # Creates a temporary working directory for the job
+    def create_work_dir
+      @logger.info "Creating working directory #{work_dir}"
+      FileUtils.mkdir_p work_dir
     end
-    def targets
-      self.class.targets
+    # @return [self] the job object (needed to reference parent job in transform DSL)
+    def job
+      self
     end
-    def transforms
-      self.class.transforms
+    def to_s
+      inspect
     end
+    def inspect
+      "#<#{Remi::Job}>: #{self.class}\n" +
+        "  parameters: #{params.to_h.keys}\n" +
+        "  sources: #{sources}\n" +
+        "  targets: #{targets}\n" +
+        "  transforms: #{transforms}"
+    end
-    def initialize(runtime_params: {}, delete_work_dir: true, logger: Settings.logger)
-      @runtime_params = runtime_params
-      @delete_work_dir = delete_work_dir
-      @logger = logger
-      create_work_dir
+    # Execute the specified components of the job.
+    #
+    # @param components [Array<symbol>] list of components to execute (e.g., `:transforms`, `:load_targets`)
+    #
+    # @return [self]
+    def execute(*components)
+      execute_transforms if components.empty? || components.include?(:transforms)
+      execute_load_targets if components.empty? || components.include?(:load_targets)
+      self
     end
-    attr_accessor :runtime_params
+    private
-    def work_dir
-      self.class.work_dir
+    def __init_params__(**kargs)
+      @params = self.class.params.clone
+      add_params **kargs
+      params.context = self
     end
-    def finalize
-      delete_work_dir
+    def __init_sub_jobs__
+      @sub_jobs = self.class.sub_jobs
+      @sub_jobs.each do |sub_job|
+        send("__init_#{sub_job}__".to_sym)
+      end
     end
-    def delete_work_dir
-      if @delete_work_dir && (work_dir.match /^#{Dir.tmpdir}/)
-        @logger.info "Deleting temporary directory #{work_dir}"
-        FileUtils.rm_r work_dir
-      else
-        @logger.debug "Not going to delete working directory #{work_dir}"
-        nil
+    def __init_sources__
+      @sources = self.class.sources
+      @sources.each do |source|
+        send("__init_#{source}__".to_sym)
       end
     end
-    def create_work_dir
-      @logger.info "Creating working directory #{work_dir}"
-      FileUtils.mkdir_p work_dir
+    def __init_targets__
+      @targets = self.class.targets
+      @targets.each do |target|
+        send("__init_#{target}__".to_sym)
+      end
     end
-    # Public: Runs any transforms that use the sources and targets selected.  If
-    # source and target is not specified, then all transforms will be run.
-    # If only the source is specified, then all transforms that use any of the
-    # sources will be run.  Same for specified transforms.
-    #
-    # sources - Array of source names
-    # targets - Array of target names
-    #
-    # Returns an array containing the result of each transform.
-    def run_transforms_using(sources: nil, targets: nil)
-      transforms.map do |t, st|
-        selected_sources = (st[:sources] & Array(sources || st[:sources])).size > 0
-        selected_targets = (st[:targets] & Array(targets || st[:targets])).size > 0
-        self.send(t) if selected_sources && selected_targets
+    def __init_transforms__
+      @transforms = self.class.transforms
+      @transforms.each do |transform|
+        send("__init_#{transform}__".to_sym)
       end
     end
-    def run_all_transforms
-      transforms.map { |t, st| self.send(t) }
+    # Executes all transforms defined
+    def execute_transforms
+      transforms.map { |t| send(t).execute }
+      self
     end
-    def load_all_targets
-      targets.each do |target|
-        @logger.info "Loading target #{target}"
-        self.send(target).tap { |t| t.respond_to?(:load) ? t.load : nil }
-      end
+    # Loads all targets defined
+    def execute_load_targets
+      targets.each { |t| send(t).load }
+      self
     end
-    # Public: Runs all transforms defined in the job.
-    #
-    # Returns the job instance.
-    def run
-      # Do all of the stuff here
-      run_all_transforms
-      load_all_targets
-      self
+    # Adds all parameters listed to the job parameters
+    def add_params(**kargs)
+      kargs.each { |k,v| params[k] = v }
     end
   end
 end