RubyGems - ductr - Versions diffs - 0.1.0 - Mend

ductr 0.1.0

Files changed (63) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +14 -0
data/.vscode/settings.json +18 -0
data/COPYING +674 -0
data/COPYING.LESSER +165 -0
data/Gemfile +6 -0
data/Gemfile.lock +121 -0
data/README.md +37 -0
data/Rakefile +37 -0
data/bin/console +15 -0
data/bin/setup +8 -0
data/ductr.gemspec +50 -0
data/exe/ductr +24 -0
data/lib/ductr/adapter.rb +94 -0
data/lib/ductr/cli/default.rb +25 -0
data/lib/ductr/cli/main.rb +60 -0
data/lib/ductr/cli/new_project_generator.rb +72 -0
data/lib/ductr/cli/templates/project/bin_ductr.rb +7 -0
data/lib/ductr/cli/templates/project/config_app.rb +5 -0
data/lib/ductr/cli/templates/project/config_development.yml +8 -0
data/lib/ductr/cli/templates/project/config_environment_development.rb +18 -0
data/lib/ductr/cli/templates/project/gemfile.rb +6 -0
data/lib/ductr/cli/templates/project/rubocop.yml +14 -0
data/lib/ductr/cli/templates/project/tool-versions +1 -0
data/lib/ductr/configuration.rb +145 -0
data/lib/ductr/etl/controls/buffered_destination.rb +65 -0
data/lib/ductr/etl/controls/buffered_transform.rb +76 -0
data/lib/ductr/etl/controls/control.rb +46 -0
data/lib/ductr/etl/controls/destination.rb +28 -0
data/lib/ductr/etl/controls/paginated_source.rb +47 -0
data/lib/ductr/etl/controls/source.rb +21 -0
data/lib/ductr/etl/controls/transform.rb +28 -0
data/lib/ductr/etl/fiber_control.rb +136 -0
data/lib/ductr/etl/fiber_runner.rb +68 -0
data/lib/ductr/etl/kiba_runner.rb +26 -0
data/lib/ductr/etl/parser.rb +115 -0
data/lib/ductr/etl/runner.rb +37 -0
data/lib/ductr/etl_job.rb +161 -0
data/lib/ductr/job.rb +58 -0
data/lib/ductr/job_etl_runner.rb +37 -0
data/lib/ductr/job_status.rb +56 -0
data/lib/ductr/kiba_job.rb +130 -0
data/lib/ductr/log/formatters/color_formatter.rb +48 -0
data/lib/ductr/log/logger.rb +169 -0
data/lib/ductr/log/outputs/file_output.rb +30 -0
data/lib/ductr/log/outputs/standard_output.rb +39 -0
data/lib/ductr/pipeline.rb +133 -0
data/lib/ductr/pipeline_runner.rb +95 -0
data/lib/ductr/pipeline_step.rb +92 -0
data/lib/ductr/registry.rb +55 -0
data/lib/ductr/rufus_trigger.rb +106 -0
data/lib/ductr/scheduler.rb +117 -0
data/lib/ductr/store/job_serializer.rb +59 -0
data/lib/ductr/store/job_store.rb +59 -0
data/lib/ductr/store/pipeline_serializer.rb +106 -0
data/lib/ductr/store/pipeline_store.rb +48 -0
data/lib/ductr/store.rb +81 -0
data/lib/ductr/trigger.rb +49 -0
data/lib/ductr/version.rb +6 -0
data/lib/ductr.rb +143 -0
data/sig/ductr.rbs +1107 -0
metadata +292 -0

data/lib/ductr/kiba_job.rb ADDED Viewed

@@ -0,0 +1,130 @@
+# frozen_string_literal: true
+module Ductr
+  #
+  # Base class for ETL job using kiba's streaming runner.
+  # Example using the SQLite adapter:
+  #
+  #   class MyKibaJob < Ductr::KibaJob
+  #     source :some_adapter, :paginated, page_size: 4
+  #     def select_some_stuff(db, offset, limit)
+  #       db[:items].offset(offset).limit(limit)
+  #     end
+  #
+  #     lookup :some_adapter, :match, merge: [:id, :item], buffer_size: 4
+  #     def merge_with_stuff(db, ids)
+  #       db[:items_bis].select(:id, Sequel.as(:name, :name_bis), :item).where(item: ids)
+  #     end
+  #
+  #     transform
+  #     def generate_more_stuff(row)
+  #       { name: "#{row[:name]}_#{row[:name_bis]}" }
+  #     end
+  #
+  #     destination :some_other_adapter, :basic
+  #     def my_destination(row, db)
+  #       logger.trace("Hello destination: #{row}")
+  #       db[:new_items].insert(name: row[:name])
+  #     end
+  #   end
+  #
+  # @see The chosen adapter documentation for further information on controls usage.
+  #
+  class KibaJob < Job
+    # @return [Class] The ETL runner class used by the job
+    ETL_RUNNER_CLASS = ETL::KibaRunner
+    include JobETLRunner
+    include ETL::Parser
+    #
+    # @!method self.source(adapter_name, source_type, **source_options)
+    #   Annotation to define a source method
+    #   @param adapter_name [Symbol] The adapter the source is running on
+    #   @param source_type [Symbol] The type of source to run
+    #   @param **source_options [Hash<Symbol: Object>] The options to pass to the source
+    #
+    #   @example Source with Sequel SQLite adapter
+    #     source :my_adapter, :paginated, page_size: 42
+    #     def my_source(db, offset, limit)
+    #       db[:items].offset(offset).limit(limit)
+    #     end
+    #
+    #   @see The chosen adapter documentation for further information on sources usage.
+    #
+    #   @return [void]
+    #
+    annotable :source
+    #
+    # @!method self.transform(transform_class, **transform_options)
+    #   Annotation to define a transform method
+    #   @param transform_class [Class, nil] The class the transform is running on
+    #   @param **transform_options [Hash<Symbol: Object>] The options to pass to the transform
+    #
+    #   @example Transform without params
+    #     transform
+    #     def rename_keys(row)
+    #       row[:new_name] = row.delete[:old_name]
+    #       row[:new_email] = row.delete[:old_email]
+    #     end
+    #
+    #   @example Transform with params
+    #     class RenameTransform < Ductr::ETL::Transform
+    #       def process(row)
+    #         call_method.each do |actual_name, new_name|
+    #           new_key = "#{options[:prefix]}#{new_name}".to_sym
+    #
+    #           row[new_key] = row.delete(actual_name)
+    #         end
+    #       end
+    #     end
+    #
+    #     transform RenameTransform, prefix: "some_"
+    #     def rename
+    #       { old_name: :new_name, old_email: :new_email }
+    #     end
+    #
+    #   @return [void]
+    #
+    annotable :transform
+    #
+    # @!method self.lookup(adapter_name, lookup_type, **lookup_options)
+    #   Annotation to define a lookup method
+    #   @param adapter_name [Symbol] The adapter the lookup is running on
+    #   @param lookup_type [Symbol] The type of lookup to run
+    #   @param **lookup_options [Hash<Symbol: Object>] The options to pass to the lookup
+    #
+    #   @example Lookup with Sequel SQLite adapter
+    #     lookup :my_other_adapter, :match, merge: [:id, :item], buffer_size: 4
+    #     def joining_different_adapters(db, ids)
+    #       db[:items_bis].select(:id, :item, :name).where(item: ids)
+    #     end
+    #
+    #   @see The chosen adapter documentation for further information on lookups usage.
+    #
+    #   @return [void]
+    #
+    annotable :lookup
+    #
+    # @!method self.destination(adapter_name, destination_type, **destination_options)
+    #   Annotation to define a destination method
+    #   @param adapter_name [Symbol] The adapter the destination is running on
+    #   @param destination_type [Symbol] The type of destination to run
+    #   @param **destination_options [Hash<Symbol: Object>] The options to pass to the destination
+    #
+    #   @example Destination with Sequel SQLite adapter
+    #     destination :my_other_adapter, :basic
+    #     def my_destination(row, db)
+    #       db[:new_items].insert(name: row[:name], new_name: row[:new_name])
+    #     end
+    #
+    #   @see The chosen adapter documentation for further information on destinations usage.
+    #
+    #   @return [void]
+    #
+    annotable :destination
+  end
+end

data/lib/ductr/log/formatters/color_formatter.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+require "logger"
+require "colorized_string"
+module Ductr
+  module Log
+    #
+    # A log formatter which colorize the text with ANSI colors.
+    #
+    class ColorFormatter < ::Logger::Formatter
+      #
+      # Colorizes the given log entry.
+      #
+      # @param [Integer] level The log's severity level
+      # @param [Time] time The log's timestamp
+      # @param [Symbol] prog_name The log's "program" name, used to add job method name to the log
+      # @param [String] message The log's message
+      #
+      # @return [String] The formatted log
+      #
+      def call(level, time, prog_name, message)
+        format(format_str(level), level[0], format_datetime(time), Process.pid, level, prog_name, msg2str(message))
+      end
+      private
+      #
+      # Generates the colorized format string based on the log level.
+      #
+      # @param [String] level The log level
+      #
+      # @return [String] The colored format string
+      #
+      def format_str(level)
+        colors = {
+          "DEBUG" => :green, "INFO" => :cyan, "WARN" => :yellow, "ERROR" => :red, "FATAL" => { background: :red }
+        }
+        timestamp = ColorizedString["%s, [%s #%d]"].colorize(:light_black)
+        level_name = ColorizedString["%5s"].colorize(colors[level])
+        prog_name = ColorizedString["%s:"].colorize(:blue)
+        "#{timestamp} #{level_name} -- #{prog_name} %s\n"
+      end
+    end
+  end
+end

data/lib/ductr/log/logger.rb ADDED Viewed

@@ -0,0 +1,169 @@
+# frozen_string_literal: true
+require "logger"
+module Ductr
+  module Log
+    #
+    # A ractor compatible logger to be used inside jobs or anywhere else in your ductr project.
+    #
+    class Logger
+      class << self
+        #
+        # Allows to add another log output.
+        # Making possible to write logs in multiple places at the same time, e.g. in STDOUT and in logs files
+        #
+        # @param [StandardOutput] output The new output to write logs to
+        # @param [::Logger::Formatter] formatter The formatter to use when writing logs
+        # @param [Hash] **options The formatter options
+        #
+        # @return [void]
+        #
+        def add_output(output, formatter = ::Logger::Formatter, **options)
+          @outputs ||= []
+          @outputs.push([output, [formatter, options]])
+        end
+        #
+        # The configured outputs list
+        #
+        # @return [Array<Array<StandardOutput, Array<::Logger::Formatter, Hash>>>]
+        #   The list of outputs with their formatters and configurations
+        #
+        def outputs
+          @outputs || [[StandardOutput, [::Logger::Formatter]]]
+        end
+        #
+        # Configure the logging level.
+        #
+        # @param [Symbol, String] lvl The desired logging level
+        #
+        # @return [void]
+        #
+        def level=(lvl)
+          level_sym = lvl.to_s.downcase.to_sym
+          @level = {
+            debug: ::Logger::DEBUG,
+            info: ::Logger::INFO,
+            warn: ::Logger::WARN,
+            error: ::Logger::ERROR,
+            fatal: ::Logger::FATAL
+          }[level_sym]
+          raise ArgumentError, "invalid log level: #{lvl}" unless @level
+        end
+        #
+        # @return [Integer] The current logging level, default ::Logger::DEBUG
+        #
+        def level
+          @level || ::Logger::DEBUG
+        end
+      end
+      #
+      # Create configured outputs instances, meaning that you can't add outputs in an already instantiated logger.
+      #
+      def initialize(prog_name = nil)
+        @prog_name = prog_name
+        @outputs = self.class.outputs.map do |output_with_params|
+          out, params = *output_with_params
+          formatter, options = *params
+          out.new(formatter, **options || {})
+        end
+      end
+      #
+      # Logs a message with the `debug` level.
+      #
+      # @param [String] message The message to log
+      # @param [String, Symbol] prog_name The program name of the message
+      #
+      # @return [void]
+      # @yield The message
+      #
+      def debug(...)
+        write(::Logger::DEBUG, ...)
+      end
+      #
+      # Logs a message with the `info` level.
+      #
+      # @param [String] message The message to log
+      # @param [String, Symbol] prog_name The program name of the message
+      #
+      # @return [void]
+      # @yield The message
+      #
+      def info(...)
+        write(::Logger::INFO, ...)
+      end
+      #
+      # Logs a message with the `warn` level.
+      #
+      # @param [String] message The message to log
+      # @param [String, Symbol] prog_name The program name of the message
+      #
+      # @return [void]
+      # @yield The message
+      #
+      def warn(...)
+        write(::Logger::WARN, ...)
+      end
+      #
+      # Logs a message with the `error` level.
+      #
+      # @param [String] message The message to log
+      # @param [String, Symbol] prog_name The program name of the message
+      #
+      # @return [void]
+      # @yield The message
+      #
+      def error(...)
+        write(::Logger::ERROR, ...)
+      end
+      #
+      # Logs a message with the `fatal` level.
+      #
+      # @param [String] message The message to log
+      # @param [String, Symbol] prog_name The program name of the message
+      #
+      # @return [void]
+      # @yield The message
+      #
+      def fatal(...)
+        write(::Logger::FATAL, ...)
+      end
+      private
+      #
+      # Writes the message with the given level into all outputs.
+      #
+      # @param [Integer] severity The severity level of the message
+      # @param [String] message The message to write
+      # @param [String] prog_name The program name of the message
+      #
+      # @return [void]
+      #
+      def write(severity, message = nil, prog_name = nil, &)
+        return if severity < self.class.level
+        message ||= yield
+        called_method = "#{@prog_name}##{caller_locations(2, 1).first.label}"
+        prog_name ||= @prog_name.is_a?(Class) ? called_method : @prog_name
+        @outputs.each do |output|
+          output.write severity, prog_name, message
+        end
+      end
+    end
+  end
+end

data/lib/ductr/log/outputs/file_output.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+require "fileutils"
+module Ductr
+  module Log
+    #
+    # An output to write logs in a file
+    #
+    class FileOutput < StandardOutput
+      #
+      # Creates the output with the given formatter, path and options
+      #
+      # @param [::Logger::Formatter] formatter The formatter to use when writing logs
+      # @param [String] path The path to write the logs
+      # @param [Hash] **options The options to write files
+      #
+      # @see The ruby's logger documentation to get options documentation
+      #
+      def initialize(formatter, path:, **options) # rubocop:disable Lint/MissingSuper
+        dir = File.dirname(path)
+        FileUtils.mkdir_p(dir) unless File.directory?(dir)
+        File.new(path, "w") unless File.exist?(path)
+        @formatter = formatter.new
+        @log_device = ::Logger::LogDevice.new path, **options
+      end
+    end
+  end
+end

data/lib/ductr/log/outputs/standard_output.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+require "logger"
+module Ductr
+  module Log
+    #
+    # The STDOUT logger output
+    #
+    class StandardOutput
+      # @return [Array<String>] The labels to associate to severity integers
+      SEVERITY_LABELS = %w[DEBUG INFO WARN ERROR FATAL ANY].freeze
+      #
+      # Creates a logger output instance
+      #
+      # @param [::Logger::Formatter] formatter The formatter to use to write the logs in STDOUT
+      # @param [Hash] **options The LogDevice options
+      #
+      def initialize(formatter, **options)
+        @formatter = formatter.new
+        @log_device = ::Logger::LogDevice.new $stdout, **options
+      end
+      #
+      # Writes the log to the STDOUT
+      #
+      # @param [Integer] severity The log's severity level
+      # @param [Symbol] prog_name The "program" name, used to add job method name to the log
+      # @param [String] message The log message
+      #
+      # @return [void]
+      #
+      def write(severity, prog_name, message)
+        @log_device.write @formatter.call(SEVERITY_LABELS[severity], Time.now, prog_name, message)
+      end
+    end
+  end
+end

data/lib/ductr/pipeline.rb ADDED Viewed

@@ -0,0 +1,133 @@
+# frozen_string_literal: true
+module Ductr
+  #
+  # Pipelines allows to easily declare rich data pipelines.
+  #
+  # By using the `after` annotation, you can define steps execution hierarchy.
+  #
+  # `sync` and `async` are useful to define job sequences inside step methods.
+  #
+  # `Pipeline` inherits from `Job` which means that pipeline are enqueued as any other job.
+  # Pipelines are enqueued in the :ductr_pipelines queue.
+  #
+  #   class MyPipeline < Ductr::Pipeline
+  #     def first_step
+  #       sync(MyJob, 1)
+  #       async(SomeJob) # Executed when `MyJob` is done
+  #     end
+  #
+  #     after :first_step
+  #     def first_parallel_step # Returns when all three `HelloJob` are done
+  #       async(HelloJob, :one)
+  #       async(HelloJob, :two)
+  #       async(HelloJob, :three)
+  #     end
+  #
+  #     after :first_step
+  #     def second_parallel_step # Executed concurrently with :first_parallel_step
+  #       async(SomeJob)
+  #       async(SomeOtherJob)
+  #       sync(HelloJob, :one) # Executed when `SomeJob` and `SomeOtherJob` are done
+  #     end
+  #
+  #     after :first_parallel_step, :second_parallel_step
+  #     def last_step # Executed when `first_parallel_step` and `second_parallel_step` jobs are done
+  #       sync(ByeJob)
+  #     end
+  #   end
+  #
+  # You can define pipelines with only one step by using `after` annotation without parameter:
+  #
+  #   class MonoStepPipeline < Ductr::Pipeline
+  #     after
+  #     def unique_step
+  #       async(MyJob)
+  #       async(MyJob)
+  #     end
+  #   end
+  #
+  # A pipeline can inherit from another, allowing you to overload and add steps to the parent pipeline:
+  #
+  #   class InheritPipeline < MonoStepPipeline
+  #     after :unique_step
+  #     def not_that_unique
+  #       async(MyJob)
+  #     end
+  #   end
+  #
+  class Pipeline < Job
+    #
+    # @!method self.after
+    #   Annotation to define preceding steps on a pipeline step method.
+    #   @params *step_names [Array<Symbol>] The preceding steps methods names
+    #   @example
+    #     after :some_step_method, :some_other_step_method
+    #     def my_step
+    #       # ...
+    #     end
+    #
+    #   @return [void]
+    #
+    annotable :after
+    queue_as :ductr_pipelines
+    # @return [PipelineRunner] The pipeline's runner instance
+    attr_reader :runner
+    #
+    # @!method run
+    #   Starts the pipeline runner.
+    #   @return [void]
+    #
+    def_delegators :@runner, :run
+    #
+    # Initializes the pipeline runner
+    #
+    def initialize(...)
+      super(...)
+      @runner = PipelineRunner.new(self)
+    end
+    #
+    # Puts the given job in the queue and waits for it to be done.
+    #
+    # @param [Class<Job>] job_class The job to enqueue
+    # @param [Array<Object>] *params The job's params
+    #
+    # @return [void]
+    #
+    def sync(job_class, *params)
+      @runner.current_step.flush_jobs
+      @runner.current_step.enqueue_job job_class.new(*params)
+      @runner.current_step.flush_jobs
+    end
+    #
+    # Enqueues the given job.
+    #
+    # @param [Class<Job>] job_class The job to enqueue
+    # @param [Array<Object>] *params The job's params
+    #
+    # @return [void]
+    #
+    def async(job_class, *params)
+      @runner.current_step.enqueue_job job_class.new(*params)
+    end
+    #
+    # Writes the pipeline's status into the Ductr's store.
+    #
+    # @param [Symbol] status The status of the job
+    #
+    # @return [void]
+    #
+    def status=(status)
+      @status = status
+      Store.write_pipeline(self)
+    end
+  end
+end

data/lib/ductr/pipeline_runner.rb ADDED Viewed

@@ -0,0 +1,95 @@
+# frozen_string_literal: true
+module Ductr
+  #
+  # In charge to parse pipeline annotations, initializing and running pipeline steps.
+  #
+  class PipelineRunner
+    # @return [Float] Time to wait in second before resuming all alive steps
+    TICK = 0.1
+    # @return [Array<PipelineStep>] All the steps declared in the pipeline
+    attr_reader :steps
+    # @return [Array<PipelineStep>] The remaining steps to run
+    attr_reader :remaining_steps
+    #
+    # Parses and initializes the given pipeline's steps.
+    #
+    # @param [Pipeline] pipeline The pipeline to parse and run.
+    #
+    def initialize(pipeline)
+      annotated_methods = pipeline.class.annotated_methods
+      @steps = step_names(annotated_methods).map do |name|
+        PipelineStep.new(pipeline, name)
+      end
+      annotated_methods.each do |method|
+        step_by(name: method.name).left = method.find_annotation(:after).params.map do |left_step_name|
+          step_by(name: left_step_name)
+        end
+      end
+      @remaining_steps = @steps.dup
+    end
+    #
+    # Actually runs the pipeline.
+    # Resumes step's fiber until they are all finished.
+    #
+    # @return [void]
+    #
+    def run
+      until @remaining_steps.empty?
+        @remaining_steps.each do |step|
+          next @remaining_steps.delete(step) unless step.alive?
+          step.resume
+        end
+        sleep(TICK)
+      end
+    end
+    #
+    # Returns the current step based on fiber execution context.
+    #
+    # @return [PipelineStep] The currently running step.
+    #
+    def current_step
+      step_by fiber: Fiber.current
+    end
+    #
+    # Parses given annotated methods and extract all step names.
+    #
+    # @param [Array<Annotable::AnnotatedMethod>] annotated_methods The annotated method to parse
+    #
+    # @return [Array<Symbol>] The declared step's names
+    #
+    def step_names(annotated_methods)
+      annotated_methods.flat_map do |method|
+        [method.name, *method.find_annotation(:after).params]
+      end.uniq
+    end
+    #
+    # Finds a step corresponding to the given name and value.
+    #
+    # @example Finds a step named `my-step`
+    #   step_by(name: :my_step)
+    #
+    # @param [Hash<Symbol: Object>] **name_and_val Step attribute's name and value
+    #
+    # @return [PipelineStep, Nil] Found step if any
+    #
+    def step_by(**name_and_val)
+      name, value = *name_and_val.to_a.first
+      steps.find do |step|
+        step.send(name) == value
+      end
+    end
+  end
+end