RubyGems - shikibu - Versions diffs - 0.1.0 - Mend

shikibu 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +7 -0
data/LICENSE +21 -0
data/README.md +487 -0
data/lib/shikibu/activity.rb +135 -0
data/lib/shikibu/app.rb +299 -0
data/lib/shikibu/channels.rb +360 -0
data/lib/shikibu/constants.rb +70 -0
data/lib/shikibu/context.rb +208 -0
data/lib/shikibu/errors.rb +137 -0
data/lib/shikibu/integrations/active_job.rb +95 -0
data/lib/shikibu/integrations/sidekiq.rb +104 -0
data/lib/shikibu/locking.rb +110 -0
data/lib/shikibu/middleware/rack_app.rb +197 -0
data/lib/shikibu/notify/notify_base.rb +67 -0
data/lib/shikibu/notify/pg_notify.rb +217 -0
data/lib/shikibu/notify/wake_event.rb +56 -0
data/lib/shikibu/outbox/relayer.rb +227 -0
data/lib/shikibu/replay.rb +361 -0
data/lib/shikibu/retry_policy.rb +81 -0
data/lib/shikibu/storage/migrations.rb +179 -0
data/lib/shikibu/storage/sequel_storage.rb +883 -0
data/lib/shikibu/version.rb +5 -0
data/lib/shikibu/worker.rb +389 -0
data/lib/shikibu/workflow.rb +398 -0
data/lib/shikibu.rb +152 -0
data/schema/LICENSE +21 -0
data/schema/README.md +57 -0
data/schema/db/migrations/mysql/20251217000000_initial_schema.sql +284 -0
data/schema/db/migrations/postgresql/20251217000000_initial_schema.sql +284 -0
data/schema/db/migrations/sqlite/20251217000000_initial_schema.sql +284 -0
data/schema/docs/column-values.md +91 -0
metadata +231 -0

data/lib/shikibu/outbox/relayer.rb ADDED Viewed

@@ -0,0 +1,227 @@
+# frozen_string_literal: true
+require 'net/http'
+require 'json'
+module Shikibu
+  module Outbox
+    # Background relayer for publishing outbox events to external message brokers.
+    #
+    # The relayer polls the database for pending events and publishes them
+    # as CloudEvents to a configured HTTP endpoint. It implements exponential
+    # backoff for retries and graceful shutdown.
+    #
+    # @example
+    #   relayer = Shikibu::Outbox::Relayer.new(
+    #     storage: storage,
+    #     broker_url: 'http://broker-ingress.default.svc.cluster.local'
+    #   )
+    #   relayer.start
+    #   # ... later ...
+    #   relayer.stop
+    #
+    class Relayer
+      DEFAULT_POLL_INTERVAL = 1.0
+      DEFAULT_MAX_RETRIES = 3
+      DEFAULT_BATCH_SIZE = 10
+      MAX_BACKOFF = 30.0
+      HTTP_OPEN_TIMEOUT = 10
+      HTTP_READ_TIMEOUT = 30
+      attr_reader :storage, :broker_url, :poll_interval, :max_retries, :batch_size, :max_age_hours
+      def initialize(storage:, broker_url:, wake_event: nil, poll_interval: DEFAULT_POLL_INTERVAL,
+                     max_retries: DEFAULT_MAX_RETRIES, batch_size: DEFAULT_BATCH_SIZE, max_age_hours: nil)
+        @storage = storage
+        @broker_url = URI.parse(broker_url)
+        @wake_event = wake_event
+        @poll_interval = poll_interval
+        @max_retries = max_retries
+        @batch_size = batch_size
+        @max_age_hours = max_age_hours
+        @running = false
+        @thread = nil
+      end
+      def start
+        return if @running
+        @running = true
+        @thread = Thread.new { poll_loop }
+        log_info("started (broker=#{@broker_url}, poll_interval=#{@poll_interval}s)")
+      end
+      def stop
+        return unless @running
+        @running = false
+        @wake_event&.signal # Wake up if waiting
+        @thread&.join(5)
+        log_info('stopped')
+      end
+      def running?
+        @running
+      end
+      private
+      def poll_loop
+        consecutive_empty = 0
+        while @running
+          begin
+            count = poll_and_publish
+            consecutive_empty = count.zero? ? consecutive_empty + 1 : 0
+          rescue StandardError => e
+            log_error('poll_loop', e)
+            consecutive_empty = 0 # Reset on error to avoid long backoffs
+          end
+          backoff = calculate_backoff(consecutive_empty)
+          wait_with_wake(backoff)
+        end
+      end
+      def poll_and_publish
+        events = @storage.get_pending_outbox_events(limit: @batch_size)
+        return 0 if events.empty?
+        log_debug("processing #{events.size} pending outbox events")
+        events.each do |event|
+          break unless @running
+          publish_event(event)
+        end
+        events.size
+      end
+      def publish_event(event)
+        event_id = event[:event_id]
+        retry_count = event[:retry_count] || 0
+        # Check max age
+        if expired?(event)
+          @storage.mark_outbox_expired(event_id, "Exceeded max age (#{@max_age_hours} hours)")
+          log_warn("event #{event_id} exceeded max age, marking as expired")
+          return
+        end
+        # Check max retries
+        if retry_count >= @max_retries
+          @storage.mark_outbox_invalid(event_id, "Exceeded max retries (#{@max_retries})")
+          log_warn("event #{event_id} exceeded max retries, marking as invalid")
+          return
+        end
+        # Build and send CloudEvent
+        response = send_cloud_event(event)
+        case response
+        when Net::HTTPSuccess
+          @storage.mark_outbox_published(event_id)
+          log_info("published event #{event_id}")
+        when Net::HTTPClientError
+          # 4xx errors are permanent failures
+          @storage.mark_outbox_invalid(event_id, "HTTP #{response.code}: #{response.message}")
+          log_error_msg("permanent error for event #{event_id}: HTTP #{response.code}")
+        else
+          # 5xx or other errors are retryable
+          @storage.mark_outbox_failed(event_id, "HTTP #{response.code}: #{response.message}")
+          retry_msg = "retry #{retry_count + 1}/#{@max_retries}"
+          log_warn("server error for event #{event_id} (#{retry_msg}): HTTP #{response.code}")
+        end
+      rescue SocketError, Errno::ECONNREFUSED, Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout => e
+        # Network errors are retryable
+        @storage.mark_outbox_failed(event_id, "#{e.class}: #{e.message}")
+        log_warn("network error for event #{event_id} (retry #{retry_count + 1}/#{@max_retries}): #{e.message}")
+      rescue StandardError => e
+        # Unknown errors are retryable (safety net)
+        @storage.mark_outbox_failed(event_id, "#{e.class}: #{e.message}")
+        log_error('publish_event', e)
+      end
+      def send_cloud_event(event)
+        http = Net::HTTP.new(@broker_url.host, @broker_url.port)
+        http.use_ssl = @broker_url.scheme == 'https'
+        http.open_timeout = HTTP_OPEN_TIMEOUT
+        http.read_timeout = HTTP_READ_TIMEOUT
+        path = @broker_url.path.empty? ? '/' : @broker_url.path
+        request = Net::HTTP::Post.new(path)
+        request['Content-Type'] = 'application/cloudevents+json'
+        cloud_event = {
+          specversion: '1.0',
+          id: event[:event_id],
+          type: event[:event_type],
+          source: event[:event_source],
+          datacontenttype: event[:content_type] || 'application/json',
+          time: format_time(event[:created_at]),
+          data: event[:event_data]
+        }
+        request.body = cloud_event.to_json
+        http.request(request)
+      end
+      def format_time(time)
+        return time.iso8601 if time.respond_to?(:iso8601)
+        time.to_s
+      end
+      def expired?(event)
+        return false unless @max_age_hours
+        created_at = event[:created_at]
+        return false unless created_at
+        age_hours = (Time.now - created_at) / 3600.0
+        age_hours > @max_age_hours
+      end
+      def calculate_backoff(consecutive_empty)
+        return @poll_interval if consecutive_empty.zero?
+        # Exponential backoff: 2s, 4s, 8s, 16s, max 30s
+        exp = [consecutive_empty, 4].min
+        backoff = @poll_interval * (2**exp)
+        jitter = rand * backoff * 0.3
+        [backoff + jitter, MAX_BACKOFF].min
+      end
+      def wait_with_wake(backoff)
+        if @wake_event
+          @wake_event.wait(backoff)
+        else
+          sleep(backoff)
+        end
+      end
+      def log_info(message)
+        warn "[Shikibu::Outbox::Relayer] #{message}"
+      end
+      def log_warn(message)
+        warn "[Shikibu::Outbox::Relayer] WARNING: #{message}"
+      end
+      def log_debug(message)
+        # Only log in debug mode if needed
+        # warn "[Shikibu::Outbox::Relayer] DEBUG: #{message}"
+      end
+      def log_error(context, error)
+        warn "[Shikibu::Outbox::Relayer] ERROR in #{context}: #{error.class}: #{error.message}"
+        warn error.backtrace.first(5).join("\n") if error.backtrace
+      end
+      def log_error_msg(message)
+        warn "[Shikibu::Outbox::Relayer] ERROR: #{message}"
+      end
+    end
+  end
+end

data/lib/shikibu/replay.rb ADDED Viewed

@@ -0,0 +1,361 @@
+# frozen_string_literal: true
+module Shikibu
+  # Orchestrates workflow execution with deterministic replay
+  class ReplayEngine
+    attr_reader :storage, :worker_id, :hooks
+    def initialize(storage:, worker_id:, hooks: nil)
+      @storage = storage
+      @worker_id = worker_id
+      @hooks = hooks
+    end
+    # Start a new workflow instance
+    # @param workflow_class [Class] Workflow class
+    # @param instance_id [String] Instance ID
+    # @param input [Hash] Input parameters
+    # @return [Object, nil] Workflow result or nil if suspended
+    def start_workflow(workflow_class, instance_id:, **input)
+      # Save workflow definition
+      storage.save_workflow_definition(
+        workflow_name: workflow_class.workflow_name,
+        source_hash: workflow_class.source_hash,
+        source_code: workflow_class.source_code
+      )
+      # Create instance record
+      storage.create_instance(
+        instance_id: instance_id,
+        workflow_name: workflow_class.workflow_name,
+        source_hash: workflow_class.source_hash,
+        owner_service: 'default',
+        input_data: input,
+        status: Status::RUNNING
+      )
+      # Execute the workflow
+      execute_workflow(instance_id, workflow_class, input, replaying: false)
+    end
+    # Resume a workflow from its current state
+    # @param instance_id [String] Instance ID
+    # @return [Object, nil] Workflow result or nil if suspended
+    def resume_workflow(instance_id)
+      instance = storage.get_instance(instance_id)
+      raise WorkflowNotFoundError, instance_id unless instance
+      # Handle crash recovery for compensating workflows (Romancy/Edda compatible)
+      return resume_compensating_workflow(instance_id) if instance[:status] == Status::COMPENSATING
+      workflow_class = Shikibu.get_workflow(instance[:workflow_name])
+      raise WorkflowNotRegisteredError, instance[:workflow_name] unless workflow_class
+      # Load history and build cache
+      history = storage.get_history(instance_id)
+      history_cache = build_history_cache(history)
+      execute_workflow(
+        instance_id,
+        workflow_class,
+        instance[:input_data],
+        replaying: true,
+        history_cache: history_cache
+      )
+    end
+    # Resume a workflow that was in compensating state when it crashed
+    # This executes remaining compensations from DB using the global registry
+    # @param instance_id [String] Instance ID
+    # @return [nil]
+    def resume_compensating_workflow(instance_id)
+      # Acquire lock
+      raise LockNotAcquiredError, instance_id unless storage.try_acquire_lock(instance_id, worker_id, timeout: 300)
+      begin
+        execute_compensations_from_db(instance_id)
+        # Clear compensations and update status
+        storage.clear_compensations(instance_id)
+        storage.update_instance_status(instance_id, Status::FAILED)
+        nil
+      ensure
+        storage.release_lock(instance_id, worker_id)
+      end
+    end
+    # Execute compensations from DB (for crash recovery)
+    # Uses global registry to find compensation functions
+    # @param instance_id [String] Instance ID
+    def execute_compensations_from_db(instance_id)
+      compensations = storage.get_compensations(instance_id)
+      # Get already executed compensation IDs from history (idempotency)
+      history = storage.get_history(instance_id)
+      executed_ids = history
+                     .select { |e| e[:event_type] == EventType::COMPENSATION_EXECUTED }
+                     .map { |e| e[:data]&.dig(:compensation_id) }
+                     .compact
+                     .to_set
+      # Execute each compensation in order (already LIFO from DB)
+      compensations.each do |comp|
+        next if executed_ids.include?(comp[:id])
+        execute_compensation_from_registry(instance_id, comp)
+      end
+    end
+    # Execute a single compensation from registry
+    # @param instance_id [String] Instance ID
+    # @param comp [Hash] Compensation record from DB
+    def execute_compensation_from_registry(instance_id, comp)
+      compensation_fn = Shikibu.get_compensation(comp[:activity_name])
+      if compensation_fn.nil?
+        # Inline block or unregistered compensation - cannot recover, skip with warning
+        record_compensation_skipped(instance_id, comp)
+        return
+      end
+      # Execute the compensation
+      args = comp[:args] || {}
+      symbolized_args = args.transform_keys(&:to_sym)
+      compensation_fn.call(nil, **symbolized_args)
+      # Record success
+      storage.append_history(
+        instance_id: instance_id,
+        activity_id: "compensation:#{comp[:id]}",
+        event_type: EventType::COMPENSATION_EXECUTED,
+        event_data: {
+          compensation_id: comp[:id],
+          activity_id: comp[:activity_id],
+          activity_name: comp[:activity_name]
+        }
+      )
+    rescue StandardError => e
+      # Record failure but continue
+      storage.append_history(
+        instance_id: instance_id,
+        activity_id: "compensation:#{comp[:id]}",
+        event_type: EventType::COMPENSATION_FAILED,
+        event_data: {
+          compensation_id: comp[:id],
+          activity_id: comp[:activity_id],
+          activity_name: comp[:activity_name],
+          error_type: e.class.name,
+          error_message: e.message
+        }
+      )
+    end
+    # Record that a compensation was skipped (inline or unregistered)
+    def record_compensation_skipped(instance_id, comp)
+      storage.append_history(
+        instance_id: instance_id,
+        activity_id: "compensation:#{comp[:id]}",
+        event_type: EventType::COMPENSATION_FAILED,
+        event_data: {
+          compensation_id: comp[:id],
+          activity_id: comp[:activity_id],
+          activity_name: comp[:activity_name],
+          error_type: 'CompensationNotFound',
+          error_message: "Compensation '#{comp[:activity_name]}' not found in registry " \
+                         '(inline blocks cannot be recovered after crash)'
+        }
+      )
+    end
+    private
+    def execute_workflow(instance_id, workflow_class, input, replaying:, history_cache: {})
+      # Acquire lock
+      lock_timeout = workflow_class.lock_timeout
+      unless storage.try_acquire_lock(instance_id, worker_id, timeout: lock_timeout)
+        raise LockNotAcquiredError, instance_id
+      end
+      begin
+        # Create context
+        ctx = WorkflowContext.new(
+          instance_id: instance_id,
+          workflow_name: workflow_class.workflow_name,
+          worker_id: worker_id,
+          storage: storage,
+          hooks: hooks,
+          history_cache: history_cache,
+          replaying: replaying
+        )
+        # Call hooks
+        hooks&.on_workflow_start&.call(instance_id, workflow_class.workflow_name, input)
+        # Create and execute workflow
+        workflow = workflow_class.allocate
+        workflow.instance_variable_set(:@pending_compensations, [])
+        workflow.context = ctx
+        # Symbolize input keys
+        symbolized_input = symbolize_keys(input)
+        result = workflow.execute(**symbolized_input)
+        # Mark completed
+        storage.update_instance_status(instance_id, Status::COMPLETED, output_data: result)
+        storage.clear_compensations(instance_id)
+        # Cleanup direct subscriptions
+        cleanup_subscriptions(ctx)
+        # Call hooks
+        hooks&.on_workflow_complete&.call(instance_id, workflow_class.workflow_name, result)
+        result
+      rescue WaitForTimerSignal => e
+        handle_timer_suspend(instance_id, e)
+        nil
+      rescue WaitForChannelSignal => e
+        handle_channel_suspend(instance_id, e)
+        nil
+      rescue RecurSignal => e
+        handle_recur(instance_id, workflow_class, e)
+        nil
+      rescue WorkflowCancelledError
+        storage.update_instance_status(instance_id, Status::CANCELLED)
+        hooks&.on_workflow_cancelled&.call(instance_id, workflow_class.workflow_name)
+        raise
+      rescue StandardError => e
+        handle_failure(instance_id, workflow_class, workflow, e)
+        raise
+      ensure
+        storage.release_lock(instance_id, worker_id)
+      end
+    end
+    def handle_timer_suspend(instance_id, signal)
+      # Update status
+      storage.update_instance_status(
+        instance_id,
+        Status::WAITING_FOR_TIMER,
+        current_activity_id: signal.activity_id
+      )
+      # Register timer
+      storage.register_timer(
+        instance_id: instance_id,
+        timer_id: signal.timer_id,
+        expires_at: signal.expires_at,
+        activity_id: signal.activity_id
+      )
+    end
+    def handle_channel_suspend(instance_id, signal)
+      # Update status
+      storage.update_instance_status(
+        instance_id,
+        Status::WAITING_FOR_MESSAGE,
+        current_activity_id: signal.activity_id
+      )
+      # Update subscription with activity_id and timeout
+      storage.subscribe_to_channel(
+        instance_id: instance_id,
+        channel: signal.channel,
+        mode: signal.mode,
+        activity_id: signal.activity_id,
+        timeout_at: signal.timeout_at
+      )
+    end
+    def handle_recur(instance_id, workflow_class, signal)
+      # Archive history
+      storage.archive_history(instance_id)
+      # Update instance with new input and reset status
+      storage.update_instance_status(instance_id, Status::RECURRED)
+      # Create new instance with continued_from link
+      new_instance_id = SecureRandom.uuid
+      storage.create_instance(
+        instance_id: new_instance_id,
+        workflow_name: workflow_class.workflow_name,
+        source_hash: workflow_class.source_hash,
+        owner_service: 'default',
+        input_data: signal.new_input,
+        status: Status::RUNNING
+      )
+      # The new instance will be picked up by the worker
+    end
+    def handle_failure(instance_id, workflow_class, workflow, error)
+      # Record failure in history
+      storage.append_history(
+        instance_id: instance_id,
+        activity_id: 'workflow_failed',
+        event_type: EventType::WORKFLOW_FAILED,
+        event_data: {
+          error_type: error.class.name,
+          error_message: error.message
+        }
+      )
+      # Update status to compensating
+      storage.update_instance_status(instance_id, Status::COMPENSATING)
+      # Execute compensations via workflow instance (LIFO order)
+      workflow.run_compensations
+      # Clear compensation records from database
+      storage.clear_compensations(instance_id)
+      # Update final status
+      storage.update_instance_status(instance_id, Status::FAILED)
+      # Call hooks
+      hooks&.on_workflow_failed&.call(instance_id, workflow_class.workflow_name, error)
+    end
+    def cleanup_subscriptions(ctx)
+      ctx.direct_subscriptions.each do |channel|
+        storage.unsubscribe_from_channel(
+          instance_id: ctx.instance_id,
+          channel: channel
+        )
+      end
+    end
+    def build_history_cache(history)
+      cache = {}
+      history.each do |event|
+        event_type = event[:event_type]
+        data = event[:data]
+        cache[event[:activity_id]] = case event_type
+                                     when EventType::CHANNEL_MESSAGE_RECEIVED, EventType::MESSAGE_TIMEOUT,
+                                          EventType::TIMER_EXPIRED
+                                       # For channel messages and timer events, preserve the full data structure
+                                       { event_type: event_type, data: data }
+                                     else
+                                       # For activity completed/failed
+                                       {
+                                         event_type: event_type,
+                                         result: data&.dig(:result),
+                                         error_type: data&.dig(:error_type),
+                                         error_message: data&.dig(:error_message)
+                                       }
+                                     end
+      end
+      cache
+    end
+    def symbolize_keys(hash)
+      return hash unless hash.is_a?(Hash)
+      hash.transform_keys do |key|
+        key.is_a?(String) ? key.to_sym : key
+      end
+    end
+  end
+end

data/lib/shikibu/retry_policy.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# frozen_string_literal: true
+module Shikibu
+  # Configuration for retry behavior on activity failures
+  class RetryPolicy
+    attr_reader :max_attempts, :base_delay, :max_delay, :backoff_coefficient,
+                :max_duration, :retryable_errors, :non_retryable_errors
+    # @param max_attempts [Integer, nil] Maximum number of attempts (nil = infinite)
+    # @param base_delay [Float] Initial delay between retries in seconds
+    # @param max_delay [Float] Maximum delay between retries in seconds
+    # @param backoff_coefficient [Float] Multiplier for exponential backoff
+    # @param max_duration [Float, nil] Maximum total duration for all retries in seconds
+    # @param retryable_errors [Array<Class>] Error classes that should be retried
+    # @param non_retryable_errors [Array<Class>] Error classes that should not be retried
+    def initialize(
+      max_attempts: 5,
+      base_delay: 1.0,
+      max_delay: 60.0,
+      backoff_coefficient: 2.0,
+      max_duration: 300.0,
+      retryable_errors: [StandardError],
+      non_retryable_errors: []
+    )
+      @max_attempts = max_attempts
+      @base_delay = base_delay.to_f
+      @max_delay = max_delay.to_f
+      @backoff_coefficient = backoff_coefficient.to_f
+      @max_duration = max_duration&.to_f
+      @retryable_errors = Array(retryable_errors)
+      @non_retryable_errors = Array(non_retryable_errors)
+    end
+    # Check if an error should be retried
+    # @param error [Exception] The error to check
+    # @return [Boolean]
+    def retryable?(error)
+      # TerminalError is never retried
+      return false if error.is_a?(TerminalError)
+      # Non-retryable errors take precedence
+      return false if @non_retryable_errors.any? { |klass| error.is_a?(klass) }
+      # Check if error matches retryable classes
+      @retryable_errors.any? { |klass| error.is_a?(klass) }
+    end
+    # Check if we should continue retrying
+    # @param attempt [Integer] Current attempt number (1-based)
+    # @param started_at [Time] When retries started
+    # @return [Boolean]
+    def should_retry?(attempt, started_at = nil)
+      return false if @max_attempts && attempt >= @max_attempts
+      if @max_duration && started_at
+        elapsed = Time.now - started_at
+        return false if elapsed >= @max_duration
+      end
+      true
+    end
+    # Calculate delay for a given attempt
+    # @param attempt [Integer] Current attempt number (1-based)
+    # @return [Float] Delay in seconds
+    def delay_for(attempt)
+      delay = @base_delay * (@backoff_coefficient**(attempt - 1))
+      [delay, @max_delay].min
+    end
+    # Default retry policy
+    def self.default
+      @default ||= new
+    end
+    # No retry policy (single attempt)
+    def self.none
+      @none ||= new(max_attempts: 1)
+    end
+  end
+end