RubyGems - fractor - Versions diffs - 0.1.9 → 0.1.10 - Mend

fractor 0.1.9 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/.rubocop_todo.yml +28 -91
data/docs/ARCHITECTURE.md +317 -0
data/docs/PERFORMANCE_TUNING.md +355 -0
data/docs/TROUBLESHOOTING.md +463 -0
data/lib/fractor/callback_registry.rb +106 -0
data/lib/fractor/config_schema.rb +170 -0
data/lib/fractor/main_loop_handler.rb +4 -8
data/lib/fractor/main_loop_handler3.rb +10 -12
data/lib/fractor/main_loop_handler4.rb +48 -20
data/lib/fractor/result_cache.rb +58 -10
data/lib/fractor/shutdown_handler.rb +12 -6
data/lib/fractor/supervisor.rb +100 -13
data/lib/fractor/version.rb +1 -1
data/lib/fractor/workflow/execution/dependency_resolver.rb +149 -0
data/lib/fractor/workflow/execution/fallback_job_handler.rb +68 -0
data/lib/fractor/workflow/execution/job_executor.rb +242 -0
data/lib/fractor/workflow/execution/result_builder.rb +76 -0
data/lib/fractor/workflow/execution/workflow_execution_logger.rb +241 -0
data/lib/fractor/workflow/workflow_executor.rb +97 -476
data/lib/fractor/wrapped_ractor.rb +2 -4
data/lib/fractor.rb +11 -0
metadata +12 -2

data/lib/fractor/supervisor.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module Fractor
   # Supervises multiple WrappedRactors, distributes work, and aggregates results.
   class Supervisor
     attr_reader :work_queue, :workers, :results, :worker_pools, :debug,
-                :error_reporter, :logger, :performance_monitor
+                :error_reporter, :logger, :performance_monitor, :callback_registry
     # Initializes the Supervisor.
     # - worker_pools: An array of worker pool configurations, each containing:
@@ -81,13 +81,14 @@ module Fractor
       @ractors_map = {} # Map Ractor object to WrappedRactor instance
       @continuous_mode = continuous_mode
       @running = false
-      @work_callbacks = []
       @wakeup_ractor = nil # Control ractor for unblocking select
       @timer_thread = nil # Timer thread for periodic wakeup
       @error_reporter = ErrorReporter.new # Track errors and statistics
-      @error_callbacks = [] # Custom error callbacks
       @performance_monitor = nil # Performance monitor instance
+      # Initialize callback registry for managing work and error callbacks
+      @callback_registry = CallbackRegistry.new(debug: @debug)
       # Initialize performance monitor if enabled
       if enable_performance_monitoring
         require_relative "performance_monitor"
@@ -112,6 +113,7 @@ module Fractor
         @timer_thread,
         @performance_monitor,
         debug: @debug,
+        continuous_mode: @continuous_mode,
       )
       # Initialize signal handler for graceful shutdown
@@ -172,14 +174,14 @@ module Fractor
     # Register a callback to provide new work items
     # The callback should return nil or empty array when no new work is available
     def register_work_source(&callback)
-      @work_callbacks << callback
+      @callback_registry.register_work_source(&callback)
     end
     # Register a callback to handle errors
     # The callback receives (error_result, worker_name, worker_class)
     # Example: supervisor.on_error { |err, worker, klass| puts "Error in #{klass}: #{err.error}" }
     def on_error(&callback)
-      @error_callbacks << callback
+      @callback_registry.register_error_callback(&callback)
     end
     # Starts the worker Ractors for all worker pools.
@@ -188,11 +190,8 @@ module Fractor
       # Pass as parameter to avoid isolation error
       debug_mode = @debug
-      # Check if running on Ruby 4.0
-      ruby_4_0 = Gem::Version.new(RUBY_VERSION) >= Gem::Version.new("4.0.0")
       # Create a wakeup Ractor for unblocking Ractor.select
-      if ruby_4_0
+      if Fractor::RUBY_4_0_OR_HIGHER
         # In Ruby 4.0, wakeup uses ports too
         @wakeup_port = Ractor::Port.new
         @wakeup_ractor = Ractor.new(@wakeup_port, debug_mode) do |port, debug|
@@ -231,7 +230,7 @@ module Fractor
         pool[:workers] = (1..num_workers).map do |i|
           # In Ruby 4.0, create a response port for each worker
-          response_port = if ruby_4_0
+          response_port = if Fractor::RUBY_4_0_OR_HIGHER
                             Ractor::Port.new
                           end
@@ -322,7 +321,9 @@ module Fractor
       end
       # Start timer thread for continuous mode to periodically check work sources
-      start_timer_thread if @continuous_mode && !@work_callbacks.empty?
+      # CRITICAL: Always start timer thread in continuous mode to ensure main loop
+      # can periodically check for worker termination during shutdown
+      start_timer_thread if @continuous_mode
       begin
         # Run the main event loop through MainLoopHandler
@@ -361,6 +362,18 @@ module Fractor
       @running = false
+      # CRITICAL: Send immediate wakeup signal to unblock main loop from Ractor.select
+      # This is especially important for Ruby 3.4+ where Ractor.select may block indefinitely
+      # without periodic checks of @running. The timer thread might take time to exit,
+      # so we send the signal here immediately.
+      if @wakeup_ractor
+        begin
+          @wakeup_ractor.send(:shutdown)
+        rescue StandardError => e
+          puts "Error sending shutdown to wakeup ractor: #{e.message}" if @debug
+        end
+      end
       # Update shutdown handler with current references before shutdown
       @shutdown_handler.instance_variable_set(:@workers, @workers)
       @shutdown_handler.instance_variable_set(:@wakeup_ractor, @wakeup_ractor)
@@ -377,13 +390,29 @@ module Fractor
     # Start the timer thread for continuous mode.
     # This thread periodically wakes up the main loop to check for new work.
+    # CRITICAL: Always start the timer thread in continuous mode, even without callbacks,
+    # to ensure the main loop can periodically check for worker termination during shutdown.
     #
     # @return [void]
     def start_timer_thread
       @timer_thread = Thread.new do
-        while @running
+        # Keep running during shutdown to allow periodic checks for worker termination
+        # Only exit when @shutting_down is true AND workers are closed
+        loop do
           sleep(0.1) # Check work sources every 100ms
-          if @wakeup_ractor && @running
+          # Exit if we're no longer running AND (not in continuous mode OR workers are closed)
+          break if !@running && (!@continuous_mode || workers.all?(&:closed?))
+          # Send wakeup signals if running, or during shutdown in continuous mode until workers close
+          should_send = if @running
+                          @running
+                        else
+                          # During shutdown in continuous mode, keep sending until workers close
+                          @continuous_mode && !workers.all?(&:closed?)
+                        end
+          if @wakeup_ractor && should_send
             begin
               @wakeup_ractor.send(:wakeup)
             rescue StandardError => e
@@ -550,5 +579,63 @@ module Fractor
       @performance_monitor.snapshot
     end
+    # Class-level documentation for Supervisor configuration options.
+    # Provides a summary of valid configuration parameters for the initialize method.
+    #
+    # @example Print configuration help
+    #   puts Fractor::Supervisor.configuration_help
+    #
+    # @return [String] Configuration documentation
+    def self.configuration_help
+      <<~HELP
+        Fractor::Supervisor Configuration Options
+        ==========================================
+        The Supervisor accepts the following keyword arguments to initialize():
+        worker_pools (Array, required)
+          Array of worker pool configuration hashes.
+          Each hash must contain:
+            - worker_class: Class inheriting from Fractor::Worker (required)
+            - num_workers: Positive integer for number of workers (optional, defaults to CPU count)
+          Example:
+            worker_pools: [
+              { worker_class: MyWorker, num_workers: 4 },
+              { worker_class: AnotherWorker, num_workers: 2 }
+            ]
+        continuous_mode (Boolean, optional, default: false)
+          Whether to run in continuous mode (long-running) or batch mode.
+          - false: Batch mode - processes all work items and exits
+          - true: Continuous mode - runs until stopped, accepts work from callbacks
+        debug (Boolean, optional, default: false)
+          Enable verbose debug output for all state changes.
+          Can also be enabled via FRACTOR_DEBUG=1 environment variable.
+        logger (Logger, optional, default: Fractor.logger)
+          Optional logger instance for this Supervisor.
+          Provides isolation when multiple gems use Fractor in the same process.
+        tracer_enabled (Boolean, optional)
+          Override for ExecutionTracer. nil uses global setting.
+        tracer_stream (IO, optional)
+          Optional trace stream for this Supervisor. nil uses global setting.
+        enable_performance_monitoring (Boolean, optional, default: false)
+          Enable performance monitoring (latency, throughput, etc.).
+          When enabled, performance_metrics() returns current metrics.
+        Validation
+        ----------
+        All configuration is validated at initialization time with detailed error messages.
+        Invalid configurations will raise ArgumentError with helpful fix suggestions.
+        For more information, see the Supervisor class documentation.
+      HELP
+    end
   end
 end

data/lib/fractor/version.rb CHANGED Viewed

@@ -2,5 +2,5 @@
 module Fractor
   # Fractor version
-  VERSION = "0.1.9"
+  VERSION = "0.1.10"
 end

data/lib/fractor/workflow/execution/dependency_resolver.rb ADDED Viewed

@@ -0,0 +1,149 @@
+# frozen_string_literal: true
+require "set"
+require "digest"
+module Fractor
+  class Workflow
+    # Computes the execution order for workflow jobs using topological sort.
+    # Jobs are grouped into levels where all jobs in a level can be executed
+    # in parallel (their dependencies are satisfied).
+    #
+    # Caches execution order based on job structure to avoid recomputing
+    # topological sort for static workflow definitions.
+    class DependencyResolver
+      # Class-level cache for execution orders.
+      # Keyed by workflow signature (hash of job structure).
+      @cache = {}
+      @mutex = Mutex.new
+      class << self
+        attr_reader :cache
+        # Clear the entire execution order cache.
+        # Useful for testing or when workflows are dynamically modified.
+        def clear_cache
+          @mutex.synchronize { @cache.clear }
+        end
+        # Clear cache entries for a specific workflow.
+        #
+        # @param workflow_signature [String] The workflow signature to clear
+        def clear_cache_for(workflow_signature)
+          @mutex.synchronize { @cache.delete(workflow_signature) }
+        end
+      end
+      # Initialize the resolver with a workflow's jobs.
+      #
+      # @param jobs [Hash] Hash of job_name => Job objects
+      # @param enable_cache [Boolean] Whether to use cached execution order (default: true)
+      def initialize(jobs, enable_cache: true)
+        @jobs = jobs
+        @enable_cache = enable_cache
+        @signature = compute_signature if enable_cache
+      end
+      # Compute the execution order using topological sort.
+      # Returns an array of arrays, where each inner array contains job names
+      # that can be executed in parallel (their dependencies are satisfied).
+      #
+      # Results are cached based on the workflow's job structure (job names
+      # and their dependencies). This provides significant performance benefits
+      # for workflows that are executed multiple times.
+      #
+      # @return [Array<Array<String>>] Execution order as grouped job names
+      def execution_order
+        # Try to get from cache first
+        if @enable_cache && @signature && cached_execution_order
+          return cached_execution_order
+        end
+        # Compute the execution order
+        order = compute_order
+        # Cache the result
+        cache_execution_order(order) if @enable_cache && @signature
+        order
+      end
+      # Invalidate the cache for this workflow's execution order.
+      # Call this if the workflow definition changes dynamically.
+      def invalidate_cache
+        return unless @enable_cache && @signature
+        self.class.clear_cache_for(@signature)
+        @cached = false
+      end
+      private
+      # Get the cached execution order for this workflow.
+      #
+      # @return [Array<Array<String>>, nil] Cached execution order or nil
+      def cached_execution_order
+        self.class.cache[@signature]
+      end
+      # Cache an execution order for this workflow.
+      #
+      # @param order [Array<Array<String>>] The execution order to cache
+      def cache_execution_order(order)
+        DependencyResolver.cache[@signature] = order
+      end
+      # Compute a unique signature for this workflow's job structure.
+      # The signature is based on job names and their dependencies.
+      #
+      # @return [String] A hash representing the workflow structure
+      def compute_signature
+        # Build a deterministic representation of the workflow structure
+        structure = {}
+        @jobs.each do |name, job|
+          structure[name] = {
+            dependencies: Array(job.dependencies).sort,
+          }
+        end
+        # Sort by job name for deterministic hashing
+        sorted_structure = structure.sort.to_h
+        # Generate SHA256 hash of the structure
+        Digest::SHA256.hexdigest(JSON.dump(sorted_structure))
+      end
+      # Compute the execution order using topological sort.
+      #
+      # @return [Array<Array<String>>] Execution order as grouped job names
+      def compute_order
+        order = []
+        remaining = @jobs.keys.to_set
+        processed = Set.new
+        until remaining.empty?
+          # Find jobs whose dependencies are all satisfied
+          ready = remaining.select do |job_name|
+            job = @jobs[job_name]
+            job.dependencies.all? { |dep| processed.include?(dep) }
+          end
+          if ready.empty?
+            # This should not happen if validation was done correctly
+            raise WorkflowExecutionError,
+                  "Cannot find next jobs to execute. Remaining: #{remaining.to_a.join(', ')}"
+          end
+          order << ready
+          ready.each do |job_name|
+            processed.add(job_name)
+            remaining.delete(job_name)
+          end
+        end
+        puts "Execution order: #{order.inspect}" if ENV["FRACTOR_DEBUG"]
+        order
+      end
+    end
+  end
+end

data/lib/fractor/workflow/execution/fallback_job_handler.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# frozen_string_literal: true
+module Fractor
+  class Workflow
+    # Handles fallback job execution when a primary job fails.
+    # Manages the lifecycle of executing a fallback job and integrating
+    # its result back into the workflow context.
+    class FallbackJobHandler
+      # Initialize the fallback handler.
+      #
+      # @param workflow [Workflow] The workflow instance
+      # @param context [WorkflowContext] The execution context
+      # @param hooks [ExecutionHooks] Execution hooks for event notification
+      # @param logger [WorkflowLogger] The workflow logger
+      def initialize(workflow, context, hooks, logger)
+        @workflow = workflow
+        @context = context
+        @hooks = hooks
+        @logger = logger
+      end
+      # Execute a fallback job for a failed job.
+      #
+      # @param original_job [Job] The job that failed
+      # @param original_error [Exception] The error that occurred
+      # @param job_trace [ExecutionTrace::JobTrace, nil] Optional job trace
+      # @param job_executor [JobExecutor] The job executor to use
+      # @param start_time [Time] The original job start time (for duration calculation)
+      # @return [Object] The output from the fallback job
+      def execute_fallback(original_job, original_error, job_trace,
+job_executor, start_time)
+        fallback_job_name = original_job.fallback_job
+        fallback_job = @workflow.class.jobs[fallback_job_name]
+        unless fallback_job
+          raise WorkflowExecutionError,
+                "Fallback job '#{fallback_job_name}' not found for job '#{original_job.name}'"
+        end
+        @logger.fallback_execution(original_job.name, fallback_job.name,
+                                   original_error)
+        begin
+          # Execute fallback job using job_executor
+          output = job_executor.execute_once(fallback_job, job_trace)
+          # Store output under original job name as well
+          @context.store_job_output(original_job.name, output)
+          original_job.state(:completed)
+          duration = Time.now - start_time
+          # Update trace
+          job_trace&.complete!(output: output)
+          @logger.job_complete(original_job.name, duration)
+          @hooks.trigger(:job_complete, original_job, output, duration)
+          output
+        rescue StandardError => e
+          @logger.fallback_failed(original_job.name, fallback_job.name, e)
+          raise WorkflowExecutionError,
+                "Job '#{original_job.name}' and fallback '#{fallback_job_name}' both failed"
+        end
+      end
+    end
+  end
+end

data/lib/fractor/workflow/execution/job_executor.rb ADDED Viewed

@@ -0,0 +1,242 @@
+# frozen_string_literal: true
+require_relative "../../supervisor"
+require_relative "../../work"
+require_relative "../retry_orchestrator"
+module Fractor
+  class Workflow
+    # Executes a single workflow job, handling all aspects of job execution
+    # including input building, work creation, and supervisor orchestration.
+    class JobExecutor
+      attr_reader :context, :logger, :dead_letter_queue
+      # Initialize the job executor.
+      #
+      # @param context [WorkflowContext] The workflow execution context
+      # @param logger [WorkflowLogger] The workflow logger
+      # @param workflow [Workflow] The workflow instance
+      # @param completed_jobs [Set<String>] Set of completed job names
+      # @param failed_jobs [Set<String>] Set of failed job names
+      # @param dead_letter_queue [DeadLetterQueue, nil] Optional DLQ for failed jobs
+      # @param circuit_breakers [CircuitBreakerRegistry] Circuit breaker registry
+      def initialize(context, logger, workflow: nil, completed_jobs: nil, failed_jobs: nil,
+                     dead_letter_queue: nil, circuit_breakers: nil)
+        @context = context
+        @logger = logger
+        @workflow = workflow
+        @completed_jobs = completed_jobs || Set.new
+        @failed_jobs = failed_jobs || Set.new
+        @dead_letter_queue = dead_letter_queue
+        @circuit_breakers = circuit_breakers || CircuitBreakerRegistry.new
+      end
+      # Execute a job once (no retry logic).
+      #
+      # @param job [Job] The job to execute
+      # @param job_trace [ExecutionTrace::JobTrace, nil] Optional job trace
+      # @return [Object] The job output
+      def execute_once(job, job_trace = nil)
+        # Build input for this job
+        job_input = @context.build_job_input(job)
+        job_trace&.set_input(job_input)
+        # Create work item - if job_input is already a Work object, use it directly
+        # to avoid double-wrapping (e.g., when using custom Work subclasses)
+        work = if job_input.is_a?(Work)
+                 job_input
+               else
+                 Work.new(job_input)
+               end
+        # Execute with circuit breaker if configured
+        if job.circuit_breaker_enabled?
+          execute_with_circuit_breaker(job, work, job_trace)
+        else
+          execute_with_supervisor(job, work)
+        end
+      end
+      # Execute a job with retry logic.
+      #
+      # @param job [Job] The job to execute
+      # @param job_trace [ExecutionTrace::JobTrace, nil] Optional job trace
+      # @return [Object] The job output
+      def execute_with_retry(job, job_trace = nil)
+        retry_config = job.retry_config
+        # Create retry orchestrator with the job's retry configuration
+        orchestrator = RetryOrchestrator.new(retry_config,
+                                             debug: ENV["FRACTOR_DEBUG"] == "1")
+        # Execute with retry logic
+        orchestrator.execute_with_retry(job) do |j|
+          execute_once(j, job_trace)
+        end
+      rescue StandardError => e
+        # Get retry state for DLQ entry
+        retry_state = orchestrator.state
+        add_to_dead_letter_queue(job, e, retry_state)
+        raise e
+      end
+      # Execute a job using a supervisor.
+      #
+      # @param job [Job] The job to execute
+      # @param work [Work] The work item to process
+      # @return [Object] The job output
+      def execute_with_supervisor(job, work)
+        supervisor = Supervisor.new(
+          worker_pools: [
+            {
+              worker_class: job.worker_class,
+              num_workers: job.num_workers || 1,
+            },
+          ],
+        )
+        supervisor.add_work_item(work)
+        supervisor.run
+        # Check for errors first (before checking results)
+        unless supervisor.results.errors.empty?
+          error = supervisor.results.errors.first
+          raise WorkflowExecutionError,
+                "Job '#{job.name}' encountered error: #{error.error}"
+        end
+        # Get the result
+        results = supervisor.results.results
+        if results.empty?
+          raise WorkflowExecutionError, "Job '#{job.name}' produced no results"
+        end
+        results.first.result
+      end
+      # Execute a job with circuit breaker protection.
+      #
+      # @param job [Job] The job to execute
+      # @param work [Work] The work item to process
+      # @param job_trace [ExecutionTrace::JobTrace, nil] Optional job trace
+      # @return [Object] The job output
+      def execute_with_circuit_breaker(job, work, _job_trace = nil)
+        breaker_key = job.circuit_breaker_key
+        # Get or create circuit breaker orchestrator for this job
+        orchestrator = @circuit_breakers.get_or_create_orchestrator(
+          breaker_key,
+          **job.circuit_breaker_config.slice(:threshold, :timeout,
+                                             :half_open_calls),
+          job_name: job.name,
+          debug: ENV["FRACTOR_DEBUG"] == "1",
+        )
+        # Log circuit state before execution
+        log_circuit_breaker_state(job, orchestrator)
+        begin
+          orchestrator.execute_with_breaker(job) do
+            execute_with_supervisor(job, work)
+          end
+        rescue Workflow::CircuitOpenError => e
+          log_circuit_breaker_open(job, orchestrator)
+          raise WorkflowExecutionError,
+                "Circuit breaker open for job '#{job.name}': #{e.message}"
+        end
+      end
+      private
+      # Add failed job to dead letter queue.
+      #
+      # @param job [Job] The job that failed
+      # @param error [Exception] The error that occurred
+      # @param retry_state [Object, nil] Optional retry state
+      def add_to_dead_letter_queue(job, error, retry_state = nil)
+        return unless @dead_letter_queue
+        # Build job input for DLQ entry
+        job_input = @context.build_job_input(job)
+        work = Work.new(job_input)
+        # Build metadata about the failure
+        metadata = build_failure_metadata(job, error, retry_state)
+        # Build context from workflow
+        context = {
+          workflow_input: @context.workflow_input,
+          completed_jobs: @completed_jobs.to_a,
+          failed_jobs: @failed_jobs.to_a,
+        }
+        @dead_letter_queue.add(work, error, context: context,
+                                            metadata: metadata)
+        @logger.added_to_dead_letter_queue(job.name, error,
+                                           @dead_letter_queue.size)
+      end
+      # Build failure metadata for dead letter queue.
+      #
+      # @param job [Job] The job that failed
+      # @param error [Exception] The error that occurred
+      # @param retry_state [Object, nil] Optional retry state
+      # @return [Hash] Failure metadata
+      def build_failure_metadata(job, _error, retry_state)
+        metadata = {
+          job_name: job.name,
+          worker_class: job.worker_class.name,
+          correlation_id: @context.correlation_id,
+          workflow_name: @workflow.class.workflow_name,
+        }
+        # Add retry information if available
+        if retry_state
+          # Handle both RetryState object and Hash from orchestrator
+          if retry_state.is_a?(Hash)
+            # From RetryOrchestrator.state
+            metadata[:retry_attempts] = retry_state[:attempts] - 1
+            metadata[:max_attempts] = retry_state[:max_attempts]
+            metadata[:last_error] = retry_state[:last_error]
+            metadata[:total_retry_time] = retry_state[:total_time]
+            metadata[:all_errors] = retry_state[:all_errors]
+          else
+            # From RetryState object
+            metadata[:retry_attempts] = retry_state.attempt - 1
+            metadata[:total_retry_time] = retry_state.total_time
+            metadata[:all_errors] = retry_state.summary[:errors]
+          end
+        end
+        metadata
+      end
+      # Log circuit breaker state.
+      #
+      # @param job [Job] The job
+      # @param orchestrator [CircuitBreakerOrchestrator] The circuit breaker orchestrator
+      def log_circuit_breaker_state(job, orchestrator)
+        @logger.circuit_breaker_state(
+          job.name,
+          orchestrator.state,
+          failure_count: orchestrator.failure_count,
+          threshold: orchestrator.breaker.threshold,
+        )
+      end
+      # Log circuit breaker open.
+      #
+      # @param job [Job] The job
+      # @param orchestrator [CircuitBreakerOrchestrator] The circuit breaker orchestrator
+      def log_circuit_breaker_open(job, orchestrator)
+        @logger.circuit_breaker_open(
+          job.name,
+          orchestrator.failure_count,
+          orchestrator.breaker.threshold,
+          last_failure: orchestrator.breaker.last_failure_time,
+        )
+      end
+    end
+  end
+end