RubyGems - swarm_sdk - Versions diffs - 2.5.4 → 2.6.0 - Mend

swarm_sdk 2.5.4 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/lib/swarm_sdk/agent/builder.rb +29 -11
data/lib/swarm_sdk/agent/chat.rb +298 -53
data/lib/swarm_sdk/agent/definition.rb +19 -4
data/lib/swarm_sdk/builders/base_builder.rb +14 -3
data/lib/swarm_sdk/config.rb +2 -0
data/lib/swarm_sdk/configuration/parser.rb +4 -1
data/lib/swarm_sdk/configuration/translator.rb +5 -2
data/lib/swarm_sdk/defaults.rb +14 -0
data/lib/swarm_sdk/swarm/all_agents_builder.rb +12 -5
data/lib/swarm_sdk/swarm/builder.rb +7 -0
data/lib/swarm_sdk/swarm/executor.rb +84 -7
data/lib/swarm_sdk/swarm.rb +14 -2
data/lib/swarm_sdk/version.rb +1 -1
data/lib/swarm_sdk.rb +10 -0
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4729a555c9f839d1c507a4353c74d522cfe21b8fdf50a7727d6ee078c89609e6
-  data.tar.gz: f21e5971305b0011f924861afc3738e30f3517e25da1ba2e1b26ad3b9052ccca
+  metadata.gz: 100b5eeda25839a9c9a02270edf2d84b4623e267a55f215fa79c7479a0333f96
+  data.tar.gz: f63cf4bf9726f769edad0b620b8738223879d2e8ceb4f2111b9be317868ace46
 SHA512:
-  metadata.gz: 3117334f14af1d526b949b9a21d20b5bd2098a34b06aaf1dfde2499a98b94ec8e284269eeff2030856d4a6fa1ad3fe37d3cd05f6121c57495c7a11baf217d804
-  data.tar.gz: e076f6ccde790b5a5b209cd2c46e7d534e7c45a703338f306c191a131cb8e40eb5c40e064f3410e5a897e6d654f586bb89a641404910604c90f40e48d0b2472f
+  metadata.gz: 742e655084a0c9307694ef1d10b868cee0f3b3af77f7c5b3bd0bd6bf25f3a97a677bb31e6dda8a9d9074f072099e575fbc8ba7bb12e6e9b56431fc839436e30b
+  data.tar.gz: 8d943e6f17b77eac4d747ba9b9a73fdc1eb2600a40646b641ae226b6e239acb7daab3271b345a8b712cf0d405170f2251f5a7b29eae77f4d991c56f9af8ce828

data/lib/swarm_sdk/agent/builder.rb CHANGED Viewed

@@ -49,7 +49,8 @@ module SwarmSDK
         @directory = "."
         @parameters = {}
         @headers = {}
-        @timeout = nil
+        @request_timeout = nil
+        @turn_timeout = nil
         @mcp_servers = []
         @disable_default_tools = nil # nil = include all default tools
         @bypass_permissions = false
@@ -112,11 +113,18 @@ module SwarmSDK
         @headers = header_hash
       end
-      # Set/get timeout
-      def timeout(seconds = :__not_provided__)
-        return @timeout if seconds == :__not_provided__
+      # Set/get request timeout
+      def request_timeout(seconds = :__not_provided__)
+        return @request_timeout if seconds == :__not_provided__
-        @timeout = seconds
+        @request_timeout = seconds
+      end
+      # Set/get turn timeout
+      def turn_timeout(seconds = :__not_provided__)
+        return @turn_timeout if seconds == :__not_provided__
+        @turn_timeout = seconds
       end
       # Add an MCP server configuration
@@ -386,13 +394,22 @@ module SwarmSDK
         !@api_version.nil?
       end
-      # Check if timeout has been explicitly set
+      # Check if request_timeout has been explicitly set
+      #
+      # Used by Swarm::Builder to determine if all_agents request_timeout should apply.
+      #
+      # @return [Boolean] true if request_timeout was explicitly set
+      def request_timeout_set?
+        !@request_timeout.nil?
+      end
+      # Check if turn_timeout has been explicitly set
       #
-      # Used by Swarm::Builder to determine if all_agents timeout should apply.
+      # Used by Swarm::Builder to determine if all_agents turn_timeout should apply.
       #
-      # @return [Boolean] true if timeout was explicitly set
-      def timeout_set?
-        !@timeout.nil?
+      # @return [Boolean] true if turn_timeout was explicitly set
+      def turn_timeout_set?
+        !@turn_timeout.nil?
       end
       # Check if coding_agent has been explicitly set
@@ -448,7 +465,8 @@ module SwarmSDK
         agent_config[:context_window] = @context_window if @context_window
         agent_config[:parameters] = @parameters if @parameters.any?
         agent_config[:headers] = @headers if @headers.any?
-        agent_config[:timeout] = @timeout if @timeout
+        agent_config[:request_timeout] = @request_timeout if @request_timeout
+        agent_config[:turn_timeout] = @turn_timeout if @turn_timeout
         agent_config[:mcp_servers] = @mcp_servers if @mcp_servers.any?
         agent_config[:disable_default_tools] = @disable_default_tools unless @disable_default_tools.nil?
         agent_config[:bypass_permissions] = @bypass_permissions

data/lib/swarm_sdk/agent/chat.rb CHANGED Viewed

@@ -122,7 +122,7 @@ module SwarmSDK
         max_concurrent_tools = definition[:max_concurrent_tools]
         base_url = definition[:base_url]
         api_version = definition[:api_version]
-        timeout = definition[:timeout] || SwarmSDK.config.agent_request_timeout
+        request_timeout = definition[:request_timeout] || SwarmSDK.config.agent_request_timeout
         assume_model_exists = definition[:assume_model_exists]
         system_prompt = definition[:system_prompt]
         parameters = definition[:parameters]
@@ -131,6 +131,9 @@ module SwarmSDK
         # Agent identifier (for plugin callbacks)
         @agent_name = agent_name
+        # Turn timeout (external timeout for entire ask() call)
+        @turn_timeout = definition[:turn_timeout]
         # Context manager for ephemeral messages
         @context_manager = ContextManager.new
@@ -162,7 +165,7 @@ module SwarmSDK
           provider_name: provider_name,
           base_url: base_url,
           api_version: api_version,
-          timeout: timeout,
+          timeout: request_timeout,
           assume_model_exists: assume_model_exists,
           max_concurrent_tools: max_concurrent_tools,
         )
@@ -461,48 +464,11 @@ module SwarmSDK
       # @return [RubyLLM::Message] LLM response
       def ask(prompt, **options)
         @ask_semaphore.acquire do
-          is_first = first_message?
-          # Collect system reminders to inject as ephemeral content
-          reminders = collect_system_reminders(prompt, is_first)
-          # Trigger user_prompt hook (with clean prompt, not reminders)
-          source = options.delete(:source) || "user"
-          final_prompt = prompt
-          if @hook_executor
-            hook_result = trigger_user_prompt(prompt, source: source)
-            if hook_result[:halted]
-              return RubyLLM::Message.new(
-                role: :assistant,
-                content: hook_result[:halt_message],
-                model_id: model_id,
-              )
-            end
-            final_prompt = hook_result[:modified_prompt] if hook_result[:modified_prompt]
-          end
-          # Add CLEAN user message to history (no reminders embedded)
-          @llm_chat.add_message(role: :user, content: final_prompt)
-          # Track reminders as ephemeral content for this LLM call only
-          # They'll be injected by around_llm_request hook but not stored
-          reminders.each do |reminder|
-            @context_manager.add_ephemeral_reminder(reminder, messages_array: @llm_chat.messages)
-          end
-          # Execute complete() which handles tool loop and ephemeral injection
-          response = execute_with_global_semaphore do
-            catch(:finish_agent) do
-              catch(:finish_swarm) do
-                @llm_chat.complete(**options)
-              end
-            end
+          if @turn_timeout
+            execute_with_turn_timeout(prompt, options)
+          else
+            execute_ask(prompt, options)
           end
-          # Handle finish markers from hooks
-          handle_finish_marker(response)
         end
       end
@@ -559,6 +525,103 @@ module SwarmSDK
       private
+      # Execute ask with turn timeout wrapper
+      def execute_with_turn_timeout(prompt, options)
+        task = Async::Task.current
+        # Use barrier to track child tasks spawned during this turn
+        # (includes RubyLLM's async tool execution when max_concurrent_tools is set)
+        barrier = Async::Barrier.new
+        begin
+          task.with_timeout(
+            @turn_timeout,
+            TurnTimeoutError,
+            "Agent turn timed out after #{@turn_timeout}s",
+          ) do
+            # Execute inside barrier to track child tasks
+            barrier.async do
+              execute_ask(prompt, options)
+            end.wait
+          end
+        rescue TurnTimeoutError
+          # Stop all child tasks
+          barrier.stop
+          emit_turn_timeout_event
+          # Return error message as response so caller can handle gracefully
+          # Format like other tool/delegation errors for natural flow
+          # This message goes to the swarm/caller, NOT added to agent's conversation history
+          RubyLLM::Message.new(
+            role: :assistant,
+            content: "Error: Request timed out after #{@turn_timeout}s. The agent did not complete its response within the time limit. Please try a simpler request or increase the turn timeout.",
+            model_id: model_id,
+          )
+        ensure
+          # Cleanup barrier if not already stopped
+          barrier.stop unless barrier.empty?
+        end
+      end
+      # Emit turn timeout event
+      def emit_turn_timeout_event
+        LogStream.emit(
+          type: "turn_timeout",
+          agent: @agent_name,
+          swarm_id: @agent_context&.swarm_id,
+          parent_swarm_id: @agent_context&.parent_swarm_id,
+          limit: @turn_timeout,
+          message: "Agent turn timed out after #{@turn_timeout}s",
+        )
+      end
+      # Execute ask without timeout (original ask implementation)
+      def execute_ask(prompt, options)
+        is_first = first_message?
+        # Collect system reminders to inject as ephemeral content
+        reminders = collect_system_reminders(prompt, is_first)
+        # Trigger user_prompt hook (with clean prompt, not reminders)
+        source = options.delete(:source) || "user"
+        final_prompt = prompt
+        if @hook_executor
+          hook_result = trigger_user_prompt(prompt, source: source)
+          if hook_result[:halted]
+            return RubyLLM::Message.new(
+              role: :assistant,
+              content: hook_result[:halt_message],
+              model_id: model_id,
+            )
+          end
+          final_prompt = hook_result[:modified_prompt] if hook_result[:modified_prompt]
+        end
+        # Add CLEAN user message to history (no reminders embedded)
+        @llm_chat.add_message(role: :user, content: final_prompt)
+        # Track reminders as ephemeral content for this LLM call only
+        # They'll be injected by around_llm_request hook but not stored
+        reminders.each do |reminder|
+          @context_manager.add_ephemeral_reminder(reminder, messages_array: @llm_chat.messages)
+        end
+        # Execute complete() which handles tool loop and ephemeral injection
+        response = execute_with_global_semaphore do
+          catch(:finish_agent) do
+            catch(:finish_swarm) do
+              @llm_chat.complete(**options)
+            end
+          end
+        end
+        # Handle finish markers from hooks
+        handle_finish_marker(response)
+      end
       # --- Tool Execution Hook ---
       # Setup around_tool_execution hook for SwarmSDK orchestration
@@ -712,17 +775,64 @@ module SwarmSDK
       # --- LLM Call Retry Logic ---
-      # Call LLM provider with retry logic for transient failures
+      # Call LLM provider with smart retry logic based on error type
+      #
+      # ## Error Categorization
+      #
+      # **Non-Retryable Client Errors (4xx)**: Return error message immediately
+      # - 400 Bad Request (after orphan tool call recovery attempt)
+      # - 401 Unauthorized (invalid API key)
+      # - 402 Payment Required (billing issue)
+      # - 403 Forbidden (permission denied)
+      # - 422 Unprocessable Entity (invalid parameters)
+      # - Other 4xx errors
+      #
+      # **Retryable Server Errors (5xx)**: Retry with delays
+      # - 429 Rate Limit (RubyLLM already retried 3x)
+      # - 500 Server Error (RubyLLM already retried 3x)
+      # - 502-503 Service Unavailable (RubyLLM already retried 3x)
+      # - 529 Overloaded (RubyLLM already retried 3x)
+      # Note: If we see these errors, RubyLLM has already tried 3 times
+      #
+      # **Network Errors**: Retry with delays
+      # - Timeouts, connection failures, etc.
       #
-      # Includes special handling for 400 Bad Request errors:
+      # ## Special Handling
+      #
+      # **400 Bad Request with Orphan Tool Calls**:
       # - Attempts to prune orphan tool calls (tool_use without tool_result)
       # - If pruning succeeds, retries immediately without counting as retry
+      # - If pruning fails or not applicable, returns error message immediately
       #
-      # @param max_retries [Integer] Maximum retry attempts
+      # ## Error Response Format
+      #
+      # Non-retryable errors return as assistant messages for natural delegation flow:
+      # ```ruby
+      # RubyLLM::Message.new(
+      #   role: :assistant,
+      #   content: "I encountered an error: [details]"
+      # )
+      # ```
+      #
+      # @param max_retries [Integer] Maximum retry attempts at SDK level
+      #   Note: RubyLLM already retries 429/5xx errors 3 times before this
       # @param delay [Integer] Delay between retries in seconds
       # @yield Block that performs the LLM call
-      # @return [Object] Result from block
-      def call_llm_with_retry(max_retries: 10, delay: 10, &block)
+      # @return [RubyLLM::Message, Object] Result from block or error message
+      #
+      # @example Handling 401 Unauthorized
+      #   result = call_llm_with_retry do
+      #     @llm_chat.complete
+      #   end
+      #   # Returns immediately: Message with "Unauthorized" error
+      #
+      # @example Handling 500 Server Error
+      #   result = call_llm_with_retry(max_retries: 3, delay: 15) do
+      #     @llm_chat.complete
+      #   end
+      #   # Retries up to 3 times with 15s delays
+      #   # (RubyLLM already tried 3x, so 6 total attempts)
+      def call_llm_with_retry(max_retries: 3, delay: 15, &block)
         attempts = 0
         pruning_attempted = false
@@ -731,22 +841,68 @@ module SwarmSDK
           begin
             return yield
+          # === CATEGORY A: NON-RETRYABLE CLIENT ERRORS ===
           rescue RubyLLM::BadRequestError => e
-            # Try to recover from 400 Bad Request by pruning orphan tool calls
-            # This can happen when tool execution is interrupted mid-stream
+            # Special case: Try orphan tool call recovery ONCE
+            # This handles interrupted tool executions (tool_use without tool_result)
             unless pruning_attempted
               pruned = recover_from_orphan_tool_calls(e)
               if pruned > 0
                 pruning_attempted = true
-                # Don't count this as a regular retry, try again immediately
-                attempts -= 1
+                attempts -= 1 # Don't count as retry
                 next
               end
             end
-            # Fall through to standard retry logic
+            # No recovery possible - fail immediately with error message
+            emit_non_retryable_error(e, "BadRequest")
+            return build_error_message(e)
+          rescue RubyLLM::UnauthorizedError => e
+            # 401: Authentication failed - won't fix by retrying
+            emit_non_retryable_error(e, "Unauthorized")
+            return build_error_message(e)
+          rescue RubyLLM::PaymentRequiredError => e
+            # 402: Billing issue - won't fix by retrying
+            emit_non_retryable_error(e, "PaymentRequired")
+            return build_error_message(e)
+          rescue RubyLLM::ForbiddenError => e
+            # 403: Permission denied - won't fix by retrying
+            emit_non_retryable_error(e, "Forbidden")
+            return build_error_message(e)
+          # === CATEGORY B: RETRYABLE SERVER ERRORS ===
+          # IMPORTANT: Must come BEFORE generic RubyLLM::Error to avoid being caught by it
+          rescue RubyLLM::RateLimitError,
+                 RubyLLM::ServerError,
+                 RubyLLM::ServiceUnavailableError,
+                 RubyLLM::OverloadedError => e
+            # These errors indicate temporary provider issues
+            # RubyLLM already retried 3 times with exponential backoff (~0.7s)
+            # Retry a few more times with longer delays to give provider time
             handle_retry_or_raise(e, attempts, max_retries, delay)
+          # === CATEGORY A (CONTINUED): OTHER CLIENT ERRORS ===
+          # IMPORTANT: Must come AFTER specific error classes (including server errors)
+          rescue RubyLLM::Error => e
+            # Generic RubyLLM::Error - check for specific status codes
+            if e.response&.status == 422
+              # 422: Unprocessable Entity - semantic validation failure
+              emit_non_retryable_error(e, "UnprocessableEntity")
+              return build_error_message(e)
+            elsif e.response&.status && (400..499).include?(e.response.status)
+              # Other 4xx errors - conservative: don't retry unknown client errors
+              emit_non_retryable_error(e, "ClientError")
+              return build_error_message(e)
+            end
+            # Unknown error type without status code - conservative: don't retry
+            emit_non_retryable_error(e, "UnknownAPIError")
+            return build_error_message(e)
+          # === CATEGORY C: NETWORK/OTHER ERRORS ===
           rescue StandardError => e
+            # Network errors, timeouts, unknown errors - retry with delays
             handle_retry_or_raise(e, attempts, max_retries, delay)
           end
         end
@@ -792,6 +948,95 @@ module SwarmSDK
         sleep(delay)
       end
+      # Build an error message as an assistant response
+      #
+      # Non-retryable errors are returned as assistant messages instead of raising.
+      # This allows errors to flow naturally through delegation - parent agents
+      # can see child agent errors and respond appropriately.
+      #
+      # @param error [RubyLLM::Error, StandardError] The error that occurred
+      # @return [RubyLLM::Message] Assistant message containing formatted error
+      #
+      # @example Error message for delegation
+      #   error = RubyLLM::UnauthorizedError.new(response, "Invalid API key")
+      #   message = build_error_message(error)
+      #   # => Message with role: :assistant, content: "I encountered an error: ..."
+      def build_error_message(error)
+        content = format_error_message(error)
+        RubyLLM::Message.new(
+          role: :assistant,
+          content: content,
+          model_id: model_id,
+        )
+      end
+      # Format error details into user-friendly message
+      #
+      # @param error [RubyLLM::Error, StandardError] The error to format
+      # @return [String] Formatted error message with type, status, and guidance
+      #
+      # @example Formatting 401 error
+      #   format_error_message(unauthorized_error)
+      #   # => "I encountered an error while processing your request:
+      #   #     **Error Type:** UnauthorizedError
+      #   #     **Status Code:** 401
+      #   #     **Message:** Invalid API key
+      #   #     Please check your API credentials."
+      def format_error_message(error)
+        status = error.respond_to?(:response) ? error.response&.status : nil
+        msg = "I encountered an error while processing your request:\n\n"
+        msg += "**Error Type:** #{error.class.name.split("::").last}\n"
+        msg += "**Status Code:** #{status}\n" if status
+        msg += "**Message:** #{error.message}\n\n"
+        msg += "This error indicates a problem that cannot be automatically recovered. "
+        # Add context-specific guidance based on error type
+        msg += case error
+        when RubyLLM::UnauthorizedError
+          "Please check your API credentials."
+        when RubyLLM::PaymentRequiredError
+          "Please check your account billing status."
+        when RubyLLM::ForbiddenError
+          "You may not have permission to access this resource."
+        when RubyLLM::BadRequestError
+          "The request format may be invalid."
+        else
+          "Please review the error and try again."
+        end
+        msg
+      end
+      # Emit llm_request_failed event for non-retryable errors
+      #
+      # This event provides visibility into errors that fail immediately
+      # without retry attempts. Useful for monitoring auth failures,
+      # billing issues, and other non-transient problems.
+      #
+      # @param error [RubyLLM::Error, StandardError] The error that occurred
+      # @param error_type [String] Friendly error type name for logging
+      # @return [void]
+      #
+      # @example Emitting unauthorized error event
+      #   emit_non_retryable_error(error, "Unauthorized")
+      #   # Emits: { type: "llm_request_failed", error_type: "Unauthorized", ... }
+      def emit_non_retryable_error(error, error_type)
+        LogStream.emit(
+          type: "llm_request_failed",
+          agent: @agent_name,
+          swarm_id: @agent_context&.swarm_id,
+          parent_swarm_id: @agent_context&.parent_swarm_id,
+          model: model_id,
+          error_type: error_type,
+          error_class: error.class.name,
+          error_message: error.message,
+          status_code: error.respond_to?(:response) ? error.response&.status : nil,
+          retryable: false,
+        )
+      end
       # Recover from 400 Bad Request by pruning orphan tool calls
       #
       # @param error [RubyLLM::BadRequestError] The error that occurred

data/lib/swarm_sdk/agent/definition.rb CHANGED Viewed

@@ -32,7 +32,8 @@ module SwarmSDK
         :mcp_servers,
         :parameters,
         :headers,
-        :timeout,
+        :request_timeout,
+        :turn_timeout,
         :disable_default_tools,
         :coding_agent,
         :default_permissions,
@@ -74,9 +75,16 @@ module SwarmSDK
         @context_window = coerce_to_integer(config[:context_window]) # Explicit context window override
         @parameters = config[:parameters] || {}
         @headers = Utils.stringify_keys(config[:headers] || {})
-        @timeout = config[:timeout] || SwarmSDK.config.agent_request_timeout
+        @request_timeout = config[:request_timeout] || SwarmSDK.config.agent_request_timeout
         @bypass_permissions = config[:bypass_permissions] || false
         @max_concurrent_tools = config[:max_concurrent_tools]
+        # Use default from config unless explicitly set (including nil to disable)
+        @turn_timeout = if config.key?(:turn_timeout)
+          config[:turn_timeout] # Could be a number OR nil (to disable)
+        else
+          SwarmSDK.config.default_turn_timeout
+        end
         # Always assume model exists - SwarmSDK validates models separately using models.json
         # This prevents RubyLLM from trying to validate models in its registry
         @assume_model_exists = true
@@ -160,7 +168,8 @@ module SwarmSDK
           mcp_servers: @mcp_servers,
           parameters: @parameters,
           headers: @headers,
-          timeout: @timeout,
+          request_timeout: @request_timeout,
+          turn_timeout: @turn_timeout,
           bypass_permissions: @bypass_permissions,
           disable_default_tools: @disable_default_tools,
           coding_agent: @coding_agent,
@@ -294,7 +303,8 @@ module SwarmSDK
           :context_window,
           :parameters,
           :headers,
-          :timeout,
+          :request_timeout,
+          :turn_timeout,
           :bypass_permissions,
           :max_concurrent_tools,
           :assume_model_exists,
@@ -465,6 +475,11 @@ module SwarmSDK
       def validate!
         raise ConfigurationError, "Agent '#{@name}' missing required 'description' field" unless @description
+        # Validate turn_timeout is positive if set
+        if @turn_timeout && @turn_timeout <= 0
+          raise ConfigurationError, "Agent '#{@name}' turn_timeout must be positive (got #{@turn_timeout})"
+        end
         # Validate api_version can only be set for OpenAI-compatible providers
         if @api_version
           openai_compatible = ["openai", "deepseek", "perplexity", "mistral", "openrouter"]

data/lib/swarm_sdk/builders/base_builder.rb CHANGED Viewed

@@ -260,7 +260,8 @@ module SwarmSDK
         builder.context_window(config[:context_window]) if config[:context_window]
         builder.system_prompt(config[:system_prompt]) if config[:system_prompt]
         builder.directory(config[:directory]) if config[:directory]
-        builder.timeout(config[:timeout]) if config[:timeout]
+        builder.request_timeout(config[:request_timeout]) if config[:request_timeout]
+        builder.turn_timeout(config[:turn_timeout]) if config[:turn_timeout]
         builder.parameters(config[:parameters]) if config[:parameters]
         builder.headers(config[:headers]) if config[:headers]
         builder.coding_agent(config[:coding_agent]) unless config[:coding_agent].nil?
@@ -337,6 +338,12 @@ module SwarmSDK
             merged[:parameters] = (merged[:parameters] || {}).merge(value || {})
           when :headers
             merged[:headers] = (merged[:headers] || {}).merge(value || {})
+          when :turn_timeout
+            # Agent-specific turn_timeout overrides all_agents
+            merged[key] = value
+          when :request_timeout
+            # Agent-specific request_timeout overrides all_agents
+            merged[key] = value
           else
             merged[key] = value
           end
@@ -372,8 +379,12 @@ module SwarmSDK
           agent_builder.api_version(all_agents_hash[:api_version])
         end
-        if all_agents_hash[:timeout] && !agent_builder.timeout_set?
-          agent_builder.timeout(all_agents_hash[:timeout])
+        if all_agents_hash[:request_timeout] && !agent_builder.request_timeout_set?
+          agent_builder.request_timeout(all_agents_hash[:request_timeout])
+        end
+        if all_agents_hash[:turn_timeout] && !agent_builder.turn_timeout_set?
+          agent_builder.turn_timeout(all_agents_hash[:turn_timeout])
         end
         if all_agents_hash[:parameters]

data/lib/swarm_sdk/config.rb CHANGED Viewed

@@ -81,6 +81,8 @@ module SwarmSDK
       chars_per_token_prose: ["SWARM_SDK_CHARS_PER_TOKEN_PROSE", -> { Defaults::TokenEstimation::CHARS_PER_TOKEN_PROSE }],
       chars_per_token_code: ["SWARM_SDK_CHARS_PER_TOKEN_CODE", -> { Defaults::TokenEstimation::CHARS_PER_TOKEN_CODE }],
       mcp_log_level: ["SWARM_SDK_MCP_LOG_LEVEL", -> { Defaults::Logging::MCP_LOG_LEVEL }],
+      default_execution_timeout: ["SWARM_SDK_DEFAULT_EXECUTION_TIMEOUT", -> { Defaults::Timeouts::EXECUTION_TIMEOUT_SECONDS }],
+      default_turn_timeout: ["SWARM_SDK_DEFAULT_TURN_TIMEOUT", -> { Defaults::Timeouts::TURN_TIMEOUT_SECONDS }],
     }.freeze
     # WebFetch and control settings

data/lib/swarm_sdk/configuration/parser.rb CHANGED Viewed

@@ -28,7 +28,8 @@ module SwarmSDK
         :all_agents_hooks,
         :scratchpad_mode,
         :nodes,
-        :external_swarms
+        :external_swarms,
+        :execution_timeout
       # Initialize parser with YAML content and options
       #
@@ -54,6 +55,7 @@ module SwarmSDK
         @external_swarms = {}
         @nodes = {}
         @scratchpad_mode = :disabled
+        @execution_timeout = nil
       end
       def parse
@@ -134,6 +136,7 @@ module SwarmSDK
         @swarm_name = @root_config[:name]
         @swarm_id = @root_config[:id]
         @scratchpad_mode = parse_scratchpad_mode(@root_config[:scratchpad])
+        @execution_timeout = @root_config[:execution_timeout]
         load_all_agents_config
         load_hooks_config

data/lib/swarm_sdk/configuration/translator.rb CHANGED Viewed

@@ -40,6 +40,7 @@ module SwarmSDK
         builder.id(@parser.swarm_id) if @parser.swarm_id
         builder.name(@parser.swarm_name)
         builder.scratchpad(@parser.scratchpad_mode)
+        builder.execution_timeout(@parser.execution_timeout) if @parser.execution_timeout
         if @parser.external_swarms&.any?
           external_swarms = @parser.external_swarms
@@ -92,7 +93,8 @@ module SwarmSDK
           provider(all_agents_cfg[:provider]) if all_agents_cfg[:provider]
           base_url(all_agents_cfg[:base_url]) if all_agents_cfg[:base_url]
           api_version(all_agents_cfg[:api_version]) if all_agents_cfg[:api_version]
-          timeout(all_agents_cfg[:timeout]) if all_agents_cfg[:timeout]
+          request_timeout(all_agents_cfg[:request_timeout]) if all_agents_cfg[:request_timeout]
+          turn_timeout(all_agents_cfg[:turn_timeout]) if all_agents_cfg[:turn_timeout]
           parameters(all_agents_cfg[:parameters]) if all_agents_cfg[:parameters]
           headers(all_agents_cfg[:headers]) if all_agents_cfg[:headers]
           coding_agent(all_agents_cfg[:coding_agent]) unless all_agents_cfg[:coding_agent].nil?
@@ -152,7 +154,8 @@ module SwarmSDK
           context_window(config[:context_window]) if config[:context_window]
           system_prompt(config[:system_prompt]) if config[:system_prompt]
           directory(config[:directory]) if config[:directory]
-          timeout(config[:timeout]) if config[:timeout]
+          request_timeout(config[:request_timeout]) if config[:request_timeout]
+          turn_timeout(config[:turn_timeout]) if config[:turn_timeout]
           parameters(config[:parameters]) if config[:parameters]
           headers(config[:headers]) if config[:headers]
           coding_agent(config[:coding_agent]) unless config[:coding_agent].nil?

data/lib/swarm_sdk/defaults.rb CHANGED Viewed

@@ -75,6 +75,20 @@ module SwarmSDK
       # data transformation operations while preventing stalls.
       TRANSFORMER_COMMAND_SECONDS = 60
+      # Execution timeout (seconds)
+      #
+      # Maximum wall-clock time for entire swarm.execute() call.
+      # 30 minutes allows complex multi-agent workflows while preventing
+      # runaway execution.
+      EXECUTION_TIMEOUT_SECONDS = 1800
+      # Turn timeout (seconds)
+      #
+      # Maximum time for a single agent.ask() call, including all LLM requests
+      # and tool executions. 30 minutes accommodates extended thinking models
+      # and complex tool chains.
+      TURN_TIMEOUT_SECONDS = 1800
       # OpenAI responses API ID TTL (seconds)
       #
       # Time-to-live for cached response IDs. 5 minutes allows conversation

data/lib/swarm_sdk/swarm/all_agents_builder.rb CHANGED Viewed

@@ -29,7 +29,8 @@ module SwarmSDK
         @provider = nil
         @base_url = nil
         @api_version = nil
-        @timeout = nil
+        @request_timeout = nil
+        @turn_timeout = nil
         @parameters = nil
         @headers = nil
         @coding_agent = nil
@@ -56,9 +57,14 @@ module SwarmSDK
         @api_version = version
       end
-      # Set timeout for all agents
-      def timeout(seconds)
-        @timeout = seconds
+      # Set request timeout for all agents
+      def request_timeout(seconds)
+        @request_timeout = seconds
+      end
+      # Set turn timeout for all agents
+      def turn_timeout(seconds)
+        @turn_timeout = seconds
       end
       # Set parameters for all agents
@@ -153,7 +159,8 @@ module SwarmSDK
           provider: @provider,
           base_url: @base_url,
           api_version: @api_version,
-          timeout: @timeout,
+          request_timeout: @request_timeout,
+          turn_timeout: @turn_timeout,
           parameters: @parameters,
           headers: @headers,
           coding_agent: @coding_agent,

data/lib/swarm_sdk/swarm/builder.rb CHANGED Viewed

@@ -49,6 +49,12 @@ module SwarmSDK
         @lead_agent = nil
         @swarm_hooks = []
         @observer_configs = []
+        @execution_timeout = nil
+      end
+      # Set execution timeout (seconds)
+      def execution_timeout(seconds)
+        @execution_timeout = seconds
       end
       # Set lead agent
@@ -142,6 +148,7 @@ module SwarmSDK
           swarm_id: @swarm_id,
           scratchpad_mode: @scratchpad,
           allow_filesystem_tools: @allow_filesystem_tools,
+          execution_timeout: @execution_timeout,
         )
         # Setup swarm registry if external swarms are registered

data/lib/swarm_sdk/swarm/executor.rb CHANGED Viewed

@@ -32,15 +32,26 @@ module SwarmSDK
       # Blocking execution using Sync
       def run_blocking(prompt, logs:, has_logging:)
+        result = nil
         Sync do |task|
-          execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
-            task.async(finished: false) { lead.ask(current_prompt) }.wait
+          start_time = Time.now
+          result = if @swarm.execution_timeout
+            execute_with_execution_timeout(task, prompt, logs, has_logging, start_time)
+          else
+            execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
+              # Execute directly - no child task needed
+              # This keeps execution in same fiber context for better control
+              lead.ask(current_prompt)
+            end
           end
         ensure
           # Always wait for observer tasks, even if main execution raises
           # This is INSIDE Sync block, so async tasks can still complete
           @swarm.wait_for_observers
         end
+        result
       ensure
         # Restore original fiber storage (preserves parent context for nested swarms)
         restore_fiber_storage
@@ -51,9 +62,17 @@ module SwarmSDK
         parent = Async::Task.current
         raise ConfigurationError, "wait: false requires an async context. Use Sync { swarm.execute(..., wait: false) }" unless parent
-        parent.async(finished: false) do
-          execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
-            Async(finished: false) { lead.ask(current_prompt) }.wait
+        # NOTE: The block receives |task| as the spawned Async::Task when arity > 0
+        parent.async(finished: false) do |task|
+          start_time = Time.now
+          if @swarm.execution_timeout
+            execute_with_execution_timeout(task, prompt, logs, has_logging, start_time)
+          else
+            execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
+              # Execute directly - no child task needed
+              lead.ask(current_prompt)
+            end
           end
         end
       end
@@ -77,8 +96,9 @@ module SwarmSDK
           result = execution_loop(current_prompt, logs, start_time, &block)
           swarm_stop_triggered = true
-        rescue ConfigurationError, AgentNotFoundError
-          # Re-raise configuration errors - these should be fixed, not caught
+        rescue ConfigurationError, AgentNotFoundError, ExecutionTimeoutError, TurnTimeoutError
+          # Re-raise configuration errors and timeouts - these should not be caught here
+          # Timeouts are handled by execute_with_execution_timeout wrapper
           raise
         rescue TypeError => e
           result = handle_type_error(e, logs, start_time)
@@ -208,6 +228,63 @@ module SwarmSDK
         LogCollector.reset!
         LogStream.reset!
       end
+      # Execute with execution timeout wrapper
+      def execute_with_execution_timeout(task, prompt, logs, has_logging, start_time)
+        # Use Async::Task.current to get the actual current task context
+        current_task = Async::Task.current || task
+        # Use barrier to track ALL child tasks spawned during execution
+        # This includes RubyLLM's async tool execution (when max_concurrent_tools is set)
+        barrier = Async::Barrier.new
+        begin
+          current_task.with_timeout(
+            @swarm.execution_timeout,
+            ExecutionTimeoutError,
+            "Swarm execution timed out after #{@swarm.execution_timeout}s",
+          ) do
+            # Execute inside barrier to track child tasks (tool executions)
+            barrier.async do
+              execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
+                lead.ask(current_prompt)
+              end
+            end.wait
+          end
+        rescue ExecutionTimeoutError => e
+          # Stop ALL child tasks (interrupts ongoing tool executions and delegations)
+          barrier.stop
+          emit_execution_timeout_event(@swarm.execution_timeout)
+          build_timeout_result(e, logs, Time.now - start_time)
+        ensure
+          # Cleanup barrier if not already stopped
+          barrier.stop unless barrier.empty?
+        end
+      end
+      # Emit execution timeout event
+      def emit_execution_timeout_event(limit)
+        LogStream.emit(
+          type: "execution_timeout",
+          swarm_id: @swarm.swarm_id,
+          parent_swarm_id: @swarm.parent_swarm_id,
+          limit: limit,
+          message: "Swarm execution timed out after #{limit}s",
+        )
+      end
+      # Build timeout result
+      def build_timeout_result(error, logs, duration)
+        Result.new(
+          content: nil,
+          agent: @swarm.lead_agent&.to_s || "unknown",
+          error: error,
+          logs: logs,
+          duration: duration,
+          metadata: { timeout: true },
+        )
+      end
     end
   end
 end

data/lib/swarm_sdk/swarm.rb CHANGED Viewed

@@ -72,7 +72,7 @@ module SwarmSDK
     # Default tools available to all agents
     DEFAULT_TOOLS = ToolConfigurator::DEFAULT_TOOLS
-    attr_reader :name, :agents, :lead_agent, :mcp_clients, :delegation_instances, :agent_definitions, :swarm_id, :parent_swarm_id, :swarm_registry, :scratchpad_storage, :allow_filesystem_tools, :hook_registry, :global_semaphore, :plugin_storages, :config_for_hooks, :observer_configs
+    attr_reader :name, :agents, :lead_agent, :mcp_clients, :delegation_instances, :agent_definitions, :swarm_id, :parent_swarm_id, :swarm_registry, :scratchpad_storage, :allow_filesystem_tools, :hook_registry, :global_semaphore, :plugin_storages, :config_for_hooks, :observer_configs, :execution_timeout
     attr_accessor :delegation_call_stack
     # Check if scratchpad tools are enabled
@@ -139,13 +139,25 @@ module SwarmSDK
     # @param scratchpad [Tools::Stores::Scratchpad, nil] Optional scratchpad instance (for testing/internal use)
     # @param scratchpad_mode [Symbol, String] Scratchpad mode (:enabled or :disabled). :per_node not allowed for non-node swarms.
     # @param allow_filesystem_tools [Boolean, nil] Whether to allow filesystem tools (nil uses global setting)
-    def initialize(name:, swarm_id: nil, parent_swarm_id: nil, global_concurrency: nil, default_local_concurrency: nil, scratchpad: nil, scratchpad_mode: :enabled, allow_filesystem_tools: nil)
+    def initialize(name:, swarm_id: nil, parent_swarm_id: nil, global_concurrency: nil, default_local_concurrency: nil, scratchpad: nil, scratchpad_mode: :enabled, allow_filesystem_tools: nil, execution_timeout: :__use_default__)
       @name = name
       @swarm_id = swarm_id || generate_swarm_id(name)
       @parent_swarm_id = parent_swarm_id
       @global_concurrency = global_concurrency || SwarmSDK.config.global_concurrency_limit
       @default_local_concurrency = default_local_concurrency || SwarmSDK.config.local_concurrency_limit
+      # Use default from config unless explicitly set (including nil to disable)
+      @execution_timeout = if execution_timeout == :__use_default__
+        SwarmSDK.config.default_execution_timeout
+      else
+        execution_timeout # Could be a number OR nil (to disable)
+      end
+      # Validate execution_timeout is positive if set
+      if @execution_timeout && @execution_timeout <= 0
+        raise ConfigurationError, "execution_timeout must be positive (got #{@execution_timeout})"
+      end
       # Handle scratchpad_mode parameter
       # For Swarm: :enabled or :disabled (not :per_node - that's for nodes)
       @scratchpad_mode = validate_swarm_scratchpad_mode(scratchpad_mode)

data/lib/swarm_sdk/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module SwarmSDK
-  VERSION = "2.5.4"
+  VERSION = "2.6.0"
 end

data/lib/swarm_sdk.rb CHANGED Viewed

@@ -13,6 +13,7 @@ require "set"
 require "yaml"
 require "async"
+require "async/barrier"
 require "async/semaphore"
 require "ruby_llm"
 require "ruby_llm/mcp"
@@ -61,6 +62,15 @@ module SwarmSDK
   class LLMError < Error; end
   class StateError < Error; end
+  # Base class for SwarmSDK timeout errors
+  class TimeoutError < Error; end
+  # Raised when swarm execution exceeds execution_timeout
+  class ExecutionTimeoutError < TimeoutError; end
+  # Raised when agent turn exceeds turn_timeout
+  class TurnTimeoutError < TimeoutError; end
   class << self
     # Get the global configuration instance
     #

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: swarm_sdk
 version: !ruby/object:Gem::Version
-  version: 2.5.4
+  version: 2.6.0
 platform: ruby
 authors:
 - Paulo Arruda