swarm_sdk 2.5.4 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4729a555c9f839d1c507a4353c74d522cfe21b8fdf50a7727d6ee078c89609e6
4
- data.tar.gz: f21e5971305b0011f924861afc3738e30f3517e25da1ba2e1b26ad3b9052ccca
3
+ metadata.gz: 100b5eeda25839a9c9a02270edf2d84b4623e267a55f215fa79c7479a0333f96
4
+ data.tar.gz: f63cf4bf9726f769edad0b620b8738223879d2e8ceb4f2111b9be317868ace46
5
5
  SHA512:
6
- metadata.gz: 3117334f14af1d526b949b9a21d20b5bd2098a34b06aaf1dfde2499a98b94ec8e284269eeff2030856d4a6fa1ad3fe37d3cd05f6121c57495c7a11baf217d804
7
- data.tar.gz: e076f6ccde790b5a5b209cd2c46e7d534e7c45a703338f306c191a131cb8e40eb5c40e064f3410e5a897e6d654f586bb89a641404910604c90f40e48d0b2472f
6
+ metadata.gz: 742e655084a0c9307694ef1d10b868cee0f3b3af77f7c5b3bd0bd6bf25f3a97a677bb31e6dda8a9d9074f072099e575fbc8ba7bb12e6e9b56431fc839436e30b
7
+ data.tar.gz: 8d943e6f17b77eac4d747ba9b9a73fdc1eb2600a40646b641ae226b6e239acb7daab3271b345a8b712cf0d405170f2251f5a7b29eae77f4d991c56f9af8ce828
@@ -49,7 +49,8 @@ module SwarmSDK
49
49
  @directory = "."
50
50
  @parameters = {}
51
51
  @headers = {}
52
- @timeout = nil
52
+ @request_timeout = nil
53
+ @turn_timeout = nil
53
54
  @mcp_servers = []
54
55
  @disable_default_tools = nil # nil = include all default tools
55
56
  @bypass_permissions = false
@@ -112,11 +113,18 @@ module SwarmSDK
112
113
  @headers = header_hash
113
114
  end
114
115
 
115
- # Set/get timeout
116
- def timeout(seconds = :__not_provided__)
117
- return @timeout if seconds == :__not_provided__
116
+ # Set/get request timeout
117
+ def request_timeout(seconds = :__not_provided__)
118
+ return @request_timeout if seconds == :__not_provided__
118
119
 
119
- @timeout = seconds
120
+ @request_timeout = seconds
121
+ end
122
+
123
+ # Set/get turn timeout
124
+ def turn_timeout(seconds = :__not_provided__)
125
+ return @turn_timeout if seconds == :__not_provided__
126
+
127
+ @turn_timeout = seconds
120
128
  end
121
129
 
122
130
  # Add an MCP server configuration
@@ -386,13 +394,22 @@ module SwarmSDK
386
394
  !@api_version.nil?
387
395
  end
388
396
 
389
- # Check if timeout has been explicitly set
397
+ # Check if request_timeout has been explicitly set
398
+ #
399
+ # Used by Swarm::Builder to determine if all_agents request_timeout should apply.
400
+ #
401
+ # @return [Boolean] true if request_timeout was explicitly set
402
+ def request_timeout_set?
403
+ !@request_timeout.nil?
404
+ end
405
+
406
+ # Check if turn_timeout has been explicitly set
390
407
  #
391
- # Used by Swarm::Builder to determine if all_agents timeout should apply.
408
+ # Used by Swarm::Builder to determine if all_agents turn_timeout should apply.
392
409
  #
393
- # @return [Boolean] true if timeout was explicitly set
394
- def timeout_set?
395
- !@timeout.nil?
410
+ # @return [Boolean] true if turn_timeout was explicitly set
411
+ def turn_timeout_set?
412
+ !@turn_timeout.nil?
396
413
  end
397
414
 
398
415
  # Check if coding_agent has been explicitly set
@@ -448,7 +465,8 @@ module SwarmSDK
448
465
  agent_config[:context_window] = @context_window if @context_window
449
466
  agent_config[:parameters] = @parameters if @parameters.any?
450
467
  agent_config[:headers] = @headers if @headers.any?
451
- agent_config[:timeout] = @timeout if @timeout
468
+ agent_config[:request_timeout] = @request_timeout if @request_timeout
469
+ agent_config[:turn_timeout] = @turn_timeout if @turn_timeout
452
470
  agent_config[:mcp_servers] = @mcp_servers if @mcp_servers.any?
453
471
  agent_config[:disable_default_tools] = @disable_default_tools unless @disable_default_tools.nil?
454
472
  agent_config[:bypass_permissions] = @bypass_permissions
@@ -122,7 +122,7 @@ module SwarmSDK
122
122
  max_concurrent_tools = definition[:max_concurrent_tools]
123
123
  base_url = definition[:base_url]
124
124
  api_version = definition[:api_version]
125
- timeout = definition[:timeout] || SwarmSDK.config.agent_request_timeout
125
+ request_timeout = definition[:request_timeout] || SwarmSDK.config.agent_request_timeout
126
126
  assume_model_exists = definition[:assume_model_exists]
127
127
  system_prompt = definition[:system_prompt]
128
128
  parameters = definition[:parameters]
@@ -131,6 +131,9 @@ module SwarmSDK
131
131
  # Agent identifier (for plugin callbacks)
132
132
  @agent_name = agent_name
133
133
 
134
+ # Turn timeout (external timeout for entire ask() call)
135
+ @turn_timeout = definition[:turn_timeout]
136
+
134
137
  # Context manager for ephemeral messages
135
138
  @context_manager = ContextManager.new
136
139
 
@@ -162,7 +165,7 @@ module SwarmSDK
162
165
  provider_name: provider_name,
163
166
  base_url: base_url,
164
167
  api_version: api_version,
165
- timeout: timeout,
168
+ timeout: request_timeout,
166
169
  assume_model_exists: assume_model_exists,
167
170
  max_concurrent_tools: max_concurrent_tools,
168
171
  )
@@ -461,48 +464,11 @@ module SwarmSDK
461
464
  # @return [RubyLLM::Message] LLM response
462
465
  def ask(prompt, **options)
463
466
  @ask_semaphore.acquire do
464
- is_first = first_message?
465
-
466
- # Collect system reminders to inject as ephemeral content
467
- reminders = collect_system_reminders(prompt, is_first)
468
-
469
- # Trigger user_prompt hook (with clean prompt, not reminders)
470
- source = options.delete(:source) || "user"
471
- final_prompt = prompt
472
- if @hook_executor
473
- hook_result = trigger_user_prompt(prompt, source: source)
474
-
475
- if hook_result[:halted]
476
- return RubyLLM::Message.new(
477
- role: :assistant,
478
- content: hook_result[:halt_message],
479
- model_id: model_id,
480
- )
481
- end
482
-
483
- final_prompt = hook_result[:modified_prompt] if hook_result[:modified_prompt]
484
- end
485
-
486
- # Add CLEAN user message to history (no reminders embedded)
487
- @llm_chat.add_message(role: :user, content: final_prompt)
488
-
489
- # Track reminders as ephemeral content for this LLM call only
490
- # They'll be injected by around_llm_request hook but not stored
491
- reminders.each do |reminder|
492
- @context_manager.add_ephemeral_reminder(reminder, messages_array: @llm_chat.messages)
493
- end
494
-
495
- # Execute complete() which handles tool loop and ephemeral injection
496
- response = execute_with_global_semaphore do
497
- catch(:finish_agent) do
498
- catch(:finish_swarm) do
499
- @llm_chat.complete(**options)
500
- end
501
- end
467
+ if @turn_timeout
468
+ execute_with_turn_timeout(prompt, options)
469
+ else
470
+ execute_ask(prompt, options)
502
471
  end
503
-
504
- # Handle finish markers from hooks
505
- handle_finish_marker(response)
506
472
  end
507
473
  end
508
474
 
@@ -559,6 +525,103 @@ module SwarmSDK
559
525
 
560
526
  private
561
527
 
528
+ # Execute ask with turn timeout wrapper
529
+ def execute_with_turn_timeout(prompt, options)
530
+ task = Async::Task.current
531
+
532
+ # Use barrier to track child tasks spawned during this turn
533
+ # (includes RubyLLM's async tool execution when max_concurrent_tools is set)
534
+ barrier = Async::Barrier.new
535
+
536
+ begin
537
+ task.with_timeout(
538
+ @turn_timeout,
539
+ TurnTimeoutError,
540
+ "Agent turn timed out after #{@turn_timeout}s",
541
+ ) do
542
+ # Execute inside barrier to track child tasks
543
+ barrier.async do
544
+ execute_ask(prompt, options)
545
+ end.wait
546
+ end
547
+ rescue TurnTimeoutError
548
+ # Stop all child tasks
549
+ barrier.stop
550
+
551
+ emit_turn_timeout_event
552
+
553
+ # Return error message as response so caller can handle gracefully
554
+ # Format like other tool/delegation errors for natural flow
555
+ # This message goes to the swarm/caller, NOT added to agent's conversation history
556
+ RubyLLM::Message.new(
557
+ role: :assistant,
558
+ content: "Error: Request timed out after #{@turn_timeout}s. The agent did not complete its response within the time limit. Please try a simpler request or increase the turn timeout.",
559
+ model_id: model_id,
560
+ )
561
+ ensure
562
+ # Cleanup barrier if not already stopped
563
+ barrier.stop unless barrier.empty?
564
+ end
565
+ end
566
+
567
+ # Emit turn timeout event
568
+ def emit_turn_timeout_event
569
+ LogStream.emit(
570
+ type: "turn_timeout",
571
+ agent: @agent_name,
572
+ swarm_id: @agent_context&.swarm_id,
573
+ parent_swarm_id: @agent_context&.parent_swarm_id,
574
+ limit: @turn_timeout,
575
+ message: "Agent turn timed out after #{@turn_timeout}s",
576
+ )
577
+ end
578
+
579
+ # Execute ask without timeout (original ask implementation)
580
+ def execute_ask(prompt, options)
581
+ is_first = first_message?
582
+
583
+ # Collect system reminders to inject as ephemeral content
584
+ reminders = collect_system_reminders(prompt, is_first)
585
+
586
+ # Trigger user_prompt hook (with clean prompt, not reminders)
587
+ source = options.delete(:source) || "user"
588
+ final_prompt = prompt
589
+ if @hook_executor
590
+ hook_result = trigger_user_prompt(prompt, source: source)
591
+
592
+ if hook_result[:halted]
593
+ return RubyLLM::Message.new(
594
+ role: :assistant,
595
+ content: hook_result[:halt_message],
596
+ model_id: model_id,
597
+ )
598
+ end
599
+
600
+ final_prompt = hook_result[:modified_prompt] if hook_result[:modified_prompt]
601
+ end
602
+
603
+ # Add CLEAN user message to history (no reminders embedded)
604
+ @llm_chat.add_message(role: :user, content: final_prompt)
605
+
606
+ # Track reminders as ephemeral content for this LLM call only
607
+ # They'll be injected by around_llm_request hook but not stored
608
+ reminders.each do |reminder|
609
+ @context_manager.add_ephemeral_reminder(reminder, messages_array: @llm_chat.messages)
610
+ end
611
+
612
+ # Execute complete() which handles tool loop and ephemeral injection
613
+ response = execute_with_global_semaphore do
614
+ catch(:finish_agent) do
615
+ catch(:finish_swarm) do
616
+ @llm_chat.complete(**options)
617
+ end
618
+ end
619
+ end
620
+
621
+ # Handle finish markers from hooks
622
+ handle_finish_marker(response)
623
+ end
624
+
562
625
  # --- Tool Execution Hook ---
563
626
 
564
627
  # Setup around_tool_execution hook for SwarmSDK orchestration
@@ -712,17 +775,64 @@ module SwarmSDK
712
775
 
713
776
  # --- LLM Call Retry Logic ---
714
777
 
715
- # Call LLM provider with retry logic for transient failures
778
+ # Call LLM provider with smart retry logic based on error type
779
+ #
780
+ # ## Error Categorization
781
+ #
782
+ # **Non-Retryable Client Errors (4xx)**: Return error message immediately
783
+ # - 400 Bad Request (after orphan tool call recovery attempt)
784
+ # - 401 Unauthorized (invalid API key)
785
+ # - 402 Payment Required (billing issue)
786
+ # - 403 Forbidden (permission denied)
787
+ # - 422 Unprocessable Entity (invalid parameters)
788
+ # - Other 4xx errors
789
+ #
790
+ # **Retryable Server Errors (5xx)**: Retry with delays
791
+ # - 429 Rate Limit (RubyLLM already retried 3x)
792
+ # - 500 Server Error (RubyLLM already retried 3x)
793
+ # - 502-503 Service Unavailable (RubyLLM already retried 3x)
794
+ # - 529 Overloaded (RubyLLM already retried 3x)
795
+ # Note: If we see these errors, RubyLLM has already tried 3 times
796
+ #
797
+ # **Network Errors**: Retry with delays
798
+ # - Timeouts, connection failures, etc.
716
799
  #
717
- # Includes special handling for 400 Bad Request errors:
800
+ # ## Special Handling
801
+ #
802
+ # **400 Bad Request with Orphan Tool Calls**:
718
803
  # - Attempts to prune orphan tool calls (tool_use without tool_result)
719
804
  # - If pruning succeeds, retries immediately without counting as retry
805
+ # - If pruning fails or not applicable, returns error message immediately
720
806
  #
721
- # @param max_retries [Integer] Maximum retry attempts
807
+ # ## Error Response Format
808
+ #
809
+ # Non-retryable errors return as assistant messages for natural delegation flow:
810
+ # ```ruby
811
+ # RubyLLM::Message.new(
812
+ # role: :assistant,
813
+ # content: "I encountered an error: [details]"
814
+ # )
815
+ # ```
816
+ #
817
+ # @param max_retries [Integer] Maximum retry attempts at SDK level
818
+ # Note: RubyLLM already retries 429/5xx errors 3 times before this
722
819
  # @param delay [Integer] Delay between retries in seconds
723
820
  # @yield Block that performs the LLM call
724
- # @return [Object] Result from block
725
- def call_llm_with_retry(max_retries: 10, delay: 10, &block)
821
+ # @return [RubyLLM::Message, Object] Result from block or error message
822
+ #
823
+ # @example Handling 401 Unauthorized
824
+ # result = call_llm_with_retry do
825
+ # @llm_chat.complete
826
+ # end
827
+ # # Returns immediately: Message with "Unauthorized" error
828
+ #
829
+ # @example Handling 500 Server Error
830
+ # result = call_llm_with_retry(max_retries: 3, delay: 15) do
831
+ # @llm_chat.complete
832
+ # end
833
+ # # Retries up to 3 times with 15s delays
834
+ # # (RubyLLM already tried 3x, so 6 total attempts)
835
+ def call_llm_with_retry(max_retries: 3, delay: 15, &block)
726
836
  attempts = 0
727
837
  pruning_attempted = false
728
838
 
@@ -731,22 +841,68 @@ module SwarmSDK
731
841
 
732
842
  begin
733
843
  return yield
844
+
845
+ # === CATEGORY A: NON-RETRYABLE CLIENT ERRORS ===
734
846
  rescue RubyLLM::BadRequestError => e
735
- # Try to recover from 400 Bad Request by pruning orphan tool calls
736
- # This can happen when tool execution is interrupted mid-stream
847
+ # Special case: Try orphan tool call recovery ONCE
848
+ # This handles interrupted tool executions (tool_use without tool_result)
737
849
  unless pruning_attempted
738
850
  pruned = recover_from_orphan_tool_calls(e)
739
851
  if pruned > 0
740
852
  pruning_attempted = true
741
- # Don't count this as a regular retry, try again immediately
742
- attempts -= 1
853
+ attempts -= 1 # Don't count as retry
743
854
  next
744
855
  end
745
856
  end
746
857
 
747
- # Fall through to standard retry logic
858
+ # No recovery possible - fail immediately with error message
859
+ emit_non_retryable_error(e, "BadRequest")
860
+ return build_error_message(e)
861
+ rescue RubyLLM::UnauthorizedError => e
862
+ # 401: Authentication failed - won't fix by retrying
863
+ emit_non_retryable_error(e, "Unauthorized")
864
+ return build_error_message(e)
865
+ rescue RubyLLM::PaymentRequiredError => e
866
+ # 402: Billing issue - won't fix by retrying
867
+ emit_non_retryable_error(e, "PaymentRequired")
868
+ return build_error_message(e)
869
+ rescue RubyLLM::ForbiddenError => e
870
+ # 403: Permission denied - won't fix by retrying
871
+ emit_non_retryable_error(e, "Forbidden")
872
+ return build_error_message(e)
873
+
874
+ # === CATEGORY B: RETRYABLE SERVER ERRORS ===
875
+ # IMPORTANT: Must come BEFORE generic RubyLLM::Error to avoid being caught by it
876
+ rescue RubyLLM::RateLimitError,
877
+ RubyLLM::ServerError,
878
+ RubyLLM::ServiceUnavailableError,
879
+ RubyLLM::OverloadedError => e
880
+ # These errors indicate temporary provider issues
881
+ # RubyLLM already retried 3 times with exponential backoff (~0.7s)
882
+ # Retry a few more times with longer delays to give provider time
748
883
  handle_retry_or_raise(e, attempts, max_retries, delay)
884
+
885
+ # === CATEGORY A (CONTINUED): OTHER CLIENT ERRORS ===
886
+ # IMPORTANT: Must come AFTER specific error classes (including server errors)
887
+ rescue RubyLLM::Error => e
888
+ # Generic RubyLLM::Error - check for specific status codes
889
+ if e.response&.status == 422
890
+ # 422: Unprocessable Entity - semantic validation failure
891
+ emit_non_retryable_error(e, "UnprocessableEntity")
892
+ return build_error_message(e)
893
+ elsif e.response&.status && (400..499).include?(e.response.status)
894
+ # Other 4xx errors - conservative: don't retry unknown client errors
895
+ emit_non_retryable_error(e, "ClientError")
896
+ return build_error_message(e)
897
+ end
898
+
899
+ # Unknown error type without status code - conservative: don't retry
900
+ emit_non_retryable_error(e, "UnknownAPIError")
901
+ return build_error_message(e)
902
+
903
+ # === CATEGORY C: NETWORK/OTHER ERRORS ===
749
904
  rescue StandardError => e
905
+ # Network errors, timeouts, unknown errors - retry with delays
750
906
  handle_retry_or_raise(e, attempts, max_retries, delay)
751
907
  end
752
908
  end
@@ -792,6 +948,95 @@ module SwarmSDK
792
948
  sleep(delay)
793
949
  end
794
950
 
951
+ # Build an error message as an assistant response
952
+ #
953
+ # Non-retryable errors are returned as assistant messages instead of raising.
954
+ # This allows errors to flow naturally through delegation - parent agents
955
+ # can see child agent errors and respond appropriately.
956
+ #
957
+ # @param error [RubyLLM::Error, StandardError] The error that occurred
958
+ # @return [RubyLLM::Message] Assistant message containing formatted error
959
+ #
960
+ # @example Error message for delegation
961
+ # error = RubyLLM::UnauthorizedError.new(response, "Invalid API key")
962
+ # message = build_error_message(error)
963
+ # # => Message with role: :assistant, content: "I encountered an error: ..."
964
+ def build_error_message(error)
965
+ content = format_error_message(error)
966
+
967
+ RubyLLM::Message.new(
968
+ role: :assistant,
969
+ content: content,
970
+ model_id: model_id,
971
+ )
972
+ end
973
+
974
+ # Format error details into user-friendly message
975
+ #
976
+ # @param error [RubyLLM::Error, StandardError] The error to format
977
+ # @return [String] Formatted error message with type, status, and guidance
978
+ #
979
+ # @example Formatting 401 error
980
+ # format_error_message(unauthorized_error)
981
+ # # => "I encountered an error while processing your request:
982
+ # # **Error Type:** UnauthorizedError
983
+ # # **Status Code:** 401
984
+ # # **Message:** Invalid API key
985
+ # # Please check your API credentials."
986
+ def format_error_message(error)
987
+ status = error.respond_to?(:response) ? error.response&.status : nil
988
+
989
+ msg = "I encountered an error while processing your request:\n\n"
990
+ msg += "**Error Type:** #{error.class.name.split("::").last}\n"
991
+ msg += "**Status Code:** #{status}\n" if status
992
+ msg += "**Message:** #{error.message}\n\n"
993
+ msg += "This error indicates a problem that cannot be automatically recovered. "
994
+
995
+ # Add context-specific guidance based on error type
996
+ msg += case error
997
+ when RubyLLM::UnauthorizedError
998
+ "Please check your API credentials."
999
+ when RubyLLM::PaymentRequiredError
1000
+ "Please check your account billing status."
1001
+ when RubyLLM::ForbiddenError
1002
+ "You may not have permission to access this resource."
1003
+ when RubyLLM::BadRequestError
1004
+ "The request format may be invalid."
1005
+ else
1006
+ "Please review the error and try again."
1007
+ end
1008
+
1009
+ msg
1010
+ end
1011
+
1012
+ # Emit llm_request_failed event for non-retryable errors
1013
+ #
1014
+ # This event provides visibility into errors that fail immediately
1015
+ # without retry attempts. Useful for monitoring auth failures,
1016
+ # billing issues, and other non-transient problems.
1017
+ #
1018
+ # @param error [RubyLLM::Error, StandardError] The error that occurred
1019
+ # @param error_type [String] Friendly error type name for logging
1020
+ # @return [void]
1021
+ #
1022
+ # @example Emitting unauthorized error event
1023
+ # emit_non_retryable_error(error, "Unauthorized")
1024
+ # # Emits: { type: "llm_request_failed", error_type: "Unauthorized", ... }
1025
+ def emit_non_retryable_error(error, error_type)
1026
+ LogStream.emit(
1027
+ type: "llm_request_failed",
1028
+ agent: @agent_name,
1029
+ swarm_id: @agent_context&.swarm_id,
1030
+ parent_swarm_id: @agent_context&.parent_swarm_id,
1031
+ model: model_id,
1032
+ error_type: error_type,
1033
+ error_class: error.class.name,
1034
+ error_message: error.message,
1035
+ status_code: error.respond_to?(:response) ? error.response&.status : nil,
1036
+ retryable: false,
1037
+ )
1038
+ end
1039
+
795
1040
  # Recover from 400 Bad Request by pruning orphan tool calls
796
1041
  #
797
1042
  # @param error [RubyLLM::BadRequestError] The error that occurred
@@ -32,7 +32,8 @@ module SwarmSDK
32
32
  :mcp_servers,
33
33
  :parameters,
34
34
  :headers,
35
- :timeout,
35
+ :request_timeout,
36
+ :turn_timeout,
36
37
  :disable_default_tools,
37
38
  :coding_agent,
38
39
  :default_permissions,
@@ -74,9 +75,16 @@ module SwarmSDK
74
75
  @context_window = coerce_to_integer(config[:context_window]) # Explicit context window override
75
76
  @parameters = config[:parameters] || {}
76
77
  @headers = Utils.stringify_keys(config[:headers] || {})
77
- @timeout = config[:timeout] || SwarmSDK.config.agent_request_timeout
78
+ @request_timeout = config[:request_timeout] || SwarmSDK.config.agent_request_timeout
78
79
  @bypass_permissions = config[:bypass_permissions] || false
79
80
  @max_concurrent_tools = config[:max_concurrent_tools]
81
+
82
+ # Use default from config unless explicitly set (including nil to disable)
83
+ @turn_timeout = if config.key?(:turn_timeout)
84
+ config[:turn_timeout] # Could be a number OR nil (to disable)
85
+ else
86
+ SwarmSDK.config.default_turn_timeout
87
+ end
80
88
  # Always assume model exists - SwarmSDK validates models separately using models.json
81
89
  # This prevents RubyLLM from trying to validate models in its registry
82
90
  @assume_model_exists = true
@@ -160,7 +168,8 @@ module SwarmSDK
160
168
  mcp_servers: @mcp_servers,
161
169
  parameters: @parameters,
162
170
  headers: @headers,
163
- timeout: @timeout,
171
+ request_timeout: @request_timeout,
172
+ turn_timeout: @turn_timeout,
164
173
  bypass_permissions: @bypass_permissions,
165
174
  disable_default_tools: @disable_default_tools,
166
175
  coding_agent: @coding_agent,
@@ -294,7 +303,8 @@ module SwarmSDK
294
303
  :context_window,
295
304
  :parameters,
296
305
  :headers,
297
- :timeout,
306
+ :request_timeout,
307
+ :turn_timeout,
298
308
  :bypass_permissions,
299
309
  :max_concurrent_tools,
300
310
  :assume_model_exists,
@@ -465,6 +475,11 @@ module SwarmSDK
465
475
  def validate!
466
476
  raise ConfigurationError, "Agent '#{@name}' missing required 'description' field" unless @description
467
477
 
478
+ # Validate turn_timeout is positive if set
479
+ if @turn_timeout && @turn_timeout <= 0
480
+ raise ConfigurationError, "Agent '#{@name}' turn_timeout must be positive (got #{@turn_timeout})"
481
+ end
482
+
468
483
  # Validate api_version can only be set for OpenAI-compatible providers
469
484
  if @api_version
470
485
  openai_compatible = ["openai", "deepseek", "perplexity", "mistral", "openrouter"]
@@ -260,7 +260,8 @@ module SwarmSDK
260
260
  builder.context_window(config[:context_window]) if config[:context_window]
261
261
  builder.system_prompt(config[:system_prompt]) if config[:system_prompt]
262
262
  builder.directory(config[:directory]) if config[:directory]
263
- builder.timeout(config[:timeout]) if config[:timeout]
263
+ builder.request_timeout(config[:request_timeout]) if config[:request_timeout]
264
+ builder.turn_timeout(config[:turn_timeout]) if config[:turn_timeout]
264
265
  builder.parameters(config[:parameters]) if config[:parameters]
265
266
  builder.headers(config[:headers]) if config[:headers]
266
267
  builder.coding_agent(config[:coding_agent]) unless config[:coding_agent].nil?
@@ -337,6 +338,12 @@ module SwarmSDK
337
338
  merged[:parameters] = (merged[:parameters] || {}).merge(value || {})
338
339
  when :headers
339
340
  merged[:headers] = (merged[:headers] || {}).merge(value || {})
341
+ when :turn_timeout
342
+ # Agent-specific turn_timeout overrides all_agents
343
+ merged[key] = value
344
+ when :request_timeout
345
+ # Agent-specific request_timeout overrides all_agents
346
+ merged[key] = value
340
347
  else
341
348
  merged[key] = value
342
349
  end
@@ -372,8 +379,12 @@ module SwarmSDK
372
379
  agent_builder.api_version(all_agents_hash[:api_version])
373
380
  end
374
381
 
375
- if all_agents_hash[:timeout] && !agent_builder.timeout_set?
376
- agent_builder.timeout(all_agents_hash[:timeout])
382
+ if all_agents_hash[:request_timeout] && !agent_builder.request_timeout_set?
383
+ agent_builder.request_timeout(all_agents_hash[:request_timeout])
384
+ end
385
+
386
+ if all_agents_hash[:turn_timeout] && !agent_builder.turn_timeout_set?
387
+ agent_builder.turn_timeout(all_agents_hash[:turn_timeout])
377
388
  end
378
389
 
379
390
  if all_agents_hash[:parameters]
@@ -81,6 +81,8 @@ module SwarmSDK
81
81
  chars_per_token_prose: ["SWARM_SDK_CHARS_PER_TOKEN_PROSE", -> { Defaults::TokenEstimation::CHARS_PER_TOKEN_PROSE }],
82
82
  chars_per_token_code: ["SWARM_SDK_CHARS_PER_TOKEN_CODE", -> { Defaults::TokenEstimation::CHARS_PER_TOKEN_CODE }],
83
83
  mcp_log_level: ["SWARM_SDK_MCP_LOG_LEVEL", -> { Defaults::Logging::MCP_LOG_LEVEL }],
84
+ default_execution_timeout: ["SWARM_SDK_DEFAULT_EXECUTION_TIMEOUT", -> { Defaults::Timeouts::EXECUTION_TIMEOUT_SECONDS }],
85
+ default_turn_timeout: ["SWARM_SDK_DEFAULT_TURN_TIMEOUT", -> { Defaults::Timeouts::TURN_TIMEOUT_SECONDS }],
84
86
  }.freeze
85
87
 
86
88
  # WebFetch and control settings
@@ -28,7 +28,8 @@ module SwarmSDK
28
28
  :all_agents_hooks,
29
29
  :scratchpad_mode,
30
30
  :nodes,
31
- :external_swarms
31
+ :external_swarms,
32
+ :execution_timeout
32
33
 
33
34
  # Initialize parser with YAML content and options
34
35
  #
@@ -54,6 +55,7 @@ module SwarmSDK
54
55
  @external_swarms = {}
55
56
  @nodes = {}
56
57
  @scratchpad_mode = :disabled
58
+ @execution_timeout = nil
57
59
  end
58
60
 
59
61
  def parse
@@ -134,6 +136,7 @@ module SwarmSDK
134
136
  @swarm_name = @root_config[:name]
135
137
  @swarm_id = @root_config[:id]
136
138
  @scratchpad_mode = parse_scratchpad_mode(@root_config[:scratchpad])
139
+ @execution_timeout = @root_config[:execution_timeout]
137
140
 
138
141
  load_all_agents_config
139
142
  load_hooks_config
@@ -40,6 +40,7 @@ module SwarmSDK
40
40
  builder.id(@parser.swarm_id) if @parser.swarm_id
41
41
  builder.name(@parser.swarm_name)
42
42
  builder.scratchpad(@parser.scratchpad_mode)
43
+ builder.execution_timeout(@parser.execution_timeout) if @parser.execution_timeout
43
44
 
44
45
  if @parser.external_swarms&.any?
45
46
  external_swarms = @parser.external_swarms
@@ -92,7 +93,8 @@ module SwarmSDK
92
93
  provider(all_agents_cfg[:provider]) if all_agents_cfg[:provider]
93
94
  base_url(all_agents_cfg[:base_url]) if all_agents_cfg[:base_url]
94
95
  api_version(all_agents_cfg[:api_version]) if all_agents_cfg[:api_version]
95
- timeout(all_agents_cfg[:timeout]) if all_agents_cfg[:timeout]
96
+ request_timeout(all_agents_cfg[:request_timeout]) if all_agents_cfg[:request_timeout]
97
+ turn_timeout(all_agents_cfg[:turn_timeout]) if all_agents_cfg[:turn_timeout]
96
98
  parameters(all_agents_cfg[:parameters]) if all_agents_cfg[:parameters]
97
99
  headers(all_agents_cfg[:headers]) if all_agents_cfg[:headers]
98
100
  coding_agent(all_agents_cfg[:coding_agent]) unless all_agents_cfg[:coding_agent].nil?
@@ -152,7 +154,8 @@ module SwarmSDK
152
154
  context_window(config[:context_window]) if config[:context_window]
153
155
  system_prompt(config[:system_prompt]) if config[:system_prompt]
154
156
  directory(config[:directory]) if config[:directory]
155
- timeout(config[:timeout]) if config[:timeout]
157
+ request_timeout(config[:request_timeout]) if config[:request_timeout]
158
+ turn_timeout(config[:turn_timeout]) if config[:turn_timeout]
156
159
  parameters(config[:parameters]) if config[:parameters]
157
160
  headers(config[:headers]) if config[:headers]
158
161
  coding_agent(config[:coding_agent]) unless config[:coding_agent].nil?
@@ -75,6 +75,20 @@ module SwarmSDK
75
75
  # data transformation operations while preventing stalls.
76
76
  TRANSFORMER_COMMAND_SECONDS = 60
77
77
 
78
+ # Execution timeout (seconds)
79
+ #
80
+ # Maximum wall-clock time for entire swarm.execute() call.
81
+ # 30 minutes allows complex multi-agent workflows while preventing
82
+ # runaway execution.
83
+ EXECUTION_TIMEOUT_SECONDS = 1800
84
+
85
+ # Turn timeout (seconds)
86
+ #
87
+ # Maximum time for a single agent.ask() call, including all LLM requests
88
+ # and tool executions. 30 minutes accommodates extended thinking models
89
+ # and complex tool chains.
90
+ TURN_TIMEOUT_SECONDS = 1800
91
+
78
92
  # OpenAI responses API ID TTL (seconds)
79
93
  #
80
94
  # Time-to-live for cached response IDs. 5 minutes allows conversation
@@ -29,7 +29,8 @@ module SwarmSDK
29
29
  @provider = nil
30
30
  @base_url = nil
31
31
  @api_version = nil
32
- @timeout = nil
32
+ @request_timeout = nil
33
+ @turn_timeout = nil
33
34
  @parameters = nil
34
35
  @headers = nil
35
36
  @coding_agent = nil
@@ -56,9 +57,14 @@ module SwarmSDK
56
57
  @api_version = version
57
58
  end
58
59
 
59
- # Set timeout for all agents
60
- def timeout(seconds)
61
- @timeout = seconds
60
+ # Set request timeout for all agents
61
+ def request_timeout(seconds)
62
+ @request_timeout = seconds
63
+ end
64
+
65
+ # Set turn timeout for all agents
66
+ def turn_timeout(seconds)
67
+ @turn_timeout = seconds
62
68
  end
63
69
 
64
70
  # Set parameters for all agents
@@ -153,7 +159,8 @@ module SwarmSDK
153
159
  provider: @provider,
154
160
  base_url: @base_url,
155
161
  api_version: @api_version,
156
- timeout: @timeout,
162
+ request_timeout: @request_timeout,
163
+ turn_timeout: @turn_timeout,
157
164
  parameters: @parameters,
158
165
  headers: @headers,
159
166
  coding_agent: @coding_agent,
@@ -49,6 +49,12 @@ module SwarmSDK
49
49
  @lead_agent = nil
50
50
  @swarm_hooks = []
51
51
  @observer_configs = []
52
+ @execution_timeout = nil
53
+ end
54
+
55
+ # Set execution timeout (seconds)
56
+ def execution_timeout(seconds)
57
+ @execution_timeout = seconds
52
58
  end
53
59
 
54
60
  # Set lead agent
@@ -142,6 +148,7 @@ module SwarmSDK
142
148
  swarm_id: @swarm_id,
143
149
  scratchpad_mode: @scratchpad,
144
150
  allow_filesystem_tools: @allow_filesystem_tools,
151
+ execution_timeout: @execution_timeout,
145
152
  )
146
153
 
147
154
  # Setup swarm registry if external swarms are registered
@@ -32,15 +32,26 @@ module SwarmSDK
32
32
 
33
33
  # Blocking execution using Sync
34
34
  def run_blocking(prompt, logs:, has_logging:)
35
+ result = nil
35
36
  Sync do |task|
36
- execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
37
- task.async(finished: false) { lead.ask(current_prompt) }.wait
37
+ start_time = Time.now
38
+
39
+ result = if @swarm.execution_timeout
40
+ execute_with_execution_timeout(task, prompt, logs, has_logging, start_time)
41
+ else
42
+ execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
43
+ # Execute directly - no child task needed
44
+ # This keeps execution in same fiber context for better control
45
+ lead.ask(current_prompt)
46
+ end
38
47
  end
39
48
  ensure
40
49
  # Always wait for observer tasks, even if main execution raises
41
50
  # This is INSIDE Sync block, so async tasks can still complete
42
51
  @swarm.wait_for_observers
43
52
  end
53
+
54
+ result
44
55
  ensure
45
56
  # Restore original fiber storage (preserves parent context for nested swarms)
46
57
  restore_fiber_storage
@@ -51,9 +62,17 @@ module SwarmSDK
51
62
  parent = Async::Task.current
52
63
  raise ConfigurationError, "wait: false requires an async context. Use Sync { swarm.execute(..., wait: false) }" unless parent
53
64
 
54
- parent.async(finished: false) do
55
- execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
56
- Async(finished: false) { lead.ask(current_prompt) }.wait
65
+ # NOTE: The block receives |task| as the spawned Async::Task when arity > 0
66
+ parent.async(finished: false) do |task|
67
+ start_time = Time.now
68
+
69
+ if @swarm.execution_timeout
70
+ execute_with_execution_timeout(task, prompt, logs, has_logging, start_time)
71
+ else
72
+ execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
73
+ # Execute directly - no child task needed
74
+ lead.ask(current_prompt)
75
+ end
57
76
  end
58
77
  end
59
78
  end
@@ -77,8 +96,9 @@ module SwarmSDK
77
96
 
78
97
  result = execution_loop(current_prompt, logs, start_time, &block)
79
98
  swarm_stop_triggered = true
80
- rescue ConfigurationError, AgentNotFoundError
81
- # Re-raise configuration errors - these should be fixed, not caught
99
+ rescue ConfigurationError, AgentNotFoundError, ExecutionTimeoutError, TurnTimeoutError
100
+ # Re-raise configuration errors and timeouts - these should not be caught here
101
+ # Timeouts are handled by execute_with_execution_timeout wrapper
82
102
  raise
83
103
  rescue TypeError => e
84
104
  result = handle_type_error(e, logs, start_time)
@@ -208,6 +228,63 @@ module SwarmSDK
208
228
  LogCollector.reset!
209
229
  LogStream.reset!
210
230
  end
231
+
232
+ # Execute with execution timeout wrapper
233
+ def execute_with_execution_timeout(task, prompt, logs, has_logging, start_time)
234
+ # Use Async::Task.current to get the actual current task context
235
+ current_task = Async::Task.current || task
236
+
237
+ # Use barrier to track ALL child tasks spawned during execution
238
+ # This includes RubyLLM's async tool execution (when max_concurrent_tools is set)
239
+ barrier = Async::Barrier.new
240
+
241
+ begin
242
+ current_task.with_timeout(
243
+ @swarm.execution_timeout,
244
+ ExecutionTimeoutError,
245
+ "Swarm execution timed out after #{@swarm.execution_timeout}s",
246
+ ) do
247
+ # Execute inside barrier to track child tasks (tool executions)
248
+ barrier.async do
249
+ execute_in_task(prompt, logs: logs, has_logging: has_logging) do |lead, current_prompt|
250
+ lead.ask(current_prompt)
251
+ end
252
+ end.wait
253
+ end
254
+ rescue ExecutionTimeoutError => e
255
+ # Stop ALL child tasks (interrupts ongoing tool executions and delegations)
256
+ barrier.stop
257
+
258
+ emit_execution_timeout_event(@swarm.execution_timeout)
259
+ build_timeout_result(e, logs, Time.now - start_time)
260
+ ensure
261
+ # Cleanup barrier if not already stopped
262
+ barrier.stop unless barrier.empty?
263
+ end
264
+ end
265
+
266
+ # Emit execution timeout event
267
+ def emit_execution_timeout_event(limit)
268
+ LogStream.emit(
269
+ type: "execution_timeout",
270
+ swarm_id: @swarm.swarm_id,
271
+ parent_swarm_id: @swarm.parent_swarm_id,
272
+ limit: limit,
273
+ message: "Swarm execution timed out after #{limit}s",
274
+ )
275
+ end
276
+
277
+ # Build timeout result
278
+ def build_timeout_result(error, logs, duration)
279
+ Result.new(
280
+ content: nil,
281
+ agent: @swarm.lead_agent&.to_s || "unknown",
282
+ error: error,
283
+ logs: logs,
284
+ duration: duration,
285
+ metadata: { timeout: true },
286
+ )
287
+ end
211
288
  end
212
289
  end
213
290
  end
@@ -72,7 +72,7 @@ module SwarmSDK
72
72
  # Default tools available to all agents
73
73
  DEFAULT_TOOLS = ToolConfigurator::DEFAULT_TOOLS
74
74
 
75
- attr_reader :name, :agents, :lead_agent, :mcp_clients, :delegation_instances, :agent_definitions, :swarm_id, :parent_swarm_id, :swarm_registry, :scratchpad_storage, :allow_filesystem_tools, :hook_registry, :global_semaphore, :plugin_storages, :config_for_hooks, :observer_configs
75
+ attr_reader :name, :agents, :lead_agent, :mcp_clients, :delegation_instances, :agent_definitions, :swarm_id, :parent_swarm_id, :swarm_registry, :scratchpad_storage, :allow_filesystem_tools, :hook_registry, :global_semaphore, :plugin_storages, :config_for_hooks, :observer_configs, :execution_timeout
76
76
  attr_accessor :delegation_call_stack
77
77
 
78
78
  # Check if scratchpad tools are enabled
@@ -139,13 +139,25 @@ module SwarmSDK
139
139
  # @param scratchpad [Tools::Stores::Scratchpad, nil] Optional scratchpad instance (for testing/internal use)
140
140
  # @param scratchpad_mode [Symbol, String] Scratchpad mode (:enabled or :disabled). :per_node not allowed for non-node swarms.
141
141
  # @param allow_filesystem_tools [Boolean, nil] Whether to allow filesystem tools (nil uses global setting)
142
- def initialize(name:, swarm_id: nil, parent_swarm_id: nil, global_concurrency: nil, default_local_concurrency: nil, scratchpad: nil, scratchpad_mode: :enabled, allow_filesystem_tools: nil)
142
+ def initialize(name:, swarm_id: nil, parent_swarm_id: nil, global_concurrency: nil, default_local_concurrency: nil, scratchpad: nil, scratchpad_mode: :enabled, allow_filesystem_tools: nil, execution_timeout: :__use_default__)
143
143
  @name = name
144
144
  @swarm_id = swarm_id || generate_swarm_id(name)
145
145
  @parent_swarm_id = parent_swarm_id
146
146
  @global_concurrency = global_concurrency || SwarmSDK.config.global_concurrency_limit
147
147
  @default_local_concurrency = default_local_concurrency || SwarmSDK.config.local_concurrency_limit
148
148
 
149
+ # Use default from config unless explicitly set (including nil to disable)
150
+ @execution_timeout = if execution_timeout == :__use_default__
151
+ SwarmSDK.config.default_execution_timeout
152
+ else
153
+ execution_timeout # Could be a number OR nil (to disable)
154
+ end
155
+
156
+ # Validate execution_timeout is positive if set
157
+ if @execution_timeout && @execution_timeout <= 0
158
+ raise ConfigurationError, "execution_timeout must be positive (got #{@execution_timeout})"
159
+ end
160
+
149
161
  # Handle scratchpad_mode parameter
150
162
  # For Swarm: :enabled or :disabled (not :per_node - that's for nodes)
151
163
  @scratchpad_mode = validate_swarm_scratchpad_mode(scratchpad_mode)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SwarmSDK
4
- VERSION = "2.5.4"
4
+ VERSION = "2.6.0"
5
5
  end
data/lib/swarm_sdk.rb CHANGED
@@ -13,6 +13,7 @@ require "set"
13
13
  require "yaml"
14
14
 
15
15
  require "async"
16
+ require "async/barrier"
16
17
  require "async/semaphore"
17
18
  require "ruby_llm"
18
19
  require "ruby_llm/mcp"
@@ -61,6 +62,15 @@ module SwarmSDK
61
62
  class LLMError < Error; end
62
63
  class StateError < Error; end
63
64
 
65
+ # Base class for SwarmSDK timeout errors
66
+ class TimeoutError < Error; end
67
+
68
+ # Raised when swarm execution exceeds execution_timeout
69
+ class ExecutionTimeoutError < TimeoutError; end
70
+
71
+ # Raised when agent turn exceeds turn_timeout
72
+ class TurnTimeoutError < TimeoutError; end
73
+
64
74
  class << self
65
75
  # Get the global configuration instance
66
76
  #
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: swarm_sdk
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.4
4
+ version: 2.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paulo Arruda