vectara-agentic 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vectara-agentic might be problematic. Click here for more details.

tests/benchmark_models.py CHANGED
@@ -68,7 +68,7 @@ def validate_api_keys(models_to_test: List[Dict]) -> None:
68
68
  missing_keys.append(key)
69
69
 
70
70
  if missing_keys:
71
- print("ERROR: Missing required API keys for benchmark execution:")
71
+ print("ERROR: Missing required API keys for benchmark execution:")
72
72
  print()
73
73
  for key in sorted(missing_keys):
74
74
  print(f" • {key}")
@@ -83,7 +83,7 @@ def validate_api_keys(models_to_test: List[Dict]) -> None:
83
83
 
84
84
  sys.exit(1)
85
85
 
86
- print("All required API keys are present")
86
+ print("All required API keys are present")
87
87
  print(f"Found API keys for {len(required_keys)} required environment variables")
88
88
 
89
89
 
@@ -135,7 +135,7 @@ class ModelBenchmark:
135
135
  {"provider": ModelProvider.OPENAI, "model": "gpt-5-mini"},
136
136
  {"provider": ModelProvider.OPENAI, "model": "gpt-4o-mini"},
137
137
  {"provider": ModelProvider.OPENAI, "model": "gpt-4.1-mini"},
138
- {"provider": ModelProvider.ANTHROPIC, "model": "claude-sonnet-4-20250514"},
138
+ {"provider": ModelProvider.ANTHROPIC, "model": "claude-sonnet-4-5"},
139
139
  {"provider": ModelProvider.TOGETHER, "model": "deepseek-ai/DeepSeek-V3"},
140
140
  {"provider": ModelProvider.GROQ, "model": "openai/gpt-oss-20b"},
141
141
  {"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-flash-lite"},
@@ -817,11 +817,11 @@ class ModelBenchmark:
817
817
  observability_setup = setup_observer(dummy_config, verbose=True)
818
818
  if observability_setup:
819
819
  print(
820
- "Arize Phoenix observability enabled - LLM calls will be traced\n"
820
+ "Arize Phoenix observability enabled - LLM calls will be traced\n"
821
821
  )
822
822
  _observability_initialized = True
823
823
  else:
824
- print("⚠️ Arize Phoenix observability setup failed\n")
824
+ print("Arize Phoenix observability setup failed\n")
825
825
 
826
826
  # Create semaphore to limit concurrent model testing
827
827
  model_semaphore = asyncio.Semaphore(self.max_concurrent_models)
@@ -835,7 +835,7 @@ class ModelBenchmark:
835
835
  tasks.append(task)
836
836
 
837
837
  # Execute all model benchmarks in parallel
838
- print("🚀 Starting parallel benchmark execution...\n")
838
+ print("Starting parallel benchmark execution...\n")
839
839
  await asyncio.gather(*tasks, return_exceptions=True)
840
840
 
841
841
  async def _run_model_benchmark(
@@ -857,9 +857,9 @@ class ModelBenchmark:
857
857
  provider, model_name, test_name, test_config
858
858
  )
859
859
  except Exception as e:
860
- print(f"Error in {model_name} - {test_name}: {e}")
860
+ print(f"Error in {model_name} - {test_name}: {e}")
861
861
 
862
- print(f"Completed: {provider.value} - {model_name}")
862
+ print(f"Completed: {provider.value} - {model_name}")
863
863
 
864
864
  async def _run_scenario_benchmark(
865
865
  self,
@@ -892,18 +892,18 @@ class ModelBenchmark:
892
892
 
893
893
  if result.error:
894
894
  print(
895
- f"{model_name}/{test_name} Iteration {iteration_num}: {result.error}"
895
+ f"{model_name}/{test_name} Iteration {iteration_num}: {result.error}"
896
896
  )
897
897
  else:
898
898
  print(
899
- f"{model_name}/{test_name} Iteration {iteration_num}: "
899
+ f"{model_name}/{test_name} Iteration {iteration_num}: "
900
900
  f"{result.total_response_time:.2f}s, "
901
901
  f"first token: {result.first_token_latency:.2f}s, "
902
902
  f"{result.tokens_per_second:.1f} chars/sec"
903
903
  )
904
904
 
905
905
  except Exception as e:
906
- print(f"{model_name}/{test_name} Iteration {iteration_num}: {e}")
906
+ print(f"{model_name}/{test_name} Iteration {iteration_num}: {e}")
907
907
  # Create error result
908
908
  error_result = BenchmarkResult(
909
909
  model_name=model_name,
@@ -929,7 +929,7 @@ class ModelBenchmark:
929
929
  successful = len([r for r in iteration_results if r.error is None])
930
930
  success_rate = (successful / len(iteration_results)) * 100
931
931
  print(
932
- f" 📊 {model_name}/{test_name} complete: {successful}/{len(iteration_results)} successful ({success_rate:.1f}%)"
932
+ f"{model_name}/{test_name} complete: {successful}/{len(iteration_results)} successful ({success_rate:.1f}%)"
933
933
  )
934
934
 
935
935
  return iteration_results
tests/test_agent.py CHANGED
@@ -13,7 +13,6 @@ from vectara_agentic.agent_config import AgentConfig
13
13
  from vectara_agentic.types import ModelProvider, ObserverType
14
14
  from vectara_agentic.tools import ToolsFactory
15
15
 
16
- from vectara_agentic.agent_core.prompts import GENERAL_INSTRUCTIONS
17
16
  from conftest import mult, STANDARD_TEST_TOPIC, STANDARD_TEST_INSTRUCTIONS
18
17
 
19
18
 
@@ -54,9 +53,11 @@ class TestAgentPackage(unittest.TestCase):
54
53
  + date.today().strftime("%A, %B %d, %Y")
55
54
  + " with Always do as your mother tells you!"
56
55
  )
56
+ # Test format_prompt with dummy instructions since we're only testing template substitution
57
+ dummy_instructions = "Test instructions"
57
58
  self.assertEqual(
58
59
  format_prompt(
59
- prompt_template, GENERAL_INSTRUCTIONS, topic, custom_instructions
60
+ prompt_template, dummy_instructions, topic, custom_instructions
60
61
  ),
61
62
  expected_output,
62
63
  )
@@ -83,7 +84,7 @@ class TestAgentPackage(unittest.TestCase):
83
84
  config = AgentConfig(
84
85
  agent_type=AgentType.REACT,
85
86
  main_llm_provider=ModelProvider.ANTHROPIC,
86
- main_llm_model_name="claude-sonnet-4-20250514",
87
+ main_llm_model_name="claude-sonnet-4-5",
87
88
  tool_llm_provider=ModelProvider.TOGETHER,
88
89
  tool_llm_model_name="moonshotai/Kimi-K2-Instruct",
89
90
  observer=ObserverType.ARIZE_PHOENIX,
tests/test_bedrock.py CHANGED
@@ -8,6 +8,7 @@ import threading
8
8
 
9
9
  from vectara_agentic.agent import Agent
10
10
  from vectara_agentic.tools import ToolsFactory
11
+ from vectara_agentic.tools_catalog import ToolsCatalog
11
12
 
12
13
  import nest_asyncio
13
14
 
@@ -15,6 +16,7 @@ nest_asyncio.apply()
15
16
 
16
17
  from conftest import (
17
18
  mult,
19
+ add,
18
20
  fc_config_bedrock,
19
21
  STANDARD_TEST_TOPIC,
20
22
  STANDARD_TEST_INSTRUCTIONS,
@@ -64,6 +66,105 @@ class TestBedrock(unittest.IsolatedAsyncioTestCase):
64
66
 
65
67
  self.assertEqual(response3.response, "1050")
66
68
 
69
+ async def test_claude_sonnet_4_multi_tool_chain(self):
70
+ """Test Claude Sonnet 4 with complex multi-step reasoning chain using multiple tools via Bedrock."""
71
+ with ARIZE_LOCK:
72
+ # Use Bedrock config (Claude Sonnet 4)
73
+ tools_catalog = ToolsCatalog(fc_config_bedrock)
74
+ tools = [
75
+ ToolsFactory().create_tool(mult),
76
+ ToolsFactory().create_tool(add),
77
+ ToolsFactory().create_tool(tools_catalog.summarize_text),
78
+ ToolsFactory().create_tool(tools_catalog.rephrase_text),
79
+ ]
80
+
81
+ agent = Agent(
82
+ agent_config=fc_config_bedrock,
83
+ tools=tools,
84
+ topic=STANDARD_TEST_TOPIC,
85
+ custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
86
+ )
87
+
88
+ # Complex multi-step reasoning task
89
+ complex_query = (
90
+ "Perform this calculation step by step: "
91
+ "First multiply 5 by 9, then add 13 to that result, "
92
+ "then multiply the new result by 2. "
93
+ "After getting the final number, summarize the entire mathematical process "
94
+ "with expertise in 'mathematics education', "
95
+ "then rephrase that summary as a 10-year-old would explain it."
96
+ )
97
+
98
+ print("\nStarting Claude Sonnet 4 multi-tool chain test (Bedrock)")
99
+ print(f"Query: {complex_query}")
100
+ print("Streaming response:\n" + "="*50)
101
+
102
+ stream = await agent.astream_chat(complex_query)
103
+
104
+ # Capture streaming deltas and tool calls
105
+ streaming_deltas = []
106
+ tool_calls_made = []
107
+ full_response = ""
108
+
109
+ async for chunk in stream.async_response_gen():
110
+ if chunk and chunk.strip():
111
+ streaming_deltas.append(chunk)
112
+ full_response += chunk
113
+ # Display each streaming delta
114
+ print(f"Delta: {repr(chunk)}")
115
+
116
+ # Track tool calls in the stream
117
+ if "mult" in chunk.lower():
118
+ if "mult" not in [call["tool"] for call in tool_calls_made]:
119
+ tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
120
+ print(f"Tool call detected: mult (#{len(tool_calls_made)})")
121
+ if "add" in chunk.lower():
122
+ if "add" not in [call["tool"] for call in tool_calls_made]:
123
+ tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
124
+ print(f"Tool call detected: add (#{len(tool_calls_made)})")
125
+ if "summarize" in chunk.lower():
126
+ if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
127
+ tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
128
+ print(f"Tool call detected: summarize_text (#{len(tool_calls_made)})")
129
+ if "rephrase" in chunk.lower():
130
+ if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
131
+ tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
132
+ print(f"Tool call detected: rephrase_text (#{len(tool_calls_made)})")
133
+
134
+ response = await stream.aget_response()
135
+
136
+ print("="*50)
137
+ print(f"Streaming completed. Total deltas: {len(streaming_deltas)}")
138
+ print(f"Tool calls made: {[call['tool'] for call in tool_calls_made]}")
139
+ print(f"📄 Final response length: {len(response.response)} chars")
140
+ print(f"Final response: {response.response}")
141
+
142
+ # Validate tool usage sequence
143
+ tools_used = [call["tool"] for call in tool_calls_made]
144
+ print(f"🧪 Tools used in order: {tools_used}")
145
+
146
+ # Check that at least multiplication happened (basic requirement)
147
+ self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
148
+
149
+ # Check for mathematical results in the full response or streaming deltas
150
+ # Expected: 5*9=45, 45+13=58, 58*2=116
151
+ expected_intermediate_results = ["45", "58", "116"]
152
+ all_text = (full_response + " " + response.response).lower()
153
+ math_results_found = sum(1 for result in expected_intermediate_results
154
+ if result in all_text)
155
+
156
+ print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
157
+ print(f"Full text searched: {all_text[:200]}...")
158
+
159
+ # More lenient assertion - just check that some mathematical progress was made
160
+ self.assertGreaterEqual(math_results_found, 1,
161
+ f"Expected at least 1 mathematical result. Found {math_results_found}. "
162
+ f"Full text: {all_text}")
163
+
164
+ # Verify that streaming actually produced content
165
+ self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
166
+ self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
167
+
67
168
 
68
169
  if __name__ == "__main__":
69
170
  unittest.main()
tests/test_gemini.py CHANGED
@@ -4,25 +4,48 @@ import warnings
4
4
  warnings.simplefilter("ignore", DeprecationWarning)
5
5
 
6
6
  import unittest
7
+ import asyncio
8
+ import gc
7
9
 
8
10
  from vectara_agentic.agent import Agent
9
11
  from vectara_agentic.tools import ToolsFactory
12
+ from vectara_agentic.tools_catalog import ToolsCatalog
13
+ from vectara_agentic.llm_utils import clear_llm_cache
10
14
 
11
15
 
12
16
  import nest_asyncio
13
17
 
14
18
  nest_asyncio.apply()
15
19
 
16
- from conftest import (
20
+ from tests.conftest import (
17
21
  mult,
22
+ add,
18
23
  fc_config_gemini,
19
24
  STANDARD_TEST_TOPIC,
20
25
  STANDARD_TEST_INSTRUCTIONS,
21
26
  )
22
27
 
23
28
 
24
- class TestGEMINI(unittest.TestCase):
25
- def test_gemini(self):
29
+ class TestGEMINI(unittest.IsolatedAsyncioTestCase):
30
+ def setUp(self):
31
+ """Set up test fixtures."""
32
+ super().setUp()
33
+ # Clear any cached LLM instances before each test
34
+ clear_llm_cache()
35
+ # Force garbage collection to clean up any lingering resources
36
+ gc.collect()
37
+
38
+ async def asyncTearDown(self):
39
+ """Clean up after each test - async version."""
40
+ await super().asyncTearDown()
41
+ # Clear cached LLM instances after each test
42
+ clear_llm_cache()
43
+ # Force garbage collection
44
+ gc.collect()
45
+ # Small delay to allow cleanup
46
+ await asyncio.sleep(0.01)
47
+
48
+ async def test_gemini(self):
26
49
  tools = [ToolsFactory().create_tool(mult)]
27
50
 
28
51
  agent = Agent(
@@ -31,14 +54,14 @@ class TestGEMINI(unittest.TestCase):
31
54
  topic=STANDARD_TEST_TOPIC,
32
55
  custom_instructions=STANDARD_TEST_INSTRUCTIONS,
33
56
  )
34
- _ = agent.chat("What is 5 times 10. Only give the answer, nothing else")
35
- _ = agent.chat("what is 3 times 7. Only give the answer, nothing else")
36
- res = agent.chat(
57
+ _ = await agent.achat("What is 5 times 10. Only give the answer, nothing else")
58
+ _ = await agent.achat("what is 3 times 7. Only give the answer, nothing else")
59
+ res = await agent.achat(
37
60
  "what is the result of multiplying the results of the last two multiplications. Only give the answer, nothing else."
38
61
  )
39
62
  self.assertIn("1050", res.response)
40
63
 
41
- def test_gemini_single_prompt(self):
64
+ async def test_gemini_single_prompt(self):
42
65
  tools = [ToolsFactory().create_tool(mult)]
43
66
 
44
67
  agent = Agent(
@@ -47,11 +70,74 @@ class TestGEMINI(unittest.TestCase):
47
70
  topic=STANDARD_TEST_TOPIC,
48
71
  custom_instructions=STANDARD_TEST_INSTRUCTIONS,
49
72
  )
50
- res = agent.chat(
73
+ res = await agent.achat(
51
74
  "First, multiply 5 by 10. Then, multiply 3 by 7. Finally, multiply the results of the first two calculations."
52
75
  )
53
76
  self.assertIn("1050", res.response)
54
77
 
78
+ async def test_gemini_25_flash_multi_tool_chain(self):
79
+ """Test Gemini 2.5 Flash with complex multi-step reasoning chain using multiple tools."""
80
+ # Use Gemini config (Gemini 2.5 Flash)
81
+ tools_catalog = ToolsCatalog(fc_config_gemini)
82
+ tools = [
83
+ ToolsFactory().create_tool(mult),
84
+ ToolsFactory().create_tool(add),
85
+ ToolsFactory().create_tool(tools_catalog.summarize_text),
86
+ ToolsFactory().create_tool(tools_catalog.rephrase_text),
87
+ ]
88
+
89
+ agent = Agent(
90
+ agent_config=fc_config_gemini,
91
+ tools=tools,
92
+ topic=STANDARD_TEST_TOPIC,
93
+ custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
94
+ )
95
+
96
+ # Complex multi-step reasoning task
97
+ complex_query = (
98
+ "Perform this calculation step by step: "
99
+ "First multiply 3 by 8, then add 14 to that result, "
100
+ "then multiply the new result by 3. "
101
+ "After getting the final number, create a text description of the entire mathematical process "
102
+ "(e.g., 'First I multiplied 3 by 8 to get 24, then added 14 to get 38, then multiplied by 3 to get 114'). "
103
+ "Then use the summarize_text tool to summarize that text description with expertise in 'mathematics education'. "
104
+ "Finally, use the rephrase_text tool to rephrase that summary as a 10-year-old would explain it."
105
+ )
106
+
107
+ print("\nStarting Gemini 2.5 Flash multi-tool chain test")
108
+ print(f"Query: {complex_query}")
109
+
110
+ # Note: Gemini tests now use async chat
111
+ response = await agent.achat(complex_query)
112
+
113
+ print(f"Final response: {response.response}")
114
+ print(f"📄 Final response length: {len(response.response)} chars")
115
+
116
+ # Check for mathematical results in the response
117
+ # Expected: 3*8=24, 24+14=38, 38*3=114
118
+ expected_intermediate_results = ["24", "38", "114"]
119
+ response_text = response.response.lower()
120
+ math_results_found = sum(1 for result in expected_intermediate_results
121
+ if result in response_text)
122
+
123
+ print(f"Mathematical results found: {math_results_found}/3 expected")
124
+ print(f"Response text searched: {response_text[:200]}...")
125
+
126
+ # More lenient assertion - just check that some mathematical progress was made
127
+ self.assertGreaterEqual(math_results_found, 1,
128
+ f"Expected at least 1 mathematical result. Found {math_results_found}. "
129
+ f"Response: {response.response}")
130
+
131
+ # Verify response has content and mentions math concepts
132
+ self.assertGreater(len(response.response.strip()), 50, "Expected substantial response content")
133
+
134
+ # Check for indications of multi-tool usage (math, summary, or explanation content)
135
+ multi_tool_indicators = ["calculate", "multipl", "add", "summary", "explain", "mathematical", "process"]
136
+ indicators_found = sum(1 for indicator in multi_tool_indicators
137
+ if indicator in response_text)
138
+ self.assertGreaterEqual(indicators_found, 2,
139
+ f"Expected multiple tool usage indicators. Found {indicators_found}: {response.response}")
140
+
55
141
 
56
142
  if __name__ == "__main__":
57
143
  unittest.main()
tests/test_groq.py CHANGED
@@ -8,6 +8,7 @@ import threading
8
8
 
9
9
  from vectara_agentic.agent import Agent
10
10
  from vectara_agentic.tools import ToolsFactory
11
+ from vectara_agentic.tools_catalog import ToolsCatalog
11
12
  from vectara_agentic.agent_config import AgentConfig
12
13
  from vectara_agentic.types import AgentType, ModelProvider
13
14
 
@@ -17,6 +18,7 @@ nest_asyncio.apply()
17
18
 
18
19
  from conftest import (
19
20
  mult,
21
+ add,
20
22
  fc_config_groq,
21
23
  STANDARD_TEST_TOPIC,
22
24
  STANDARD_TEST_INSTRUCTIONS,
@@ -66,37 +68,116 @@ class TestGROQ(unittest.IsolatedAsyncioTestCase):
66
68
 
67
69
  self.assertEqual(response3.response, "1050")
68
70
 
69
- async def test_gpt_oss_120b(self):
70
- """Test GPT-OSS-120B model with GROQ provider."""
71
+ # Skipping test_gpt_oss_120b due to model's internal tools conflicting with function calling
72
+ # GPT-OSS-120B has internal tools like repo_browser.open_file that cause validation errors
73
+
74
+ async def test_gpt_oss_20b(self):
75
+ """Test GPT-OSS-20B model with complex multi-step reasoning chain using multiple tools via GROQ."""
71
76
  with ARIZE_LOCK:
72
- # Create config specifically for GPT-OSS-120B via GROQ
73
- gpt_oss_config = AgentConfig(
77
+ # Create config for GPT-OSS-20B via GROQ
78
+ gpt_oss_20b_config = AgentConfig(
74
79
  agent_type=AgentType.FUNCTION_CALLING,
75
80
  main_llm_provider=ModelProvider.GROQ,
76
- main_llm_model_name="openai/gpt-oss-120b",
81
+ main_llm_model_name="openai/gpt-oss-20b",
77
82
  tool_llm_provider=ModelProvider.GROQ,
78
- tool_llm_model_name="openai/gpt-oss-120b",
83
+ tool_llm_model_name="openai/gpt-oss-20b",
79
84
  )
80
85
 
81
- tools = [ToolsFactory().create_tool(mult)]
86
+ # Create multiple tools for complex reasoning
87
+ tools_catalog = ToolsCatalog(gpt_oss_20b_config)
88
+ tools = [
89
+ ToolsFactory().create_tool(mult),
90
+ ToolsFactory().create_tool(add),
91
+ ToolsFactory().create_tool(tools_catalog.summarize_text),
92
+ ToolsFactory().create_tool(tools_catalog.rephrase_text),
93
+ ]
94
+
82
95
  agent = Agent(
83
- agent_config=gpt_oss_config,
96
+ agent_config=gpt_oss_20b_config,
84
97
  tools=tools,
85
98
  topic=STANDARD_TEST_TOPIC,
86
- custom_instructions=STANDARD_TEST_INSTRUCTIONS,
99
+ custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
87
100
  )
88
101
 
89
- # Test simple multiplication: 8 * 6 = 48
90
- stream = await agent.astream_chat(
91
- "What is 8 times 6? Only give the answer, nothing else"
102
+ # Complex multi-step reasoning task
103
+ complex_query = (
104
+ "Perform this calculation step by step: "
105
+ "First multiply 6 by 9, then add 12 to that result, "
106
+ "then multiply the new result by 2. "
107
+ "After getting the final number, summarize the entire mathematical process "
108
+ "with expertise in 'mathematics education', "
109
+ "then rephrase that summary as a 10-year-old would explain it."
92
110
  )
93
- # Consume the stream
111
+
112
+ print("\nStarting GPT-OSS-20B multi-tool chain test (GROQ)")
113
+ print(f"Query: {complex_query}")
114
+ print("Streaming response:\n" + "="*50)
115
+
116
+ stream = await agent.astream_chat(complex_query)
117
+
118
+ # Capture streaming deltas and tool calls
119
+ streaming_deltas = []
120
+ tool_calls_made = []
121
+ full_response = ""
122
+
94
123
  async for chunk in stream.async_response_gen():
95
- pass
124
+ if chunk and chunk.strip():
125
+ streaming_deltas.append(chunk)
126
+ full_response += chunk
127
+ # Display each streaming delta
128
+ print(f"📡 Delta: {repr(chunk)}")
129
+
130
+ # Track tool calls in the stream
131
+ if "mult" in chunk.lower():
132
+ if "mult" not in [call["tool"] for call in tool_calls_made]:
133
+ tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
134
+ print(f"Tool call detected: mult (#{len(tool_calls_made)})")
135
+ if "add" in chunk.lower():
136
+ if "add" not in [call["tool"] for call in tool_calls_made]:
137
+ tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
138
+ print(f"Tool call detected: add (#{len(tool_calls_made)})")
139
+ if "summarize" in chunk.lower():
140
+ if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
141
+ tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
142
+ print(f"Tool call detected: summarize_text (#{len(tool_calls_made)})")
143
+ if "rephrase" in chunk.lower():
144
+ if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
145
+ tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
146
+ print(f"Tool call detected: rephrase_text (#{len(tool_calls_made)})")
147
+
96
148
  response = await stream.aget_response()
97
149
 
98
- # Verify the response contains the correct answer
99
- self.assertIn("48", response.response)
150
+ print("="*50)
151
+ print(f"Streaming completed. Total deltas: {len(streaming_deltas)}")
152
+ print(f"Tool calls made: {[call['tool'] for call in tool_calls_made]}")
153
+ print(f"📄 Final response length: {len(response.response)} chars")
154
+ print(f"Final response: {response.response}")
155
+
156
+ # Validate tool usage sequence
157
+ tools_used = [call["tool"] for call in tool_calls_made]
158
+ print(f"🧪 Tools used in order: {tools_used}")
159
+
160
+ # Check that at least multiplication happened (basic requirement)
161
+ self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
162
+
163
+ # Check for mathematical results in the full response or streaming deltas
164
+ # Expected: 6*9=54, 54+12=66, 66*2=132
165
+ expected_intermediate_results = ["54", "66", "132"]
166
+ all_text = (full_response + " " + response.response).lower()
167
+ math_results_found = sum(1 for result in expected_intermediate_results
168
+ if result in all_text)
169
+
170
+ print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
171
+ print(f"Full text searched: {all_text[:200]}...")
172
+
173
+ # More lenient assertion - just check that some mathematical progress was made
174
+ self.assertGreaterEqual(math_results_found, 1,
175
+ f"Expected at least 1 mathematical result. Found {math_results_found}. "
176
+ f"Full text: {all_text}")
177
+
178
+ # Verify that streaming actually produced content
179
+ self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
180
+ self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
100
181
 
101
182
 
102
183
  if __name__ == "__main__":
tests/test_openai.py CHANGED
@@ -8,6 +8,7 @@ import threading
8
8
 
9
9
  from vectara_agentic.agent import Agent
10
10
  from vectara_agentic.tools import ToolsFactory
11
+ from vectara_agentic.tools_catalog import ToolsCatalog
11
12
  from vectara_agentic.agent_config import AgentConfig
12
13
  from vectara_agentic.types import AgentType, ModelProvider
13
14
 
@@ -18,6 +19,7 @@ nest_asyncio.apply()
18
19
  from conftest import (
19
20
  fc_config_openai,
20
21
  mult,
22
+ add,
21
23
  STANDARD_TEST_TOPIC,
22
24
  STANDARD_TEST_INSTRUCTIONS,
23
25
  )
@@ -155,6 +157,105 @@ class TestOpenAI(unittest.IsolatedAsyncioTestCase):
155
157
 
156
158
  self.assertIn("25", response.response)
157
159
 
160
+ async def test_gpt_41_mini_multi_tool_chain(self):
161
+ """Test GPT-4.1-mini with complex multi-step reasoning chain using multiple tools."""
162
+ with ARIZE_LOCK:
163
+ # Use default OpenAI config (gpt-4.1-mini)
164
+ tools_catalog = ToolsCatalog(fc_config_openai)
165
+ tools = [
166
+ ToolsFactory().create_tool(mult),
167
+ ToolsFactory().create_tool(add),
168
+ ToolsFactory().create_tool(tools_catalog.summarize_text),
169
+ ToolsFactory().create_tool(tools_catalog.rephrase_text),
170
+ ]
171
+
172
+ agent = Agent(
173
+ agent_config=fc_config_openai,
174
+ tools=tools,
175
+ topic=STANDARD_TEST_TOPIC,
176
+ custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
177
+ )
178
+
179
+ # Complex multi-step reasoning task
180
+ complex_query = (
181
+ "Perform this calculation step by step: "
182
+ "First multiply 4 by 7, then add 18 to that result, "
183
+ "then multiply the new result by 2. "
184
+ "After getting the final number, summarize the entire mathematical process "
185
+ "with expertise in 'mathematics education', "
186
+ "then rephrase that summary as a 10-year-old would explain it."
187
+ )
188
+
189
+ print("\nStarting GPT-4.1-mini multi-tool chain test (OpenAI)")
190
+ print(f"Query: {complex_query}")
191
+ print("Streaming response:\n" + "="*50)
192
+
193
+ stream = await agent.astream_chat(complex_query)
194
+
195
+ # Capture streaming deltas and tool calls
196
+ streaming_deltas = []
197
+ tool_calls_made = []
198
+ full_response = ""
199
+
200
+ async for chunk in stream.async_response_gen():
201
+ if chunk and chunk.strip():
202
+ streaming_deltas.append(chunk)
203
+ full_response += chunk
204
+ # Display each streaming delta
205
+ print(f"Delta: {repr(chunk)}")
206
+
207
+ # Track tool calls in the stream
208
+ if "mult" in chunk.lower():
209
+ if "mult" not in [call["tool"] for call in tool_calls_made]:
210
+ tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
211
+ print(f"Tool call detected: mult (#{len(tool_calls_made)})")
212
+ if "add" in chunk.lower():
213
+ if "add" not in [call["tool"] for call in tool_calls_made]:
214
+ tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
215
+ print(f"Tool call detected: add (#{len(tool_calls_made)})")
216
+ if "summarize" in chunk.lower():
217
+ if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
218
+ tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
219
+ print(f"Tool call detected: summarize_text (#{len(tool_calls_made)})")
220
+ if "rephrase" in chunk.lower():
221
+ if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
222
+ tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
223
+ print(f"Tool call detected: rephrase_text (#{len(tool_calls_made)})")
224
+
225
+ response = await stream.aget_response()
226
+
227
+ print("="*50)
228
+ print(f"Streaming completed. Total deltas: {len(streaming_deltas)}")
229
+ print(f"Tool calls made: {[call['tool'] for call in tool_calls_made]}")
230
+ print(f"📄 Final response length: {len(response.response)} chars")
231
+ print(f"Final response: {response.response}")
232
+
233
+ # Validate tool usage sequence
234
+ tools_used = [call["tool"] for call in tool_calls_made]
235
+ print(f"🧪 Tools used in order: {tools_used}")
236
+
237
+ # Check that at least multiplication happened (basic requirement)
238
+ self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
239
+
240
+ # Check for mathematical results in the full response or streaming deltas
241
+ # Expected: 4*7=28, 28+18=46, 46*2=92
242
+ expected_intermediate_results = ["28", "46", "92"]
243
+ all_text = (full_response + " " + response.response).lower()
244
+ math_results_found = sum(1 for result in expected_intermediate_results
245
+ if result in all_text)
246
+
247
+ print(f"Mathematical results found: {math_results_found}/3 expected")
248
+ print(f"Full text searched: {all_text[:200]}...")
249
+
250
+ # More lenient assertion - just check that some mathematical progress was made
251
+ self.assertGreaterEqual(math_results_found, 1,
252
+ f"Expected at least 1 mathematical result. Found {math_results_found}. "
253
+ f"Full text: {all_text}")
254
+
255
+ # Verify that streaming actually produced content
256
+ self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
257
+ self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
258
+
158
259
 
159
260
  if __name__ == "__main__":
160
261
  unittest.main()