vectara-agentic 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vectara-agentic might be problematic. Click here for more details.

tests/test_bedrock.py CHANGED
@@ -8,6 +8,7 @@ import threading
8
8
 
9
9
  from vectara_agentic.agent import Agent
10
10
  from vectara_agentic.tools import ToolsFactory
11
+ from vectara_agentic.tools_catalog import ToolsCatalog
11
12
 
12
13
  import nest_asyncio
13
14
 
@@ -15,6 +16,7 @@ nest_asyncio.apply()
15
16
 
16
17
  from conftest import (
17
18
  mult,
19
+ add,
18
20
  fc_config_bedrock,
19
21
  STANDARD_TEST_TOPIC,
20
22
  STANDARD_TEST_INSTRUCTIONS,
@@ -64,6 +66,105 @@ class TestBedrock(unittest.IsolatedAsyncioTestCase):
64
66
 
65
67
  self.assertEqual(response3.response, "1050")
66
68
 
69
+ async def test_claude_sonnet_4_multi_tool_chain(self):
70
+ """Test Claude Sonnet 4 with complex multi-step reasoning chain using multiple tools via Bedrock."""
71
+ with ARIZE_LOCK:
72
+ # Use Bedrock config (Claude Sonnet 4)
73
+ tools_catalog = ToolsCatalog(fc_config_bedrock)
74
+ tools = [
75
+ ToolsFactory().create_tool(mult),
76
+ ToolsFactory().create_tool(add),
77
+ ToolsFactory().create_tool(tools_catalog.summarize_text),
78
+ ToolsFactory().create_tool(tools_catalog.rephrase_text),
79
+ ]
80
+
81
+ agent = Agent(
82
+ agent_config=fc_config_bedrock,
83
+ tools=tools,
84
+ topic=STANDARD_TEST_TOPIC,
85
+ custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
86
+ )
87
+
88
+ # Complex multi-step reasoning task
89
+ complex_query = (
90
+ "Perform this calculation step by step: "
91
+ "First multiply 5 by 9, then add 13 to that result, "
92
+ "then multiply the new result by 2. "
93
+ "After getting the final number, summarize the entire mathematical process "
94
+ "with expertise in 'mathematics education', "
95
+ "then rephrase that summary as a 10-year-old would explain it."
96
+ )
97
+
98
+ print("\n🔍 Starting Claude Sonnet 4 multi-tool chain test (Bedrock)")
99
+ print(f"📝 Query: {complex_query}")
100
+ print("🌊 Streaming response:\n" + "="*50)
101
+
102
+ stream = await agent.astream_chat(complex_query)
103
+
104
+ # Capture streaming deltas and tool calls
105
+ streaming_deltas = []
106
+ tool_calls_made = []
107
+ full_response = ""
108
+
109
+ async for chunk in stream.async_response_gen():
110
+ if chunk and chunk.strip():
111
+ streaming_deltas.append(chunk)
112
+ full_response += chunk
113
+ # Display each streaming delta
114
+ print(f"📡 Delta: {repr(chunk)}")
115
+
116
+ # Track tool calls in the stream
117
+ if "mult" in chunk.lower():
118
+ if "mult" not in [call["tool"] for call in tool_calls_made]:
119
+ tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
120
+ print(f"🔧 Tool call detected: mult (#{len(tool_calls_made)})")
121
+ if "add" in chunk.lower():
122
+ if "add" not in [call["tool"] for call in tool_calls_made]:
123
+ tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
124
+ print(f"🔧 Tool call detected: add (#{len(tool_calls_made)})")
125
+ if "summarize" in chunk.lower():
126
+ if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
127
+ tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
128
+ print(f"🔧 Tool call detected: summarize_text (#{len(tool_calls_made)})")
129
+ if "rephrase" in chunk.lower():
130
+ if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
131
+ tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
132
+ print(f"🔧 Tool call detected: rephrase_text (#{len(tool_calls_made)})")
133
+
134
+ response = await stream.aget_response()
135
+
136
+ print("="*50)
137
+ print(f"✅ Streaming completed. Total deltas: {len(streaming_deltas)}")
138
+ print(f"🔧 Tool calls made: {[call['tool'] for call in tool_calls_made]}")
139
+ print(f"📄 Final response length: {len(response.response)} chars")
140
+ print(f"🎯 Final response: {response.response}")
141
+
142
+ # Validate tool usage sequence
143
+ tools_used = [call["tool"] for call in tool_calls_made]
144
+ print(f"🧪 Tools used in order: {tools_used}")
145
+
146
+ # Check that at least multiplication happened (basic requirement)
147
+ self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
148
+
149
+ # Check for mathematical results in the full response or streaming deltas
150
+ # Expected: 5*9=45, 45+13=58, 58*2=116
151
+ expected_intermediate_results = ["45", "58", "116"]
152
+ all_text = (full_response + " " + response.response).lower()
153
+ math_results_found = sum(1 for result in expected_intermediate_results
154
+ if result in all_text)
155
+
156
+ print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
157
+ print(f"🔍 Full text searched: {all_text[:200]}...")
158
+
159
+ # More lenient assertion - just check that some mathematical progress was made
160
+ self.assertGreaterEqual(math_results_found, 1,
161
+ f"Expected at least 1 mathematical result. Found {math_results_found}. "
162
+ f"Full text: {all_text}")
163
+
164
+ # Verify that streaming actually produced content
165
+ self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
166
+ self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
167
+
67
168
 
68
169
  if __name__ == "__main__":
69
170
  unittest.main()
tests/test_gemini.py CHANGED
@@ -7,6 +7,7 @@ import unittest
7
7
 
8
8
  from vectara_agentic.agent import Agent
9
9
  from vectara_agentic.tools import ToolsFactory
10
+ from vectara_agentic.tools_catalog import ToolsCatalog
10
11
 
11
12
 
12
13
  import nest_asyncio
@@ -15,6 +16,7 @@ nest_asyncio.apply()
15
16
 
16
17
  from conftest import (
17
18
  mult,
19
+ add,
18
20
  fc_config_gemini,
19
21
  STANDARD_TEST_TOPIC,
20
22
  STANDARD_TEST_INSTRUCTIONS,
@@ -52,6 +54,68 @@ class TestGEMINI(unittest.TestCase):
52
54
  )
53
55
  self.assertIn("1050", res.response)
54
56
 
57
+ def test_gemini_25_flash_multi_tool_chain(self):
58
+ """Test Gemini 2.5 Flash with complex multi-step reasoning chain using multiple tools."""
59
+ # Use Gemini config (Gemini 2.5 Flash)
60
+ tools_catalog = ToolsCatalog(fc_config_gemini)
61
+ tools = [
62
+ ToolsFactory().create_tool(mult),
63
+ ToolsFactory().create_tool(add),
64
+ ToolsFactory().create_tool(tools_catalog.summarize_text),
65
+ ToolsFactory().create_tool(tools_catalog.rephrase_text),
66
+ ]
67
+
68
+ agent = Agent(
69
+ agent_config=fc_config_gemini,
70
+ tools=tools,
71
+ topic=STANDARD_TEST_TOPIC,
72
+ custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
73
+ )
74
+
75
+ # Complex multi-step reasoning task
76
+ complex_query = (
77
+ "Perform this calculation step by step: "
78
+ "First multiply 3 by 8, then add 14 to that result, "
79
+ "then multiply the new result by 3. "
80
+ "After getting the final number, summarize the entire mathematical process "
81
+ "with expertise in 'mathematics education', "
82
+ "then rephrase that summary as a 10-year-old would explain it."
83
+ )
84
+
85
+ print("\n🔍 Starting Gemini 2.5 Flash multi-tool chain test")
86
+ print(f"📝 Query: {complex_query}")
87
+
88
+ # Note: Gemini tests use synchronous chat, not async streaming
89
+ response = agent.chat(complex_query)
90
+
91
+ print(f"🎯 Final response: {response.response}")
92
+ print(f"📄 Final response length: {len(response.response)} chars")
93
+
94
+ # Check for mathematical results in the response
95
+ # Expected: 3*8=24, 24+14=38, 38*3=114
96
+ expected_intermediate_results = ["24", "38", "114"]
97
+ response_text = response.response.lower()
98
+ math_results_found = sum(1 for result in expected_intermediate_results
99
+ if result in response_text)
100
+
101
+ print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
102
+ print(f"🔍 Response text searched: {response_text[:200]}...")
103
+
104
+ # More lenient assertion - just check that some mathematical progress was made
105
+ self.assertGreaterEqual(math_results_found, 1,
106
+ f"Expected at least 1 mathematical result. Found {math_results_found}. "
107
+ f"Response: {response.response}")
108
+
109
+ # Verify response has content and mentions math concepts
110
+ self.assertGreater(len(response.response.strip()), 50, "Expected substantial response content")
111
+
112
+ # Check for indications of multi-tool usage (math, summary, or explanation content)
113
+ multi_tool_indicators = ["calculate", "multiply", "add", "summary", "explain", "mathematical", "process"]
114
+ indicators_found = sum(1 for indicator in multi_tool_indicators
115
+ if indicator in response_text)
116
+ self.assertGreaterEqual(indicators_found, 3,
117
+ f"Expected multiple tool usage indicators. Found {indicators_found}: {response.response}")
118
+
55
119
 
56
120
  if __name__ == "__main__":
57
121
  unittest.main()
tests/test_groq.py CHANGED
@@ -8,6 +8,7 @@ import threading
8
8
 
9
9
  from vectara_agentic.agent import Agent
10
10
  from vectara_agentic.tools import ToolsFactory
11
+ from vectara_agentic.tools_catalog import ToolsCatalog
11
12
  from vectara_agentic.agent_config import AgentConfig
12
13
  from vectara_agentic.types import AgentType, ModelProvider
13
14
 
@@ -17,6 +18,7 @@ nest_asyncio.apply()
17
18
 
18
19
  from conftest import (
19
20
  mult,
21
+ add,
20
22
  fc_config_groq,
21
23
  STANDARD_TEST_TOPIC,
22
24
  STANDARD_TEST_INSTRUCTIONS,
@@ -67,9 +69,9 @@ class TestGROQ(unittest.IsolatedAsyncioTestCase):
67
69
  self.assertEqual(response3.response, "1050")
68
70
 
69
71
  async def test_gpt_oss_120b(self):
70
- """Test GPT-OSS-120B model with GROQ provider."""
72
+ """Test GPT-OSS-120B model with complex multi-step reasoning chain using multiple tools via GROQ."""
71
73
  with ARIZE_LOCK:
72
- # Create config specifically for GPT-OSS-120B via GROQ
74
+ # Create config for GPT-OSS-120B via GROQ
73
75
  gpt_oss_config = AgentConfig(
74
76
  agent_type=AgentType.FUNCTION_CALLING,
75
77
  main_llm_provider=ModelProvider.GROQ,
@@ -78,25 +80,208 @@ class TestGROQ(unittest.IsolatedAsyncioTestCase):
78
80
  tool_llm_model_name="openai/gpt-oss-120b",
79
81
  )
80
82
 
81
- tools = [ToolsFactory().create_tool(mult)]
83
+ # Create multiple tools for complex reasoning
84
+ tools_catalog = ToolsCatalog(gpt_oss_config)
85
+ tools = [
86
+ ToolsFactory().create_tool(mult),
87
+ ToolsFactory().create_tool(add),
88
+ ToolsFactory().create_tool(tools_catalog.summarize_text),
89
+ ToolsFactory().create_tool(tools_catalog.rephrase_text),
90
+ ]
91
+
82
92
  agent = Agent(
83
93
  agent_config=gpt_oss_config,
84
94
  tools=tools,
85
95
  topic=STANDARD_TEST_TOPIC,
86
- custom_instructions=STANDARD_TEST_INSTRUCTIONS,
96
+ custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
87
97
  )
88
98
 
89
- # Test simple multiplication: 8 * 6 = 48
90
- stream = await agent.astream_chat(
91
- "What is 8 times 6? Only give the answer, nothing else"
99
+ # Complex multi-step reasoning task
100
+ complex_query = (
101
+ "Perform this calculation step by step: "
102
+ "First multiply 7 by 8, then add 15 to that result, "
103
+ "then multiply the new result by 3. "
104
+ "After getting the final number, summarize the entire mathematical process "
105
+ "with expertise in 'mathematics education', "
106
+ "then rephrase that summary as a 10-year-old would explain it."
92
107
  )
93
- # Consume the stream
108
+
109
+ print("\n🔍 Starting GPT-OSS-120B multi-tool chain test (GROQ)")
110
+ print(f"📝 Query: {complex_query}")
111
+ print("🌊 Streaming response:\n" + "="*50)
112
+
113
+ stream = await agent.astream_chat(complex_query)
114
+
115
+ # Capture streaming deltas and tool calls
116
+ streaming_deltas = []
117
+ tool_calls_made = []
118
+ full_response = ""
119
+
94
120
  async for chunk in stream.async_response_gen():
95
- pass
121
+ if chunk and chunk.strip():
122
+ streaming_deltas.append(chunk)
123
+ full_response += chunk
124
+ # Display each streaming delta
125
+ print(f"📡 Delta: {repr(chunk)}")
126
+
127
+ # Track tool calls in the stream
128
+ if "mult" in chunk.lower():
129
+ if "mult" not in [call["tool"] for call in tool_calls_made]:
130
+ tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
131
+ print(f"🔧 Tool call detected: mult (#{len(tool_calls_made)})")
132
+ if "add" in chunk.lower():
133
+ if "add" not in [call["tool"] for call in tool_calls_made]:
134
+ tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
135
+ print(f"🔧 Tool call detected: add (#{len(tool_calls_made)})")
136
+ if "summarize" in chunk.lower():
137
+ if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
138
+ tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
139
+ print(f"🔧 Tool call detected: summarize_text (#{len(tool_calls_made)})")
140
+ if "rephrase" in chunk.lower():
141
+ if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
142
+ tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
143
+ print(f"🔧 Tool call detected: rephrase_text (#{len(tool_calls_made)})")
144
+
145
+ response = await stream.aget_response()
146
+
147
+ print("="*50)
148
+ print(f"✅ Streaming completed. Total deltas: {len(streaming_deltas)}")
149
+ print(f"🔧 Tool calls made: {[call['tool'] for call in tool_calls_made]}")
150
+ print(f"📄 Final response length: {len(response.response)} chars")
151
+ print(f"🎯 Final response: {response.response}")
152
+
153
+ # Validate tool usage sequence
154
+ tools_used = [call["tool"] for call in tool_calls_made]
155
+ print(f"🧪 Tools used in order: {tools_used}")
156
+
157
+ # Check that at least multiplication happened (basic requirement)
158
+ self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
159
+
160
+ # Check for mathematical results in the full response or streaming deltas
161
+ expected_intermediate_results = ["56", "71", "213"]
162
+ all_text = (full_response + " " + response.response).lower()
163
+ math_results_found = sum(1 for result in expected_intermediate_results
164
+ if result in all_text)
165
+
166
+ print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
167
+ print(f"🔍 Full text searched: {all_text[:200]}...")
168
+
169
+ # More lenient assertion - just check that some mathematical progress was made
170
+ self.assertGreaterEqual(math_results_found, 1,
171
+ f"Expected at least 1 mathematical result. Found {math_results_found}. "
172
+ f"Full text: {all_text}")
173
+
174
+ # Verify that streaming actually produced content
175
+ self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
176
+ self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
177
+
178
+ async def test_gpt_oss_20b(self):
179
+ """Test GPT-OSS-20B model with complex multi-step reasoning chain using multiple tools via GROQ."""
180
+ with ARIZE_LOCK:
181
+ # Create config for GPT-OSS-20B via GROQ
182
+ gpt_oss_20b_config = AgentConfig(
183
+ agent_type=AgentType.FUNCTION_CALLING,
184
+ main_llm_provider=ModelProvider.GROQ,
185
+ main_llm_model_name="openai/gpt-oss-20b",
186
+ tool_llm_provider=ModelProvider.GROQ,
187
+ tool_llm_model_name="openai/gpt-oss-20b",
188
+ )
189
+
190
+ # Create multiple tools for complex reasoning
191
+ tools_catalog = ToolsCatalog(gpt_oss_20b_config)
192
+ tools = [
193
+ ToolsFactory().create_tool(mult),
194
+ ToolsFactory().create_tool(add),
195
+ ToolsFactory().create_tool(tools_catalog.summarize_text),
196
+ ToolsFactory().create_tool(tools_catalog.rephrase_text),
197
+ ]
198
+
199
+ agent = Agent(
200
+ agent_config=gpt_oss_20b_config,
201
+ tools=tools,
202
+ topic=STANDARD_TEST_TOPIC,
203
+ custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
204
+ )
205
+
206
+ # Complex multi-step reasoning task
207
+ complex_query = (
208
+ "Perform this calculation step by step: "
209
+ "First multiply 6 by 9, then add 12 to that result, "
210
+ "then multiply the new result by 2. "
211
+ "After getting the final number, summarize the entire mathematical process "
212
+ "with expertise in 'mathematics education', "
213
+ "then rephrase that summary as a 10-year-old would explain it."
214
+ )
215
+
216
+ print("\n🔍 Starting GPT-OSS-20B multi-tool chain test (GROQ)")
217
+ print(f"📝 Query: {complex_query}")
218
+ print("🌊 Streaming response:\n" + "="*50)
219
+
220
+ stream = await agent.astream_chat(complex_query)
221
+
222
+ # Capture streaming deltas and tool calls
223
+ streaming_deltas = []
224
+ tool_calls_made = []
225
+ full_response = ""
226
+
227
+ async for chunk in stream.async_response_gen():
228
+ if chunk and chunk.strip():
229
+ streaming_deltas.append(chunk)
230
+ full_response += chunk
231
+ # Display each streaming delta
232
+ print(f"📡 Delta: {repr(chunk)}")
233
+
234
+ # Track tool calls in the stream
235
+ if "mult" in chunk.lower():
236
+ if "mult" not in [call["tool"] for call in tool_calls_made]:
237
+ tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
238
+ print(f"🔧 Tool call detected: mult (#{len(tool_calls_made)})")
239
+ if "add" in chunk.lower():
240
+ if "add" not in [call["tool"] for call in tool_calls_made]:
241
+ tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
242
+ print(f"🔧 Tool call detected: add (#{len(tool_calls_made)})")
243
+ if "summarize" in chunk.lower():
244
+ if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
245
+ tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
246
+ print(f"🔧 Tool call detected: summarize_text (#{len(tool_calls_made)})")
247
+ if "rephrase" in chunk.lower():
248
+ if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
249
+ tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
250
+ print(f"🔧 Tool call detected: rephrase_text (#{len(tool_calls_made)})")
251
+
96
252
  response = await stream.aget_response()
97
253
 
98
- # Verify the response contains the correct answer
99
- self.assertIn("48", response.response)
254
+ print("="*50)
255
+ print(f" Streaming completed. Total deltas: {len(streaming_deltas)}")
256
+ print(f"🔧 Tool calls made: {[call['tool'] for call in tool_calls_made]}")
257
+ print(f"📄 Final response length: {len(response.response)} chars")
258
+ print(f"🎯 Final response: {response.response}")
259
+
260
+ # Validate tool usage sequence
261
+ tools_used = [call["tool"] for call in tool_calls_made]
262
+ print(f"🧪 Tools used in order: {tools_used}")
263
+
264
+ # Check that at least multiplication happened (basic requirement)
265
+ self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
266
+
267
+ # Check for mathematical results in the full response or streaming deltas
268
+ # Expected: 6*9=54, 54+12=66, 66*2=132
269
+ expected_intermediate_results = ["54", "66", "132"]
270
+ all_text = (full_response + " " + response.response).lower()
271
+ math_results_found = sum(1 for result in expected_intermediate_results
272
+ if result in all_text)
273
+
274
+ print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
275
+ print(f"🔍 Full text searched: {all_text[:200]}...")
276
+
277
+ # More lenient assertion - just check that some mathematical progress was made
278
+ self.assertGreaterEqual(math_results_found, 1,
279
+ f"Expected at least 1 mathematical result. Found {math_results_found}. "
280
+ f"Full text: {all_text}")
281
+
282
+ # Verify that streaming actually produced content
283
+ self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
284
+ self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
100
285
 
101
286
 
102
287
  if __name__ == "__main__":
tests/test_openai.py CHANGED
@@ -8,6 +8,7 @@ import threading
8
8
 
9
9
  from vectara_agentic.agent import Agent
10
10
  from vectara_agentic.tools import ToolsFactory
11
+ from vectara_agentic.tools_catalog import ToolsCatalog
11
12
  from vectara_agentic.agent_config import AgentConfig
12
13
  from vectara_agentic.types import AgentType, ModelProvider
13
14
 
@@ -18,6 +19,7 @@ nest_asyncio.apply()
18
19
  from conftest import (
19
20
  fc_config_openai,
20
21
  mult,
22
+ add,
21
23
  STANDARD_TEST_TOPIC,
22
24
  STANDARD_TEST_INSTRUCTIONS,
23
25
  )
@@ -155,6 +157,105 @@ class TestOpenAI(unittest.IsolatedAsyncioTestCase):
155
157
 
156
158
  self.assertIn("25", response.response)
157
159
 
160
+ async def test_gpt_41_mini_multi_tool_chain(self):
161
+ """Test GPT-4.1-mini with complex multi-step reasoning chain using multiple tools."""
162
+ with ARIZE_LOCK:
163
+ # Use default OpenAI config (gpt-4.1-mini)
164
+ tools_catalog = ToolsCatalog(fc_config_openai)
165
+ tools = [
166
+ ToolsFactory().create_tool(mult),
167
+ ToolsFactory().create_tool(add),
168
+ ToolsFactory().create_tool(tools_catalog.summarize_text),
169
+ ToolsFactory().create_tool(tools_catalog.rephrase_text),
170
+ ]
171
+
172
+ agent = Agent(
173
+ agent_config=fc_config_openai,
174
+ tools=tools,
175
+ topic=STANDARD_TEST_TOPIC,
176
+ custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
177
+ )
178
+
179
+ # Complex multi-step reasoning task
180
+ complex_query = (
181
+ "Perform this calculation step by step: "
182
+ "First multiply 4 by 7, then add 18 to that result, "
183
+ "then multiply the new result by 2. "
184
+ "After getting the final number, summarize the entire mathematical process "
185
+ "with expertise in 'mathematics education', "
186
+ "then rephrase that summary as a 10-year-old would explain it."
187
+ )
188
+
189
+ print("\n🔍 Starting GPT-4.1-mini multi-tool chain test (OpenAI)")
190
+ print(f"📝 Query: {complex_query}")
191
+ print("🌊 Streaming response:\n" + "="*50)
192
+
193
+ stream = await agent.astream_chat(complex_query)
194
+
195
+ # Capture streaming deltas and tool calls
196
+ streaming_deltas = []
197
+ tool_calls_made = []
198
+ full_response = ""
199
+
200
+ async for chunk in stream.async_response_gen():
201
+ if chunk and chunk.strip():
202
+ streaming_deltas.append(chunk)
203
+ full_response += chunk
204
+ # Display each streaming delta
205
+ print(f"📡 Delta: {repr(chunk)}")
206
+
207
+ # Track tool calls in the stream
208
+ if "mult" in chunk.lower():
209
+ if "mult" not in [call["tool"] for call in tool_calls_made]:
210
+ tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
211
+ print(f"🔧 Tool call detected: mult (#{len(tool_calls_made)})")
212
+ if "add" in chunk.lower():
213
+ if "add" not in [call["tool"] for call in tool_calls_made]:
214
+ tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
215
+ print(f"🔧 Tool call detected: add (#{len(tool_calls_made)})")
216
+ if "summarize" in chunk.lower():
217
+ if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
218
+ tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
219
+ print(f"🔧 Tool call detected: summarize_text (#{len(tool_calls_made)})")
220
+ if "rephrase" in chunk.lower():
221
+ if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
222
+ tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
223
+ print(f"🔧 Tool call detected: rephrase_text (#{len(tool_calls_made)})")
224
+
225
+ response = await stream.aget_response()
226
+
227
+ print("="*50)
228
+ print(f"✅ Streaming completed. Total deltas: {len(streaming_deltas)}")
229
+ print(f"🔧 Tool calls made: {[call['tool'] for call in tool_calls_made]}")
230
+ print(f"📄 Final response length: {len(response.response)} chars")
231
+ print(f"🎯 Final response: {response.response}")
232
+
233
+ # Validate tool usage sequence
234
+ tools_used = [call["tool"] for call in tool_calls_made]
235
+ print(f"🧪 Tools used in order: {tools_used}")
236
+
237
+ # Check that at least multiplication happened (basic requirement)
238
+ self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
239
+
240
+ # Check for mathematical results in the full response or streaming deltas
241
+ # Expected: 4*7=28, 28+18=46, 46*2=92
242
+ expected_intermediate_results = ["28", "46", "92"]
243
+ all_text = (full_response + " " + response.response).lower()
244
+ math_results_found = sum(1 for result in expected_intermediate_results
245
+ if result in all_text)
246
+
247
+ print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
248
+ print(f"🔍 Full text searched: {all_text[:200]}...")
249
+
250
+ # More lenient assertion - just check that some mathematical progress was made
251
+ self.assertGreaterEqual(math_results_found, 1,
252
+ f"Expected at least 1 mathematical result. Found {math_results_found}. "
253
+ f"Full text: {all_text}")
254
+
255
+ # Verify that streaming actually produced content
256
+ self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
257
+ self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
258
+
158
259
 
159
260
  if __name__ == "__main__":
160
261
  unittest.main()