vectara-agentic 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vectara-agentic might be problematic. Click here for more details.
- tests/test_bedrock.py +101 -0
- tests/test_gemini.py +64 -0
- tests/test_groq.py +196 -11
- tests/test_openai.py +101 -0
- vectara_agentic/_version.py +1 -1
- vectara_agentic/agent.py +1 -1
- vectara_agentic/agent_core/prompts.py +1 -0
- vectara_agentic/agent_core/streaming.py +176 -194
- vectara_agentic/llm_utils.py +1 -1
- vectara_agentic/sub_query_workflow.py +31 -31
- vectara_agentic/tools.py +0 -2
- {vectara_agentic-0.4.7.dist-info → vectara_agentic-0.4.8.dist-info}/METADATA +31 -30
- {vectara_agentic-0.4.7.dist-info → vectara_agentic-0.4.8.dist-info}/RECORD +16 -16
- {vectara_agentic-0.4.7.dist-info → vectara_agentic-0.4.8.dist-info}/WHEEL +0 -0
- {vectara_agentic-0.4.7.dist-info → vectara_agentic-0.4.8.dist-info}/licenses/LICENSE +0 -0
- {vectara_agentic-0.4.7.dist-info → vectara_agentic-0.4.8.dist-info}/top_level.txt +0 -0
tests/test_bedrock.py
CHANGED
|
@@ -8,6 +8,7 @@ import threading
|
|
|
8
8
|
|
|
9
9
|
from vectara_agentic.agent import Agent
|
|
10
10
|
from vectara_agentic.tools import ToolsFactory
|
|
11
|
+
from vectara_agentic.tools_catalog import ToolsCatalog
|
|
11
12
|
|
|
12
13
|
import nest_asyncio
|
|
13
14
|
|
|
@@ -15,6 +16,7 @@ nest_asyncio.apply()
|
|
|
15
16
|
|
|
16
17
|
from conftest import (
|
|
17
18
|
mult,
|
|
19
|
+
add,
|
|
18
20
|
fc_config_bedrock,
|
|
19
21
|
STANDARD_TEST_TOPIC,
|
|
20
22
|
STANDARD_TEST_INSTRUCTIONS,
|
|
@@ -64,6 +66,105 @@ class TestBedrock(unittest.IsolatedAsyncioTestCase):
|
|
|
64
66
|
|
|
65
67
|
self.assertEqual(response3.response, "1050")
|
|
66
68
|
|
|
69
|
+
async def test_claude_sonnet_4_multi_tool_chain(self):
|
|
70
|
+
"""Test Claude Sonnet 4 with complex multi-step reasoning chain using multiple tools via Bedrock."""
|
|
71
|
+
with ARIZE_LOCK:
|
|
72
|
+
# Use Bedrock config (Claude Sonnet 4)
|
|
73
|
+
tools_catalog = ToolsCatalog(fc_config_bedrock)
|
|
74
|
+
tools = [
|
|
75
|
+
ToolsFactory().create_tool(mult),
|
|
76
|
+
ToolsFactory().create_tool(add),
|
|
77
|
+
ToolsFactory().create_tool(tools_catalog.summarize_text),
|
|
78
|
+
ToolsFactory().create_tool(tools_catalog.rephrase_text),
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
agent = Agent(
|
|
82
|
+
agent_config=fc_config_bedrock,
|
|
83
|
+
tools=tools,
|
|
84
|
+
topic=STANDARD_TEST_TOPIC,
|
|
85
|
+
custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Complex multi-step reasoning task
|
|
89
|
+
complex_query = (
|
|
90
|
+
"Perform this calculation step by step: "
|
|
91
|
+
"First multiply 5 by 9, then add 13 to that result, "
|
|
92
|
+
"then multiply the new result by 2. "
|
|
93
|
+
"After getting the final number, summarize the entire mathematical process "
|
|
94
|
+
"with expertise in 'mathematics education', "
|
|
95
|
+
"then rephrase that summary as a 10-year-old would explain it."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
print("\n🔍 Starting Claude Sonnet 4 multi-tool chain test (Bedrock)")
|
|
99
|
+
print(f"📝 Query: {complex_query}")
|
|
100
|
+
print("🌊 Streaming response:\n" + "="*50)
|
|
101
|
+
|
|
102
|
+
stream = await agent.astream_chat(complex_query)
|
|
103
|
+
|
|
104
|
+
# Capture streaming deltas and tool calls
|
|
105
|
+
streaming_deltas = []
|
|
106
|
+
tool_calls_made = []
|
|
107
|
+
full_response = ""
|
|
108
|
+
|
|
109
|
+
async for chunk in stream.async_response_gen():
|
|
110
|
+
if chunk and chunk.strip():
|
|
111
|
+
streaming_deltas.append(chunk)
|
|
112
|
+
full_response += chunk
|
|
113
|
+
# Display each streaming delta
|
|
114
|
+
print(f"📡 Delta: {repr(chunk)}")
|
|
115
|
+
|
|
116
|
+
# Track tool calls in the stream
|
|
117
|
+
if "mult" in chunk.lower():
|
|
118
|
+
if "mult" not in [call["tool"] for call in tool_calls_made]:
|
|
119
|
+
tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
|
|
120
|
+
print(f"🔧 Tool call detected: mult (#{len(tool_calls_made)})")
|
|
121
|
+
if "add" in chunk.lower():
|
|
122
|
+
if "add" not in [call["tool"] for call in tool_calls_made]:
|
|
123
|
+
tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
|
|
124
|
+
print(f"🔧 Tool call detected: add (#{len(tool_calls_made)})")
|
|
125
|
+
if "summarize" in chunk.lower():
|
|
126
|
+
if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
|
|
127
|
+
tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
|
|
128
|
+
print(f"🔧 Tool call detected: summarize_text (#{len(tool_calls_made)})")
|
|
129
|
+
if "rephrase" in chunk.lower():
|
|
130
|
+
if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
|
|
131
|
+
tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
|
|
132
|
+
print(f"🔧 Tool call detected: rephrase_text (#{len(tool_calls_made)})")
|
|
133
|
+
|
|
134
|
+
response = await stream.aget_response()
|
|
135
|
+
|
|
136
|
+
print("="*50)
|
|
137
|
+
print(f"✅ Streaming completed. Total deltas: {len(streaming_deltas)}")
|
|
138
|
+
print(f"🔧 Tool calls made: {[call['tool'] for call in tool_calls_made]}")
|
|
139
|
+
print(f"📄 Final response length: {len(response.response)} chars")
|
|
140
|
+
print(f"🎯 Final response: {response.response}")
|
|
141
|
+
|
|
142
|
+
# Validate tool usage sequence
|
|
143
|
+
tools_used = [call["tool"] for call in tool_calls_made]
|
|
144
|
+
print(f"🧪 Tools used in order: {tools_used}")
|
|
145
|
+
|
|
146
|
+
# Check that at least multiplication happened (basic requirement)
|
|
147
|
+
self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
|
|
148
|
+
|
|
149
|
+
# Check for mathematical results in the full response or streaming deltas
|
|
150
|
+
# Expected: 5*9=45, 45+13=58, 58*2=116
|
|
151
|
+
expected_intermediate_results = ["45", "58", "116"]
|
|
152
|
+
all_text = (full_response + " " + response.response).lower()
|
|
153
|
+
math_results_found = sum(1 for result in expected_intermediate_results
|
|
154
|
+
if result in all_text)
|
|
155
|
+
|
|
156
|
+
print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
|
|
157
|
+
print(f"🔍 Full text searched: {all_text[:200]}...")
|
|
158
|
+
|
|
159
|
+
# More lenient assertion - just check that some mathematical progress was made
|
|
160
|
+
self.assertGreaterEqual(math_results_found, 1,
|
|
161
|
+
f"Expected at least 1 mathematical result. Found {math_results_found}. "
|
|
162
|
+
f"Full text: {all_text}")
|
|
163
|
+
|
|
164
|
+
# Verify that streaming actually produced content
|
|
165
|
+
self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
|
|
166
|
+
self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
|
|
167
|
+
|
|
67
168
|
|
|
68
169
|
if __name__ == "__main__":
|
|
69
170
|
unittest.main()
|
tests/test_gemini.py
CHANGED
|
@@ -7,6 +7,7 @@ import unittest
|
|
|
7
7
|
|
|
8
8
|
from vectara_agentic.agent import Agent
|
|
9
9
|
from vectara_agentic.tools import ToolsFactory
|
|
10
|
+
from vectara_agentic.tools_catalog import ToolsCatalog
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
import nest_asyncio
|
|
@@ -15,6 +16,7 @@ nest_asyncio.apply()
|
|
|
15
16
|
|
|
16
17
|
from conftest import (
|
|
17
18
|
mult,
|
|
19
|
+
add,
|
|
18
20
|
fc_config_gemini,
|
|
19
21
|
STANDARD_TEST_TOPIC,
|
|
20
22
|
STANDARD_TEST_INSTRUCTIONS,
|
|
@@ -52,6 +54,68 @@ class TestGEMINI(unittest.TestCase):
|
|
|
52
54
|
)
|
|
53
55
|
self.assertIn("1050", res.response)
|
|
54
56
|
|
|
57
|
+
def test_gemini_25_flash_multi_tool_chain(self):
|
|
58
|
+
"""Test Gemini 2.5 Flash with complex multi-step reasoning chain using multiple tools."""
|
|
59
|
+
# Use Gemini config (Gemini 2.5 Flash)
|
|
60
|
+
tools_catalog = ToolsCatalog(fc_config_gemini)
|
|
61
|
+
tools = [
|
|
62
|
+
ToolsFactory().create_tool(mult),
|
|
63
|
+
ToolsFactory().create_tool(add),
|
|
64
|
+
ToolsFactory().create_tool(tools_catalog.summarize_text),
|
|
65
|
+
ToolsFactory().create_tool(tools_catalog.rephrase_text),
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
agent = Agent(
|
|
69
|
+
agent_config=fc_config_gemini,
|
|
70
|
+
tools=tools,
|
|
71
|
+
topic=STANDARD_TEST_TOPIC,
|
|
72
|
+
custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Complex multi-step reasoning task
|
|
76
|
+
complex_query = (
|
|
77
|
+
"Perform this calculation step by step: "
|
|
78
|
+
"First multiply 3 by 8, then add 14 to that result, "
|
|
79
|
+
"then multiply the new result by 3. "
|
|
80
|
+
"After getting the final number, summarize the entire mathematical process "
|
|
81
|
+
"with expertise in 'mathematics education', "
|
|
82
|
+
"then rephrase that summary as a 10-year-old would explain it."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
print("\n🔍 Starting Gemini 2.5 Flash multi-tool chain test")
|
|
86
|
+
print(f"📝 Query: {complex_query}")
|
|
87
|
+
|
|
88
|
+
# Note: Gemini tests use synchronous chat, not async streaming
|
|
89
|
+
response = agent.chat(complex_query)
|
|
90
|
+
|
|
91
|
+
print(f"🎯 Final response: {response.response}")
|
|
92
|
+
print(f"📄 Final response length: {len(response.response)} chars")
|
|
93
|
+
|
|
94
|
+
# Check for mathematical results in the response
|
|
95
|
+
# Expected: 3*8=24, 24+14=38, 38*3=114
|
|
96
|
+
expected_intermediate_results = ["24", "38", "114"]
|
|
97
|
+
response_text = response.response.lower()
|
|
98
|
+
math_results_found = sum(1 for result in expected_intermediate_results
|
|
99
|
+
if result in response_text)
|
|
100
|
+
|
|
101
|
+
print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
|
|
102
|
+
print(f"🔍 Response text searched: {response_text[:200]}...")
|
|
103
|
+
|
|
104
|
+
# More lenient assertion - just check that some mathematical progress was made
|
|
105
|
+
self.assertGreaterEqual(math_results_found, 1,
|
|
106
|
+
f"Expected at least 1 mathematical result. Found {math_results_found}. "
|
|
107
|
+
f"Response: {response.response}")
|
|
108
|
+
|
|
109
|
+
# Verify response has content and mentions math concepts
|
|
110
|
+
self.assertGreater(len(response.response.strip()), 50, "Expected substantial response content")
|
|
111
|
+
|
|
112
|
+
# Check for indications of multi-tool usage (math, summary, or explanation content)
|
|
113
|
+
multi_tool_indicators = ["calculate", "multiply", "add", "summary", "explain", "mathematical", "process"]
|
|
114
|
+
indicators_found = sum(1 for indicator in multi_tool_indicators
|
|
115
|
+
if indicator in response_text)
|
|
116
|
+
self.assertGreaterEqual(indicators_found, 3,
|
|
117
|
+
f"Expected multiple tool usage indicators. Found {indicators_found}: {response.response}")
|
|
118
|
+
|
|
55
119
|
|
|
56
120
|
if __name__ == "__main__":
|
|
57
121
|
unittest.main()
|
tests/test_groq.py
CHANGED
|
@@ -8,6 +8,7 @@ import threading
|
|
|
8
8
|
|
|
9
9
|
from vectara_agentic.agent import Agent
|
|
10
10
|
from vectara_agentic.tools import ToolsFactory
|
|
11
|
+
from vectara_agentic.tools_catalog import ToolsCatalog
|
|
11
12
|
from vectara_agentic.agent_config import AgentConfig
|
|
12
13
|
from vectara_agentic.types import AgentType, ModelProvider
|
|
13
14
|
|
|
@@ -17,6 +18,7 @@ nest_asyncio.apply()
|
|
|
17
18
|
|
|
18
19
|
from conftest import (
|
|
19
20
|
mult,
|
|
21
|
+
add,
|
|
20
22
|
fc_config_groq,
|
|
21
23
|
STANDARD_TEST_TOPIC,
|
|
22
24
|
STANDARD_TEST_INSTRUCTIONS,
|
|
@@ -67,9 +69,9 @@ class TestGROQ(unittest.IsolatedAsyncioTestCase):
|
|
|
67
69
|
self.assertEqual(response3.response, "1050")
|
|
68
70
|
|
|
69
71
|
async def test_gpt_oss_120b(self):
|
|
70
|
-
"""Test GPT-OSS-120B model with GROQ
|
|
72
|
+
"""Test GPT-OSS-120B model with complex multi-step reasoning chain using multiple tools via GROQ."""
|
|
71
73
|
with ARIZE_LOCK:
|
|
72
|
-
# Create config
|
|
74
|
+
# Create config for GPT-OSS-120B via GROQ
|
|
73
75
|
gpt_oss_config = AgentConfig(
|
|
74
76
|
agent_type=AgentType.FUNCTION_CALLING,
|
|
75
77
|
main_llm_provider=ModelProvider.GROQ,
|
|
@@ -78,25 +80,208 @@ class TestGROQ(unittest.IsolatedAsyncioTestCase):
|
|
|
78
80
|
tool_llm_model_name="openai/gpt-oss-120b",
|
|
79
81
|
)
|
|
80
82
|
|
|
81
|
-
tools
|
|
83
|
+
# Create multiple tools for complex reasoning
|
|
84
|
+
tools_catalog = ToolsCatalog(gpt_oss_config)
|
|
85
|
+
tools = [
|
|
86
|
+
ToolsFactory().create_tool(mult),
|
|
87
|
+
ToolsFactory().create_tool(add),
|
|
88
|
+
ToolsFactory().create_tool(tools_catalog.summarize_text),
|
|
89
|
+
ToolsFactory().create_tool(tools_catalog.rephrase_text),
|
|
90
|
+
]
|
|
91
|
+
|
|
82
92
|
agent = Agent(
|
|
83
93
|
agent_config=gpt_oss_config,
|
|
84
94
|
tools=tools,
|
|
85
95
|
topic=STANDARD_TEST_TOPIC,
|
|
86
|
-
custom_instructions=
|
|
96
|
+
custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
|
|
87
97
|
)
|
|
88
98
|
|
|
89
|
-
#
|
|
90
|
-
|
|
91
|
-
"
|
|
99
|
+
# Complex multi-step reasoning task
|
|
100
|
+
complex_query = (
|
|
101
|
+
"Perform this calculation step by step: "
|
|
102
|
+
"First multiply 7 by 8, then add 15 to that result, "
|
|
103
|
+
"then multiply the new result by 3. "
|
|
104
|
+
"After getting the final number, summarize the entire mathematical process "
|
|
105
|
+
"with expertise in 'mathematics education', "
|
|
106
|
+
"then rephrase that summary as a 10-year-old would explain it."
|
|
92
107
|
)
|
|
93
|
-
|
|
108
|
+
|
|
109
|
+
print("\n🔍 Starting GPT-OSS-120B multi-tool chain test (GROQ)")
|
|
110
|
+
print(f"📝 Query: {complex_query}")
|
|
111
|
+
print("🌊 Streaming response:\n" + "="*50)
|
|
112
|
+
|
|
113
|
+
stream = await agent.astream_chat(complex_query)
|
|
114
|
+
|
|
115
|
+
# Capture streaming deltas and tool calls
|
|
116
|
+
streaming_deltas = []
|
|
117
|
+
tool_calls_made = []
|
|
118
|
+
full_response = ""
|
|
119
|
+
|
|
94
120
|
async for chunk in stream.async_response_gen():
|
|
95
|
-
|
|
121
|
+
if chunk and chunk.strip():
|
|
122
|
+
streaming_deltas.append(chunk)
|
|
123
|
+
full_response += chunk
|
|
124
|
+
# Display each streaming delta
|
|
125
|
+
print(f"📡 Delta: {repr(chunk)}")
|
|
126
|
+
|
|
127
|
+
# Track tool calls in the stream
|
|
128
|
+
if "mult" in chunk.lower():
|
|
129
|
+
if "mult" not in [call["tool"] for call in tool_calls_made]:
|
|
130
|
+
tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
|
|
131
|
+
print(f"🔧 Tool call detected: mult (#{len(tool_calls_made)})")
|
|
132
|
+
if "add" in chunk.lower():
|
|
133
|
+
if "add" not in [call["tool"] for call in tool_calls_made]:
|
|
134
|
+
tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
|
|
135
|
+
print(f"🔧 Tool call detected: add (#{len(tool_calls_made)})")
|
|
136
|
+
if "summarize" in chunk.lower():
|
|
137
|
+
if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
|
|
138
|
+
tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
|
|
139
|
+
print(f"🔧 Tool call detected: summarize_text (#{len(tool_calls_made)})")
|
|
140
|
+
if "rephrase" in chunk.lower():
|
|
141
|
+
if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
|
|
142
|
+
tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
|
|
143
|
+
print(f"🔧 Tool call detected: rephrase_text (#{len(tool_calls_made)})")
|
|
144
|
+
|
|
145
|
+
response = await stream.aget_response()
|
|
146
|
+
|
|
147
|
+
print("="*50)
|
|
148
|
+
print(f"✅ Streaming completed. Total deltas: {len(streaming_deltas)}")
|
|
149
|
+
print(f"🔧 Tool calls made: {[call['tool'] for call in tool_calls_made]}")
|
|
150
|
+
print(f"📄 Final response length: {len(response.response)} chars")
|
|
151
|
+
print(f"🎯 Final response: {response.response}")
|
|
152
|
+
|
|
153
|
+
# Validate tool usage sequence
|
|
154
|
+
tools_used = [call["tool"] for call in tool_calls_made]
|
|
155
|
+
print(f"🧪 Tools used in order: {tools_used}")
|
|
156
|
+
|
|
157
|
+
# Check that at least multiplication happened (basic requirement)
|
|
158
|
+
self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
|
|
159
|
+
|
|
160
|
+
# Check for mathematical results in the full response or streaming deltas
|
|
161
|
+
expected_intermediate_results = ["56", "71", "213"]
|
|
162
|
+
all_text = (full_response + " " + response.response).lower()
|
|
163
|
+
math_results_found = sum(1 for result in expected_intermediate_results
|
|
164
|
+
if result in all_text)
|
|
165
|
+
|
|
166
|
+
print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
|
|
167
|
+
print(f"🔍 Full text searched: {all_text[:200]}...")
|
|
168
|
+
|
|
169
|
+
# More lenient assertion - just check that some mathematical progress was made
|
|
170
|
+
self.assertGreaterEqual(math_results_found, 1,
|
|
171
|
+
f"Expected at least 1 mathematical result. Found {math_results_found}. "
|
|
172
|
+
f"Full text: {all_text}")
|
|
173
|
+
|
|
174
|
+
# Verify that streaming actually produced content
|
|
175
|
+
self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
|
|
176
|
+
self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
|
|
177
|
+
|
|
178
|
+
async def test_gpt_oss_20b(self):
|
|
179
|
+
"""Test GPT-OSS-20B model with complex multi-step reasoning chain using multiple tools via GROQ."""
|
|
180
|
+
with ARIZE_LOCK:
|
|
181
|
+
# Create config for GPT-OSS-20B via GROQ
|
|
182
|
+
gpt_oss_20b_config = AgentConfig(
|
|
183
|
+
agent_type=AgentType.FUNCTION_CALLING,
|
|
184
|
+
main_llm_provider=ModelProvider.GROQ,
|
|
185
|
+
main_llm_model_name="openai/gpt-oss-20b",
|
|
186
|
+
tool_llm_provider=ModelProvider.GROQ,
|
|
187
|
+
tool_llm_model_name="openai/gpt-oss-20b",
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Create multiple tools for complex reasoning
|
|
191
|
+
tools_catalog = ToolsCatalog(gpt_oss_20b_config)
|
|
192
|
+
tools = [
|
|
193
|
+
ToolsFactory().create_tool(mult),
|
|
194
|
+
ToolsFactory().create_tool(add),
|
|
195
|
+
ToolsFactory().create_tool(tools_catalog.summarize_text),
|
|
196
|
+
ToolsFactory().create_tool(tools_catalog.rephrase_text),
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
agent = Agent(
|
|
200
|
+
agent_config=gpt_oss_20b_config,
|
|
201
|
+
tools=tools,
|
|
202
|
+
topic=STANDARD_TEST_TOPIC,
|
|
203
|
+
custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Complex multi-step reasoning task
|
|
207
|
+
complex_query = (
|
|
208
|
+
"Perform this calculation step by step: "
|
|
209
|
+
"First multiply 6 by 9, then add 12 to that result, "
|
|
210
|
+
"then multiply the new result by 2. "
|
|
211
|
+
"After getting the final number, summarize the entire mathematical process "
|
|
212
|
+
"with expertise in 'mathematics education', "
|
|
213
|
+
"then rephrase that summary as a 10-year-old would explain it."
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
print("\n🔍 Starting GPT-OSS-20B multi-tool chain test (GROQ)")
|
|
217
|
+
print(f"📝 Query: {complex_query}")
|
|
218
|
+
print("🌊 Streaming response:\n" + "="*50)
|
|
219
|
+
|
|
220
|
+
stream = await agent.astream_chat(complex_query)
|
|
221
|
+
|
|
222
|
+
# Capture streaming deltas and tool calls
|
|
223
|
+
streaming_deltas = []
|
|
224
|
+
tool_calls_made = []
|
|
225
|
+
full_response = ""
|
|
226
|
+
|
|
227
|
+
async for chunk in stream.async_response_gen():
|
|
228
|
+
if chunk and chunk.strip():
|
|
229
|
+
streaming_deltas.append(chunk)
|
|
230
|
+
full_response += chunk
|
|
231
|
+
# Display each streaming delta
|
|
232
|
+
print(f"📡 Delta: {repr(chunk)}")
|
|
233
|
+
|
|
234
|
+
# Track tool calls in the stream
|
|
235
|
+
if "mult" in chunk.lower():
|
|
236
|
+
if "mult" not in [call["tool"] for call in tool_calls_made]:
|
|
237
|
+
tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
|
|
238
|
+
print(f"🔧 Tool call detected: mult (#{len(tool_calls_made)})")
|
|
239
|
+
if "add" in chunk.lower():
|
|
240
|
+
if "add" not in [call["tool"] for call in tool_calls_made]:
|
|
241
|
+
tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
|
|
242
|
+
print(f"🔧 Tool call detected: add (#{len(tool_calls_made)})")
|
|
243
|
+
if "summarize" in chunk.lower():
|
|
244
|
+
if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
|
|
245
|
+
tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
|
|
246
|
+
print(f"🔧 Tool call detected: summarize_text (#{len(tool_calls_made)})")
|
|
247
|
+
if "rephrase" in chunk.lower():
|
|
248
|
+
if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
|
|
249
|
+
tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
|
|
250
|
+
print(f"🔧 Tool call detected: rephrase_text (#{len(tool_calls_made)})")
|
|
251
|
+
|
|
96
252
|
response = await stream.aget_response()
|
|
97
253
|
|
|
98
|
-
|
|
99
|
-
|
|
254
|
+
print("="*50)
|
|
255
|
+
print(f"✅ Streaming completed. Total deltas: {len(streaming_deltas)}")
|
|
256
|
+
print(f"🔧 Tool calls made: {[call['tool'] for call in tool_calls_made]}")
|
|
257
|
+
print(f"📄 Final response length: {len(response.response)} chars")
|
|
258
|
+
print(f"🎯 Final response: {response.response}")
|
|
259
|
+
|
|
260
|
+
# Validate tool usage sequence
|
|
261
|
+
tools_used = [call["tool"] for call in tool_calls_made]
|
|
262
|
+
print(f"🧪 Tools used in order: {tools_used}")
|
|
263
|
+
|
|
264
|
+
# Check that at least multiplication happened (basic requirement)
|
|
265
|
+
self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
|
|
266
|
+
|
|
267
|
+
# Check for mathematical results in the full response or streaming deltas
|
|
268
|
+
# Expected: 6*9=54, 54+12=66, 66*2=132
|
|
269
|
+
expected_intermediate_results = ["54", "66", "132"]
|
|
270
|
+
all_text = (full_response + " " + response.response).lower()
|
|
271
|
+
math_results_found = sum(1 for result in expected_intermediate_results
|
|
272
|
+
if result in all_text)
|
|
273
|
+
|
|
274
|
+
print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
|
|
275
|
+
print(f"🔍 Full text searched: {all_text[:200]}...")
|
|
276
|
+
|
|
277
|
+
# More lenient assertion - just check that some mathematical progress was made
|
|
278
|
+
self.assertGreaterEqual(math_results_found, 1,
|
|
279
|
+
f"Expected at least 1 mathematical result. Found {math_results_found}. "
|
|
280
|
+
f"Full text: {all_text}")
|
|
281
|
+
|
|
282
|
+
# Verify that streaming actually produced content
|
|
283
|
+
self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
|
|
284
|
+
self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
|
|
100
285
|
|
|
101
286
|
|
|
102
287
|
if __name__ == "__main__":
|
tests/test_openai.py
CHANGED
|
@@ -8,6 +8,7 @@ import threading
|
|
|
8
8
|
|
|
9
9
|
from vectara_agentic.agent import Agent
|
|
10
10
|
from vectara_agentic.tools import ToolsFactory
|
|
11
|
+
from vectara_agentic.tools_catalog import ToolsCatalog
|
|
11
12
|
from vectara_agentic.agent_config import AgentConfig
|
|
12
13
|
from vectara_agentic.types import AgentType, ModelProvider
|
|
13
14
|
|
|
@@ -18,6 +19,7 @@ nest_asyncio.apply()
|
|
|
18
19
|
from conftest import (
|
|
19
20
|
fc_config_openai,
|
|
20
21
|
mult,
|
|
22
|
+
add,
|
|
21
23
|
STANDARD_TEST_TOPIC,
|
|
22
24
|
STANDARD_TEST_INSTRUCTIONS,
|
|
23
25
|
)
|
|
@@ -155,6 +157,105 @@ class TestOpenAI(unittest.IsolatedAsyncioTestCase):
|
|
|
155
157
|
|
|
156
158
|
self.assertIn("25", response.response)
|
|
157
159
|
|
|
160
|
+
async def test_gpt_41_mini_multi_tool_chain(self):
|
|
161
|
+
"""Test GPT-4.1-mini with complex multi-step reasoning chain using multiple tools."""
|
|
162
|
+
with ARIZE_LOCK:
|
|
163
|
+
# Use default OpenAI config (gpt-4.1-mini)
|
|
164
|
+
tools_catalog = ToolsCatalog(fc_config_openai)
|
|
165
|
+
tools = [
|
|
166
|
+
ToolsFactory().create_tool(mult),
|
|
167
|
+
ToolsFactory().create_tool(add),
|
|
168
|
+
ToolsFactory().create_tool(tools_catalog.summarize_text),
|
|
169
|
+
ToolsFactory().create_tool(tools_catalog.rephrase_text),
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
agent = Agent(
|
|
173
|
+
agent_config=fc_config_openai,
|
|
174
|
+
tools=tools,
|
|
175
|
+
topic=STANDARD_TEST_TOPIC,
|
|
176
|
+
custom_instructions="You are a mathematical reasoning agent that explains your work step by step.",
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Complex multi-step reasoning task
|
|
180
|
+
complex_query = (
|
|
181
|
+
"Perform this calculation step by step: "
|
|
182
|
+
"First multiply 4 by 7, then add 18 to that result, "
|
|
183
|
+
"then multiply the new result by 2. "
|
|
184
|
+
"After getting the final number, summarize the entire mathematical process "
|
|
185
|
+
"with expertise in 'mathematics education', "
|
|
186
|
+
"then rephrase that summary as a 10-year-old would explain it."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
print("\n🔍 Starting GPT-4.1-mini multi-tool chain test (OpenAI)")
|
|
190
|
+
print(f"📝 Query: {complex_query}")
|
|
191
|
+
print("🌊 Streaming response:\n" + "="*50)
|
|
192
|
+
|
|
193
|
+
stream = await agent.astream_chat(complex_query)
|
|
194
|
+
|
|
195
|
+
# Capture streaming deltas and tool calls
|
|
196
|
+
streaming_deltas = []
|
|
197
|
+
tool_calls_made = []
|
|
198
|
+
full_response = ""
|
|
199
|
+
|
|
200
|
+
async for chunk in stream.async_response_gen():
|
|
201
|
+
if chunk and chunk.strip():
|
|
202
|
+
streaming_deltas.append(chunk)
|
|
203
|
+
full_response += chunk
|
|
204
|
+
# Display each streaming delta
|
|
205
|
+
print(f"📡 Delta: {repr(chunk)}")
|
|
206
|
+
|
|
207
|
+
# Track tool calls in the stream
|
|
208
|
+
if "mult" in chunk.lower():
|
|
209
|
+
if "mult" not in [call["tool"] for call in tool_calls_made]:
|
|
210
|
+
tool_calls_made.append({"tool": "mult", "order": len(tool_calls_made) + 1})
|
|
211
|
+
print(f"🔧 Tool call detected: mult (#{len(tool_calls_made)})")
|
|
212
|
+
if "add" in chunk.lower():
|
|
213
|
+
if "add" not in [call["tool"] for call in tool_calls_made]:
|
|
214
|
+
tool_calls_made.append({"tool": "add", "order": len(tool_calls_made) + 1})
|
|
215
|
+
print(f"🔧 Tool call detected: add (#{len(tool_calls_made)})")
|
|
216
|
+
if "summarize" in chunk.lower():
|
|
217
|
+
if "summarize_text" not in [call["tool"] for call in tool_calls_made]:
|
|
218
|
+
tool_calls_made.append({"tool": "summarize_text", "order": len(tool_calls_made) + 1})
|
|
219
|
+
print(f"🔧 Tool call detected: summarize_text (#{len(tool_calls_made)})")
|
|
220
|
+
if "rephrase" in chunk.lower():
|
|
221
|
+
if "rephrase_text" not in [call["tool"] for call in tool_calls_made]:
|
|
222
|
+
tool_calls_made.append({"tool": "rephrase_text", "order": len(tool_calls_made) + 1})
|
|
223
|
+
print(f"🔧 Tool call detected: rephrase_text (#{len(tool_calls_made)})")
|
|
224
|
+
|
|
225
|
+
response = await stream.aget_response()
|
|
226
|
+
|
|
227
|
+
print("="*50)
|
|
228
|
+
print(f"✅ Streaming completed. Total deltas: {len(streaming_deltas)}")
|
|
229
|
+
print(f"🔧 Tool calls made: {[call['tool'] for call in tool_calls_made]}")
|
|
230
|
+
print(f"📄 Final response length: {len(response.response)} chars")
|
|
231
|
+
print(f"🎯 Final response: {response.response}")
|
|
232
|
+
|
|
233
|
+
# Validate tool usage sequence
|
|
234
|
+
tools_used = [call["tool"] for call in tool_calls_made]
|
|
235
|
+
print(f"🧪 Tools used in order: {tools_used}")
|
|
236
|
+
|
|
237
|
+
# Check that at least multiplication happened (basic requirement)
|
|
238
|
+
self.assertIn("mult", tools_used, f"Expected multiplication tool to be used. Tools used: {tools_used}")
|
|
239
|
+
|
|
240
|
+
# Check for mathematical results in the full response or streaming deltas
|
|
241
|
+
# Expected: 4*7=28, 28+18=46, 46*2=92
|
|
242
|
+
expected_intermediate_results = ["28", "46", "92"]
|
|
243
|
+
all_text = (full_response + " " + response.response).lower()
|
|
244
|
+
math_results_found = sum(1 for result in expected_intermediate_results
|
|
245
|
+
if result in all_text)
|
|
246
|
+
|
|
247
|
+
print(f"🔢 Mathematical results found: {math_results_found}/3 expected")
|
|
248
|
+
print(f"🔍 Full text searched: {all_text[:200]}...")
|
|
249
|
+
|
|
250
|
+
# More lenient assertion - just check that some mathematical progress was made
|
|
251
|
+
self.assertGreaterEqual(math_results_found, 1,
|
|
252
|
+
f"Expected at least 1 mathematical result. Found {math_results_found}. "
|
|
253
|
+
f"Full text: {all_text}")
|
|
254
|
+
|
|
255
|
+
# Verify that streaming actually produced content
|
|
256
|
+
self.assertGreater(len(streaming_deltas), 0, "Expected streaming deltas to be produced")
|
|
257
|
+
self.assertGreater(len(response.response.strip()), 0, "Expected non-empty final response")
|
|
258
|
+
|
|
158
259
|
|
|
159
260
|
if __name__ == "__main__":
|
|
160
261
|
unittest.main()
|
vectara_agentic/_version.py
CHANGED
vectara_agentic/agent.py
CHANGED
|
@@ -1096,7 +1096,7 @@ class Agent:
|
|
|
1096
1096
|
model_fields = outputs_model_on_fail_cls.model_fields
|
|
1097
1097
|
input_dict = {}
|
|
1098
1098
|
for key in model_fields:
|
|
1099
|
-
value = await workflow_context.get(key, default=_missing)
|
|
1099
|
+
value = await workflow_context.store.get(key, default=_missing) # pylint: disable=no-member
|
|
1100
1100
|
if value is not _missing:
|
|
1101
1101
|
input_dict[key] = value
|
|
1102
1102
|
output = outputs_model_on_fail_cls.model_validate(input_dict)
|
|
@@ -58,6 +58,7 @@ GENERAL_INSTRUCTIONS = """
|
|
|
58
58
|
- Always respond in the language of the question, and in text (no images, videos or code).
|
|
59
59
|
- If you are provided with database tools use them for analytical queries (such as counting, calculating max, min, average, sum, or other statistics).
|
|
60
60
|
For each database, the database tools include: x_list_tables, x_load_data, x_describe_tables, x_load_unique_values, and x_load_sample_data, where 'x' in the database name.
|
|
61
|
+
Do not call any database tool unless it is included in your list of available tools.
|
|
61
62
|
for example, if the database name is "ev", the tools are: ev_list_tables, ev_load_data, ev_describe_tables, ev_load_unique_values, and ev_load_sample_data.
|
|
62
63
|
Use ANSI SQL-92 syntax for the SQL queries, and do not use any other SQL dialect.
|
|
63
64
|
Before using the x_load_data with a SQL query, always follow these discovery steps:
|