vectara-agentic 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vectara-agentic might be problematic. Click here for more details.
- tests/benchmark_models.py +945 -0
- tests/conftest.py +9 -5
- tests/run_tests.py +3 -0
- tests/test_agent.py +57 -29
- tests/test_agent_fallback_memory.py +270 -0
- tests/test_agent_memory_consistency.py +229 -0
- tests/test_agent_type.py +4 -0
- tests/test_bedrock.py +46 -31
- tests/test_fallback.py +1 -1
- tests/test_gemini.py +7 -22
- tests/test_groq.py +46 -31
- tests/test_private_llm.py +1 -1
- tests/test_serialization.py +3 -6
- tests/test_session_memory.py +252 -0
- tests/test_streaming.py +58 -37
- tests/test_together.py +62 -0
- tests/test_vhc.py +3 -2
- tests/test_workflow.py +9 -28
- vectara_agentic/_observability.py +19 -0
- vectara_agentic/_version.py +1 -1
- vectara_agentic/agent.py +246 -37
- vectara_agentic/agent_core/factory.py +34 -153
- vectara_agentic/agent_core/prompts.py +19 -13
- vectara_agentic/agent_core/serialization.py +17 -8
- vectara_agentic/agent_core/streaming.py +27 -43
- vectara_agentic/agent_core/utils/__init__.py +0 -5
- vectara_agentic/agent_core/utils/hallucination.py +54 -99
- vectara_agentic/llm_utils.py +4 -2
- vectara_agentic/sub_query_workflow.py +3 -2
- vectara_agentic/tools.py +0 -19
- vectara_agentic/types.py +9 -3
- {vectara_agentic-0.4.0.dist-info → vectara_agentic-0.4.2.dist-info}/METADATA +79 -39
- vectara_agentic-0.4.2.dist-info/RECORD +54 -0
- vectara_agentic/agent_core/utils/prompt_formatting.py +0 -56
- vectara_agentic-0.4.0.dist-info/RECORD +0 -50
- {vectara_agentic-0.4.0.dist-info → vectara_agentic-0.4.2.dist-info}/WHEEL +0 -0
- {vectara_agentic-0.4.0.dist-info → vectara_agentic-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {vectara_agentic-0.4.0.dist-info → vectara_agentic-0.4.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,945 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Model Performance Benchmark Script
|
|
4
|
+
|
|
5
|
+
This script benchmarks different LLM models for latency and performance
|
|
6
|
+
in the context of Vectara Agentic framework.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import time
|
|
11
|
+
import json
|
|
12
|
+
import statistics
|
|
13
|
+
import sys
|
|
14
|
+
import os
|
|
15
|
+
from typing import Dict, List, Tuple, Any
|
|
16
|
+
from dataclasses import dataclass, asdict
|
|
17
|
+
|
|
18
|
+
# Add the current directory to Python path to import vectara_agentic
|
|
19
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
20
|
+
|
|
21
|
+
from vectara_agentic.agent import Agent
|
|
22
|
+
from vectara_agentic.agent_config import AgentConfig
|
|
23
|
+
from vectara_agentic.types import ModelProvider, ObserverType
|
|
24
|
+
from vectara_agentic.tools import ToolsFactory
|
|
25
|
+
from vectara_agentic._observability import setup_observer, shutdown_observer
|
|
26
|
+
|
|
27
|
+
# Initialize observability once at startup to prevent repeated instrumentation
|
|
28
|
+
_observability_initialized = False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class BenchmarkResult:
|
|
33
|
+
"""Results from a single benchmark run."""
|
|
34
|
+
|
|
35
|
+
model_name: str
|
|
36
|
+
provider: str
|
|
37
|
+
test_type: str
|
|
38
|
+
first_token_latency: float
|
|
39
|
+
total_response_time: float
|
|
40
|
+
response_length: int
|
|
41
|
+
tokens_per_second: float
|
|
42
|
+
error: str = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class BenchmarkStats:
|
|
47
|
+
"""Aggregated statistics for multiple runs."""
|
|
48
|
+
|
|
49
|
+
model_name: str
|
|
50
|
+
provider: str
|
|
51
|
+
test_type: str
|
|
52
|
+
runs: int
|
|
53
|
+
avg_first_token_latency: float
|
|
54
|
+
avg_total_response_time: float
|
|
55
|
+
avg_tokens_per_second: float
|
|
56
|
+
median_first_token_latency: float
|
|
57
|
+
median_total_response_time: float
|
|
58
|
+
median_tokens_per_second: float
|
|
59
|
+
min_total_response_time: float
|
|
60
|
+
max_total_response_time: float
|
|
61
|
+
std_total_response_time: float
|
|
62
|
+
success_rate: float
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ModelBenchmark:
|
|
66
|
+
"""Benchmarking suite for different LLM models."""
|
|
67
|
+
|
|
68
|
+
def __init__(self, enable_observability: bool = False):
|
|
69
|
+
# Test configurations
|
|
70
|
+
self.enable_observability = enable_observability
|
|
71
|
+
self.models_to_test = [
|
|
72
|
+
# OpenAI models
|
|
73
|
+
{"provider": ModelProvider.OPENAI, "model": "gpt-5"},
|
|
74
|
+
{"provider": ModelProvider.OPENAI, "model": "gpt-5-mini"},
|
|
75
|
+
{"provider": ModelProvider.OPENAI, "model": "gpt-4o"},
|
|
76
|
+
{"provider": ModelProvider.OPENAI, "model": "gpt-4o-mini"},
|
|
77
|
+
{"provider": ModelProvider.OPENAI, "model": "gpt-4.1"},
|
|
78
|
+
{"provider": ModelProvider.OPENAI, "model": "gpt-4.1-mini"},
|
|
79
|
+
{"provider": ModelProvider.ANTHROPIC, "model": "claude-sonnet-4-20250514"},
|
|
80
|
+
{"provider": ModelProvider.TOGETHER, "model": "deepseek-ai/DeepSeek-V3"},
|
|
81
|
+
{"provider": ModelProvider.GROQ, "model": "openai/gpt-oss-20b"},
|
|
82
|
+
{"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-flash"},
|
|
83
|
+
{"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-pro"},
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
# Test scenarios - focused on advanced tool calling only
|
|
87
|
+
self.test_scenarios = {
|
|
88
|
+
"financial_analysis": {
|
|
89
|
+
"prompt": "Analyze a $50,000 investment portfolio with 60% stocks (8% return), 30% bonds (4% return), and 10% cash (1% return). Calculate the expected annual return, then determine how the portfolio value would grow over 15 years with monthly contributions of $1,000. Create a summary report of the analysis.",
|
|
90
|
+
"description": "Multi-step financial analysis with calculations and reporting",
|
|
91
|
+
"needs_tools": True,
|
|
92
|
+
},
|
|
93
|
+
"data_processing": {
|
|
94
|
+
"prompt": "Generate a dataset of 100 customers with randomized demographics (age, income, location, purchase_history). Then analyze this data to find correlations between age groups and spending patterns. Create a statistical summary and export the results to a formatted report.",
|
|
95
|
+
"description": "Data generation, analysis, and reporting workflow",
|
|
96
|
+
"needs_tools": True,
|
|
97
|
+
},
|
|
98
|
+
"research_synthesis": {
|
|
99
|
+
"prompt": "Search for information about the latest developments in quantum computing, specifically focusing on error correction breakthroughs in 2024. Extract key findings from multiple sources, summarize the technical approaches, and create a structured research report with citations.",
|
|
100
|
+
"description": "Information retrieval, synthesis, and document generation",
|
|
101
|
+
"needs_tools": True,
|
|
102
|
+
},
|
|
103
|
+
"system_monitoring": {
|
|
104
|
+
"prompt": "Check system performance metrics including CPU usage, memory consumption, and disk space. If any metrics exceed safe thresholds (CPU > 80%, Memory > 90%, Disk > 85%), generate alerts and suggest optimization strategies. Create a monitoring report with recommendations.",
|
|
105
|
+
"description": "System monitoring with conditional logic and reporting",
|
|
106
|
+
"needs_tools": True,
|
|
107
|
+
},
|
|
108
|
+
"workflow_automation": {
|
|
109
|
+
"prompt": "Create a project task list with 10 software development tasks, assign priorities and estimated hours, then simulate a sprint planning session by organizing tasks into a 2-week sprint. Generate a sprint backlog with daily breakdowns and resource allocation recommendations.",
|
|
110
|
+
"description": "Complex workflow orchestration with multiple tool interactions",
|
|
111
|
+
"needs_tools": True,
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
self.iterations_per_test = 5
|
|
116
|
+
self.results: List[BenchmarkResult] = []
|
|
117
|
+
|
|
118
|
+
def create_agent_config(
|
|
119
|
+
self, provider: ModelProvider, model_name: str
|
|
120
|
+
) -> AgentConfig:
|
|
121
|
+
"""Create agent configuration for the specified model."""
|
|
122
|
+
return AgentConfig(
|
|
123
|
+
main_llm_provider=provider,
|
|
124
|
+
main_llm_model_name=model_name,
|
|
125
|
+
tool_llm_provider=provider,
|
|
126
|
+
tool_llm_model_name=model_name,
|
|
127
|
+
observer=(
|
|
128
|
+
ObserverType.ARIZE_PHOENIX
|
|
129
|
+
if self.enable_observability
|
|
130
|
+
else ObserverType.NO_OBSERVER
|
|
131
|
+
),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def create_test_tools(self) -> List:
|
|
135
|
+
"""Create an advanced set of tools for realistic agent testing."""
|
|
136
|
+
import random
|
|
137
|
+
import json
|
|
138
|
+
import psutil
|
|
139
|
+
from datetime import datetime
|
|
140
|
+
|
|
141
|
+
tools_factory = ToolsFactory()
|
|
142
|
+
|
|
143
|
+
# Financial Analysis Tools
|
|
144
|
+
def calculate_portfolio_return(
|
|
145
|
+
stocks_pct: float,
|
|
146
|
+
stocks_return: float,
|
|
147
|
+
bonds_pct: float,
|
|
148
|
+
bonds_return: float,
|
|
149
|
+
cash_pct: float,
|
|
150
|
+
cash_return: float,
|
|
151
|
+
) -> dict:
|
|
152
|
+
"""Calculate expected portfolio return and allocation details."""
|
|
153
|
+
total_allocation = stocks_pct + bonds_pct + cash_pct
|
|
154
|
+
if abs(total_allocation - 100) > 0.01:
|
|
155
|
+
raise ValueError(
|
|
156
|
+
f"Portfolio allocation must sum to 100%, got {total_allocation}%"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
expected_return = (
|
|
160
|
+
stocks_pct * stocks_return
|
|
161
|
+
+ bonds_pct * bonds_return
|
|
162
|
+
+ cash_pct * cash_return
|
|
163
|
+
) / 100
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
"expected_annual_return_pct": expected_return,
|
|
167
|
+
"allocation": {
|
|
168
|
+
"stocks": {"percentage": stocks_pct, "return": stocks_return},
|
|
169
|
+
"bonds": {"percentage": bonds_pct, "return": bonds_return},
|
|
170
|
+
"cash": {"percentage": cash_pct, "return": cash_return},
|
|
171
|
+
},
|
|
172
|
+
"risk_profile": (
|
|
173
|
+
"aggressive"
|
|
174
|
+
if stocks_pct > 70
|
|
175
|
+
else "moderate" if stocks_pct > 40 else "conservative"
|
|
176
|
+
),
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
def project_investment_growth(
|
|
180
|
+
initial_amount: float,
|
|
181
|
+
annual_return: float,
|
|
182
|
+
years: int,
|
|
183
|
+
monthly_contribution: float = 0,
|
|
184
|
+
) -> dict:
|
|
185
|
+
"""Project investment growth with optional monthly contributions."""
|
|
186
|
+
monthly_rate = annual_return / 12 / 100
|
|
187
|
+
months = years * 12
|
|
188
|
+
|
|
189
|
+
# Calculate compound growth with monthly contributions
|
|
190
|
+
if monthly_contribution > 0:
|
|
191
|
+
# Future value of initial investment
|
|
192
|
+
fv_initial = initial_amount * ((1 + monthly_rate) ** months)
|
|
193
|
+
# Future value of monthly contributions (ordinary annuity)
|
|
194
|
+
fv_contributions = monthly_contribution * (
|
|
195
|
+
((1 + monthly_rate) ** months - 1) / monthly_rate
|
|
196
|
+
)
|
|
197
|
+
final_value = fv_initial + fv_contributions
|
|
198
|
+
total_contributions = monthly_contribution * months
|
|
199
|
+
else:
|
|
200
|
+
final_value = initial_amount * ((1 + annual_return / 100) ** years)
|
|
201
|
+
total_contributions = 0
|
|
202
|
+
|
|
203
|
+
total_invested = initial_amount + total_contributions
|
|
204
|
+
total_gains = final_value - total_invested
|
|
205
|
+
|
|
206
|
+
return {
|
|
207
|
+
"initial_investment": initial_amount,
|
|
208
|
+
"monthly_contribution": monthly_contribution,
|
|
209
|
+
"total_contributions": total_contributions,
|
|
210
|
+
"total_invested": total_invested,
|
|
211
|
+
"final_value": round(final_value, 2),
|
|
212
|
+
"total_gains": round(total_gains, 2),
|
|
213
|
+
"return_multiple": round(final_value / initial_amount, 2),
|
|
214
|
+
"years": years,
|
|
215
|
+
"annual_return_used": annual_return,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
# Data Analysis Tools
|
|
219
|
+
def generate_customer_dataset(count: int) -> str:
|
|
220
|
+
"""Generate randomized customer data for analysis."""
|
|
221
|
+
customers = []
|
|
222
|
+
locations = [
|
|
223
|
+
"New York",
|
|
224
|
+
"Los Angeles",
|
|
225
|
+
"Chicago",
|
|
226
|
+
"Houston",
|
|
227
|
+
"Phoenix",
|
|
228
|
+
"Philadelphia",
|
|
229
|
+
"San Antonio",
|
|
230
|
+
"San Diego",
|
|
231
|
+
"Dallas",
|
|
232
|
+
"San Jose",
|
|
233
|
+
]
|
|
234
|
+
|
|
235
|
+
for i in range(count):
|
|
236
|
+
age = random.randint(18, 75)
|
|
237
|
+
income = random.randint(25000, 150000)
|
|
238
|
+
location = random.choice(locations)
|
|
239
|
+
purchase_history = random.randint(1, 50)
|
|
240
|
+
|
|
241
|
+
customers.append(
|
|
242
|
+
{
|
|
243
|
+
"customer_id": f"CUST_{i+1:04d}",
|
|
244
|
+
"age": age,
|
|
245
|
+
"income": income,
|
|
246
|
+
"location": location,
|
|
247
|
+
"purchase_history": purchase_history,
|
|
248
|
+
"age_group": (
|
|
249
|
+
"18-30"
|
|
250
|
+
if age <= 30
|
|
251
|
+
else (
|
|
252
|
+
"31-45"
|
|
253
|
+
if age <= 45
|
|
254
|
+
else "46-60" if age <= 60 else "60+"
|
|
255
|
+
)
|
|
256
|
+
),
|
|
257
|
+
}
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return json.dumps(customers, indent=2)
|
|
261
|
+
|
|
262
|
+
def analyze_customer_data(customer_data_json: str) -> dict:
|
|
263
|
+
"""Analyze customer data for patterns and correlations."""
|
|
264
|
+
customers = json.loads(customer_data_json)
|
|
265
|
+
|
|
266
|
+
# Group by age groups
|
|
267
|
+
age_groups = {}
|
|
268
|
+
for customer in customers:
|
|
269
|
+
group = customer["age_group"]
|
|
270
|
+
if group not in age_groups:
|
|
271
|
+
age_groups[group] = {
|
|
272
|
+
"count": 0,
|
|
273
|
+
"total_spending": 0,
|
|
274
|
+
"total_income": 0,
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
age_groups[group]["count"] += 1
|
|
278
|
+
age_groups[group]["total_spending"] += customer["purchase_history"]
|
|
279
|
+
age_groups[group]["total_income"] += customer["income"]
|
|
280
|
+
|
|
281
|
+
# Calculate averages
|
|
282
|
+
analysis = {}
|
|
283
|
+
for group, data in age_groups.items():
|
|
284
|
+
analysis[group] = {
|
|
285
|
+
"count": data["count"],
|
|
286
|
+
"avg_spending": round(data["total_spending"] / data["count"], 2),
|
|
287
|
+
"avg_income": round(data["total_income"] / data["count"], 2),
|
|
288
|
+
"spending_to_income_ratio": round(
|
|
289
|
+
(data["total_spending"] / data["count"])
|
|
290
|
+
/ (data["total_income"] / data["count"])
|
|
291
|
+
* 1000,
|
|
292
|
+
4,
|
|
293
|
+
),
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
return {
|
|
297
|
+
"total_customers": len(customers),
|
|
298
|
+
"age_group_analysis": analysis,
|
|
299
|
+
"overall_avg_spending": round(
|
|
300
|
+
sum(c["purchase_history"] for c in customers) / len(customers), 2
|
|
301
|
+
),
|
|
302
|
+
"overall_avg_income": round(
|
|
303
|
+
sum(c["income"] for c in customers) / len(customers), 2
|
|
304
|
+
),
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
# System Monitoring Tools
|
|
308
|
+
def get_system_metrics() -> dict:
|
|
309
|
+
"""Get current system performance metrics."""
|
|
310
|
+
try:
|
|
311
|
+
cpu_percent = psutil.cpu_percent(interval=1)
|
|
312
|
+
memory = psutil.virtual_memory()
|
|
313
|
+
disk = psutil.disk_usage("/")
|
|
314
|
+
|
|
315
|
+
return {
|
|
316
|
+
"cpu_usage_percent": cpu_percent,
|
|
317
|
+
"memory_usage_percent": memory.percent,
|
|
318
|
+
"memory_available_gb": round(memory.available / (1024**3), 2),
|
|
319
|
+
"disk_usage_percent": disk.percent,
|
|
320
|
+
"disk_free_gb": round(disk.free / (1024**3), 2),
|
|
321
|
+
"timestamp": datetime.now().isoformat(),
|
|
322
|
+
}
|
|
323
|
+
except Exception:
|
|
324
|
+
# Fallback with simulated data for testing
|
|
325
|
+
return {
|
|
326
|
+
"cpu_usage_percent": random.randint(20, 95),
|
|
327
|
+
"memory_usage_percent": random.randint(40, 95),
|
|
328
|
+
"memory_available_gb": random.randint(1, 16),
|
|
329
|
+
"disk_usage_percent": random.randint(30, 90),
|
|
330
|
+
"disk_free_gb": random.randint(10, 500),
|
|
331
|
+
"timestamp": datetime.now().isoformat(),
|
|
332
|
+
"note": "Simulated data - psutil unavailable",
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
def check_system_health(
|
|
336
|
+
cpu_threshold: int = 80,
|
|
337
|
+
memory_threshold: int = 90,
|
|
338
|
+
disk_threshold: int = 85,
|
|
339
|
+
) -> dict:
|
|
340
|
+
"""Check system health against thresholds and generate alerts."""
|
|
341
|
+
metrics = get_system_metrics()
|
|
342
|
+
alerts = []
|
|
343
|
+
recommendations = []
|
|
344
|
+
|
|
345
|
+
if metrics["cpu_usage_percent"] > cpu_threshold:
|
|
346
|
+
alerts.append(
|
|
347
|
+
f"HIGH CPU USAGE: {metrics['cpu_usage_percent']}% (threshold: {cpu_threshold}%)"
|
|
348
|
+
)
|
|
349
|
+
recommendations.append(
|
|
350
|
+
"Consider closing unnecessary applications or upgrading CPU"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
if metrics["memory_usage_percent"] > memory_threshold:
|
|
354
|
+
alerts.append(
|
|
355
|
+
f"HIGH MEMORY USAGE: {metrics['memory_usage_percent']}% (threshold: {memory_threshold}%)"
|
|
356
|
+
)
|
|
357
|
+
recommendations.append(
|
|
358
|
+
"Close memory-intensive applications or add more RAM"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
if metrics["disk_usage_percent"] > disk_threshold:
|
|
362
|
+
alerts.append(
|
|
363
|
+
f"LOW DISK SPACE: {metrics['disk_usage_percent']}% used (threshold: {disk_threshold}%)"
|
|
364
|
+
)
|
|
365
|
+
recommendations.append(
|
|
366
|
+
"Clean up temporary files or expand disk storage"
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
health_status = (
|
|
370
|
+
"CRITICAL" if len(alerts) >= 2 else "WARNING" if alerts else "HEALTHY"
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
return {
|
|
374
|
+
"health_status": health_status,
|
|
375
|
+
"alerts": alerts,
|
|
376
|
+
"recommendations": recommendations,
|
|
377
|
+
"metrics": metrics,
|
|
378
|
+
"thresholds": {
|
|
379
|
+
"cpu": cpu_threshold,
|
|
380
|
+
"memory": memory_threshold,
|
|
381
|
+
"disk": disk_threshold,
|
|
382
|
+
},
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
# Project Management Tools
|
|
386
|
+
def create_project_tasks(count: int = 10) -> str:
|
|
387
|
+
"""Generate a list of software development tasks."""
|
|
388
|
+
task_types = [
|
|
389
|
+
"Implement user authentication system",
|
|
390
|
+
"Create REST API endpoints",
|
|
391
|
+
"Design database schema",
|
|
392
|
+
"Build responsive frontend components",
|
|
393
|
+
"Write unit tests",
|
|
394
|
+
"Set up CI/CD pipeline",
|
|
395
|
+
"Implement error handling",
|
|
396
|
+
"Create API documentation",
|
|
397
|
+
"Optimize database queries",
|
|
398
|
+
"Implement caching layer",
|
|
399
|
+
"Add logging and monitoring",
|
|
400
|
+
"Create user dashboard",
|
|
401
|
+
"Implement search functionality",
|
|
402
|
+
"Add data validation",
|
|
403
|
+
"Create admin panel",
|
|
404
|
+
]
|
|
405
|
+
|
|
406
|
+
tasks = []
|
|
407
|
+
for i in range(count):
|
|
408
|
+
task = random.choice(task_types)
|
|
409
|
+
priority = random.choice(["High", "Medium", "Low"])
|
|
410
|
+
estimated_hours = random.randint(2, 24)
|
|
411
|
+
|
|
412
|
+
tasks.append(
|
|
413
|
+
{
|
|
414
|
+
"task_id": f"TASK-{i+1:03d}",
|
|
415
|
+
"title": f"{task} #{i+1}",
|
|
416
|
+
"priority": priority,
|
|
417
|
+
"estimated_hours": estimated_hours,
|
|
418
|
+
"status": "Backlog",
|
|
419
|
+
"assigned_to": None,
|
|
420
|
+
}
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
return json.dumps(tasks, indent=2)
|
|
424
|
+
|
|
425
|
+
def plan_sprint(tasks_json: str, sprint_capacity_hours: int = 80) -> dict:
|
|
426
|
+
"""Organize tasks into a sprint with daily breakdowns."""
|
|
427
|
+
tasks = json.loads(tasks_json)
|
|
428
|
+
|
|
429
|
+
# Sort by priority and estimated hours
|
|
430
|
+
priority_order = {"High": 3, "Medium": 2, "Low": 1}
|
|
431
|
+
tasks.sort(
|
|
432
|
+
key=lambda x: (priority_order[x["priority"]], -x["estimated_hours"]),
|
|
433
|
+
reverse=True,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
sprint_tasks = []
|
|
437
|
+
total_hours = 0
|
|
438
|
+
|
|
439
|
+
for task in tasks:
|
|
440
|
+
if total_hours + task["estimated_hours"] <= sprint_capacity_hours:
|
|
441
|
+
sprint_tasks.append(task)
|
|
442
|
+
total_hours += task["estimated_hours"]
|
|
443
|
+
else:
|
|
444
|
+
break
|
|
445
|
+
|
|
446
|
+
# Distribute across 2 weeks (10 working days)
|
|
447
|
+
daily_breakdown = []
|
|
448
|
+
remaining_hours = total_hours
|
|
449
|
+
days_remaining = 10
|
|
450
|
+
|
|
451
|
+
for day in range(1, 11):
|
|
452
|
+
if days_remaining > 0:
|
|
453
|
+
day_hours = min(
|
|
454
|
+
8,
|
|
455
|
+
remaining_hours // days_remaining
|
|
456
|
+
+ (1 if remaining_hours % days_remaining else 0),
|
|
457
|
+
)
|
|
458
|
+
daily_breakdown.append(
|
|
459
|
+
{
|
|
460
|
+
"day": day,
|
|
461
|
+
"planned_hours": day_hours,
|
|
462
|
+
"remaining_capacity": 8 - day_hours,
|
|
463
|
+
}
|
|
464
|
+
)
|
|
465
|
+
remaining_hours -= day_hours
|
|
466
|
+
days_remaining -= 1
|
|
467
|
+
|
|
468
|
+
return {
|
|
469
|
+
"sprint_summary": {
|
|
470
|
+
"total_tasks": len(sprint_tasks),
|
|
471
|
+
"total_planned_hours": total_hours,
|
|
472
|
+
"sprint_capacity": sprint_capacity_hours,
|
|
473
|
+
"utilization_percent": round(
|
|
474
|
+
(total_hours / sprint_capacity_hours) * 100, 1
|
|
475
|
+
),
|
|
476
|
+
},
|
|
477
|
+
"selected_tasks": sprint_tasks,
|
|
478
|
+
"daily_breakdown": daily_breakdown,
|
|
479
|
+
"backlog_remaining": len(tasks) - len(sprint_tasks),
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
# Reporting Tools
|
|
483
|
+
def create_formatted_report(
|
|
484
|
+
title: str, data: dict, report_type: str = "summary"
|
|
485
|
+
) -> str:
|
|
486
|
+
"""Create a formatted text report from structured data."""
|
|
487
|
+
report_lines = []
|
|
488
|
+
report_lines.append("=" * 60)
|
|
489
|
+
report_lines.append(f"{title.upper()}")
|
|
490
|
+
report_lines.append("=" * 60)
|
|
491
|
+
report_lines.append(
|
|
492
|
+
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
|
493
|
+
)
|
|
494
|
+
report_lines.append(f"Report Type: {report_type.title()}")
|
|
495
|
+
report_lines.append("")
|
|
496
|
+
|
|
497
|
+
def format_dict(d, indent=0):
|
|
498
|
+
lines = []
|
|
499
|
+
for key, value in d.items():
|
|
500
|
+
prefix = " " * indent
|
|
501
|
+
if isinstance(value, dict):
|
|
502
|
+
lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
|
|
503
|
+
lines.extend(format_dict(value, indent + 1))
|
|
504
|
+
elif isinstance(value, list):
|
|
505
|
+
lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
|
|
506
|
+
for i, item in enumerate(value):
|
|
507
|
+
if isinstance(item, dict):
|
|
508
|
+
lines.append(f"{prefix} Item {i+1}:")
|
|
509
|
+
lines.extend(format_dict(item, indent + 2))
|
|
510
|
+
else:
|
|
511
|
+
lines.append(f"{prefix} - {item}")
|
|
512
|
+
else:
|
|
513
|
+
lines.append(
|
|
514
|
+
f"{prefix}{key.replace('_', ' ').title()}: {value}"
|
|
515
|
+
)
|
|
516
|
+
return lines
|
|
517
|
+
|
|
518
|
+
report_lines.extend(format_dict(data))
|
|
519
|
+
report_lines.append("")
|
|
520
|
+
report_lines.append("=" * 60)
|
|
521
|
+
|
|
522
|
+
return "\n".join(report_lines)
|
|
523
|
+
|
|
524
|
+
# Research Tools
|
|
525
|
+
def search_information(query: str, max_results: int = 5) -> dict:
|
|
526
|
+
"""Simulate information search with structured results."""
|
|
527
|
+
# Simulated search results for testing
|
|
528
|
+
simulated_results = [
|
|
529
|
+
{
|
|
530
|
+
"title": f"Research Paper: {query} - Latest Developments",
|
|
531
|
+
"source": "Journal of Advanced Computing",
|
|
532
|
+
"summary": f"Recent breakthrough in {query} showing promising results in error reduction and scalability improvements.",
|
|
533
|
+
"relevance_score": random.randint(80, 95),
|
|
534
|
+
"publication_date": "2024-11-15",
|
|
535
|
+
},
|
|
536
|
+
{
|
|
537
|
+
"title": f"Technical Review: {query} Implementation Challenges",
|
|
538
|
+
"source": "Tech Innovation Quarterly",
|
|
539
|
+
"summary": f"Comprehensive analysis of current {query} methodologies and their practical applications.",
|
|
540
|
+
"relevance_score": random.randint(75, 90),
|
|
541
|
+
"publication_date": "2024-10-22",
|
|
542
|
+
},
|
|
543
|
+
{
|
|
544
|
+
"title": f"Industry Report: {query} Market Trends",
|
|
545
|
+
"source": "Technology Research Institute",
|
|
546
|
+
"summary": f"Market analysis and future projections for {query} adoption across industries.",
|
|
547
|
+
"relevance_score": random.randint(70, 85),
|
|
548
|
+
"publication_date": "2024-09-30",
|
|
549
|
+
},
|
|
550
|
+
]
|
|
551
|
+
|
|
552
|
+
return {
|
|
553
|
+
"query": query,
|
|
554
|
+
"total_results": len(simulated_results),
|
|
555
|
+
"results": simulated_results[:max_results],
|
|
556
|
+
"search_timestamp": datetime.now().isoformat(),
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
def synthesize_research(search_results: dict) -> dict:
|
|
560
|
+
"""Synthesize research findings into structured summary."""
|
|
561
|
+
results = search_results["results"]
|
|
562
|
+
|
|
563
|
+
key_findings = []
|
|
564
|
+
technical_approaches = []
|
|
565
|
+
citations = []
|
|
566
|
+
|
|
567
|
+
for i, result in enumerate(results, 1):
|
|
568
|
+
key_findings.append(f"Finding {i}: {result['summary']}")
|
|
569
|
+
technical_approaches.append(
|
|
570
|
+
f"Approach {i}: Methodology described in '{result['title']}'"
|
|
571
|
+
)
|
|
572
|
+
citations.append(
|
|
573
|
+
f"[{i}] {result['title']} - {result['source']} ({result['publication_date']})"
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
return {
|
|
577
|
+
"research_topic": search_results["query"],
|
|
578
|
+
"sources_analyzed": len(results),
|
|
579
|
+
"key_findings": key_findings,
|
|
580
|
+
"technical_approaches": technical_approaches,
|
|
581
|
+
"citations": citations,
|
|
582
|
+
"confidence_level": "High" if len(results) >= 3 else "Medium",
|
|
583
|
+
"synthesis_date": datetime.now().isoformat(),
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
# Create and return all tools
|
|
587
|
+
return [
|
|
588
|
+
# Financial Analysis
|
|
589
|
+
tools_factory.create_tool(calculate_portfolio_return, vhc_eligible=False),
|
|
590
|
+
tools_factory.create_tool(project_investment_growth, vhc_eligible=False),
|
|
591
|
+
# Data Analysis
|
|
592
|
+
tools_factory.create_tool(generate_customer_dataset, vhc_eligible=False),
|
|
593
|
+
tools_factory.create_tool(analyze_customer_data, vhc_eligible=False),
|
|
594
|
+
# System Monitoring
|
|
595
|
+
tools_factory.create_tool(get_system_metrics, vhc_eligible=False),
|
|
596
|
+
tools_factory.create_tool(check_system_health, vhc_eligible=False),
|
|
597
|
+
# Project Management
|
|
598
|
+
tools_factory.create_tool(create_project_tasks, vhc_eligible=False),
|
|
599
|
+
tools_factory.create_tool(plan_sprint, vhc_eligible=False),
|
|
600
|
+
# Reporting
|
|
601
|
+
tools_factory.create_tool(create_formatted_report, vhc_eligible=False),
|
|
602
|
+
# Research
|
|
603
|
+
tools_factory.create_tool(search_information, vhc_eligible=False),
|
|
604
|
+
tools_factory.create_tool(synthesize_research, vhc_eligible=False),
|
|
605
|
+
]
|
|
606
|
+
|
|
607
|
+
async def measure_streaming_response(
|
|
608
|
+
self, agent: Agent, prompt: str
|
|
609
|
+
) -> Tuple[float, float, int]:
|
|
610
|
+
"""
|
|
611
|
+
Measure streaming response metrics.
|
|
612
|
+
Returns: (first_token_latency, total_time, response_length)
|
|
613
|
+
"""
|
|
614
|
+
start_time = time.time()
|
|
615
|
+
first_token_time = None
|
|
616
|
+
response_text = ""
|
|
617
|
+
|
|
618
|
+
try:
|
|
619
|
+
streaming_response = await agent.astream_chat(prompt)
|
|
620
|
+
|
|
621
|
+
# Check if we have the async_response_gen method
|
|
622
|
+
if hasattr(streaming_response, "async_response_gen") and callable(
|
|
623
|
+
streaming_response.async_response_gen
|
|
624
|
+
):
|
|
625
|
+
async for token in streaming_response.async_response_gen():
|
|
626
|
+
if first_token_time is None:
|
|
627
|
+
first_token_time = time.time()
|
|
628
|
+
response_text += str(token)
|
|
629
|
+
|
|
630
|
+
# Get final response
|
|
631
|
+
final_response = await streaming_response.aget_response()
|
|
632
|
+
if hasattr(final_response, "response") and final_response.response:
|
|
633
|
+
response_text = final_response.response
|
|
634
|
+
|
|
635
|
+
end_time = time.time()
|
|
636
|
+
total_time = end_time - start_time
|
|
637
|
+
first_token_latency = (
|
|
638
|
+
(first_token_time - start_time) if first_token_time else total_time
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
return first_token_latency, total_time, len(response_text)
|
|
642
|
+
|
|
643
|
+
except Exception as e:
|
|
644
|
+
end_time = time.time()
|
|
645
|
+
print(f"Error during streaming: {e}")
|
|
646
|
+
return -1, end_time - start_time, 0
|
|
647
|
+
|
|
648
|
+
async def run_single_benchmark(
|
|
649
|
+
self,
|
|
650
|
+
provider: ModelProvider,
|
|
651
|
+
model_name: str,
|
|
652
|
+
test_name: str,
|
|
653
|
+
test_config: Dict[str, Any],
|
|
654
|
+
) -> BenchmarkResult:
|
|
655
|
+
"""Run a single benchmark iteration."""
|
|
656
|
+
try:
|
|
657
|
+
# Create agent configuration
|
|
658
|
+
config = self.create_agent_config(provider, model_name)
|
|
659
|
+
|
|
660
|
+
# Create tools if needed
|
|
661
|
+
tools = (
|
|
662
|
+
self.create_test_tools()
|
|
663
|
+
if test_config.get("needs_tools", False)
|
|
664
|
+
else []
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
# Create agent
|
|
668
|
+
agent = Agent.from_tools(
|
|
669
|
+
tools=tools,
|
|
670
|
+
topic="benchmark",
|
|
671
|
+
agent_config=config,
|
|
672
|
+
verbose=False,
|
|
673
|
+
session_id=f"benchmark_{model_name}_{test_name}_{int(time.time())}",
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
# Measure response
|
|
677
|
+
first_token_latency, total_time, response_length = (
|
|
678
|
+
await self.measure_streaming_response(agent, test_config["prompt"])
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
# Calculate tokens per second (approximate)
|
|
682
|
+
tokens_per_second = response_length / total_time if total_time > 0 else 0
|
|
683
|
+
|
|
684
|
+
# Note: Skip per-agent cleanup to avoid OpenTelemetry uninstrumentation warnings
|
|
685
|
+
|
|
686
|
+
return BenchmarkResult(
|
|
687
|
+
model_name=model_name,
|
|
688
|
+
provider=provider.value,
|
|
689
|
+
test_type=test_name,
|
|
690
|
+
first_token_latency=first_token_latency,
|
|
691
|
+
total_response_time=total_time,
|
|
692
|
+
response_length=response_length,
|
|
693
|
+
tokens_per_second=tokens_per_second,
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
except Exception as e:
|
|
697
|
+
return BenchmarkResult(
|
|
698
|
+
model_name=model_name,
|
|
699
|
+
provider=provider.value,
|
|
700
|
+
test_type=test_name,
|
|
701
|
+
first_token_latency=-1,
|
|
702
|
+
total_response_time=-1,
|
|
703
|
+
response_length=0,
|
|
704
|
+
tokens_per_second=0,
|
|
705
|
+
error=str(e),
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
async def run_benchmarks(self):
|
|
709
|
+
"""Run all benchmark combinations."""
|
|
710
|
+
global _observability_initialized
|
|
711
|
+
|
|
712
|
+
print("Starting model performance benchmarks...")
|
|
713
|
+
print(
|
|
714
|
+
f"Testing {len(self.models_to_test)} models across {len(self.test_scenarios)} scenarios"
|
|
715
|
+
)
|
|
716
|
+
print(f"Running {self.iterations_per_test} iterations per combination\n")
|
|
717
|
+
|
|
718
|
+
# Setup observability once if enabled and not already initialized
|
|
719
|
+
if self.enable_observability and not _observability_initialized:
|
|
720
|
+
dummy_config = AgentConfig(observer=ObserverType.ARIZE_PHOENIX)
|
|
721
|
+
observability_setup = setup_observer(dummy_config, verbose=True)
|
|
722
|
+
if observability_setup:
|
|
723
|
+
print(
|
|
724
|
+
"✅ Arize Phoenix observability enabled - LLM calls will be traced\n"
|
|
725
|
+
)
|
|
726
|
+
_observability_initialized = True
|
|
727
|
+
else:
|
|
728
|
+
print("⚠️ Arize Phoenix observability setup failed\n")
|
|
729
|
+
|
|
730
|
+
total_tests = (
|
|
731
|
+
len(self.models_to_test)
|
|
732
|
+
* len(self.test_scenarios)
|
|
733
|
+
* self.iterations_per_test
|
|
734
|
+
)
|
|
735
|
+
current_test = 0
|
|
736
|
+
|
|
737
|
+
for model_config in self.models_to_test:
|
|
738
|
+
provider = model_config["provider"]
|
|
739
|
+
model_name = model_config["model"]
|
|
740
|
+
|
|
741
|
+
print(f"\n{'='*60}")
|
|
742
|
+
print(f"Testing: {provider.value} - {model_name}")
|
|
743
|
+
print(f"{'='*60}")
|
|
744
|
+
|
|
745
|
+
for test_name, test_config in self.test_scenarios.items():
|
|
746
|
+
print(f"\nRunning {test_name}: {test_config['description']}")
|
|
747
|
+
|
|
748
|
+
for iteration in range(self.iterations_per_test):
|
|
749
|
+
current_test += 1
|
|
750
|
+
progress = (current_test / total_tests) * 100
|
|
751
|
+
print(
|
|
752
|
+
f" Iteration {iteration + 1}/{self.iterations_per_test} ({progress:.1f}% complete)"
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
result = await self.run_single_benchmark(
|
|
756
|
+
provider, model_name, test_name, test_config
|
|
757
|
+
)
|
|
758
|
+
self.results.append(result)
|
|
759
|
+
|
|
760
|
+
if result.error:
|
|
761
|
+
print(f" ERROR: {result.error}")
|
|
762
|
+
else:
|
|
763
|
+
print(
|
|
764
|
+
f" Time: {result.total_response_time:.2f}s, "
|
|
765
|
+
f"First token: {result.first_token_latency:.2f}s, "
|
|
766
|
+
f"Speed: {result.tokens_per_second:.1f} chars/sec"
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
# Small delay between tests
|
|
770
|
+
await asyncio.sleep(1)
|
|
771
|
+
|
|
772
|
+
def calculate_statistics(self) -> List[BenchmarkStats]:
|
|
773
|
+
"""Calculate aggregated statistics from results."""
|
|
774
|
+
stats = []
|
|
775
|
+
|
|
776
|
+
# Group results by model and test type
|
|
777
|
+
grouped = {}
|
|
778
|
+
for result in self.results:
|
|
779
|
+
key = (result.model_name, result.provider, result.test_type)
|
|
780
|
+
if key not in grouped:
|
|
781
|
+
grouped[key] = []
|
|
782
|
+
grouped[key].append(result)
|
|
783
|
+
|
|
784
|
+
# Calculate statistics for each group
|
|
785
|
+
for (model_name, provider, test_type), group_results in grouped.items():
|
|
786
|
+
successful_results = [
|
|
787
|
+
r
|
|
788
|
+
for r in group_results
|
|
789
|
+
if r.error is None and r.total_response_time > 0
|
|
790
|
+
]
|
|
791
|
+
|
|
792
|
+
if not successful_results:
|
|
793
|
+
continue
|
|
794
|
+
|
|
795
|
+
response_times = [r.total_response_time for r in successful_results]
|
|
796
|
+
first_token_times = [r.first_token_latency for r in successful_results]
|
|
797
|
+
tokens_per_sec = [r.tokens_per_second for r in successful_results]
|
|
798
|
+
|
|
799
|
+
stats.append(
|
|
800
|
+
BenchmarkStats(
|
|
801
|
+
model_name=model_name,
|
|
802
|
+
provider=provider,
|
|
803
|
+
test_type=test_type,
|
|
804
|
+
runs=len(group_results),
|
|
805
|
+
avg_first_token_latency=statistics.mean(first_token_times),
|
|
806
|
+
avg_total_response_time=statistics.mean(response_times),
|
|
807
|
+
avg_tokens_per_second=statistics.mean(tokens_per_sec),
|
|
808
|
+
median_first_token_latency=statistics.median(first_token_times),
|
|
809
|
+
median_total_response_time=statistics.median(response_times),
|
|
810
|
+
median_tokens_per_second=statistics.median(tokens_per_sec),
|
|
811
|
+
min_total_response_time=min(response_times),
|
|
812
|
+
max_total_response_time=max(response_times),
|
|
813
|
+
std_total_response_time=(
|
|
814
|
+
statistics.stdev(response_times)
|
|
815
|
+
if len(response_times) > 1
|
|
816
|
+
else 0
|
|
817
|
+
),
|
|
818
|
+
success_rate=(len(successful_results) / len(group_results)) * 100,
|
|
819
|
+
)
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
return stats
|
|
823
|
+
|
|
824
|
+
def generate_report(self, stats: List[BenchmarkStats]) -> str:
|
|
825
|
+
"""Generate a comprehensive performance report."""
|
|
826
|
+
report = []
|
|
827
|
+
report.append("=" * 80)
|
|
828
|
+
report.append("MODEL PERFORMANCE BENCHMARK RESULTS")
|
|
829
|
+
report.append("=" * 80)
|
|
830
|
+
report.append("")
|
|
831
|
+
|
|
832
|
+
# Group by test type for easier comparison
|
|
833
|
+
by_test_type = {}
|
|
834
|
+
for stat in stats:
|
|
835
|
+
if stat.test_type not in by_test_type:
|
|
836
|
+
by_test_type[stat.test_type] = []
|
|
837
|
+
by_test_type[stat.test_type].append(stat)
|
|
838
|
+
|
|
839
|
+
for test_type, test_stats in by_test_type.items():
|
|
840
|
+
report.append(f"\n{test_type.upper().replace('_', ' ')} RESULTS")
|
|
841
|
+
report.append("-" * 50)
|
|
842
|
+
|
|
843
|
+
# Sort by average response time
|
|
844
|
+
test_stats.sort(key=lambda x: x.avg_total_response_time)
|
|
845
|
+
|
|
846
|
+
report.append(
|
|
847
|
+
f"{'Model':<25} {'Provider':<12} {'Avg Time':<10} {'First Token':<12} {'Chars/sec':<10} {'Success':<8}"
|
|
848
|
+
)
|
|
849
|
+
report.append("-" * 85)
|
|
850
|
+
|
|
851
|
+
for stat in test_stats:
|
|
852
|
+
report.append(
|
|
853
|
+
f"{stat.model_name:<25} {stat.provider:<12} "
|
|
854
|
+
f"{stat.avg_total_response_time:<10.2f} {stat.avg_first_token_latency:<12.2f} "
|
|
855
|
+
f"{stat.avg_tokens_per_second:<10.1f} {stat.success_rate:<8.0f}%"
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
# Overall performance ranking
|
|
859
|
+
report.append("\n\nOVERALL PERFORMANCE RANKING")
|
|
860
|
+
report.append("-" * 40)
|
|
861
|
+
|
|
862
|
+
# Calculate overall average performance
|
|
863
|
+
overall_performance = {}
|
|
864
|
+
for stat in stats:
|
|
865
|
+
key = f"{stat.provider} - {stat.model_name}"
|
|
866
|
+
if key not in overall_performance:
|
|
867
|
+
overall_performance[key] = []
|
|
868
|
+
overall_performance[key].append(stat.avg_total_response_time)
|
|
869
|
+
|
|
870
|
+
# Calculate average across all test types
|
|
871
|
+
overall_rankings = []
|
|
872
|
+
for model, times in overall_performance.items():
|
|
873
|
+
avg_time = statistics.mean(times)
|
|
874
|
+
overall_rankings.append((model, avg_time))
|
|
875
|
+
|
|
876
|
+
overall_rankings.sort(key=lambda x: x[1])
|
|
877
|
+
|
|
878
|
+
report.append(f"{'Rank':<5} {'Model':<35} {'Avg Response Time':<18}")
|
|
879
|
+
report.append("-" * 60)
|
|
880
|
+
|
|
881
|
+
for i, (model, avg_time) in enumerate(overall_rankings, 1):
|
|
882
|
+
report.append(f"{i:<5} {model:<35} {avg_time:<18.2f}s")
|
|
883
|
+
|
|
884
|
+
return "\n".join(report)
|
|
885
|
+
|
|
886
|
+
def save_results(
|
|
887
|
+
self, stats: List[BenchmarkStats], filename: str = "benchmark_results.json"
|
|
888
|
+
):
|
|
889
|
+
"""Save detailed results to JSON file."""
|
|
890
|
+
output = {
|
|
891
|
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
|
892
|
+
"configuration": {
|
|
893
|
+
"iterations_per_test": self.iterations_per_test,
|
|
894
|
+
"models_tested": [
|
|
895
|
+
f"{m['provider'].value}-{m['model']}" for m in self.models_to_test
|
|
896
|
+
],
|
|
897
|
+
"test_scenarios": list(self.test_scenarios.keys()),
|
|
898
|
+
},
|
|
899
|
+
"raw_results": [asdict(result) for result in self.results],
|
|
900
|
+
"statistics": [asdict(stat) for stat in stats],
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
with open(filename, "w") as f:
|
|
904
|
+
json.dump(output, f, indent=2)
|
|
905
|
+
|
|
906
|
+
print(f"\nDetailed results saved to: {filename}")
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
async def main():
|
|
910
|
+
"""Main benchmark execution."""
|
|
911
|
+
print("Vectara Agentic Model Performance Benchmark")
|
|
912
|
+
print("=" * 50)
|
|
913
|
+
|
|
914
|
+
# Check if observability should be enabled via environment variable
|
|
915
|
+
enable_observability = os.getenv("ENABLE_OBSERVABILITY", "false").lower() == "true"
|
|
916
|
+
benchmark = ModelBenchmark(enable_observability=enable_observability)
|
|
917
|
+
|
|
918
|
+
try:
|
|
919
|
+
await benchmark.run_benchmarks()
|
|
920
|
+
|
|
921
|
+
# Calculate and display results
|
|
922
|
+
stats = benchmark.calculate_statistics()
|
|
923
|
+
report = benchmark.generate_report(stats)
|
|
924
|
+
|
|
925
|
+
print("\n" + report)
|
|
926
|
+
|
|
927
|
+
# Save results
|
|
928
|
+
benchmark.save_results(stats)
|
|
929
|
+
|
|
930
|
+
except KeyboardInterrupt:
|
|
931
|
+
print("\nBenchmark interrupted by user")
|
|
932
|
+
except Exception as e:
|
|
933
|
+
print(f"\nBenchmark failed with error: {e}")
|
|
934
|
+
import traceback
|
|
935
|
+
|
|
936
|
+
traceback.print_exc()
|
|
937
|
+
finally:
|
|
938
|
+
# Cleanup observability
|
|
939
|
+
if enable_observability and _observability_initialized:
|
|
940
|
+
shutdown_observer()
|
|
941
|
+
print("\n🔄 Arize Phoenix observability shutdown complete")
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
if __name__ == "__main__":
|
|
945
|
+
asyncio.run(main())
|