vectara-agentic 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. tests/__init__.py +1 -0
  2. tests/benchmark_models.py +1120 -0
  3. tests/conftest.py +18 -16
  4. tests/endpoint.py +9 -5
  5. tests/run_tests.py +3 -0
  6. tests/test_agent.py +52 -8
  7. tests/test_agent_type.py +2 -0
  8. tests/test_api_endpoint.py +13 -13
  9. tests/test_bedrock.py +9 -1
  10. tests/test_fallback.py +19 -8
  11. tests/test_gemini.py +14 -40
  12. tests/test_groq.py +9 -1
  13. tests/test_private_llm.py +20 -7
  14. tests/test_react_error_handling.py +293 -0
  15. tests/test_react_memory.py +257 -0
  16. tests/test_react_streaming.py +135 -0
  17. tests/test_react_workflow_events.py +395 -0
  18. tests/test_return_direct.py +1 -0
  19. tests/test_serialization.py +58 -20
  20. tests/test_together.py +9 -1
  21. tests/test_tools.py +3 -1
  22. tests/test_vectara_llms.py +2 -2
  23. tests/test_vhc.py +7 -2
  24. tests/test_workflow.py +17 -11
  25. vectara_agentic/_callback.py +79 -21
  26. vectara_agentic/_observability.py +19 -0
  27. vectara_agentic/_version.py +1 -1
  28. vectara_agentic/agent.py +89 -21
  29. vectara_agentic/agent_core/factory.py +5 -6
  30. vectara_agentic/agent_core/prompts.py +3 -4
  31. vectara_agentic/agent_core/serialization.py +12 -10
  32. vectara_agentic/agent_core/streaming.py +245 -68
  33. vectara_agentic/agent_core/utils/schemas.py +2 -2
  34. vectara_agentic/llm_utils.py +6 -2
  35. vectara_agentic/sub_query_workflow.py +3 -2
  36. vectara_agentic/tools.py +0 -19
  37. {vectara_agentic-0.4.1.dist-info → vectara_agentic-0.4.3.dist-info}/METADATA +156 -61
  38. vectara_agentic-0.4.3.dist-info/RECORD +58 -0
  39. vectara_agentic-0.4.1.dist-info/RECORD +0 -53
  40. {vectara_agentic-0.4.1.dist-info → vectara_agentic-0.4.3.dist-info}/WHEEL +0 -0
  41. {vectara_agentic-0.4.1.dist-info → vectara_agentic-0.4.3.dist-info}/licenses/LICENSE +0 -0
  42. {vectara_agentic-0.4.1.dist-info → vectara_agentic-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1120 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Model Performance Benchmark Script
4
+
5
+ This script benchmarks different LLM models for latency and performance
6
+ in the context of Vectara Agentic framework.
7
+ """
8
+
9
+ import asyncio
10
+ import time
11
+ import json
12
+ import statistics
13
+ import sys
14
+ import os
15
+ import random
16
+ from typing import Dict, List, Tuple, Any, Set
17
+ from dataclasses import dataclass, asdict
18
+
19
+ # Add the current directory to Python path to import vectara_agentic
20
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
21
+
22
+ from vectara_agentic.agent import Agent
23
+ from vectara_agentic.agent_config import AgentConfig
24
+ from vectara_agentic.types import ModelProvider, ObserverType
25
+ from vectara_agentic.tools import ToolsFactory
26
+ from vectara_agentic._observability import setup_observer, shutdown_observer
27
+
28
+ # Initialize observability once at startup to prevent repeated instrumentation
29
+ _observability_initialized = False
30
+
31
+
32
+ def validate_api_keys(models_to_test: List[Dict]) -> None:
33
+ """
34
+ Validate that all required API keys are present for the models being tested.
35
+
36
+ Args:
37
+ models_to_test: List of model configurations with provider and model info
38
+
39
+ Raises:
40
+ SystemExit: If any required API keys are missing
41
+ """
42
+ # Map providers to their required environment variables
43
+ provider_api_keys = {
44
+ ModelProvider.OPENAI: "OPENAI_API_KEY",
45
+ ModelProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
46
+ ModelProvider.TOGETHER: "TOGETHER_API_KEY",
47
+ ModelProvider.GROQ: "GROQ_API_KEY",
48
+ ModelProvider.GEMINI: "GOOGLE_API_KEY",
49
+ }
50
+
51
+ required_keys = set()
52
+
53
+ # Collect unique providers from models to test
54
+ providers_in_use: Set[ModelProvider] = set()
55
+ for model_config in models_to_test:
56
+ providers_in_use.add(model_config["provider"])
57
+
58
+ # Add required API keys for each provider
59
+ for provider in providers_in_use:
60
+ api_key_name = provider_api_keys.get(provider)
61
+ if api_key_name: # Skip providers that don't use env var API keys
62
+ required_keys.add(api_key_name)
63
+
64
+ # Check for missing API keys
65
+ missing_keys = []
66
+ for key in required_keys:
67
+ if not os.getenv(key):
68
+ missing_keys.append(key)
69
+
70
+ if missing_keys:
71
+ print("❌ ERROR: Missing required API keys for benchmark execution:")
72
+ print()
73
+ for key in sorted(missing_keys):
74
+ print(f" • {key}")
75
+ print()
76
+ print("Please set these environment variables before running the benchmark.")
77
+ print("Providers being tested:")
78
+ for provider in sorted(providers_in_use, key=lambda p: p.value):
79
+ models_for_provider = [
80
+ m["model"] for m in models_to_test if m["provider"] == provider
81
+ ]
82
+ print(f" • {provider.value}: {', '.join(models_for_provider)}")
83
+
84
+ sys.exit(1)
85
+
86
+ print("✅ All required API keys are present")
87
+ print(f"Found API keys for {len(required_keys)} required environment variables")
88
+
89
+
90
+ @dataclass
91
+ class BenchmarkResult:
92
+ """Results from a single benchmark run."""
93
+
94
+ model_name: str
95
+ provider: str
96
+ test_type: str
97
+ first_token_latency: float
98
+ total_response_time: float
99
+ response_length: int
100
+ tokens_per_second: float
101
+ error: str = None
102
+
103
+
104
+ @dataclass
105
+ class BenchmarkStats:
106
+ """Aggregated statistics for multiple runs."""
107
+
108
+ model_name: str
109
+ provider: str
110
+ test_type: str
111
+ runs: int
112
+ avg_first_token_latency: float
113
+ avg_total_response_time: float
114
+ avg_tokens_per_second: float
115
+ median_first_token_latency: float
116
+ median_total_response_time: float
117
+ median_tokens_per_second: float
118
+ min_total_response_time: float
119
+ max_total_response_time: float
120
+ std_total_response_time: float
121
+ success_rate: float
122
+
123
+
124
+ class ModelBenchmark:
125
+ """Benchmarking suite for different LLM models."""
126
+
127
+ def __init__(
128
+ self, enable_observability: bool = False, max_concurrent_models: int = 2
129
+ ):
130
+ # Test configurations
131
+ self.enable_observability = enable_observability
132
+ self.max_concurrent_models = max_concurrent_models
133
+ self.models_to_test = [
134
+ # OpenAI models
135
+ {"provider": ModelProvider.OPENAI, "model": "gpt-5-mini"},
136
+ {"provider": ModelProvider.OPENAI, "model": "gpt-4o-mini"},
137
+ {"provider": ModelProvider.OPENAI, "model": "gpt-4.1-mini"},
138
+ {"provider": ModelProvider.ANTHROPIC, "model": "claude-sonnet-4-20250514"},
139
+ {"provider": ModelProvider.TOGETHER, "model": "deepseek-ai/DeepSeek-V3"},
140
+ {"provider": ModelProvider.GROQ, "model": "openai/gpt-oss-20b"},
141
+ {"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-flash-lite"},
142
+ ]
143
+
144
+ # Test scenarios - focused on advanced tool calling only
145
+ self.test_scenarios = {
146
+ "financial_analysis": {
147
+ "prompt": "Analyze a $50,000 investment portfolio with 60% stocks (8% return), 30% bonds (4% return), and 10% cash (1% return). Calculate the expected annual return, then determine how the portfolio value would grow over 15 years with monthly contributions of $1,000. Create a summary report of the analysis.",
148
+ "description": "Multi-step financial analysis with calculations and reporting",
149
+ "needs_tools": True,
150
+ },
151
+ "data_processing": {
152
+ "prompt": "Generate a dataset of 100 customers with randomized demographics (age, income, location, purchase_history). Then analyze this data to find correlations between age groups and spending patterns. Create a statistical summary and export the results to a formatted report.",
153
+ "description": "Data generation, analysis, and reporting workflow",
154
+ "needs_tools": True,
155
+ },
156
+ "research_synthesis": {
157
+ "prompt": "Search for information about the latest developments in quantum computing, specifically focusing on error correction breakthroughs in 2024. Extract key findings from multiple sources, summarize the technical approaches, and create a structured research report with citations.",
158
+ "description": "Information retrieval, synthesis, and document generation",
159
+ "needs_tools": True,
160
+ },
161
+ "system_monitoring": {
162
+ "prompt": "Check system performance metrics including CPU usage, memory consumption, and disk space. If any metrics exceed safe thresholds (CPU > 80%, Memory > 90%, Disk > 85%), generate alerts and suggest optimization strategies. Create a monitoring report with recommendations.",
163
+ "description": "System monitoring with conditional logic and reporting",
164
+ "needs_tools": True,
165
+ },
166
+ "workflow_automation": {
167
+ "prompt": "Create a project task list with 10 software development tasks, assign priorities and estimated hours, then simulate a sprint planning session by organizing tasks into a 2-week sprint. Generate a sprint backlog with daily breakdowns and resource allocation recommendations.",
168
+ "description": "Complex workflow orchestration with multiple tool interactions",
169
+ "needs_tools": True,
170
+ },
171
+ }
172
+
173
+ self.iterations_per_test = 5
174
+ self.results: List[BenchmarkResult] = []
175
+
176
+ # Provider-specific rate limits (requests per minute)
177
+ self.provider_rate_limits = {
178
+ ModelProvider.OPENAI: 100,
179
+ ModelProvider.ANTHROPIC: 100,
180
+ ModelProvider.TOGETHER: 80,
181
+ ModelProvider.GROQ: 50, # Conservative for GROQ
182
+ ModelProvider.GEMINI: 60,
183
+ }
184
+
185
+ def create_agent_config(
186
+ self, provider: ModelProvider, model_name: str
187
+ ) -> AgentConfig:
188
+ """Create agent configuration for the specified model."""
189
+ return AgentConfig(
190
+ main_llm_provider=provider,
191
+ main_llm_model_name=model_name,
192
+ tool_llm_provider=provider,
193
+ tool_llm_model_name=model_name,
194
+ observer=(
195
+ ObserverType.ARIZE_PHOENIX
196
+ if self.enable_observability
197
+ else ObserverType.NO_OBSERVER
198
+ ),
199
+ )
200
+
201
+ def analyze_customer_data(self, customer_data_json: str) -> dict:
202
+ """Analyze customer data for patterns and correlations."""
203
+ customers = json.loads(customer_data_json)
204
+
205
+ # Group by age groups
206
+ age_groups = {}
207
+ for customer in customers:
208
+ group = customer["age_group"]
209
+ if group not in age_groups:
210
+ age_groups[group] = {
211
+ "count": 0,
212
+ "total_spending": 0,
213
+ "total_income": 0,
214
+ }
215
+
216
+ age_groups[group]["count"] += 1
217
+ age_groups[group]["total_spending"] += customer["purchase_history"]
218
+ age_groups[group]["total_income"] += customer["income"]
219
+
220
+ # Calculate averages
221
+ analysis = {}
222
+ for group, data in age_groups.items():
223
+ analysis[group] = {
224
+ "count": data["count"],
225
+ "avg_spending": round(data["total_spending"] / data["count"], 2),
226
+ "avg_income": round(data["total_income"] / data["count"], 2),
227
+ "spending_to_income_ratio": round(
228
+ (data["total_spending"] / data["count"])
229
+ / (data["total_income"] / data["count"])
230
+ * 1000,
231
+ 4,
232
+ ),
233
+ }
234
+
235
+ return {
236
+ "total_customers": len(customers),
237
+ "age_group_analysis": analysis,
238
+ "overall_avg_spending": round(
239
+ sum(c["purchase_history"] for c in customers) / len(customers), 2
240
+ ),
241
+ "overall_avg_income": round(
242
+ sum(c["income"] for c in customers) / len(customers), 2
243
+ ),
244
+ }
245
+
246
+ def get_system_metrics(self) -> dict:
247
+ """Get current system performance metrics."""
248
+ import psutil
249
+ from datetime import datetime
250
+
251
+ try:
252
+ cpu_percent = psutil.cpu_percent(interval=1)
253
+ memory = psutil.virtual_memory()
254
+ disk = psutil.disk_usage("/")
255
+
256
+ return {
257
+ "cpu_usage_percent": cpu_percent,
258
+ "memory_usage_percent": memory.percent,
259
+ "memory_available_gb": round(memory.available / (1024**3), 2),
260
+ "disk_usage_percent": disk.percent,
261
+ "disk_free_gb": round(disk.free / (1024**3), 2),
262
+ "timestamp": datetime.now().isoformat(),
263
+ }
264
+ except Exception:
265
+ # Fallback with simulated data for testing
266
+ return {
267
+ "cpu_usage_percent": random.randint(20, 95),
268
+ "memory_usage_percent": random.randint(40, 95),
269
+ "memory_available_gb": random.randint(1, 16),
270
+ "disk_usage_percent": random.randint(30, 90),
271
+ "disk_free_gb": random.randint(10, 500),
272
+ "timestamp": datetime.now().isoformat(),
273
+ "note": "Simulated data - psutil unavailable",
274
+ }
275
+
276
+ def check_system_health(
277
+ self,
278
+ cpu_threshold: int = 80,
279
+ memory_threshold: int = 90,
280
+ disk_threshold: int = 85,
281
+ ) -> dict:
282
+ """Check system health against thresholds and generate alerts."""
283
+ metrics = self.get_system_metrics()
284
+ alerts = []
285
+ recommendations = []
286
+
287
+ if metrics["cpu_usage_percent"] > cpu_threshold:
288
+ alerts.append(
289
+ f"HIGH CPU USAGE: {metrics['cpu_usage_percent']}% (threshold: {cpu_threshold}%)"
290
+ )
291
+ recommendations.append(
292
+ "Consider closing unnecessary applications or upgrading CPU"
293
+ )
294
+
295
+ if metrics["memory_usage_percent"] > memory_threshold:
296
+ alerts.append(
297
+ f"HIGH MEMORY USAGE: {metrics['memory_usage_percent']}% (threshold: {memory_threshold}%)"
298
+ )
299
+ recommendations.append(
300
+ "Close memory-intensive applications or add more RAM"
301
+ )
302
+
303
+ if metrics["disk_usage_percent"] > disk_threshold:
304
+ alerts.append(
305
+ f"LOW DISK SPACE: {metrics['disk_usage_percent']}% used (threshold: {disk_threshold}%)"
306
+ )
307
+ recommendations.append("Clean up temporary files or expand disk storage")
308
+
309
+ health_status = (
310
+ "CRITICAL" if len(alerts) >= 2 else "WARNING" if alerts else "HEALTHY"
311
+ )
312
+
313
+ return {
314
+ "health_status": health_status,
315
+ "alerts": alerts,
316
+ "recommendations": recommendations,
317
+ "metrics": metrics,
318
+ "thresholds": {
319
+ "cpu": cpu_threshold,
320
+ "memory": memory_threshold,
321
+ "disk": disk_threshold,
322
+ },
323
+ }
324
+
325
+ def create_project_tasks(self, count: int = 10) -> str:
326
+ """Generate a list of software development tasks."""
327
+ task_types = [
328
+ "Implement user authentication system",
329
+ "Create REST API endpoints",
330
+ "Design database schema",
331
+ "Build responsive frontend components",
332
+ "Write unit tests",
333
+ "Set up CI/CD pipeline",
334
+ "Implement error handling",
335
+ "Create API documentation",
336
+ "Optimize database queries",
337
+ "Implement caching layer",
338
+ "Add logging and monitoring",
339
+ "Create user dashboard",
340
+ "Implement search functionality",
341
+ "Add data validation",
342
+ "Create admin panel",
343
+ ]
344
+
345
+ tasks = []
346
+ for i in range(count):
347
+ task = random.choice(task_types)
348
+ priority = random.choice(["High", "Medium", "Low"])
349
+ estimated_hours = random.randint(2, 24)
350
+
351
+ tasks.append(
352
+ {
353
+ "task_id": f"TASK-{i+1:03d}",
354
+ "title": f"{task} #{i+1}",
355
+ "priority": priority,
356
+ "estimated_hours": estimated_hours,
357
+ "status": "Backlog",
358
+ "assigned_to": None,
359
+ }
360
+ )
361
+
362
+ return json.dumps(tasks, indent=2)
363
+
364
+ def plan_sprint(self, tasks_json: str, sprint_capacity_hours: int = 80) -> dict:
365
+ """Organize tasks into a sprint with daily breakdowns."""
366
+ tasks = json.loads(tasks_json)
367
+
368
+ # Sort by priority and estimated hours
369
+ priority_order = {"High": 3, "Medium": 2, "Low": 1}
370
+ tasks.sort(
371
+ key=lambda x: (priority_order[x["priority"]], -x["estimated_hours"]),
372
+ reverse=True,
373
+ )
374
+
375
+ sprint_tasks = []
376
+ total_hours = 0
377
+
378
+ for task in tasks:
379
+ if total_hours + task["estimated_hours"] <= sprint_capacity_hours:
380
+ sprint_tasks.append(task)
381
+ total_hours += task["estimated_hours"]
382
+ else:
383
+ break
384
+
385
+ # Distribute across 2 weeks (10 working days)
386
+ daily_breakdown = []
387
+ remaining_hours = total_hours
388
+ days_remaining = 10
389
+
390
+ for day in range(1, 11):
391
+ if days_remaining > 0:
392
+ day_hours = min(
393
+ 8,
394
+ remaining_hours // days_remaining
395
+ + (1 if remaining_hours % days_remaining else 0),
396
+ )
397
+ daily_breakdown.append(
398
+ {
399
+ "day": day,
400
+ "planned_hours": day_hours,
401
+ "remaining_capacity": 8 - day_hours,
402
+ }
403
+ )
404
+ remaining_hours -= day_hours
405
+ days_remaining -= 1
406
+
407
+ return {
408
+ "sprint_summary": {
409
+ "total_tasks": len(sprint_tasks),
410
+ "total_planned_hours": total_hours,
411
+ "sprint_capacity": sprint_capacity_hours,
412
+ "utilization_percent": round(
413
+ (total_hours / sprint_capacity_hours) * 100, 1
414
+ ),
415
+ },
416
+ "selected_tasks": sprint_tasks,
417
+ "daily_breakdown": daily_breakdown,
418
+ "backlog_remaining": len(tasks) - len(sprint_tasks),
419
+ }
420
+
421
+ def create_formatted_report(
422
+ self, title: str, data: dict, report_type: str = "summary"
423
+ ) -> str:
424
+ """Create a formatted text report from structured data."""
425
+ from datetime import datetime
426
+
427
+ report_lines = []
428
+ report_lines.append("=" * 60)
429
+ report_lines.append(f"{title.upper()}")
430
+ report_lines.append("=" * 60)
431
+ report_lines.append(
432
+ f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
433
+ )
434
+ report_lines.append(f"Report Type: {report_type.title()}")
435
+ report_lines.append("")
436
+
437
+ def format_dict(d, indent=0):
438
+ lines = []
439
+ for key, value in d.items():
440
+ prefix = " " * indent
441
+ if isinstance(value, dict):
442
+ lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
443
+ lines.extend(format_dict(value, indent + 1))
444
+ elif isinstance(value, list):
445
+ lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
446
+ for i, item in enumerate(value):
447
+ if isinstance(item, dict):
448
+ lines.append(f"{prefix} Item {i+1}:")
449
+ lines.extend(format_dict(item, indent + 2))
450
+ else:
451
+ lines.append(f"{prefix} - {item}")
452
+ else:
453
+ lines.append(f"{prefix}{key.replace('_', ' ').title()}: {value}")
454
+ return lines
455
+
456
+ report_lines.extend(format_dict(data))
457
+ report_lines.append("")
458
+ report_lines.append("=" * 60)
459
+
460
+ return "\n".join(report_lines)
461
+
462
+ def search_information(self, query: str, max_results: int = 5) -> dict:
463
+ """Simulate information search with structured results."""
464
+ from datetime import datetime
465
+
466
+ # Simulated search results for testing
467
+ simulated_results = [
468
+ {
469
+ "title": f"Research Paper: {query} - Latest Developments",
470
+ "source": "Journal of Advanced Computing",
471
+ "summary": f"Recent breakthrough in {query} showing promising results in error reduction and scalability improvements.",
472
+ "relevance_score": random.randint(80, 95),
473
+ "publication_date": "2024-11-15",
474
+ },
475
+ {
476
+ "title": f"Technical Review: {query} Implementation Challenges",
477
+ "source": "Tech Innovation Quarterly",
478
+ "summary": f"Comprehensive analysis of current {query} methodologies and their practical applications.",
479
+ "relevance_score": random.randint(75, 90),
480
+ "publication_date": "2024-10-22",
481
+ },
482
+ {
483
+ "title": f"Industry Report: {query} Market Trends",
484
+ "source": "Technology Research Institute",
485
+ "summary": f"Market analysis and future projections for {query} adoption across industries.",
486
+ "relevance_score": random.randint(70, 85),
487
+ "publication_date": "2024-09-30",
488
+ },
489
+ ]
490
+
491
+ return {
492
+ "query": query,
493
+ "total_results": len(simulated_results),
494
+ "results": simulated_results[:max_results],
495
+ "search_timestamp": datetime.now().isoformat(),
496
+ }
497
+
498
+ def synthesize_research(self, search_results: dict) -> dict:
499
+ """Synthesize research findings into structured summary."""
500
+ from datetime import datetime
501
+
502
+ results = search_results["results"]
503
+
504
+ key_findings = []
505
+ technical_approaches = []
506
+ citations = []
507
+
508
+ for i, result in enumerate(results, 1):
509
+ key_findings.append(f"Finding {i}: {result['summary']}")
510
+ technical_approaches.append(
511
+ f"Approach {i}: Methodology described in '{result['title']}'"
512
+ )
513
+ citations.append(
514
+ f"[{i}] {result['title']} - {result['source']} ({result['publication_date']})"
515
+ )
516
+
517
+ return {
518
+ "research_topic": search_results["query"],
519
+ "sources_analyzed": len(results),
520
+ "key_findings": key_findings,
521
+ "technical_approaches": technical_approaches,
522
+ "citations": citations,
523
+ "confidence_level": "High" if len(results) >= 3 else "Medium",
524
+ "synthesis_date": datetime.now().isoformat(),
525
+ }
526
+
527
+ def create_test_tools(self) -> List:
528
+ """Create an advanced set of tools for realistic agent testing."""
529
+ tools_factory = ToolsFactory()
530
+
531
+ # Financial Analysis Tools
532
+ def calculate_portfolio_return(
533
+ stocks_pct: float,
534
+ stocks_return: float,
535
+ bonds_pct: float,
536
+ bonds_return: float,
537
+ cash_pct: float,
538
+ cash_return: float,
539
+ ) -> dict:
540
+ """Calculate expected portfolio return and allocation details."""
541
+ total_allocation = stocks_pct + bonds_pct + cash_pct
542
+ if abs(total_allocation - 100) > 0.01:
543
+ raise ValueError(
544
+ f"Portfolio allocation must sum to 100%, got {total_allocation}%"
545
+ )
546
+
547
+ expected_return = (
548
+ stocks_pct * stocks_return
549
+ + bonds_pct * bonds_return
550
+ + cash_pct * cash_return
551
+ ) / 100
552
+
553
+ return {
554
+ "expected_annual_return_pct": expected_return,
555
+ "allocation": {
556
+ "stocks": {"percentage": stocks_pct, "return": stocks_return},
557
+ "bonds": {"percentage": bonds_pct, "return": bonds_return},
558
+ "cash": {"percentage": cash_pct, "return": cash_return},
559
+ },
560
+ "risk_profile": (
561
+ "aggressive"
562
+ if stocks_pct > 70
563
+ else "moderate" if stocks_pct > 40 else "conservative"
564
+ ),
565
+ }
566
+
567
+ def project_investment_growth(
568
+ initial_amount: float,
569
+ annual_return: float,
570
+ years: int,
571
+ monthly_contribution: float = 0,
572
+ ) -> dict:
573
+ """Project investment growth with optional monthly contributions."""
574
+ monthly_rate = annual_return / 12 / 100
575
+ months = years * 12
576
+
577
+ # Calculate compound growth with monthly contributions
578
+ if monthly_contribution > 0:
579
+ # Future value of initial investment
580
+ fv_initial = initial_amount * ((1 + monthly_rate) ** months)
581
+ # Future value of monthly contributions (ordinary annuity)
582
+ fv_contributions = monthly_contribution * (
583
+ ((1 + monthly_rate) ** months - 1) / monthly_rate
584
+ )
585
+ final_value = fv_initial + fv_contributions
586
+ total_contributions = monthly_contribution * months
587
+ else:
588
+ final_value = initial_amount * ((1 + annual_return / 100) ** years)
589
+ total_contributions = 0
590
+
591
+ total_invested = initial_amount + total_contributions
592
+ total_gains = final_value - total_invested
593
+
594
+ return {
595
+ "initial_investment": initial_amount,
596
+ "monthly_contribution": monthly_contribution,
597
+ "total_contributions": total_contributions,
598
+ "total_invested": total_invested,
599
+ "final_value": round(final_value, 2),
600
+ "total_gains": round(total_gains, 2),
601
+ "return_multiple": round(final_value / initial_amount, 2),
602
+ "years": years,
603
+ "annual_return_used": annual_return,
604
+ }
605
+
606
+ # Data Analysis Tools
607
+ def generate_customer_dataset(count: int) -> str:
608
+ """Generate randomized customer data for analysis."""
609
+ customers = []
610
+ locations = [
611
+ "New York",
612
+ "Los Angeles",
613
+ "Chicago",
614
+ "Houston",
615
+ "Phoenix",
616
+ "Philadelphia",
617
+ "San Antonio",
618
+ "San Diego",
619
+ "Dallas",
620
+ "San Jose",
621
+ ]
622
+
623
+ for i in range(count):
624
+ age = random.randint(18, 75)
625
+ income = random.randint(25000, 150000)
626
+ location = random.choice(locations)
627
+ purchase_history = random.randint(1, 50)
628
+
629
+ customers.append(
630
+ {
631
+ "customer_id": f"CUST_{i+1:04d}",
632
+ "age": age,
633
+ "income": income,
634
+ "location": location,
635
+ "purchase_history": purchase_history,
636
+ "age_group": (
637
+ "18-30"
638
+ if age <= 30
639
+ else (
640
+ "31-45"
641
+ if age <= 45
642
+ else "46-60" if age <= 60 else "60+"
643
+ )
644
+ ),
645
+ }
646
+ )
647
+
648
+ return json.dumps(customers, indent=2)
649
+
650
+ # Create and return all tools
651
+ return [
652
+ # Financial Analysis
653
+ tools_factory.create_tool(calculate_portfolio_return, vhc_eligible=False),
654
+ tools_factory.create_tool(project_investment_growth, vhc_eligible=False),
655
+ # Data Analysis
656
+ tools_factory.create_tool(generate_customer_dataset, vhc_eligible=False),
657
+ tools_factory.create_tool(self.analyze_customer_data, vhc_eligible=False),
658
+ # System Monitoring
659
+ tools_factory.create_tool(self.get_system_metrics, vhc_eligible=False),
660
+ tools_factory.create_tool(self.check_system_health, vhc_eligible=False),
661
+ # Project Management
662
+ tools_factory.create_tool(self.create_project_tasks, vhc_eligible=False),
663
+ tools_factory.create_tool(self.plan_sprint, vhc_eligible=False),
664
+ # Reporting
665
+ tools_factory.create_tool(self.create_formatted_report, vhc_eligible=False),
666
+ # Research
667
+ tools_factory.create_tool(self.search_information, vhc_eligible=False),
668
+ tools_factory.create_tool(self.synthesize_research, vhc_eligible=False),
669
+ ]
670
+
671
+ def _calculate_provider_delay(self, provider: ModelProvider) -> float:
672
+ """Calculate appropriate delay based on provider rate limits."""
673
+ base_delay = 60.0 / self.provider_rate_limits.get(
674
+ provider, 60
675
+ ) # seconds between requests
676
+ # Add jitter to prevent thundering herd
677
+ jitter = random.uniform(0.5, 1.5)
678
+ return base_delay * jitter * 2 # Extra conservative multiplier
679
+
680
+ async def _retry_with_backoff(
681
+ self, func, max_retries: int = 3, base_delay: float = 1.0
682
+ ):
683
+ """Retry function with exponential backoff on rate limit errors."""
684
+ for attempt in range(max_retries):
685
+ try:
686
+ return await func()
687
+ except Exception as e:
688
+ error_str = str(e).lower()
689
+ if "rate limit" in error_str or "429" in error_str:
690
+ if attempt == max_retries - 1:
691
+ raise # Last attempt, re-raise the error
692
+
693
+ # Calculate backoff delay
694
+ delay = base_delay * (2**attempt) + random.uniform(0, 1)
695
+ print(
696
+ f" ⏳ Rate limit hit, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})"
697
+ )
698
+ await asyncio.sleep(delay)
699
+ else:
700
+ raise # Non-rate-limit error, don't retry
701
+
702
+ async def measure_streaming_response(
703
+ self, agent: Agent, prompt: str
704
+ ) -> Tuple[float, float, int]:
705
+ """
706
+ Measure streaming response metrics.
707
+ Returns: (first_token_latency, total_time, response_length)
708
+ """
709
+ start_time = time.time()
710
+ first_token_time = None
711
+ response_text = ""
712
+
713
+ try:
714
+ streaming_response = await agent.astream_chat(prompt)
715
+
716
+ # Check if we have the async_response_gen method
717
+ if hasattr(streaming_response, "async_response_gen") and callable(
718
+ streaming_response.async_response_gen
719
+ ):
720
+ async for token in streaming_response.async_response_gen():
721
+ if first_token_time is None:
722
+ first_token_time = time.time()
723
+ response_text += str(token)
724
+
725
+ # Get final response
726
+ final_response = await streaming_response.aget_response()
727
+ if hasattr(final_response, "response") and final_response.response:
728
+ response_text = final_response.response
729
+
730
+ end_time = time.time()
731
+ total_time = end_time - start_time
732
+ first_token_latency = (
733
+ (first_token_time - start_time) if first_token_time else total_time
734
+ )
735
+
736
+ return first_token_latency, total_time, len(response_text)
737
+
738
+ except Exception as e:
739
+ end_time = time.time()
740
+ print(f"Error during streaming: {e}")
741
+ return -1, end_time - start_time, 0
742
+
743
+ async def run_single_benchmark(
744
+ self,
745
+ provider: ModelProvider,
746
+ model_name: str,
747
+ test_name: str,
748
+ test_config: Dict[str, Any],
749
+ ) -> BenchmarkResult:
750
+ """Run a single benchmark iteration."""
751
+ try:
752
+ # Create agent configuration
753
+ config = self.create_agent_config(provider, model_name)
754
+
755
+ # Create tools if needed
756
+ tools = (
757
+ self.create_test_tools()
758
+ if test_config.get("needs_tools", False)
759
+ else []
760
+ )
761
+
762
+ # Create agent
763
+ agent = Agent.from_tools(
764
+ tools=tools,
765
+ topic="benchmark",
766
+ agent_config=config,
767
+ verbose=False,
768
+ session_id=f"benchmark_{model_name}_{test_name}_{int(time.time())}",
769
+ )
770
+
771
+ # Measure response
772
+ first_token_latency, total_time, response_length = (
773
+ await self.measure_streaming_response(agent, test_config["prompt"])
774
+ )
775
+
776
+ # Calculate tokens per second (approximate)
777
+ tokens_per_second = response_length / total_time if total_time > 0 else 0
778
+
779
+ # Note: Skip per-agent cleanup to avoid OpenTelemetry uninstrumentation warnings
780
+
781
+ return BenchmarkResult(
782
+ model_name=model_name,
783
+ provider=provider.value,
784
+ test_type=test_name,
785
+ first_token_latency=first_token_latency,
786
+ total_response_time=total_time,
787
+ response_length=response_length,
788
+ tokens_per_second=tokens_per_second,
789
+ )
790
+
791
+ except Exception as e:
792
+ return BenchmarkResult(
793
+ model_name=model_name,
794
+ provider=provider.value,
795
+ test_type=test_name,
796
+ first_token_latency=-1,
797
+ total_response_time=-1,
798
+ response_length=0,
799
+ tokens_per_second=0,
800
+ error=str(e),
801
+ )
802
+
803
+ async def run_benchmarks(self):
804
+ """Run all benchmark combinations with parallel execution."""
805
+ global _observability_initialized
806
+
807
+ print("Starting model performance benchmarks...")
808
+ print(
809
+ f"Testing {len(self.models_to_test)} models across {len(self.test_scenarios)} scenarios"
810
+ )
811
+ print(f"Running {self.iterations_per_test} iterations per combination")
812
+ print(f"Max concurrent models: {self.max_concurrent_models}\n")
813
+
814
+ # Setup observability once if enabled and not already initialized
815
+ if self.enable_observability and not _observability_initialized:
816
+ dummy_config = AgentConfig(observer=ObserverType.ARIZE_PHOENIX)
817
+ observability_setup = setup_observer(dummy_config, verbose=True)
818
+ if observability_setup:
819
+ print(
820
+ "✅ Arize Phoenix observability enabled - LLM calls will be traced\n"
821
+ )
822
+ _observability_initialized = True
823
+ else:
824
+ print("⚠️ Arize Phoenix observability setup failed\n")
825
+
826
+ # Create semaphore to limit concurrent model testing
827
+ model_semaphore = asyncio.Semaphore(self.max_concurrent_models)
828
+
829
+ # Create tasks for all model benchmarks
830
+ tasks = []
831
+ for model_config in self.models_to_test:
832
+ task = asyncio.create_task(
833
+ self._run_model_benchmark(model_config, model_semaphore)
834
+ )
835
+ tasks.append(task)
836
+
837
+ # Execute all model benchmarks in parallel
838
+ print("🚀 Starting parallel benchmark execution...\n")
839
+ await asyncio.gather(*tasks, return_exceptions=True)
840
+
841
+ async def _run_model_benchmark(
842
+ self, model_config: Dict, semaphore: asyncio.Semaphore
843
+ ):
844
+ """Run all benchmarks for a single model."""
845
+ async with semaphore:
846
+ provider = model_config["provider"]
847
+ model_name = model_config["model"]
848
+
849
+ print(f"\n{'='*60}")
850
+ print(f"Starting: {provider.value} - {model_name}")
851
+ print(f"{'='*60}")
852
+
853
+ # Run all scenarios for this model sequentially to avoid rate limits
854
+ for test_name, test_config in self.test_scenarios.items():
855
+ try:
856
+ await self._run_scenario_benchmark(
857
+ provider, model_name, test_name, test_config
858
+ )
859
+ except Exception as e:
860
+ print(f"❌ Error in {model_name} - {test_name}: {e}")
861
+
862
+ print(f"✅ Completed: {provider.value} - {model_name}")
863
+
864
+ async def _run_scenario_benchmark(
865
+ self,
866
+ provider: ModelProvider,
867
+ model_name: str,
868
+ test_name: str,
869
+ test_config: Dict[str, Any],
870
+ ):
871
+ """Run all iterations for a single test scenario sequentially."""
872
+ print(
873
+ f"\n🔄 Running {model_name}/{test_name}: {test_config['description']}"
874
+ )
875
+
876
+ iteration_results = []
877
+
878
+ # Run iterations sequentially to avoid rate limits
879
+ for iteration in range(self.iterations_per_test):
880
+ iteration_num = iteration + 1
881
+ try:
882
+ # Use retry with backoff for rate limit handling
883
+ async def run_benchmark():
884
+ return await self.run_single_benchmark(
885
+ provider, model_name, test_name, test_config
886
+ )
887
+
888
+ result = await self._retry_with_backoff(
889
+ run_benchmark, max_retries=3, base_delay=2.0
890
+ )
891
+ iteration_results.append(result)
892
+
893
+ if result.error:
894
+ print(
895
+ f" ❌ {model_name}/{test_name} Iteration {iteration_num}: {result.error}"
896
+ )
897
+ else:
898
+ print(
899
+ f" ✅ {model_name}/{test_name} Iteration {iteration_num}: "
900
+ f"{result.total_response_time:.2f}s, "
901
+ f"first token: {result.first_token_latency:.2f}s, "
902
+ f"{result.tokens_per_second:.1f} chars/sec"
903
+ )
904
+
905
+ except Exception as e:
906
+ print(f" ❌ {model_name}/{test_name} Iteration {iteration_num}: {e}")
907
+ # Create error result
908
+ error_result = BenchmarkResult(
909
+ model_name=model_name,
910
+ provider=provider.value,
911
+ test_type=test_name,
912
+ first_token_latency=-1,
913
+ total_response_time=-1,
914
+ response_length=0,
915
+ tokens_per_second=0,
916
+ error=str(e),
917
+ )
918
+ iteration_results.append(error_result)
919
+
920
+ # Add delay between iterations based on provider
921
+ if iteration_num < self.iterations_per_test:
922
+ delay = self._calculate_provider_delay(provider)
923
+ await asyncio.sleep(delay)
924
+
925
+ # Add all results to the main results list
926
+ self.results.extend(iteration_results)
927
+
928
+ # Calculate success rate for this scenario
929
+ successful = len([r for r in iteration_results if r.error is None])
930
+ success_rate = (successful / len(iteration_results)) * 100
931
+ print(
932
+ f" 📊 {model_name}/{test_name} complete: {successful}/{len(iteration_results)} successful ({success_rate:.1f}%)"
933
+ )
934
+
935
+ return iteration_results
936
+
937
+ def calculate_statistics(self) -> List[BenchmarkStats]:
938
+ """Calculate aggregated statistics from results."""
939
+ stats = []
940
+
941
+ # Group results by model and test type
942
+ grouped = {}
943
+ for result in self.results:
944
+ key = (result.model_name, result.provider, result.test_type)
945
+ if key not in grouped:
946
+ grouped[key] = []
947
+ grouped[key].append(result)
948
+
949
+ # Calculate statistics for each group
950
+ for (model_name, provider, test_type), group_results in grouped.items():
951
+ successful_results = [
952
+ r
953
+ for r in group_results
954
+ if r.error is None and r.total_response_time > 0
955
+ ]
956
+
957
+ if not successful_results:
958
+ continue
959
+
960
+ response_times = [r.total_response_time for r in successful_results]
961
+ first_token_times = [r.first_token_latency for r in successful_results]
962
+ tokens_per_sec = [r.tokens_per_second for r in successful_results]
963
+
964
+ stats.append(
965
+ BenchmarkStats(
966
+ model_name=model_name,
967
+ provider=provider,
968
+ test_type=test_type,
969
+ runs=len(group_results),
970
+ avg_first_token_latency=statistics.mean(first_token_times),
971
+ avg_total_response_time=statistics.mean(response_times),
972
+ avg_tokens_per_second=statistics.mean(tokens_per_sec),
973
+ median_first_token_latency=statistics.median(first_token_times),
974
+ median_total_response_time=statistics.median(response_times),
975
+ median_tokens_per_second=statistics.median(tokens_per_sec),
976
+ min_total_response_time=min(response_times),
977
+ max_total_response_time=max(response_times),
978
+ std_total_response_time=(
979
+ statistics.stdev(response_times)
980
+ if len(response_times) > 1
981
+ else 0
982
+ ),
983
+ success_rate=(len(successful_results) / len(group_results)) * 100,
984
+ )
985
+ )
986
+
987
+ return stats
988
+
989
+ def generate_report(self, stats: List[BenchmarkStats]) -> str:
990
+ """Generate a comprehensive performance report."""
991
+ report = []
992
+ report.append("=" * 80)
993
+ report.append("MODEL PERFORMANCE BENCHMARK RESULTS")
994
+ report.append("=" * 80)
995
+ report.append("")
996
+
997
+ # Group by test type for easier comparison
998
+ by_test_type = {}
999
+ for stat in stats:
1000
+ if stat.test_type not in by_test_type:
1001
+ by_test_type[stat.test_type] = []
1002
+ by_test_type[stat.test_type].append(stat)
1003
+
1004
+ for test_type, test_stats in by_test_type.items():
1005
+ report.append(f"\n{test_type.upper().replace('_', ' ')} RESULTS")
1006
+ report.append("-" * 50)
1007
+
1008
+ # Sort by average response time
1009
+ test_stats.sort(key=lambda x: x.avg_total_response_time)
1010
+
1011
+ report.append(
1012
+ f"{'Model':<25} {'Provider':<12} {'Avg Time':<10} {'First Token':<12} {'Chars/sec':<10} {'Success':<8}"
1013
+ )
1014
+ report.append("-" * 85)
1015
+
1016
+ for stat in test_stats:
1017
+ report.append(
1018
+ f"{stat.model_name:<25} {stat.provider:<12} "
1019
+ f"{stat.avg_total_response_time:<10.2f} {stat.avg_first_token_latency:<12.2f} "
1020
+ f"{stat.avg_tokens_per_second:<10.1f} {stat.success_rate:<8.0f}%"
1021
+ )
1022
+
1023
+ # Overall performance ranking
1024
+ report.append("\n\nOVERALL PERFORMANCE RANKING")
1025
+ report.append("-" * 40)
1026
+
1027
+ # Calculate overall average performance
1028
+ overall_performance = {}
1029
+ for stat in stats:
1030
+ key = f"{stat.provider} - {stat.model_name}"
1031
+ if key not in overall_performance:
1032
+ overall_performance[key] = []
1033
+ overall_performance[key].append(stat.avg_total_response_time)
1034
+
1035
+ # Calculate average across all test types
1036
+ overall_rankings = []
1037
+ for model, times in overall_performance.items():
1038
+ avg_time = statistics.mean(times)
1039
+ overall_rankings.append((model, avg_time))
1040
+
1041
+ overall_rankings.sort(key=lambda x: x[1])
1042
+
1043
+ report.append(f"{'Rank':<5} {'Model':<35} {'Avg Response Time':<18}")
1044
+ report.append("-" * 60)
1045
+
1046
+ for i, (model, avg_time) in enumerate(overall_rankings, 1):
1047
+ report.append(f"{i:<5} {model:<35} {avg_time:<18.2f}s")
1048
+
1049
+ return "\n".join(report)
1050
+
1051
+ def save_results(
1052
+ self, stats: List[BenchmarkStats], filename: str = "benchmark_results.json"
1053
+ ):
1054
+ """Save detailed results to JSON file."""
1055
+ output = {
1056
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
1057
+ "configuration": {
1058
+ "iterations_per_test": self.iterations_per_test,
1059
+ "models_tested": [
1060
+ f"{m['provider'].value}-{m['model']}" for m in self.models_to_test
1061
+ ],
1062
+ "test_scenarios": list(self.test_scenarios.keys()),
1063
+ },
1064
+ "raw_results": [asdict(result) for result in self.results],
1065
+ "statistics": [asdict(stat) for stat in stats],
1066
+ }
1067
+
1068
+ with open(filename, "w") as f:
1069
+ json.dump(output, f, indent=2)
1070
+
1071
+ print(f"\nDetailed results saved to: {filename}")
1072
+
1073
+
1074
+ async def main():
1075
+ """Main benchmark execution."""
1076
+ print("Vectara Agentic Model Performance Benchmark")
1077
+ print("=" * 50)
1078
+
1079
+ # Check if observability should be enabled via environment variable
1080
+ enable_observability = os.getenv("ENABLE_OBSERVABILITY", "false").lower() == "true"
1081
+
1082
+ # Allow configuring concurrency via environment variable
1083
+ max_concurrent_models = int(os.getenv("MAX_CONCURRENT_MODELS", "5"))
1084
+
1085
+ benchmark = ModelBenchmark(
1086
+ enable_observability=enable_observability,
1087
+ max_concurrent_models=max_concurrent_models,
1088
+ )
1089
+
1090
+ # Validate that all required API keys are present before running benchmarks
1091
+ validate_api_keys(benchmark.models_to_test)
1092
+
1093
+ try:
1094
+ await benchmark.run_benchmarks()
1095
+
1096
+ # Calculate and display results
1097
+ stats = benchmark.calculate_statistics()
1098
+ report = benchmark.generate_report(stats)
1099
+
1100
+ print("\n" + report)
1101
+
1102
+ # Save results
1103
+ benchmark.save_results(stats)
1104
+
1105
+ except KeyboardInterrupt:
1106
+ print("\nBenchmark interrupted by user")
1107
+ except Exception as e:
1108
+ print(f"\nBenchmark failed with error: {e}")
1109
+ import traceback
1110
+
1111
+ traceback.print_exc()
1112
+ finally:
1113
+ # Cleanup observability
1114
+ if enable_observability and _observability_initialized:
1115
+ shutdown_observer()
1116
+ print("\n🔄 Arize Phoenix observability shutdown complete")
1117
+
1118
+
1119
+ if __name__ == "__main__":
1120
+ asyncio.run(main())