vectara-agentic 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vectara-agentic might be problematic. Click here for more details.

@@ -0,0 +1,945 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Model Performance Benchmark Script
4
+
5
+ This script benchmarks different LLM models for latency and performance
6
+ in the context of Vectara Agentic framework.
7
+ """
8
+
9
+ import asyncio
10
+ import time
11
+ import json
12
+ import statistics
13
+ import sys
14
+ import os
15
+ from typing import Dict, List, Tuple, Any
16
+ from dataclasses import dataclass, asdict
17
+
18
+ # Add the current directory to Python path to import vectara_agentic
19
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
20
+
21
+ from vectara_agentic.agent import Agent
22
+ from vectara_agentic.agent_config import AgentConfig
23
+ from vectara_agentic.types import ModelProvider, ObserverType
24
+ from vectara_agentic.tools import ToolsFactory
25
+ from vectara_agentic._observability import setup_observer, shutdown_observer
26
+
27
+ # Initialize observability once at startup to prevent repeated instrumentation
28
+ _observability_initialized = False
29
+
30
+
31
+ @dataclass
32
+ class BenchmarkResult:
33
+ """Results from a single benchmark run."""
34
+
35
+ model_name: str
36
+ provider: str
37
+ test_type: str
38
+ first_token_latency: float
39
+ total_response_time: float
40
+ response_length: int
41
+ tokens_per_second: float
42
+ error: str = None
43
+
44
+
45
+ @dataclass
46
+ class BenchmarkStats:
47
+ """Aggregated statistics for multiple runs."""
48
+
49
+ model_name: str
50
+ provider: str
51
+ test_type: str
52
+ runs: int
53
+ avg_first_token_latency: float
54
+ avg_total_response_time: float
55
+ avg_tokens_per_second: float
56
+ median_first_token_latency: float
57
+ median_total_response_time: float
58
+ median_tokens_per_second: float
59
+ min_total_response_time: float
60
+ max_total_response_time: float
61
+ std_total_response_time: float
62
+ success_rate: float
63
+
64
+
65
+ class ModelBenchmark:
66
+ """Benchmarking suite for different LLM models."""
67
+
68
+ def __init__(self, enable_observability: bool = False):
69
+ # Test configurations
70
+ self.enable_observability = enable_observability
71
+ self.models_to_test = [
72
+ # OpenAI models
73
+ {"provider": ModelProvider.OPENAI, "model": "gpt-5"},
74
+ {"provider": ModelProvider.OPENAI, "model": "gpt-5-mini"},
75
+ {"provider": ModelProvider.OPENAI, "model": "gpt-4o"},
76
+ {"provider": ModelProvider.OPENAI, "model": "gpt-4o-mini"},
77
+ {"provider": ModelProvider.OPENAI, "model": "gpt-4.1"},
78
+ {"provider": ModelProvider.OPENAI, "model": "gpt-4.1-mini"},
79
+ {"provider": ModelProvider.ANTHROPIC, "model": "claude-sonnet-4-20250514"},
80
+ {"provider": ModelProvider.TOGETHER, "model": "deepseek-ai/DeepSeek-V3"},
81
+ {"provider": ModelProvider.GROQ, "model": "openai/gpt-oss-20b"},
82
+ {"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-flash"},
83
+ {"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-pro"},
84
+ ]
85
+
86
+ # Test scenarios - focused on advanced tool calling only
87
+ self.test_scenarios = {
88
+ "financial_analysis": {
89
+ "prompt": "Analyze a $50,000 investment portfolio with 60% stocks (8% return), 30% bonds (4% return), and 10% cash (1% return). Calculate the expected annual return, then determine how the portfolio value would grow over 15 years with monthly contributions of $1,000. Create a summary report of the analysis.",
90
+ "description": "Multi-step financial analysis with calculations and reporting",
91
+ "needs_tools": True,
92
+ },
93
+ "data_processing": {
94
+ "prompt": "Generate a dataset of 100 customers with randomized demographics (age, income, location, purchase_history). Then analyze this data to find correlations between age groups and spending patterns. Create a statistical summary and export the results to a formatted report.",
95
+ "description": "Data generation, analysis, and reporting workflow",
96
+ "needs_tools": True,
97
+ },
98
+ "research_synthesis": {
99
+ "prompt": "Search for information about the latest developments in quantum computing, specifically focusing on error correction breakthroughs in 2024. Extract key findings from multiple sources, summarize the technical approaches, and create a structured research report with citations.",
100
+ "description": "Information retrieval, synthesis, and document generation",
101
+ "needs_tools": True,
102
+ },
103
+ "system_monitoring": {
104
+ "prompt": "Check system performance metrics including CPU usage, memory consumption, and disk space. If any metrics exceed safe thresholds (CPU > 80%, Memory > 90%, Disk > 85%), generate alerts and suggest optimization strategies. Create a monitoring report with recommendations.",
105
+ "description": "System monitoring with conditional logic and reporting",
106
+ "needs_tools": True,
107
+ },
108
+ "workflow_automation": {
109
+ "prompt": "Create a project task list with 10 software development tasks, assign priorities and estimated hours, then simulate a sprint planning session by organizing tasks into a 2-week sprint. Generate a sprint backlog with daily breakdowns and resource allocation recommendations.",
110
+ "description": "Complex workflow orchestration with multiple tool interactions",
111
+ "needs_tools": True,
112
+ },
113
+ }
114
+
115
+ self.iterations_per_test = 5
116
+ self.results: List[BenchmarkResult] = []
117
+
118
+ def create_agent_config(
119
+ self, provider: ModelProvider, model_name: str
120
+ ) -> AgentConfig:
121
+ """Create agent configuration for the specified model."""
122
+ return AgentConfig(
123
+ main_llm_provider=provider,
124
+ main_llm_model_name=model_name,
125
+ tool_llm_provider=provider,
126
+ tool_llm_model_name=model_name,
127
+ observer=(
128
+ ObserverType.ARIZE_PHOENIX
129
+ if self.enable_observability
130
+ else ObserverType.NO_OBSERVER
131
+ ),
132
+ )
133
+
134
+ def create_test_tools(self) -> List:
135
+ """Create an advanced set of tools for realistic agent testing."""
136
+ import random
137
+ import json
138
+ import psutil
139
+ from datetime import datetime
140
+
141
+ tools_factory = ToolsFactory()
142
+
143
+ # Financial Analysis Tools
144
+ def calculate_portfolio_return(
145
+ stocks_pct: float,
146
+ stocks_return: float,
147
+ bonds_pct: float,
148
+ bonds_return: float,
149
+ cash_pct: float,
150
+ cash_return: float,
151
+ ) -> dict:
152
+ """Calculate expected portfolio return and allocation details."""
153
+ total_allocation = stocks_pct + bonds_pct + cash_pct
154
+ if abs(total_allocation - 100) > 0.01:
155
+ raise ValueError(
156
+ f"Portfolio allocation must sum to 100%, got {total_allocation}%"
157
+ )
158
+
159
+ expected_return = (
160
+ stocks_pct * stocks_return
161
+ + bonds_pct * bonds_return
162
+ + cash_pct * cash_return
163
+ ) / 100
164
+
165
+ return {
166
+ "expected_annual_return_pct": expected_return,
167
+ "allocation": {
168
+ "stocks": {"percentage": stocks_pct, "return": stocks_return},
169
+ "bonds": {"percentage": bonds_pct, "return": bonds_return},
170
+ "cash": {"percentage": cash_pct, "return": cash_return},
171
+ },
172
+ "risk_profile": (
173
+ "aggressive"
174
+ if stocks_pct > 70
175
+ else "moderate" if stocks_pct > 40 else "conservative"
176
+ ),
177
+ }
178
+
179
+ def project_investment_growth(
180
+ initial_amount: float,
181
+ annual_return: float,
182
+ years: int,
183
+ monthly_contribution: float = 0,
184
+ ) -> dict:
185
+ """Project investment growth with optional monthly contributions."""
186
+ monthly_rate = annual_return / 12 / 100
187
+ months = years * 12
188
+
189
+ # Calculate compound growth with monthly contributions
190
+ if monthly_contribution > 0:
191
+ # Future value of initial investment
192
+ fv_initial = initial_amount * ((1 + monthly_rate) ** months)
193
+ # Future value of monthly contributions (ordinary annuity)
194
+ fv_contributions = monthly_contribution * (
195
+ ((1 + monthly_rate) ** months - 1) / monthly_rate
196
+ )
197
+ final_value = fv_initial + fv_contributions
198
+ total_contributions = monthly_contribution * months
199
+ else:
200
+ final_value = initial_amount * ((1 + annual_return / 100) ** years)
201
+ total_contributions = 0
202
+
203
+ total_invested = initial_amount + total_contributions
204
+ total_gains = final_value - total_invested
205
+
206
+ return {
207
+ "initial_investment": initial_amount,
208
+ "monthly_contribution": monthly_contribution,
209
+ "total_contributions": total_contributions,
210
+ "total_invested": total_invested,
211
+ "final_value": round(final_value, 2),
212
+ "total_gains": round(total_gains, 2),
213
+ "return_multiple": round(final_value / initial_amount, 2),
214
+ "years": years,
215
+ "annual_return_used": annual_return,
216
+ }
217
+
218
+ # Data Analysis Tools
219
+ def generate_customer_dataset(count: int) -> str:
220
+ """Generate randomized customer data for analysis."""
221
+ customers = []
222
+ locations = [
223
+ "New York",
224
+ "Los Angeles",
225
+ "Chicago",
226
+ "Houston",
227
+ "Phoenix",
228
+ "Philadelphia",
229
+ "San Antonio",
230
+ "San Diego",
231
+ "Dallas",
232
+ "San Jose",
233
+ ]
234
+
235
+ for i in range(count):
236
+ age = random.randint(18, 75)
237
+ income = random.randint(25000, 150000)
238
+ location = random.choice(locations)
239
+ purchase_history = random.randint(1, 50)
240
+
241
+ customers.append(
242
+ {
243
+ "customer_id": f"CUST_{i+1:04d}",
244
+ "age": age,
245
+ "income": income,
246
+ "location": location,
247
+ "purchase_history": purchase_history,
248
+ "age_group": (
249
+ "18-30"
250
+ if age <= 30
251
+ else (
252
+ "31-45"
253
+ if age <= 45
254
+ else "46-60" if age <= 60 else "60+"
255
+ )
256
+ ),
257
+ }
258
+ )
259
+
260
+ return json.dumps(customers, indent=2)
261
+
262
+ def analyze_customer_data(customer_data_json: str) -> dict:
263
+ """Analyze customer data for patterns and correlations."""
264
+ customers = json.loads(customer_data_json)
265
+
266
+ # Group by age groups
267
+ age_groups = {}
268
+ for customer in customers:
269
+ group = customer["age_group"]
270
+ if group not in age_groups:
271
+ age_groups[group] = {
272
+ "count": 0,
273
+ "total_spending": 0,
274
+ "total_income": 0,
275
+ }
276
+
277
+ age_groups[group]["count"] += 1
278
+ age_groups[group]["total_spending"] += customer["purchase_history"]
279
+ age_groups[group]["total_income"] += customer["income"]
280
+
281
+ # Calculate averages
282
+ analysis = {}
283
+ for group, data in age_groups.items():
284
+ analysis[group] = {
285
+ "count": data["count"],
286
+ "avg_spending": round(data["total_spending"] / data["count"], 2),
287
+ "avg_income": round(data["total_income"] / data["count"], 2),
288
+ "spending_to_income_ratio": round(
289
+ (data["total_spending"] / data["count"])
290
+ / (data["total_income"] / data["count"])
291
+ * 1000,
292
+ 4,
293
+ ),
294
+ }
295
+
296
+ return {
297
+ "total_customers": len(customers),
298
+ "age_group_analysis": analysis,
299
+ "overall_avg_spending": round(
300
+ sum(c["purchase_history"] for c in customers) / len(customers), 2
301
+ ),
302
+ "overall_avg_income": round(
303
+ sum(c["income"] for c in customers) / len(customers), 2
304
+ ),
305
+ }
306
+
307
+ # System Monitoring Tools
308
+ def get_system_metrics() -> dict:
309
+ """Get current system performance metrics."""
310
+ try:
311
+ cpu_percent = psutil.cpu_percent(interval=1)
312
+ memory = psutil.virtual_memory()
313
+ disk = psutil.disk_usage("/")
314
+
315
+ return {
316
+ "cpu_usage_percent": cpu_percent,
317
+ "memory_usage_percent": memory.percent,
318
+ "memory_available_gb": round(memory.available / (1024**3), 2),
319
+ "disk_usage_percent": disk.percent,
320
+ "disk_free_gb": round(disk.free / (1024**3), 2),
321
+ "timestamp": datetime.now().isoformat(),
322
+ }
323
+ except Exception:
324
+ # Fallback with simulated data for testing
325
+ return {
326
+ "cpu_usage_percent": random.randint(20, 95),
327
+ "memory_usage_percent": random.randint(40, 95),
328
+ "memory_available_gb": random.randint(1, 16),
329
+ "disk_usage_percent": random.randint(30, 90),
330
+ "disk_free_gb": random.randint(10, 500),
331
+ "timestamp": datetime.now().isoformat(),
332
+ "note": "Simulated data - psutil unavailable",
333
+ }
334
+
335
+ def check_system_health(
336
+ cpu_threshold: int = 80,
337
+ memory_threshold: int = 90,
338
+ disk_threshold: int = 85,
339
+ ) -> dict:
340
+ """Check system health against thresholds and generate alerts."""
341
+ metrics = get_system_metrics()
342
+ alerts = []
343
+ recommendations = []
344
+
345
+ if metrics["cpu_usage_percent"] > cpu_threshold:
346
+ alerts.append(
347
+ f"HIGH CPU USAGE: {metrics['cpu_usage_percent']}% (threshold: {cpu_threshold}%)"
348
+ )
349
+ recommendations.append(
350
+ "Consider closing unnecessary applications or upgrading CPU"
351
+ )
352
+
353
+ if metrics["memory_usage_percent"] > memory_threshold:
354
+ alerts.append(
355
+ f"HIGH MEMORY USAGE: {metrics['memory_usage_percent']}% (threshold: {memory_threshold}%)"
356
+ )
357
+ recommendations.append(
358
+ "Close memory-intensive applications or add more RAM"
359
+ )
360
+
361
+ if metrics["disk_usage_percent"] > disk_threshold:
362
+ alerts.append(
363
+ f"LOW DISK SPACE: {metrics['disk_usage_percent']}% used (threshold: {disk_threshold}%)"
364
+ )
365
+ recommendations.append(
366
+ "Clean up temporary files or expand disk storage"
367
+ )
368
+
369
+ health_status = (
370
+ "CRITICAL" if len(alerts) >= 2 else "WARNING" if alerts else "HEALTHY"
371
+ )
372
+
373
+ return {
374
+ "health_status": health_status,
375
+ "alerts": alerts,
376
+ "recommendations": recommendations,
377
+ "metrics": metrics,
378
+ "thresholds": {
379
+ "cpu": cpu_threshold,
380
+ "memory": memory_threshold,
381
+ "disk": disk_threshold,
382
+ },
383
+ }
384
+
385
+ # Project Management Tools
386
+ def create_project_tasks(count: int = 10) -> str:
387
+ """Generate a list of software development tasks."""
388
+ task_types = [
389
+ "Implement user authentication system",
390
+ "Create REST API endpoints",
391
+ "Design database schema",
392
+ "Build responsive frontend components",
393
+ "Write unit tests",
394
+ "Set up CI/CD pipeline",
395
+ "Implement error handling",
396
+ "Create API documentation",
397
+ "Optimize database queries",
398
+ "Implement caching layer",
399
+ "Add logging and monitoring",
400
+ "Create user dashboard",
401
+ "Implement search functionality",
402
+ "Add data validation",
403
+ "Create admin panel",
404
+ ]
405
+
406
+ tasks = []
407
+ for i in range(count):
408
+ task = random.choice(task_types)
409
+ priority = random.choice(["High", "Medium", "Low"])
410
+ estimated_hours = random.randint(2, 24)
411
+
412
+ tasks.append(
413
+ {
414
+ "task_id": f"TASK-{i+1:03d}",
415
+ "title": f"{task} #{i+1}",
416
+ "priority": priority,
417
+ "estimated_hours": estimated_hours,
418
+ "status": "Backlog",
419
+ "assigned_to": None,
420
+ }
421
+ )
422
+
423
+ return json.dumps(tasks, indent=2)
424
+
425
+ def plan_sprint(tasks_json: str, sprint_capacity_hours: int = 80) -> dict:
426
+ """Organize tasks into a sprint with daily breakdowns."""
427
+ tasks = json.loads(tasks_json)
428
+
429
+ # Sort by priority and estimated hours
430
+ priority_order = {"High": 3, "Medium": 2, "Low": 1}
431
+ tasks.sort(
432
+ key=lambda x: (priority_order[x["priority"]], -x["estimated_hours"]),
433
+ reverse=True,
434
+ )
435
+
436
+ sprint_tasks = []
437
+ total_hours = 0
438
+
439
+ for task in tasks:
440
+ if total_hours + task["estimated_hours"] <= sprint_capacity_hours:
441
+ sprint_tasks.append(task)
442
+ total_hours += task["estimated_hours"]
443
+ else:
444
+ break
445
+
446
+ # Distribute across 2 weeks (10 working days)
447
+ daily_breakdown = []
448
+ remaining_hours = total_hours
449
+ days_remaining = 10
450
+
451
+ for day in range(1, 11):
452
+ if days_remaining > 0:
453
+ day_hours = min(
454
+ 8,
455
+ remaining_hours // days_remaining
456
+ + (1 if remaining_hours % days_remaining else 0),
457
+ )
458
+ daily_breakdown.append(
459
+ {
460
+ "day": day,
461
+ "planned_hours": day_hours,
462
+ "remaining_capacity": 8 - day_hours,
463
+ }
464
+ )
465
+ remaining_hours -= day_hours
466
+ days_remaining -= 1
467
+
468
+ return {
469
+ "sprint_summary": {
470
+ "total_tasks": len(sprint_tasks),
471
+ "total_planned_hours": total_hours,
472
+ "sprint_capacity": sprint_capacity_hours,
473
+ "utilization_percent": round(
474
+ (total_hours / sprint_capacity_hours) * 100, 1
475
+ ),
476
+ },
477
+ "selected_tasks": sprint_tasks,
478
+ "daily_breakdown": daily_breakdown,
479
+ "backlog_remaining": len(tasks) - len(sprint_tasks),
480
+ }
481
+
482
+ # Reporting Tools
483
+ def create_formatted_report(
484
+ title: str, data: dict, report_type: str = "summary"
485
+ ) -> str:
486
+ """Create a formatted text report from structured data."""
487
+ report_lines = []
488
+ report_lines.append("=" * 60)
489
+ report_lines.append(f"{title.upper()}")
490
+ report_lines.append("=" * 60)
491
+ report_lines.append(
492
+ f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
493
+ )
494
+ report_lines.append(f"Report Type: {report_type.title()}")
495
+ report_lines.append("")
496
+
497
+ def format_dict(d, indent=0):
498
+ lines = []
499
+ for key, value in d.items():
500
+ prefix = " " * indent
501
+ if isinstance(value, dict):
502
+ lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
503
+ lines.extend(format_dict(value, indent + 1))
504
+ elif isinstance(value, list):
505
+ lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
506
+ for i, item in enumerate(value):
507
+ if isinstance(item, dict):
508
+ lines.append(f"{prefix} Item {i+1}:")
509
+ lines.extend(format_dict(item, indent + 2))
510
+ else:
511
+ lines.append(f"{prefix} - {item}")
512
+ else:
513
+ lines.append(
514
+ f"{prefix}{key.replace('_', ' ').title()}: {value}"
515
+ )
516
+ return lines
517
+
518
+ report_lines.extend(format_dict(data))
519
+ report_lines.append("")
520
+ report_lines.append("=" * 60)
521
+
522
+ return "\n".join(report_lines)
523
+
524
+ # Research Tools
525
+ def search_information(query: str, max_results: int = 5) -> dict:
526
+ """Simulate information search with structured results."""
527
+ # Simulated search results for testing
528
+ simulated_results = [
529
+ {
530
+ "title": f"Research Paper: {query} - Latest Developments",
531
+ "source": "Journal of Advanced Computing",
532
+ "summary": f"Recent breakthrough in {query} showing promising results in error reduction and scalability improvements.",
533
+ "relevance_score": random.randint(80, 95),
534
+ "publication_date": "2024-11-15",
535
+ },
536
+ {
537
+ "title": f"Technical Review: {query} Implementation Challenges",
538
+ "source": "Tech Innovation Quarterly",
539
+ "summary": f"Comprehensive analysis of current {query} methodologies and their practical applications.",
540
+ "relevance_score": random.randint(75, 90),
541
+ "publication_date": "2024-10-22",
542
+ },
543
+ {
544
+ "title": f"Industry Report: {query} Market Trends",
545
+ "source": "Technology Research Institute",
546
+ "summary": f"Market analysis and future projections for {query} adoption across industries.",
547
+ "relevance_score": random.randint(70, 85),
548
+ "publication_date": "2024-09-30",
549
+ },
550
+ ]
551
+
552
+ return {
553
+ "query": query,
554
+ "total_results": len(simulated_results),
555
+ "results": simulated_results[:max_results],
556
+ "search_timestamp": datetime.now().isoformat(),
557
+ }
558
+
559
+ def synthesize_research(search_results: dict) -> dict:
560
+ """Synthesize research findings into structured summary."""
561
+ results = search_results["results"]
562
+
563
+ key_findings = []
564
+ technical_approaches = []
565
+ citations = []
566
+
567
+ for i, result in enumerate(results, 1):
568
+ key_findings.append(f"Finding {i}: {result['summary']}")
569
+ technical_approaches.append(
570
+ f"Approach {i}: Methodology described in '{result['title']}'"
571
+ )
572
+ citations.append(
573
+ f"[{i}] {result['title']} - {result['source']} ({result['publication_date']})"
574
+ )
575
+
576
+ return {
577
+ "research_topic": search_results["query"],
578
+ "sources_analyzed": len(results),
579
+ "key_findings": key_findings,
580
+ "technical_approaches": technical_approaches,
581
+ "citations": citations,
582
+ "confidence_level": "High" if len(results) >= 3 else "Medium",
583
+ "synthesis_date": datetime.now().isoformat(),
584
+ }
585
+
586
+ # Create and return all tools
587
+ return [
588
+ # Financial Analysis
589
+ tools_factory.create_tool(calculate_portfolio_return, vhc_eligible=False),
590
+ tools_factory.create_tool(project_investment_growth, vhc_eligible=False),
591
+ # Data Analysis
592
+ tools_factory.create_tool(generate_customer_dataset, vhc_eligible=False),
593
+ tools_factory.create_tool(analyze_customer_data, vhc_eligible=False),
594
+ # System Monitoring
595
+ tools_factory.create_tool(get_system_metrics, vhc_eligible=False),
596
+ tools_factory.create_tool(check_system_health, vhc_eligible=False),
597
+ # Project Management
598
+ tools_factory.create_tool(create_project_tasks, vhc_eligible=False),
599
+ tools_factory.create_tool(plan_sprint, vhc_eligible=False),
600
+ # Reporting
601
+ tools_factory.create_tool(create_formatted_report, vhc_eligible=False),
602
+ # Research
603
+ tools_factory.create_tool(search_information, vhc_eligible=False),
604
+ tools_factory.create_tool(synthesize_research, vhc_eligible=False),
605
+ ]
606
+
607
+ async def measure_streaming_response(
608
+ self, agent: Agent, prompt: str
609
+ ) -> Tuple[float, float, int]:
610
+ """
611
+ Measure streaming response metrics.
612
+ Returns: (first_token_latency, total_time, response_length)
613
+ """
614
+ start_time = time.time()
615
+ first_token_time = None
616
+ response_text = ""
617
+
618
+ try:
619
+ streaming_response = await agent.astream_chat(prompt)
620
+
621
+ # Check if we have the async_response_gen method
622
+ if hasattr(streaming_response, "async_response_gen") and callable(
623
+ streaming_response.async_response_gen
624
+ ):
625
+ async for token in streaming_response.async_response_gen():
626
+ if first_token_time is None:
627
+ first_token_time = time.time()
628
+ response_text += str(token)
629
+
630
+ # Get final response
631
+ final_response = await streaming_response.aget_response()
632
+ if hasattr(final_response, "response") and final_response.response:
633
+ response_text = final_response.response
634
+
635
+ end_time = time.time()
636
+ total_time = end_time - start_time
637
+ first_token_latency = (
638
+ (first_token_time - start_time) if first_token_time else total_time
639
+ )
640
+
641
+ return first_token_latency, total_time, len(response_text)
642
+
643
+ except Exception as e:
644
+ end_time = time.time()
645
+ print(f"Error during streaming: {e}")
646
+ return -1, end_time - start_time, 0
647
+
648
+ async def run_single_benchmark(
649
+ self,
650
+ provider: ModelProvider,
651
+ model_name: str,
652
+ test_name: str,
653
+ test_config: Dict[str, Any],
654
+ ) -> BenchmarkResult:
655
+ """Run a single benchmark iteration."""
656
+ try:
657
+ # Create agent configuration
658
+ config = self.create_agent_config(provider, model_name)
659
+
660
+ # Create tools if needed
661
+ tools = (
662
+ self.create_test_tools()
663
+ if test_config.get("needs_tools", False)
664
+ else []
665
+ )
666
+
667
+ # Create agent
668
+ agent = Agent.from_tools(
669
+ tools=tools,
670
+ topic="benchmark",
671
+ agent_config=config,
672
+ verbose=False,
673
+ session_id=f"benchmark_{model_name}_{test_name}_{int(time.time())}",
674
+ )
675
+
676
+ # Measure response
677
+ first_token_latency, total_time, response_length = (
678
+ await self.measure_streaming_response(agent, test_config["prompt"])
679
+ )
680
+
681
+ # Calculate tokens per second (approximate)
682
+ tokens_per_second = response_length / total_time if total_time > 0 else 0
683
+
684
+ # Note: Skip per-agent cleanup to avoid OpenTelemetry uninstrumentation warnings
685
+
686
+ return BenchmarkResult(
687
+ model_name=model_name,
688
+ provider=provider.value,
689
+ test_type=test_name,
690
+ first_token_latency=first_token_latency,
691
+ total_response_time=total_time,
692
+ response_length=response_length,
693
+ tokens_per_second=tokens_per_second,
694
+ )
695
+
696
+ except Exception as e:
697
+ return BenchmarkResult(
698
+ model_name=model_name,
699
+ provider=provider.value,
700
+ test_type=test_name,
701
+ first_token_latency=-1,
702
+ total_response_time=-1,
703
+ response_length=0,
704
+ tokens_per_second=0,
705
+ error=str(e),
706
+ )
707
+
708
+ async def run_benchmarks(self):
709
+ """Run all benchmark combinations."""
710
+ global _observability_initialized
711
+
712
+ print("Starting model performance benchmarks...")
713
+ print(
714
+ f"Testing {len(self.models_to_test)} models across {len(self.test_scenarios)} scenarios"
715
+ )
716
+ print(f"Running {self.iterations_per_test} iterations per combination\n")
717
+
718
+ # Setup observability once if enabled and not already initialized
719
+ if self.enable_observability and not _observability_initialized:
720
+ dummy_config = AgentConfig(observer=ObserverType.ARIZE_PHOENIX)
721
+ observability_setup = setup_observer(dummy_config, verbose=True)
722
+ if observability_setup:
723
+ print(
724
+ "✅ Arize Phoenix observability enabled - LLM calls will be traced\n"
725
+ )
726
+ _observability_initialized = True
727
+ else:
728
+ print("⚠️ Arize Phoenix observability setup failed\n")
729
+
730
+ total_tests = (
731
+ len(self.models_to_test)
732
+ * len(self.test_scenarios)
733
+ * self.iterations_per_test
734
+ )
735
+ current_test = 0
736
+
737
+ for model_config in self.models_to_test:
738
+ provider = model_config["provider"]
739
+ model_name = model_config["model"]
740
+
741
+ print(f"\n{'='*60}")
742
+ print(f"Testing: {provider.value} - {model_name}")
743
+ print(f"{'='*60}")
744
+
745
+ for test_name, test_config in self.test_scenarios.items():
746
+ print(f"\nRunning {test_name}: {test_config['description']}")
747
+
748
+ for iteration in range(self.iterations_per_test):
749
+ current_test += 1
750
+ progress = (current_test / total_tests) * 100
751
+ print(
752
+ f" Iteration {iteration + 1}/{self.iterations_per_test} ({progress:.1f}% complete)"
753
+ )
754
+
755
+ result = await self.run_single_benchmark(
756
+ provider, model_name, test_name, test_config
757
+ )
758
+ self.results.append(result)
759
+
760
+ if result.error:
761
+ print(f" ERROR: {result.error}")
762
+ else:
763
+ print(
764
+ f" Time: {result.total_response_time:.2f}s, "
765
+ f"First token: {result.first_token_latency:.2f}s, "
766
+ f"Speed: {result.tokens_per_second:.1f} chars/sec"
767
+ )
768
+
769
+ # Small delay between tests
770
+ await asyncio.sleep(1)
771
+
772
+ def calculate_statistics(self) -> List[BenchmarkStats]:
773
+ """Calculate aggregated statistics from results."""
774
+ stats = []
775
+
776
+ # Group results by model and test type
777
+ grouped = {}
778
+ for result in self.results:
779
+ key = (result.model_name, result.provider, result.test_type)
780
+ if key not in grouped:
781
+ grouped[key] = []
782
+ grouped[key].append(result)
783
+
784
+ # Calculate statistics for each group
785
+ for (model_name, provider, test_type), group_results in grouped.items():
786
+ successful_results = [
787
+ r
788
+ for r in group_results
789
+ if r.error is None and r.total_response_time > 0
790
+ ]
791
+
792
+ if not successful_results:
793
+ continue
794
+
795
+ response_times = [r.total_response_time for r in successful_results]
796
+ first_token_times = [r.first_token_latency for r in successful_results]
797
+ tokens_per_sec = [r.tokens_per_second for r in successful_results]
798
+
799
+ stats.append(
800
+ BenchmarkStats(
801
+ model_name=model_name,
802
+ provider=provider,
803
+ test_type=test_type,
804
+ runs=len(group_results),
805
+ avg_first_token_latency=statistics.mean(first_token_times),
806
+ avg_total_response_time=statistics.mean(response_times),
807
+ avg_tokens_per_second=statistics.mean(tokens_per_sec),
808
+ median_first_token_latency=statistics.median(first_token_times),
809
+ median_total_response_time=statistics.median(response_times),
810
+ median_tokens_per_second=statistics.median(tokens_per_sec),
811
+ min_total_response_time=min(response_times),
812
+ max_total_response_time=max(response_times),
813
+ std_total_response_time=(
814
+ statistics.stdev(response_times)
815
+ if len(response_times) > 1
816
+ else 0
817
+ ),
818
+ success_rate=(len(successful_results) / len(group_results)) * 100,
819
+ )
820
+ )
821
+
822
+ return stats
823
+
824
+ def generate_report(self, stats: List[BenchmarkStats]) -> str:
825
+ """Generate a comprehensive performance report."""
826
+ report = []
827
+ report.append("=" * 80)
828
+ report.append("MODEL PERFORMANCE BENCHMARK RESULTS")
829
+ report.append("=" * 80)
830
+ report.append("")
831
+
832
+ # Group by test type for easier comparison
833
+ by_test_type = {}
834
+ for stat in stats:
835
+ if stat.test_type not in by_test_type:
836
+ by_test_type[stat.test_type] = []
837
+ by_test_type[stat.test_type].append(stat)
838
+
839
+ for test_type, test_stats in by_test_type.items():
840
+ report.append(f"\n{test_type.upper().replace('_', ' ')} RESULTS")
841
+ report.append("-" * 50)
842
+
843
+ # Sort by average response time
844
+ test_stats.sort(key=lambda x: x.avg_total_response_time)
845
+
846
+ report.append(
847
+ f"{'Model':<25} {'Provider':<12} {'Avg Time':<10} {'First Token':<12} {'Chars/sec':<10} {'Success':<8}"
848
+ )
849
+ report.append("-" * 85)
850
+
851
+ for stat in test_stats:
852
+ report.append(
853
+ f"{stat.model_name:<25} {stat.provider:<12} "
854
+ f"{stat.avg_total_response_time:<10.2f} {stat.avg_first_token_latency:<12.2f} "
855
+ f"{stat.avg_tokens_per_second:<10.1f} {stat.success_rate:<8.0f}%"
856
+ )
857
+
858
+ # Overall performance ranking
859
+ report.append("\n\nOVERALL PERFORMANCE RANKING")
860
+ report.append("-" * 40)
861
+
862
+ # Calculate overall average performance
863
+ overall_performance = {}
864
+ for stat in stats:
865
+ key = f"{stat.provider} - {stat.model_name}"
866
+ if key not in overall_performance:
867
+ overall_performance[key] = []
868
+ overall_performance[key].append(stat.avg_total_response_time)
869
+
870
+ # Calculate average across all test types
871
+ overall_rankings = []
872
+ for model, times in overall_performance.items():
873
+ avg_time = statistics.mean(times)
874
+ overall_rankings.append((model, avg_time))
875
+
876
+ overall_rankings.sort(key=lambda x: x[1])
877
+
878
+ report.append(f"{'Rank':<5} {'Model':<35} {'Avg Response Time':<18}")
879
+ report.append("-" * 60)
880
+
881
+ for i, (model, avg_time) in enumerate(overall_rankings, 1):
882
+ report.append(f"{i:<5} {model:<35} {avg_time:<18.2f}s")
883
+
884
+ return "\n".join(report)
885
+
886
+ def save_results(
887
+ self, stats: List[BenchmarkStats], filename: str = "benchmark_results.json"
888
+ ):
889
+ """Save detailed results to JSON file."""
890
+ output = {
891
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
892
+ "configuration": {
893
+ "iterations_per_test": self.iterations_per_test,
894
+ "models_tested": [
895
+ f"{m['provider'].value}-{m['model']}" for m in self.models_to_test
896
+ ],
897
+ "test_scenarios": list(self.test_scenarios.keys()),
898
+ },
899
+ "raw_results": [asdict(result) for result in self.results],
900
+ "statistics": [asdict(stat) for stat in stats],
901
+ }
902
+
903
+ with open(filename, "w") as f:
904
+ json.dump(output, f, indent=2)
905
+
906
+ print(f"\nDetailed results saved to: {filename}")
907
+
908
+
909
+ async def main():
910
+ """Main benchmark execution."""
911
+ print("Vectara Agentic Model Performance Benchmark")
912
+ print("=" * 50)
913
+
914
+ # Check if observability should be enabled via environment variable
915
+ enable_observability = os.getenv("ENABLE_OBSERVABILITY", "false").lower() == "true"
916
+ benchmark = ModelBenchmark(enable_observability=enable_observability)
917
+
918
+ try:
919
+ await benchmark.run_benchmarks()
920
+
921
+ # Calculate and display results
922
+ stats = benchmark.calculate_statistics()
923
+ report = benchmark.generate_report(stats)
924
+
925
+ print("\n" + report)
926
+
927
+ # Save results
928
+ benchmark.save_results(stats)
929
+
930
+ except KeyboardInterrupt:
931
+ print("\nBenchmark interrupted by user")
932
+ except Exception as e:
933
+ print(f"\nBenchmark failed with error: {e}")
934
+ import traceback
935
+
936
+ traceback.print_exc()
937
+ finally:
938
+ # Cleanup observability
939
+ if enable_observability and _observability_initialized:
940
+ shutdown_observer()
941
+ print("\n🔄 Arize Phoenix observability shutdown complete")
942
+
943
+
944
+ if __name__ == "__main__":
945
+ asyncio.run(main())