vectara-agentic 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. tests/__init__.py +1 -0
  2. tests/benchmark_models.py +547 -372
  3. tests/conftest.py +14 -12
  4. tests/endpoint.py +9 -5
  5. tests/run_tests.py +1 -0
  6. tests/test_agent.py +22 -9
  7. tests/test_agent_fallback_memory.py +4 -4
  8. tests/test_agent_memory_consistency.py +4 -4
  9. tests/test_agent_type.py +2 -0
  10. tests/test_api_endpoint.py +13 -13
  11. tests/test_bedrock.py +9 -1
  12. tests/test_fallback.py +18 -7
  13. tests/test_gemini.py +14 -40
  14. tests/test_groq.py +9 -1
  15. tests/test_private_llm.py +19 -6
  16. tests/test_react_error_handling.py +293 -0
  17. tests/test_react_memory.py +257 -0
  18. tests/test_react_streaming.py +135 -0
  19. tests/test_react_workflow_events.py +395 -0
  20. tests/test_return_direct.py +1 -0
  21. tests/test_serialization.py +58 -20
  22. tests/test_session_memory.py +11 -11
  23. tests/test_together.py +9 -1
  24. tests/test_tools.py +3 -1
  25. tests/test_vectara_llms.py +2 -2
  26. tests/test_vhc.py +7 -2
  27. tests/test_workflow.py +17 -11
  28. vectara_agentic/_callback.py +79 -21
  29. vectara_agentic/_version.py +1 -1
  30. vectara_agentic/agent.py +65 -27
  31. vectara_agentic/agent_core/serialization.py +5 -9
  32. vectara_agentic/agent_core/streaming.py +245 -64
  33. vectara_agentic/agent_core/utils/schemas.py +2 -2
  34. vectara_agentic/llm_utils.py +4 -2
  35. {vectara_agentic-0.4.2.dist-info → vectara_agentic-0.4.3.dist-info}/METADATA +127 -31
  36. vectara_agentic-0.4.3.dist-info/RECORD +58 -0
  37. vectara_agentic-0.4.2.dist-info/RECORD +0 -54
  38. {vectara_agentic-0.4.2.dist-info → vectara_agentic-0.4.3.dist-info}/WHEEL +0 -0
  39. {vectara_agentic-0.4.2.dist-info → vectara_agentic-0.4.3.dist-info}/licenses/LICENSE +0 -0
  40. {vectara_agentic-0.4.2.dist-info → vectara_agentic-0.4.3.dist-info}/top_level.txt +0 -0
tests/benchmark_models.py CHANGED
@@ -12,7 +12,8 @@ import json
12
12
  import statistics
13
13
  import sys
14
14
  import os
15
- from typing import Dict, List, Tuple, Any
15
+ import random
16
+ from typing import Dict, List, Tuple, Any, Set
16
17
  from dataclasses import dataclass, asdict
17
18
 
18
19
  # Add the current directory to Python path to import vectara_agentic
@@ -28,6 +29,64 @@ from vectara_agentic._observability import setup_observer, shutdown_observer
28
29
  _observability_initialized = False
29
30
 
30
31
 
32
+ def validate_api_keys(models_to_test: List[Dict]) -> None:
33
+ """
34
+ Validate that all required API keys are present for the models being tested.
35
+
36
+ Args:
37
+ models_to_test: List of model configurations with provider and model info
38
+
39
+ Raises:
40
+ SystemExit: If any required API keys are missing
41
+ """
42
+ # Map providers to their required environment variables
43
+ provider_api_keys = {
44
+ ModelProvider.OPENAI: "OPENAI_API_KEY",
45
+ ModelProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
46
+ ModelProvider.TOGETHER: "TOGETHER_API_KEY",
47
+ ModelProvider.GROQ: "GROQ_API_KEY",
48
+ ModelProvider.GEMINI: "GOOGLE_API_KEY",
49
+ }
50
+
51
+ required_keys = set()
52
+
53
+ # Collect unique providers from models to test
54
+ providers_in_use: Set[ModelProvider] = set()
55
+ for model_config in models_to_test:
56
+ providers_in_use.add(model_config["provider"])
57
+
58
+ # Add required API keys for each provider
59
+ for provider in providers_in_use:
60
+ api_key_name = provider_api_keys.get(provider)
61
+ if api_key_name: # Skip providers that don't use env var API keys
62
+ required_keys.add(api_key_name)
63
+
64
+ # Check for missing API keys
65
+ missing_keys = []
66
+ for key in required_keys:
67
+ if not os.getenv(key):
68
+ missing_keys.append(key)
69
+
70
+ if missing_keys:
71
+ print("❌ ERROR: Missing required API keys for benchmark execution:")
72
+ print()
73
+ for key in sorted(missing_keys):
74
+ print(f" • {key}")
75
+ print()
76
+ print("Please set these environment variables before running the benchmark.")
77
+ print("Providers being tested:")
78
+ for provider in sorted(providers_in_use, key=lambda p: p.value):
79
+ models_for_provider = [
80
+ m["model"] for m in models_to_test if m["provider"] == provider
81
+ ]
82
+ print(f" • {provider.value}: {', '.join(models_for_provider)}")
83
+
84
+ sys.exit(1)
85
+
86
+ print("✅ All required API keys are present")
87
+ print(f"Found API keys for {len(required_keys)} required environment variables")
88
+
89
+
31
90
  @dataclass
32
91
  class BenchmarkResult:
33
92
  """Results from a single benchmark run."""
@@ -65,22 +124,21 @@ class BenchmarkStats:
65
124
  class ModelBenchmark:
66
125
  """Benchmarking suite for different LLM models."""
67
126
 
68
- def __init__(self, enable_observability: bool = False):
127
+ def __init__(
128
+ self, enable_observability: bool = False, max_concurrent_models: int = 2
129
+ ):
69
130
  # Test configurations
70
131
  self.enable_observability = enable_observability
132
+ self.max_concurrent_models = max_concurrent_models
71
133
  self.models_to_test = [
72
134
  # OpenAI models
73
- {"provider": ModelProvider.OPENAI, "model": "gpt-5"},
74
135
  {"provider": ModelProvider.OPENAI, "model": "gpt-5-mini"},
75
- {"provider": ModelProvider.OPENAI, "model": "gpt-4o"},
76
136
  {"provider": ModelProvider.OPENAI, "model": "gpt-4o-mini"},
77
- {"provider": ModelProvider.OPENAI, "model": "gpt-4.1"},
78
137
  {"provider": ModelProvider.OPENAI, "model": "gpt-4.1-mini"},
79
138
  {"provider": ModelProvider.ANTHROPIC, "model": "claude-sonnet-4-20250514"},
80
139
  {"provider": ModelProvider.TOGETHER, "model": "deepseek-ai/DeepSeek-V3"},
81
140
  {"provider": ModelProvider.GROQ, "model": "openai/gpt-oss-20b"},
82
- {"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-flash"},
83
- {"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-pro"},
141
+ {"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-flash-lite"},
84
142
  ]
85
143
 
86
144
  # Test scenarios - focused on advanced tool calling only
@@ -115,6 +173,15 @@ class ModelBenchmark:
115
173
  self.iterations_per_test = 5
116
174
  self.results: List[BenchmarkResult] = []
117
175
 
176
+ # Provider-specific rate limits (requests per minute)
177
+ self.provider_rate_limits = {
178
+ ModelProvider.OPENAI: 100,
179
+ ModelProvider.ANTHROPIC: 100,
180
+ ModelProvider.TOGETHER: 80,
181
+ ModelProvider.GROQ: 50, # Conservative for GROQ
182
+ ModelProvider.GEMINI: 60,
183
+ }
184
+
118
185
  def create_agent_config(
119
186
  self, provider: ModelProvider, model_name: str
120
187
  ) -> AgentConfig:
@@ -131,13 +198,334 @@ class ModelBenchmark:
131
198
  ),
132
199
  )
133
200
 
134
- def create_test_tools(self) -> List:
135
- """Create an advanced set of tools for realistic agent testing."""
136
- import random
137
- import json
201
+ def analyze_customer_data(self, customer_data_json: str) -> dict:
202
+ """Analyze customer data for patterns and correlations."""
203
+ customers = json.loads(customer_data_json)
204
+
205
+ # Group by age groups
206
+ age_groups = {}
207
+ for customer in customers:
208
+ group = customer["age_group"]
209
+ if group not in age_groups:
210
+ age_groups[group] = {
211
+ "count": 0,
212
+ "total_spending": 0,
213
+ "total_income": 0,
214
+ }
215
+
216
+ age_groups[group]["count"] += 1
217
+ age_groups[group]["total_spending"] += customer["purchase_history"]
218
+ age_groups[group]["total_income"] += customer["income"]
219
+
220
+ # Calculate averages
221
+ analysis = {}
222
+ for group, data in age_groups.items():
223
+ analysis[group] = {
224
+ "count": data["count"],
225
+ "avg_spending": round(data["total_spending"] / data["count"], 2),
226
+ "avg_income": round(data["total_income"] / data["count"], 2),
227
+ "spending_to_income_ratio": round(
228
+ (data["total_spending"] / data["count"])
229
+ / (data["total_income"] / data["count"])
230
+ * 1000,
231
+ 4,
232
+ ),
233
+ }
234
+
235
+ return {
236
+ "total_customers": len(customers),
237
+ "age_group_analysis": analysis,
238
+ "overall_avg_spending": round(
239
+ sum(c["purchase_history"] for c in customers) / len(customers), 2
240
+ ),
241
+ "overall_avg_income": round(
242
+ sum(c["income"] for c in customers) / len(customers), 2
243
+ ),
244
+ }
245
+
246
+ def get_system_metrics(self) -> dict:
247
+ """Get current system performance metrics."""
138
248
  import psutil
139
249
  from datetime import datetime
140
250
 
251
+ try:
252
+ cpu_percent = psutil.cpu_percent(interval=1)
253
+ memory = psutil.virtual_memory()
254
+ disk = psutil.disk_usage("/")
255
+
256
+ return {
257
+ "cpu_usage_percent": cpu_percent,
258
+ "memory_usage_percent": memory.percent,
259
+ "memory_available_gb": round(memory.available / (1024**3), 2),
260
+ "disk_usage_percent": disk.percent,
261
+ "disk_free_gb": round(disk.free / (1024**3), 2),
262
+ "timestamp": datetime.now().isoformat(),
263
+ }
264
+ except Exception:
265
+ # Fallback with simulated data for testing
266
+ return {
267
+ "cpu_usage_percent": random.randint(20, 95),
268
+ "memory_usage_percent": random.randint(40, 95),
269
+ "memory_available_gb": random.randint(1, 16),
270
+ "disk_usage_percent": random.randint(30, 90),
271
+ "disk_free_gb": random.randint(10, 500),
272
+ "timestamp": datetime.now().isoformat(),
273
+ "note": "Simulated data - psutil unavailable",
274
+ }
275
+
276
+ def check_system_health(
277
+ self,
278
+ cpu_threshold: int = 80,
279
+ memory_threshold: int = 90,
280
+ disk_threshold: int = 85,
281
+ ) -> dict:
282
+ """Check system health against thresholds and generate alerts."""
283
+ metrics = self.get_system_metrics()
284
+ alerts = []
285
+ recommendations = []
286
+
287
+ if metrics["cpu_usage_percent"] > cpu_threshold:
288
+ alerts.append(
289
+ f"HIGH CPU USAGE: {metrics['cpu_usage_percent']}% (threshold: {cpu_threshold}%)"
290
+ )
291
+ recommendations.append(
292
+ "Consider closing unnecessary applications or upgrading CPU"
293
+ )
294
+
295
+ if metrics["memory_usage_percent"] > memory_threshold:
296
+ alerts.append(
297
+ f"HIGH MEMORY USAGE: {metrics['memory_usage_percent']}% (threshold: {memory_threshold}%)"
298
+ )
299
+ recommendations.append(
300
+ "Close memory-intensive applications or add more RAM"
301
+ )
302
+
303
+ if metrics["disk_usage_percent"] > disk_threshold:
304
+ alerts.append(
305
+ f"LOW DISK SPACE: {metrics['disk_usage_percent']}% used (threshold: {disk_threshold}%)"
306
+ )
307
+ recommendations.append("Clean up temporary files or expand disk storage")
308
+
309
+ health_status = (
310
+ "CRITICAL" if len(alerts) >= 2 else "WARNING" if alerts else "HEALTHY"
311
+ )
312
+
313
+ return {
314
+ "health_status": health_status,
315
+ "alerts": alerts,
316
+ "recommendations": recommendations,
317
+ "metrics": metrics,
318
+ "thresholds": {
319
+ "cpu": cpu_threshold,
320
+ "memory": memory_threshold,
321
+ "disk": disk_threshold,
322
+ },
323
+ }
324
+
325
+ def create_project_tasks(self, count: int = 10) -> str:
326
+ """Generate a list of software development tasks."""
327
+ task_types = [
328
+ "Implement user authentication system",
329
+ "Create REST API endpoints",
330
+ "Design database schema",
331
+ "Build responsive frontend components",
332
+ "Write unit tests",
333
+ "Set up CI/CD pipeline",
334
+ "Implement error handling",
335
+ "Create API documentation",
336
+ "Optimize database queries",
337
+ "Implement caching layer",
338
+ "Add logging and monitoring",
339
+ "Create user dashboard",
340
+ "Implement search functionality",
341
+ "Add data validation",
342
+ "Create admin panel",
343
+ ]
344
+
345
+ tasks = []
346
+ for i in range(count):
347
+ task = random.choice(task_types)
348
+ priority = random.choice(["High", "Medium", "Low"])
349
+ estimated_hours = random.randint(2, 24)
350
+
351
+ tasks.append(
352
+ {
353
+ "task_id": f"TASK-{i+1:03d}",
354
+ "title": f"{task} #{i+1}",
355
+ "priority": priority,
356
+ "estimated_hours": estimated_hours,
357
+ "status": "Backlog",
358
+ "assigned_to": None,
359
+ }
360
+ )
361
+
362
+ return json.dumps(tasks, indent=2)
363
+
364
+ def plan_sprint(self, tasks_json: str, sprint_capacity_hours: int = 80) -> dict:
365
+ """Organize tasks into a sprint with daily breakdowns."""
366
+ tasks = json.loads(tasks_json)
367
+
368
+ # Sort by priority and estimated hours
369
+ priority_order = {"High": 3, "Medium": 2, "Low": 1}
370
+ tasks.sort(
371
+ key=lambda x: (priority_order[x["priority"]], -x["estimated_hours"]),
372
+ reverse=True,
373
+ )
374
+
375
+ sprint_tasks = []
376
+ total_hours = 0
377
+
378
+ for task in tasks:
379
+ if total_hours + task["estimated_hours"] <= sprint_capacity_hours:
380
+ sprint_tasks.append(task)
381
+ total_hours += task["estimated_hours"]
382
+ else:
383
+ break
384
+
385
+ # Distribute across 2 weeks (10 working days)
386
+ daily_breakdown = []
387
+ remaining_hours = total_hours
388
+ days_remaining = 10
389
+
390
+ for day in range(1, 11):
391
+ if days_remaining > 0:
392
+ day_hours = min(
393
+ 8,
394
+ remaining_hours // days_remaining
395
+ + (1 if remaining_hours % days_remaining else 0),
396
+ )
397
+ daily_breakdown.append(
398
+ {
399
+ "day": day,
400
+ "planned_hours": day_hours,
401
+ "remaining_capacity": 8 - day_hours,
402
+ }
403
+ )
404
+ remaining_hours -= day_hours
405
+ days_remaining -= 1
406
+
407
+ return {
408
+ "sprint_summary": {
409
+ "total_tasks": len(sprint_tasks),
410
+ "total_planned_hours": total_hours,
411
+ "sprint_capacity": sprint_capacity_hours,
412
+ "utilization_percent": round(
413
+ (total_hours / sprint_capacity_hours) * 100, 1
414
+ ),
415
+ },
416
+ "selected_tasks": sprint_tasks,
417
+ "daily_breakdown": daily_breakdown,
418
+ "backlog_remaining": len(tasks) - len(sprint_tasks),
419
+ }
420
+
421
+ def create_formatted_report(
422
+ self, title: str, data: dict, report_type: str = "summary"
423
+ ) -> str:
424
+ """Create a formatted text report from structured data."""
425
+ from datetime import datetime
426
+
427
+ report_lines = []
428
+ report_lines.append("=" * 60)
429
+ report_lines.append(f"{title.upper()}")
430
+ report_lines.append("=" * 60)
431
+ report_lines.append(
432
+ f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
433
+ )
434
+ report_lines.append(f"Report Type: {report_type.title()}")
435
+ report_lines.append("")
436
+
437
+ def format_dict(d, indent=0):
438
+ lines = []
439
+ for key, value in d.items():
440
+ prefix = " " * indent
441
+ if isinstance(value, dict):
442
+ lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
443
+ lines.extend(format_dict(value, indent + 1))
444
+ elif isinstance(value, list):
445
+ lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
446
+ for i, item in enumerate(value):
447
+ if isinstance(item, dict):
448
+ lines.append(f"{prefix} Item {i+1}:")
449
+ lines.extend(format_dict(item, indent + 2))
450
+ else:
451
+ lines.append(f"{prefix} - {item}")
452
+ else:
453
+ lines.append(f"{prefix}{key.replace('_', ' ').title()}: {value}")
454
+ return lines
455
+
456
+ report_lines.extend(format_dict(data))
457
+ report_lines.append("")
458
+ report_lines.append("=" * 60)
459
+
460
+ return "\n".join(report_lines)
461
+
462
+ def search_information(self, query: str, max_results: int = 5) -> dict:
463
+ """Simulate information search with structured results."""
464
+ from datetime import datetime
465
+
466
+ # Simulated search results for testing
467
+ simulated_results = [
468
+ {
469
+ "title": f"Research Paper: {query} - Latest Developments",
470
+ "source": "Journal of Advanced Computing",
471
+ "summary": f"Recent breakthrough in {query} showing promising results in error reduction and scalability improvements.",
472
+ "relevance_score": random.randint(80, 95),
473
+ "publication_date": "2024-11-15",
474
+ },
475
+ {
476
+ "title": f"Technical Review: {query} Implementation Challenges",
477
+ "source": "Tech Innovation Quarterly",
478
+ "summary": f"Comprehensive analysis of current {query} methodologies and their practical applications.",
479
+ "relevance_score": random.randint(75, 90),
480
+ "publication_date": "2024-10-22",
481
+ },
482
+ {
483
+ "title": f"Industry Report: {query} Market Trends",
484
+ "source": "Technology Research Institute",
485
+ "summary": f"Market analysis and future projections for {query} adoption across industries.",
486
+ "relevance_score": random.randint(70, 85),
487
+ "publication_date": "2024-09-30",
488
+ },
489
+ ]
490
+
491
+ return {
492
+ "query": query,
493
+ "total_results": len(simulated_results),
494
+ "results": simulated_results[:max_results],
495
+ "search_timestamp": datetime.now().isoformat(),
496
+ }
497
+
498
+ def synthesize_research(self, search_results: dict) -> dict:
499
+ """Synthesize research findings into structured summary."""
500
+ from datetime import datetime
501
+
502
+ results = search_results["results"]
503
+
504
+ key_findings = []
505
+ technical_approaches = []
506
+ citations = []
507
+
508
+ for i, result in enumerate(results, 1):
509
+ key_findings.append(f"Finding {i}: {result['summary']}")
510
+ technical_approaches.append(
511
+ f"Approach {i}: Methodology described in '{result['title']}'"
512
+ )
513
+ citations.append(
514
+ f"[{i}] {result['title']} - {result['source']} ({result['publication_date']})"
515
+ )
516
+
517
+ return {
518
+ "research_topic": search_results["query"],
519
+ "sources_analyzed": len(results),
520
+ "key_findings": key_findings,
521
+ "technical_approaches": technical_approaches,
522
+ "citations": citations,
523
+ "confidence_level": "High" if len(results) >= 3 else "Medium",
524
+ "synthesis_date": datetime.now().isoformat(),
525
+ }
526
+
527
+ def create_test_tools(self) -> List:
528
+ """Create an advanced set of tools for realistic agent testing."""
141
529
  tools_factory = ToolsFactory()
142
530
 
143
531
  # Financial Analysis Tools
@@ -259,330 +647,6 @@ class ModelBenchmark:
259
647
 
260
648
  return json.dumps(customers, indent=2)
261
649
 
262
- def analyze_customer_data(customer_data_json: str) -> dict:
263
- """Analyze customer data for patterns and correlations."""
264
- customers = json.loads(customer_data_json)
265
-
266
- # Group by age groups
267
- age_groups = {}
268
- for customer in customers:
269
- group = customer["age_group"]
270
- if group not in age_groups:
271
- age_groups[group] = {
272
- "count": 0,
273
- "total_spending": 0,
274
- "total_income": 0,
275
- }
276
-
277
- age_groups[group]["count"] += 1
278
- age_groups[group]["total_spending"] += customer["purchase_history"]
279
- age_groups[group]["total_income"] += customer["income"]
280
-
281
- # Calculate averages
282
- analysis = {}
283
- for group, data in age_groups.items():
284
- analysis[group] = {
285
- "count": data["count"],
286
- "avg_spending": round(data["total_spending"] / data["count"], 2),
287
- "avg_income": round(data["total_income"] / data["count"], 2),
288
- "spending_to_income_ratio": round(
289
- (data["total_spending"] / data["count"])
290
- / (data["total_income"] / data["count"])
291
- * 1000,
292
- 4,
293
- ),
294
- }
295
-
296
- return {
297
- "total_customers": len(customers),
298
- "age_group_analysis": analysis,
299
- "overall_avg_spending": round(
300
- sum(c["purchase_history"] for c in customers) / len(customers), 2
301
- ),
302
- "overall_avg_income": round(
303
- sum(c["income"] for c in customers) / len(customers), 2
304
- ),
305
- }
306
-
307
- # System Monitoring Tools
308
- def get_system_metrics() -> dict:
309
- """Get current system performance metrics."""
310
- try:
311
- cpu_percent = psutil.cpu_percent(interval=1)
312
- memory = psutil.virtual_memory()
313
- disk = psutil.disk_usage("/")
314
-
315
- return {
316
- "cpu_usage_percent": cpu_percent,
317
- "memory_usage_percent": memory.percent,
318
- "memory_available_gb": round(memory.available / (1024**3), 2),
319
- "disk_usage_percent": disk.percent,
320
- "disk_free_gb": round(disk.free / (1024**3), 2),
321
- "timestamp": datetime.now().isoformat(),
322
- }
323
- except Exception:
324
- # Fallback with simulated data for testing
325
- return {
326
- "cpu_usage_percent": random.randint(20, 95),
327
- "memory_usage_percent": random.randint(40, 95),
328
- "memory_available_gb": random.randint(1, 16),
329
- "disk_usage_percent": random.randint(30, 90),
330
- "disk_free_gb": random.randint(10, 500),
331
- "timestamp": datetime.now().isoformat(),
332
- "note": "Simulated data - psutil unavailable",
333
- }
334
-
335
- def check_system_health(
336
- cpu_threshold: int = 80,
337
- memory_threshold: int = 90,
338
- disk_threshold: int = 85,
339
- ) -> dict:
340
- """Check system health against thresholds and generate alerts."""
341
- metrics = get_system_metrics()
342
- alerts = []
343
- recommendations = []
344
-
345
- if metrics["cpu_usage_percent"] > cpu_threshold:
346
- alerts.append(
347
- f"HIGH CPU USAGE: {metrics['cpu_usage_percent']}% (threshold: {cpu_threshold}%)"
348
- )
349
- recommendations.append(
350
- "Consider closing unnecessary applications or upgrading CPU"
351
- )
352
-
353
- if metrics["memory_usage_percent"] > memory_threshold:
354
- alerts.append(
355
- f"HIGH MEMORY USAGE: {metrics['memory_usage_percent']}% (threshold: {memory_threshold}%)"
356
- )
357
- recommendations.append(
358
- "Close memory-intensive applications or add more RAM"
359
- )
360
-
361
- if metrics["disk_usage_percent"] > disk_threshold:
362
- alerts.append(
363
- f"LOW DISK SPACE: {metrics['disk_usage_percent']}% used (threshold: {disk_threshold}%)"
364
- )
365
- recommendations.append(
366
- "Clean up temporary files or expand disk storage"
367
- )
368
-
369
- health_status = (
370
- "CRITICAL" if len(alerts) >= 2 else "WARNING" if alerts else "HEALTHY"
371
- )
372
-
373
- return {
374
- "health_status": health_status,
375
- "alerts": alerts,
376
- "recommendations": recommendations,
377
- "metrics": metrics,
378
- "thresholds": {
379
- "cpu": cpu_threshold,
380
- "memory": memory_threshold,
381
- "disk": disk_threshold,
382
- },
383
- }
384
-
385
- # Project Management Tools
386
- def create_project_tasks(count: int = 10) -> str:
387
- """Generate a list of software development tasks."""
388
- task_types = [
389
- "Implement user authentication system",
390
- "Create REST API endpoints",
391
- "Design database schema",
392
- "Build responsive frontend components",
393
- "Write unit tests",
394
- "Set up CI/CD pipeline",
395
- "Implement error handling",
396
- "Create API documentation",
397
- "Optimize database queries",
398
- "Implement caching layer",
399
- "Add logging and monitoring",
400
- "Create user dashboard",
401
- "Implement search functionality",
402
- "Add data validation",
403
- "Create admin panel",
404
- ]
405
-
406
- tasks = []
407
- for i in range(count):
408
- task = random.choice(task_types)
409
- priority = random.choice(["High", "Medium", "Low"])
410
- estimated_hours = random.randint(2, 24)
411
-
412
- tasks.append(
413
- {
414
- "task_id": f"TASK-{i+1:03d}",
415
- "title": f"{task} #{i+1}",
416
- "priority": priority,
417
- "estimated_hours": estimated_hours,
418
- "status": "Backlog",
419
- "assigned_to": None,
420
- }
421
- )
422
-
423
- return json.dumps(tasks, indent=2)
424
-
425
- def plan_sprint(tasks_json: str, sprint_capacity_hours: int = 80) -> dict:
426
- """Organize tasks into a sprint with daily breakdowns."""
427
- tasks = json.loads(tasks_json)
428
-
429
- # Sort by priority and estimated hours
430
- priority_order = {"High": 3, "Medium": 2, "Low": 1}
431
- tasks.sort(
432
- key=lambda x: (priority_order[x["priority"]], -x["estimated_hours"]),
433
- reverse=True,
434
- )
435
-
436
- sprint_tasks = []
437
- total_hours = 0
438
-
439
- for task in tasks:
440
- if total_hours + task["estimated_hours"] <= sprint_capacity_hours:
441
- sprint_tasks.append(task)
442
- total_hours += task["estimated_hours"]
443
- else:
444
- break
445
-
446
- # Distribute across 2 weeks (10 working days)
447
- daily_breakdown = []
448
- remaining_hours = total_hours
449
- days_remaining = 10
450
-
451
- for day in range(1, 11):
452
- if days_remaining > 0:
453
- day_hours = min(
454
- 8,
455
- remaining_hours // days_remaining
456
- + (1 if remaining_hours % days_remaining else 0),
457
- )
458
- daily_breakdown.append(
459
- {
460
- "day": day,
461
- "planned_hours": day_hours,
462
- "remaining_capacity": 8 - day_hours,
463
- }
464
- )
465
- remaining_hours -= day_hours
466
- days_remaining -= 1
467
-
468
- return {
469
- "sprint_summary": {
470
- "total_tasks": len(sprint_tasks),
471
- "total_planned_hours": total_hours,
472
- "sprint_capacity": sprint_capacity_hours,
473
- "utilization_percent": round(
474
- (total_hours / sprint_capacity_hours) * 100, 1
475
- ),
476
- },
477
- "selected_tasks": sprint_tasks,
478
- "daily_breakdown": daily_breakdown,
479
- "backlog_remaining": len(tasks) - len(sprint_tasks),
480
- }
481
-
482
- # Reporting Tools
483
- def create_formatted_report(
484
- title: str, data: dict, report_type: str = "summary"
485
- ) -> str:
486
- """Create a formatted text report from structured data."""
487
- report_lines = []
488
- report_lines.append("=" * 60)
489
- report_lines.append(f"{title.upper()}")
490
- report_lines.append("=" * 60)
491
- report_lines.append(
492
- f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
493
- )
494
- report_lines.append(f"Report Type: {report_type.title()}")
495
- report_lines.append("")
496
-
497
- def format_dict(d, indent=0):
498
- lines = []
499
- for key, value in d.items():
500
- prefix = " " * indent
501
- if isinstance(value, dict):
502
- lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
503
- lines.extend(format_dict(value, indent + 1))
504
- elif isinstance(value, list):
505
- lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
506
- for i, item in enumerate(value):
507
- if isinstance(item, dict):
508
- lines.append(f"{prefix} Item {i+1}:")
509
- lines.extend(format_dict(item, indent + 2))
510
- else:
511
- lines.append(f"{prefix} - {item}")
512
- else:
513
- lines.append(
514
- f"{prefix}{key.replace('_', ' ').title()}: {value}"
515
- )
516
- return lines
517
-
518
- report_lines.extend(format_dict(data))
519
- report_lines.append("")
520
- report_lines.append("=" * 60)
521
-
522
- return "\n".join(report_lines)
523
-
524
- # Research Tools
525
- def search_information(query: str, max_results: int = 5) -> dict:
526
- """Simulate information search with structured results."""
527
- # Simulated search results for testing
528
- simulated_results = [
529
- {
530
- "title": f"Research Paper: {query} - Latest Developments",
531
- "source": "Journal of Advanced Computing",
532
- "summary": f"Recent breakthrough in {query} showing promising results in error reduction and scalability improvements.",
533
- "relevance_score": random.randint(80, 95),
534
- "publication_date": "2024-11-15",
535
- },
536
- {
537
- "title": f"Technical Review: {query} Implementation Challenges",
538
- "source": "Tech Innovation Quarterly",
539
- "summary": f"Comprehensive analysis of current {query} methodologies and their practical applications.",
540
- "relevance_score": random.randint(75, 90),
541
- "publication_date": "2024-10-22",
542
- },
543
- {
544
- "title": f"Industry Report: {query} Market Trends",
545
- "source": "Technology Research Institute",
546
- "summary": f"Market analysis and future projections for {query} adoption across industries.",
547
- "relevance_score": random.randint(70, 85),
548
- "publication_date": "2024-09-30",
549
- },
550
- ]
551
-
552
- return {
553
- "query": query,
554
- "total_results": len(simulated_results),
555
- "results": simulated_results[:max_results],
556
- "search_timestamp": datetime.now().isoformat(),
557
- }
558
-
559
- def synthesize_research(search_results: dict) -> dict:
560
- """Synthesize research findings into structured summary."""
561
- results = search_results["results"]
562
-
563
- key_findings = []
564
- technical_approaches = []
565
- citations = []
566
-
567
- for i, result in enumerate(results, 1):
568
- key_findings.append(f"Finding {i}: {result['summary']}")
569
- technical_approaches.append(
570
- f"Approach {i}: Methodology described in '{result['title']}'"
571
- )
572
- citations.append(
573
- f"[{i}] {result['title']} - {result['source']} ({result['publication_date']})"
574
- )
575
-
576
- return {
577
- "research_topic": search_results["query"],
578
- "sources_analyzed": len(results),
579
- "key_findings": key_findings,
580
- "technical_approaches": technical_approaches,
581
- "citations": citations,
582
- "confidence_level": "High" if len(results) >= 3 else "Medium",
583
- "synthesis_date": datetime.now().isoformat(),
584
- }
585
-
586
650
  # Create and return all tools
587
651
  return [
588
652
  # Financial Analysis
@@ -590,20 +654,51 @@ class ModelBenchmark:
590
654
  tools_factory.create_tool(project_investment_growth, vhc_eligible=False),
591
655
  # Data Analysis
592
656
  tools_factory.create_tool(generate_customer_dataset, vhc_eligible=False),
593
- tools_factory.create_tool(analyze_customer_data, vhc_eligible=False),
657
+ tools_factory.create_tool(self.analyze_customer_data, vhc_eligible=False),
594
658
  # System Monitoring
595
- tools_factory.create_tool(get_system_metrics, vhc_eligible=False),
596
- tools_factory.create_tool(check_system_health, vhc_eligible=False),
659
+ tools_factory.create_tool(self.get_system_metrics, vhc_eligible=False),
660
+ tools_factory.create_tool(self.check_system_health, vhc_eligible=False),
597
661
  # Project Management
598
- tools_factory.create_tool(create_project_tasks, vhc_eligible=False),
599
- tools_factory.create_tool(plan_sprint, vhc_eligible=False),
662
+ tools_factory.create_tool(self.create_project_tasks, vhc_eligible=False),
663
+ tools_factory.create_tool(self.plan_sprint, vhc_eligible=False),
600
664
  # Reporting
601
- tools_factory.create_tool(create_formatted_report, vhc_eligible=False),
665
+ tools_factory.create_tool(self.create_formatted_report, vhc_eligible=False),
602
666
  # Research
603
- tools_factory.create_tool(search_information, vhc_eligible=False),
604
- tools_factory.create_tool(synthesize_research, vhc_eligible=False),
667
+ tools_factory.create_tool(self.search_information, vhc_eligible=False),
668
+ tools_factory.create_tool(self.synthesize_research, vhc_eligible=False),
605
669
  ]
606
670
 
671
+ def _calculate_provider_delay(self, provider: ModelProvider) -> float:
672
+ """Calculate appropriate delay based on provider rate limits."""
673
+ base_delay = 60.0 / self.provider_rate_limits.get(
674
+ provider, 60
675
+ ) # seconds between requests
676
+ # Add jitter to prevent thundering herd
677
+ jitter = random.uniform(0.5, 1.5)
678
+ return base_delay * jitter * 2 # Extra conservative multiplier
679
+
680
+ async def _retry_with_backoff(
681
+ self, func, max_retries: int = 3, base_delay: float = 1.0
682
+ ):
683
+ """Retry function with exponential backoff on rate limit errors."""
684
+ for attempt in range(max_retries):
685
+ try:
686
+ return await func()
687
+ except Exception as e:
688
+ error_str = str(e).lower()
689
+ if "rate limit" in error_str or "429" in error_str:
690
+ if attempt == max_retries - 1:
691
+ raise # Last attempt, re-raise the error
692
+
693
+ # Calculate backoff delay
694
+ delay = base_delay * (2**attempt) + random.uniform(0, 1)
695
+ print(
696
+ f" ⏳ Rate limit hit, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})"
697
+ )
698
+ await asyncio.sleep(delay)
699
+ else:
700
+ raise # Non-rate-limit error, don't retry
701
+
607
702
  async def measure_streaming_response(
608
703
  self, agent: Agent, prompt: str
609
704
  ) -> Tuple[float, float, int]:
@@ -706,14 +801,15 @@ class ModelBenchmark:
706
801
  )
707
802
 
708
803
  async def run_benchmarks(self):
709
- """Run all benchmark combinations."""
804
+ """Run all benchmark combinations with parallel execution."""
710
805
  global _observability_initialized
711
806
 
712
807
  print("Starting model performance benchmarks...")
713
808
  print(
714
809
  f"Testing {len(self.models_to_test)} models across {len(self.test_scenarios)} scenarios"
715
810
  )
716
- print(f"Running {self.iterations_per_test} iterations per combination\n")
811
+ print(f"Running {self.iterations_per_test} iterations per combination")
812
+ print(f"Max concurrent models: {self.max_concurrent_models}\n")
717
813
 
718
814
  # Setup observability once if enabled and not already initialized
719
815
  if self.enable_observability and not _observability_initialized:
@@ -727,47 +823,116 @@ class ModelBenchmark:
727
823
  else:
728
824
  print("⚠️ Arize Phoenix observability setup failed\n")
729
825
 
730
- total_tests = (
731
- len(self.models_to_test)
732
- * len(self.test_scenarios)
733
- * self.iterations_per_test
734
- )
735
- current_test = 0
826
+ # Create semaphore to limit concurrent model testing
827
+ model_semaphore = asyncio.Semaphore(self.max_concurrent_models)
736
828
 
829
+ # Create tasks for all model benchmarks
830
+ tasks = []
737
831
  for model_config in self.models_to_test:
832
+ task = asyncio.create_task(
833
+ self._run_model_benchmark(model_config, model_semaphore)
834
+ )
835
+ tasks.append(task)
836
+
837
+ # Execute all model benchmarks in parallel
838
+ print("🚀 Starting parallel benchmark execution...\n")
839
+ await asyncio.gather(*tasks, return_exceptions=True)
840
+
841
+ async def _run_model_benchmark(
842
+ self, model_config: Dict, semaphore: asyncio.Semaphore
843
+ ):
844
+ """Run all benchmarks for a single model."""
845
+ async with semaphore:
738
846
  provider = model_config["provider"]
739
847
  model_name = model_config["model"]
740
848
 
741
849
  print(f"\n{'='*60}")
742
- print(f"Testing: {provider.value} - {model_name}")
850
+ print(f"Starting: {provider.value} - {model_name}")
743
851
  print(f"{'='*60}")
744
852
 
853
+ # Run all scenarios for this model sequentially to avoid rate limits
745
854
  for test_name, test_config in self.test_scenarios.items():
746
- print(f"\nRunning {test_name}: {test_config['description']}")
747
-
748
- for iteration in range(self.iterations_per_test):
749
- current_test += 1
750
- progress = (current_test / total_tests) * 100
751
- print(
752
- f" Iteration {iteration + 1}/{self.iterations_per_test} ({progress:.1f}% complete)"
855
+ try:
856
+ await self._run_scenario_benchmark(
857
+ provider, model_name, test_name, test_config
753
858
  )
859
+ except Exception as e:
860
+ print(f"❌ Error in {model_name} - {test_name}: {e}")
861
+
862
+ print(f"✅ Completed: {provider.value} - {model_name}")
754
863
 
755
- result = await self.run_single_benchmark(
864
+ async def _run_scenario_benchmark(
865
+ self,
866
+ provider: ModelProvider,
867
+ model_name: str,
868
+ test_name: str,
869
+ test_config: Dict[str, Any],
870
+ ):
871
+ """Run all iterations for a single test scenario sequentially."""
872
+ print(
873
+ f"\n🔄 Running {model_name}/{test_name}: {test_config['description']}"
874
+ )
875
+
876
+ iteration_results = []
877
+
878
+ # Run iterations sequentially to avoid rate limits
879
+ for iteration in range(self.iterations_per_test):
880
+ iteration_num = iteration + 1
881
+ try:
882
+ # Use retry with backoff for rate limit handling
883
+ async def run_benchmark():
884
+ return await self.run_single_benchmark(
756
885
  provider, model_name, test_name, test_config
757
886
  )
758
- self.results.append(result)
759
887
 
760
- if result.error:
761
- print(f" ERROR: {result.error}")
762
- else:
763
- print(
764
- f" Time: {result.total_response_time:.2f}s, "
765
- f"First token: {result.first_token_latency:.2f}s, "
766
- f"Speed: {result.tokens_per_second:.1f} chars/sec"
767
- )
888
+ result = await self._retry_with_backoff(
889
+ run_benchmark, max_retries=3, base_delay=2.0
890
+ )
891
+ iteration_results.append(result)
768
892
 
769
- # Small delay between tests
770
- await asyncio.sleep(1)
893
+ if result.error:
894
+ print(
895
+ f" ❌ {model_name}/{test_name} Iteration {iteration_num}: {result.error}"
896
+ )
897
+ else:
898
+ print(
899
+ f" ✅ {model_name}/{test_name} Iteration {iteration_num}: "
900
+ f"{result.total_response_time:.2f}s, "
901
+ f"first token: {result.first_token_latency:.2f}s, "
902
+ f"{result.tokens_per_second:.1f} chars/sec"
903
+ )
904
+
905
+ except Exception as e:
906
+ print(f" ❌ {model_name}/{test_name} Iteration {iteration_num}: {e}")
907
+ # Create error result
908
+ error_result = BenchmarkResult(
909
+ model_name=model_name,
910
+ provider=provider.value,
911
+ test_type=test_name,
912
+ first_token_latency=-1,
913
+ total_response_time=-1,
914
+ response_length=0,
915
+ tokens_per_second=0,
916
+ error=str(e),
917
+ )
918
+ iteration_results.append(error_result)
919
+
920
+ # Add delay between iterations based on provider
921
+ if iteration_num < self.iterations_per_test:
922
+ delay = self._calculate_provider_delay(provider)
923
+ await asyncio.sleep(delay)
924
+
925
+ # Add all results to the main results list
926
+ self.results.extend(iteration_results)
927
+
928
+ # Calculate success rate for this scenario
929
+ successful = len([r for r in iteration_results if r.error is None])
930
+ success_rate = (successful / len(iteration_results)) * 100
931
+ print(
932
+ f" 📊 {model_name}/{test_name} complete: {successful}/{len(iteration_results)} successful ({success_rate:.1f}%)"
933
+ )
934
+
935
+ return iteration_results
771
936
 
772
937
  def calculate_statistics(self) -> List[BenchmarkStats]:
773
938
  """Calculate aggregated statistics from results."""
@@ -913,7 +1078,17 @@ async def main():
913
1078
 
914
1079
  # Check if observability should be enabled via environment variable
915
1080
  enable_observability = os.getenv("ENABLE_OBSERVABILITY", "false").lower() == "true"
916
- benchmark = ModelBenchmark(enable_observability=enable_observability)
1081
+
1082
+ # Allow configuring concurrency via environment variable
1083
+ max_concurrent_models = int(os.getenv("MAX_CONCURRENT_MODELS", "5"))
1084
+
1085
+ benchmark = ModelBenchmark(
1086
+ enable_observability=enable_observability,
1087
+ max_concurrent_models=max_concurrent_models,
1088
+ )
1089
+
1090
+ # Validate that all required API keys are present before running benchmarks
1091
+ validate_api_keys(benchmark.models_to_test)
917
1092
 
918
1093
  try:
919
1094
  await benchmark.run_benchmarks()