vectara-agentic 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vectara-agentic might be problematic. Click here for more details.
- tests/__init__.py +1 -0
- tests/benchmark_models.py +547 -372
- tests/conftest.py +14 -12
- tests/endpoint.py +9 -5
- tests/run_tests.py +1 -0
- tests/test_agent.py +22 -9
- tests/test_agent_fallback_memory.py +4 -4
- tests/test_agent_memory_consistency.py +4 -4
- tests/test_agent_type.py +2 -0
- tests/test_api_endpoint.py +13 -13
- tests/test_bedrock.py +9 -1
- tests/test_fallback.py +18 -7
- tests/test_gemini.py +14 -40
- tests/test_groq.py +43 -1
- tests/test_openai.py +160 -0
- tests/test_private_llm.py +19 -6
- tests/test_react_error_handling.py +293 -0
- tests/test_react_memory.py +257 -0
- tests/test_react_streaming.py +135 -0
- tests/test_react_workflow_events.py +395 -0
- tests/test_return_direct.py +1 -0
- tests/test_serialization.py +58 -20
- tests/test_session_memory.py +11 -11
- tests/test_streaming.py +0 -44
- tests/test_together.py +75 -1
- tests/test_tools.py +3 -1
- tests/test_vectara_llms.py +2 -2
- tests/test_vhc.py +7 -2
- tests/test_workflow.py +17 -11
- vectara_agentic/_callback.py +79 -21
- vectara_agentic/_version.py +1 -1
- vectara_agentic/agent.py +65 -27
- vectara_agentic/agent_core/serialization.py +5 -9
- vectara_agentic/agent_core/streaming.py +245 -64
- vectara_agentic/agent_core/utils/schemas.py +2 -2
- vectara_agentic/llm_utils.py +64 -15
- vectara_agentic/tools.py +88 -31
- {vectara_agentic-0.4.2.dist-info → vectara_agentic-0.4.4.dist-info}/METADATA +133 -36
- vectara_agentic-0.4.4.dist-info/RECORD +59 -0
- vectara_agentic-0.4.2.dist-info/RECORD +0 -54
- {vectara_agentic-0.4.2.dist-info → vectara_agentic-0.4.4.dist-info}/WHEEL +0 -0
- {vectara_agentic-0.4.2.dist-info → vectara_agentic-0.4.4.dist-info}/licenses/LICENSE +0 -0
- {vectara_agentic-0.4.2.dist-info → vectara_agentic-0.4.4.dist-info}/top_level.txt +0 -0
tests/benchmark_models.py
CHANGED
|
@@ -12,7 +12,8 @@ import json
|
|
|
12
12
|
import statistics
|
|
13
13
|
import sys
|
|
14
14
|
import os
|
|
15
|
-
|
|
15
|
+
import random
|
|
16
|
+
from typing import Dict, List, Tuple, Any, Set
|
|
16
17
|
from dataclasses import dataclass, asdict
|
|
17
18
|
|
|
18
19
|
# Add the current directory to Python path to import vectara_agentic
|
|
@@ -28,6 +29,64 @@ from vectara_agentic._observability import setup_observer, shutdown_observer
|
|
|
28
29
|
_observability_initialized = False
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
def validate_api_keys(models_to_test: List[Dict]) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Validate that all required API keys are present for the models being tested.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
models_to_test: List of model configurations with provider and model info
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
SystemExit: If any required API keys are missing
|
|
41
|
+
"""
|
|
42
|
+
# Map providers to their required environment variables
|
|
43
|
+
provider_api_keys = {
|
|
44
|
+
ModelProvider.OPENAI: "OPENAI_API_KEY",
|
|
45
|
+
ModelProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
46
|
+
ModelProvider.TOGETHER: "TOGETHER_API_KEY",
|
|
47
|
+
ModelProvider.GROQ: "GROQ_API_KEY",
|
|
48
|
+
ModelProvider.GEMINI: "GOOGLE_API_KEY",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
required_keys = set()
|
|
52
|
+
|
|
53
|
+
# Collect unique providers from models to test
|
|
54
|
+
providers_in_use: Set[ModelProvider] = set()
|
|
55
|
+
for model_config in models_to_test:
|
|
56
|
+
providers_in_use.add(model_config["provider"])
|
|
57
|
+
|
|
58
|
+
# Add required API keys for each provider
|
|
59
|
+
for provider in providers_in_use:
|
|
60
|
+
api_key_name = provider_api_keys.get(provider)
|
|
61
|
+
if api_key_name: # Skip providers that don't use env var API keys
|
|
62
|
+
required_keys.add(api_key_name)
|
|
63
|
+
|
|
64
|
+
# Check for missing API keys
|
|
65
|
+
missing_keys = []
|
|
66
|
+
for key in required_keys:
|
|
67
|
+
if not os.getenv(key):
|
|
68
|
+
missing_keys.append(key)
|
|
69
|
+
|
|
70
|
+
if missing_keys:
|
|
71
|
+
print("❌ ERROR: Missing required API keys for benchmark execution:")
|
|
72
|
+
print()
|
|
73
|
+
for key in sorted(missing_keys):
|
|
74
|
+
print(f" • {key}")
|
|
75
|
+
print()
|
|
76
|
+
print("Please set these environment variables before running the benchmark.")
|
|
77
|
+
print("Providers being tested:")
|
|
78
|
+
for provider in sorted(providers_in_use, key=lambda p: p.value):
|
|
79
|
+
models_for_provider = [
|
|
80
|
+
m["model"] for m in models_to_test if m["provider"] == provider
|
|
81
|
+
]
|
|
82
|
+
print(f" • {provider.value}: {', '.join(models_for_provider)}")
|
|
83
|
+
|
|
84
|
+
sys.exit(1)
|
|
85
|
+
|
|
86
|
+
print("✅ All required API keys are present")
|
|
87
|
+
print(f"Found API keys for {len(required_keys)} required environment variables")
|
|
88
|
+
|
|
89
|
+
|
|
31
90
|
@dataclass
|
|
32
91
|
class BenchmarkResult:
|
|
33
92
|
"""Results from a single benchmark run."""
|
|
@@ -65,22 +124,21 @@ class BenchmarkStats:
|
|
|
65
124
|
class ModelBenchmark:
|
|
66
125
|
"""Benchmarking suite for different LLM models."""
|
|
67
126
|
|
|
68
|
-
def __init__(
|
|
127
|
+
def __init__(
|
|
128
|
+
self, enable_observability: bool = False, max_concurrent_models: int = 2
|
|
129
|
+
):
|
|
69
130
|
# Test configurations
|
|
70
131
|
self.enable_observability = enable_observability
|
|
132
|
+
self.max_concurrent_models = max_concurrent_models
|
|
71
133
|
self.models_to_test = [
|
|
72
134
|
# OpenAI models
|
|
73
|
-
{"provider": ModelProvider.OPENAI, "model": "gpt-5"},
|
|
74
135
|
{"provider": ModelProvider.OPENAI, "model": "gpt-5-mini"},
|
|
75
|
-
{"provider": ModelProvider.OPENAI, "model": "gpt-4o"},
|
|
76
136
|
{"provider": ModelProvider.OPENAI, "model": "gpt-4o-mini"},
|
|
77
|
-
{"provider": ModelProvider.OPENAI, "model": "gpt-4.1"},
|
|
78
137
|
{"provider": ModelProvider.OPENAI, "model": "gpt-4.1-mini"},
|
|
79
138
|
{"provider": ModelProvider.ANTHROPIC, "model": "claude-sonnet-4-20250514"},
|
|
80
139
|
{"provider": ModelProvider.TOGETHER, "model": "deepseek-ai/DeepSeek-V3"},
|
|
81
140
|
{"provider": ModelProvider.GROQ, "model": "openai/gpt-oss-20b"},
|
|
82
|
-
{"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-flash"},
|
|
83
|
-
{"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-pro"},
|
|
141
|
+
{"provider": ModelProvider.GEMINI, "model": "models/gemini-2.5-flash-lite"},
|
|
84
142
|
]
|
|
85
143
|
|
|
86
144
|
# Test scenarios - focused on advanced tool calling only
|
|
@@ -115,6 +173,15 @@ class ModelBenchmark:
|
|
|
115
173
|
self.iterations_per_test = 5
|
|
116
174
|
self.results: List[BenchmarkResult] = []
|
|
117
175
|
|
|
176
|
+
# Provider-specific rate limits (requests per minute)
|
|
177
|
+
self.provider_rate_limits = {
|
|
178
|
+
ModelProvider.OPENAI: 100,
|
|
179
|
+
ModelProvider.ANTHROPIC: 100,
|
|
180
|
+
ModelProvider.TOGETHER: 80,
|
|
181
|
+
ModelProvider.GROQ: 50, # Conservative for GROQ
|
|
182
|
+
ModelProvider.GEMINI: 60,
|
|
183
|
+
}
|
|
184
|
+
|
|
118
185
|
def create_agent_config(
|
|
119
186
|
self, provider: ModelProvider, model_name: str
|
|
120
187
|
) -> AgentConfig:
|
|
@@ -131,13 +198,334 @@ class ModelBenchmark:
|
|
|
131
198
|
),
|
|
132
199
|
)
|
|
133
200
|
|
|
134
|
-
def
|
|
135
|
-
"""
|
|
136
|
-
|
|
137
|
-
|
|
201
|
+
def analyze_customer_data(self, customer_data_json: str) -> dict:
|
|
202
|
+
"""Analyze customer data for patterns and correlations."""
|
|
203
|
+
customers = json.loads(customer_data_json)
|
|
204
|
+
|
|
205
|
+
# Group by age groups
|
|
206
|
+
age_groups = {}
|
|
207
|
+
for customer in customers:
|
|
208
|
+
group = customer["age_group"]
|
|
209
|
+
if group not in age_groups:
|
|
210
|
+
age_groups[group] = {
|
|
211
|
+
"count": 0,
|
|
212
|
+
"total_spending": 0,
|
|
213
|
+
"total_income": 0,
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
age_groups[group]["count"] += 1
|
|
217
|
+
age_groups[group]["total_spending"] += customer["purchase_history"]
|
|
218
|
+
age_groups[group]["total_income"] += customer["income"]
|
|
219
|
+
|
|
220
|
+
# Calculate averages
|
|
221
|
+
analysis = {}
|
|
222
|
+
for group, data in age_groups.items():
|
|
223
|
+
analysis[group] = {
|
|
224
|
+
"count": data["count"],
|
|
225
|
+
"avg_spending": round(data["total_spending"] / data["count"], 2),
|
|
226
|
+
"avg_income": round(data["total_income"] / data["count"], 2),
|
|
227
|
+
"spending_to_income_ratio": round(
|
|
228
|
+
(data["total_spending"] / data["count"])
|
|
229
|
+
/ (data["total_income"] / data["count"])
|
|
230
|
+
* 1000,
|
|
231
|
+
4,
|
|
232
|
+
),
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return {
|
|
236
|
+
"total_customers": len(customers),
|
|
237
|
+
"age_group_analysis": analysis,
|
|
238
|
+
"overall_avg_spending": round(
|
|
239
|
+
sum(c["purchase_history"] for c in customers) / len(customers), 2
|
|
240
|
+
),
|
|
241
|
+
"overall_avg_income": round(
|
|
242
|
+
sum(c["income"] for c in customers) / len(customers), 2
|
|
243
|
+
),
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
def get_system_metrics(self) -> dict:
|
|
247
|
+
"""Get current system performance metrics."""
|
|
138
248
|
import psutil
|
|
139
249
|
from datetime import datetime
|
|
140
250
|
|
|
251
|
+
try:
|
|
252
|
+
cpu_percent = psutil.cpu_percent(interval=1)
|
|
253
|
+
memory = psutil.virtual_memory()
|
|
254
|
+
disk = psutil.disk_usage("/")
|
|
255
|
+
|
|
256
|
+
return {
|
|
257
|
+
"cpu_usage_percent": cpu_percent,
|
|
258
|
+
"memory_usage_percent": memory.percent,
|
|
259
|
+
"memory_available_gb": round(memory.available / (1024**3), 2),
|
|
260
|
+
"disk_usage_percent": disk.percent,
|
|
261
|
+
"disk_free_gb": round(disk.free / (1024**3), 2),
|
|
262
|
+
"timestamp": datetime.now().isoformat(),
|
|
263
|
+
}
|
|
264
|
+
except Exception:
|
|
265
|
+
# Fallback with simulated data for testing
|
|
266
|
+
return {
|
|
267
|
+
"cpu_usage_percent": random.randint(20, 95),
|
|
268
|
+
"memory_usage_percent": random.randint(40, 95),
|
|
269
|
+
"memory_available_gb": random.randint(1, 16),
|
|
270
|
+
"disk_usage_percent": random.randint(30, 90),
|
|
271
|
+
"disk_free_gb": random.randint(10, 500),
|
|
272
|
+
"timestamp": datetime.now().isoformat(),
|
|
273
|
+
"note": "Simulated data - psutil unavailable",
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
def check_system_health(
|
|
277
|
+
self,
|
|
278
|
+
cpu_threshold: int = 80,
|
|
279
|
+
memory_threshold: int = 90,
|
|
280
|
+
disk_threshold: int = 85,
|
|
281
|
+
) -> dict:
|
|
282
|
+
"""Check system health against thresholds and generate alerts."""
|
|
283
|
+
metrics = self.get_system_metrics()
|
|
284
|
+
alerts = []
|
|
285
|
+
recommendations = []
|
|
286
|
+
|
|
287
|
+
if metrics["cpu_usage_percent"] > cpu_threshold:
|
|
288
|
+
alerts.append(
|
|
289
|
+
f"HIGH CPU USAGE: {metrics['cpu_usage_percent']}% (threshold: {cpu_threshold}%)"
|
|
290
|
+
)
|
|
291
|
+
recommendations.append(
|
|
292
|
+
"Consider closing unnecessary applications or upgrading CPU"
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
if metrics["memory_usage_percent"] > memory_threshold:
|
|
296
|
+
alerts.append(
|
|
297
|
+
f"HIGH MEMORY USAGE: {metrics['memory_usage_percent']}% (threshold: {memory_threshold}%)"
|
|
298
|
+
)
|
|
299
|
+
recommendations.append(
|
|
300
|
+
"Close memory-intensive applications or add more RAM"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
if metrics["disk_usage_percent"] > disk_threshold:
|
|
304
|
+
alerts.append(
|
|
305
|
+
f"LOW DISK SPACE: {metrics['disk_usage_percent']}% used (threshold: {disk_threshold}%)"
|
|
306
|
+
)
|
|
307
|
+
recommendations.append("Clean up temporary files or expand disk storage")
|
|
308
|
+
|
|
309
|
+
health_status = (
|
|
310
|
+
"CRITICAL" if len(alerts) >= 2 else "WARNING" if alerts else "HEALTHY"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
return {
|
|
314
|
+
"health_status": health_status,
|
|
315
|
+
"alerts": alerts,
|
|
316
|
+
"recommendations": recommendations,
|
|
317
|
+
"metrics": metrics,
|
|
318
|
+
"thresholds": {
|
|
319
|
+
"cpu": cpu_threshold,
|
|
320
|
+
"memory": memory_threshold,
|
|
321
|
+
"disk": disk_threshold,
|
|
322
|
+
},
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
def create_project_tasks(self, count: int = 10) -> str:
|
|
326
|
+
"""Generate a list of software development tasks."""
|
|
327
|
+
task_types = [
|
|
328
|
+
"Implement user authentication system",
|
|
329
|
+
"Create REST API endpoints",
|
|
330
|
+
"Design database schema",
|
|
331
|
+
"Build responsive frontend components",
|
|
332
|
+
"Write unit tests",
|
|
333
|
+
"Set up CI/CD pipeline",
|
|
334
|
+
"Implement error handling",
|
|
335
|
+
"Create API documentation",
|
|
336
|
+
"Optimize database queries",
|
|
337
|
+
"Implement caching layer",
|
|
338
|
+
"Add logging and monitoring",
|
|
339
|
+
"Create user dashboard",
|
|
340
|
+
"Implement search functionality",
|
|
341
|
+
"Add data validation",
|
|
342
|
+
"Create admin panel",
|
|
343
|
+
]
|
|
344
|
+
|
|
345
|
+
tasks = []
|
|
346
|
+
for i in range(count):
|
|
347
|
+
task = random.choice(task_types)
|
|
348
|
+
priority = random.choice(["High", "Medium", "Low"])
|
|
349
|
+
estimated_hours = random.randint(2, 24)
|
|
350
|
+
|
|
351
|
+
tasks.append(
|
|
352
|
+
{
|
|
353
|
+
"task_id": f"TASK-{i+1:03d}",
|
|
354
|
+
"title": f"{task} #{i+1}",
|
|
355
|
+
"priority": priority,
|
|
356
|
+
"estimated_hours": estimated_hours,
|
|
357
|
+
"status": "Backlog",
|
|
358
|
+
"assigned_to": None,
|
|
359
|
+
}
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
return json.dumps(tasks, indent=2)
|
|
363
|
+
|
|
364
|
+
def plan_sprint(self, tasks_json: str, sprint_capacity_hours: int = 80) -> dict:
|
|
365
|
+
"""Organize tasks into a sprint with daily breakdowns."""
|
|
366
|
+
tasks = json.loads(tasks_json)
|
|
367
|
+
|
|
368
|
+
# Sort by priority and estimated hours
|
|
369
|
+
priority_order = {"High": 3, "Medium": 2, "Low": 1}
|
|
370
|
+
tasks.sort(
|
|
371
|
+
key=lambda x: (priority_order[x["priority"]], -x["estimated_hours"]),
|
|
372
|
+
reverse=True,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
sprint_tasks = []
|
|
376
|
+
total_hours = 0
|
|
377
|
+
|
|
378
|
+
for task in tasks:
|
|
379
|
+
if total_hours + task["estimated_hours"] <= sprint_capacity_hours:
|
|
380
|
+
sprint_tasks.append(task)
|
|
381
|
+
total_hours += task["estimated_hours"]
|
|
382
|
+
else:
|
|
383
|
+
break
|
|
384
|
+
|
|
385
|
+
# Distribute across 2 weeks (10 working days)
|
|
386
|
+
daily_breakdown = []
|
|
387
|
+
remaining_hours = total_hours
|
|
388
|
+
days_remaining = 10
|
|
389
|
+
|
|
390
|
+
for day in range(1, 11):
|
|
391
|
+
if days_remaining > 0:
|
|
392
|
+
day_hours = min(
|
|
393
|
+
8,
|
|
394
|
+
remaining_hours // days_remaining
|
|
395
|
+
+ (1 if remaining_hours % days_remaining else 0),
|
|
396
|
+
)
|
|
397
|
+
daily_breakdown.append(
|
|
398
|
+
{
|
|
399
|
+
"day": day,
|
|
400
|
+
"planned_hours": day_hours,
|
|
401
|
+
"remaining_capacity": 8 - day_hours,
|
|
402
|
+
}
|
|
403
|
+
)
|
|
404
|
+
remaining_hours -= day_hours
|
|
405
|
+
days_remaining -= 1
|
|
406
|
+
|
|
407
|
+
return {
|
|
408
|
+
"sprint_summary": {
|
|
409
|
+
"total_tasks": len(sprint_tasks),
|
|
410
|
+
"total_planned_hours": total_hours,
|
|
411
|
+
"sprint_capacity": sprint_capacity_hours,
|
|
412
|
+
"utilization_percent": round(
|
|
413
|
+
(total_hours / sprint_capacity_hours) * 100, 1
|
|
414
|
+
),
|
|
415
|
+
},
|
|
416
|
+
"selected_tasks": sprint_tasks,
|
|
417
|
+
"daily_breakdown": daily_breakdown,
|
|
418
|
+
"backlog_remaining": len(tasks) - len(sprint_tasks),
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
def create_formatted_report(
|
|
422
|
+
self, title: str, data: dict, report_type: str = "summary"
|
|
423
|
+
) -> str:
|
|
424
|
+
"""Create a formatted text report from structured data."""
|
|
425
|
+
from datetime import datetime
|
|
426
|
+
|
|
427
|
+
report_lines = []
|
|
428
|
+
report_lines.append("=" * 60)
|
|
429
|
+
report_lines.append(f"{title.upper()}")
|
|
430
|
+
report_lines.append("=" * 60)
|
|
431
|
+
report_lines.append(
|
|
432
|
+
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
|
433
|
+
)
|
|
434
|
+
report_lines.append(f"Report Type: {report_type.title()}")
|
|
435
|
+
report_lines.append("")
|
|
436
|
+
|
|
437
|
+
def format_dict(d, indent=0):
|
|
438
|
+
lines = []
|
|
439
|
+
for key, value in d.items():
|
|
440
|
+
prefix = " " * indent
|
|
441
|
+
if isinstance(value, dict):
|
|
442
|
+
lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
|
|
443
|
+
lines.extend(format_dict(value, indent + 1))
|
|
444
|
+
elif isinstance(value, list):
|
|
445
|
+
lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
|
|
446
|
+
for i, item in enumerate(value):
|
|
447
|
+
if isinstance(item, dict):
|
|
448
|
+
lines.append(f"{prefix} Item {i+1}:")
|
|
449
|
+
lines.extend(format_dict(item, indent + 2))
|
|
450
|
+
else:
|
|
451
|
+
lines.append(f"{prefix} - {item}")
|
|
452
|
+
else:
|
|
453
|
+
lines.append(f"{prefix}{key.replace('_', ' ').title()}: {value}")
|
|
454
|
+
return lines
|
|
455
|
+
|
|
456
|
+
report_lines.extend(format_dict(data))
|
|
457
|
+
report_lines.append("")
|
|
458
|
+
report_lines.append("=" * 60)
|
|
459
|
+
|
|
460
|
+
return "\n".join(report_lines)
|
|
461
|
+
|
|
462
|
+
def search_information(self, query: str, max_results: int = 5) -> dict:
|
|
463
|
+
"""Simulate information search with structured results."""
|
|
464
|
+
from datetime import datetime
|
|
465
|
+
|
|
466
|
+
# Simulated search results for testing
|
|
467
|
+
simulated_results = [
|
|
468
|
+
{
|
|
469
|
+
"title": f"Research Paper: {query} - Latest Developments",
|
|
470
|
+
"source": "Journal of Advanced Computing",
|
|
471
|
+
"summary": f"Recent breakthrough in {query} showing promising results in error reduction and scalability improvements.",
|
|
472
|
+
"relevance_score": random.randint(80, 95),
|
|
473
|
+
"publication_date": "2024-11-15",
|
|
474
|
+
},
|
|
475
|
+
{
|
|
476
|
+
"title": f"Technical Review: {query} Implementation Challenges",
|
|
477
|
+
"source": "Tech Innovation Quarterly",
|
|
478
|
+
"summary": f"Comprehensive analysis of current {query} methodologies and their practical applications.",
|
|
479
|
+
"relevance_score": random.randint(75, 90),
|
|
480
|
+
"publication_date": "2024-10-22",
|
|
481
|
+
},
|
|
482
|
+
{
|
|
483
|
+
"title": f"Industry Report: {query} Market Trends",
|
|
484
|
+
"source": "Technology Research Institute",
|
|
485
|
+
"summary": f"Market analysis and future projections for {query} adoption across industries.",
|
|
486
|
+
"relevance_score": random.randint(70, 85),
|
|
487
|
+
"publication_date": "2024-09-30",
|
|
488
|
+
},
|
|
489
|
+
]
|
|
490
|
+
|
|
491
|
+
return {
|
|
492
|
+
"query": query,
|
|
493
|
+
"total_results": len(simulated_results),
|
|
494
|
+
"results": simulated_results[:max_results],
|
|
495
|
+
"search_timestamp": datetime.now().isoformat(),
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
def synthesize_research(self, search_results: dict) -> dict:
|
|
499
|
+
"""Synthesize research findings into structured summary."""
|
|
500
|
+
from datetime import datetime
|
|
501
|
+
|
|
502
|
+
results = search_results["results"]
|
|
503
|
+
|
|
504
|
+
key_findings = []
|
|
505
|
+
technical_approaches = []
|
|
506
|
+
citations = []
|
|
507
|
+
|
|
508
|
+
for i, result in enumerate(results, 1):
|
|
509
|
+
key_findings.append(f"Finding {i}: {result['summary']}")
|
|
510
|
+
technical_approaches.append(
|
|
511
|
+
f"Approach {i}: Methodology described in '{result['title']}'"
|
|
512
|
+
)
|
|
513
|
+
citations.append(
|
|
514
|
+
f"[{i}] {result['title']} - {result['source']} ({result['publication_date']})"
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
return {
|
|
518
|
+
"research_topic": search_results["query"],
|
|
519
|
+
"sources_analyzed": len(results),
|
|
520
|
+
"key_findings": key_findings,
|
|
521
|
+
"technical_approaches": technical_approaches,
|
|
522
|
+
"citations": citations,
|
|
523
|
+
"confidence_level": "High" if len(results) >= 3 else "Medium",
|
|
524
|
+
"synthesis_date": datetime.now().isoformat(),
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
def create_test_tools(self) -> List:
|
|
528
|
+
"""Create an advanced set of tools for realistic agent testing."""
|
|
141
529
|
tools_factory = ToolsFactory()
|
|
142
530
|
|
|
143
531
|
# Financial Analysis Tools
|
|
@@ -259,330 +647,6 @@ class ModelBenchmark:
|
|
|
259
647
|
|
|
260
648
|
return json.dumps(customers, indent=2)
|
|
261
649
|
|
|
262
|
-
def analyze_customer_data(customer_data_json: str) -> dict:
|
|
263
|
-
"""Analyze customer data for patterns and correlations."""
|
|
264
|
-
customers = json.loads(customer_data_json)
|
|
265
|
-
|
|
266
|
-
# Group by age groups
|
|
267
|
-
age_groups = {}
|
|
268
|
-
for customer in customers:
|
|
269
|
-
group = customer["age_group"]
|
|
270
|
-
if group not in age_groups:
|
|
271
|
-
age_groups[group] = {
|
|
272
|
-
"count": 0,
|
|
273
|
-
"total_spending": 0,
|
|
274
|
-
"total_income": 0,
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
age_groups[group]["count"] += 1
|
|
278
|
-
age_groups[group]["total_spending"] += customer["purchase_history"]
|
|
279
|
-
age_groups[group]["total_income"] += customer["income"]
|
|
280
|
-
|
|
281
|
-
# Calculate averages
|
|
282
|
-
analysis = {}
|
|
283
|
-
for group, data in age_groups.items():
|
|
284
|
-
analysis[group] = {
|
|
285
|
-
"count": data["count"],
|
|
286
|
-
"avg_spending": round(data["total_spending"] / data["count"], 2),
|
|
287
|
-
"avg_income": round(data["total_income"] / data["count"], 2),
|
|
288
|
-
"spending_to_income_ratio": round(
|
|
289
|
-
(data["total_spending"] / data["count"])
|
|
290
|
-
/ (data["total_income"] / data["count"])
|
|
291
|
-
* 1000,
|
|
292
|
-
4,
|
|
293
|
-
),
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
return {
|
|
297
|
-
"total_customers": len(customers),
|
|
298
|
-
"age_group_analysis": analysis,
|
|
299
|
-
"overall_avg_spending": round(
|
|
300
|
-
sum(c["purchase_history"] for c in customers) / len(customers), 2
|
|
301
|
-
),
|
|
302
|
-
"overall_avg_income": round(
|
|
303
|
-
sum(c["income"] for c in customers) / len(customers), 2
|
|
304
|
-
),
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
# System Monitoring Tools
|
|
308
|
-
def get_system_metrics() -> dict:
|
|
309
|
-
"""Get current system performance metrics."""
|
|
310
|
-
try:
|
|
311
|
-
cpu_percent = psutil.cpu_percent(interval=1)
|
|
312
|
-
memory = psutil.virtual_memory()
|
|
313
|
-
disk = psutil.disk_usage("/")
|
|
314
|
-
|
|
315
|
-
return {
|
|
316
|
-
"cpu_usage_percent": cpu_percent,
|
|
317
|
-
"memory_usage_percent": memory.percent,
|
|
318
|
-
"memory_available_gb": round(memory.available / (1024**3), 2),
|
|
319
|
-
"disk_usage_percent": disk.percent,
|
|
320
|
-
"disk_free_gb": round(disk.free / (1024**3), 2),
|
|
321
|
-
"timestamp": datetime.now().isoformat(),
|
|
322
|
-
}
|
|
323
|
-
except Exception:
|
|
324
|
-
# Fallback with simulated data for testing
|
|
325
|
-
return {
|
|
326
|
-
"cpu_usage_percent": random.randint(20, 95),
|
|
327
|
-
"memory_usage_percent": random.randint(40, 95),
|
|
328
|
-
"memory_available_gb": random.randint(1, 16),
|
|
329
|
-
"disk_usage_percent": random.randint(30, 90),
|
|
330
|
-
"disk_free_gb": random.randint(10, 500),
|
|
331
|
-
"timestamp": datetime.now().isoformat(),
|
|
332
|
-
"note": "Simulated data - psutil unavailable",
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
def check_system_health(
|
|
336
|
-
cpu_threshold: int = 80,
|
|
337
|
-
memory_threshold: int = 90,
|
|
338
|
-
disk_threshold: int = 85,
|
|
339
|
-
) -> dict:
|
|
340
|
-
"""Check system health against thresholds and generate alerts."""
|
|
341
|
-
metrics = get_system_metrics()
|
|
342
|
-
alerts = []
|
|
343
|
-
recommendations = []
|
|
344
|
-
|
|
345
|
-
if metrics["cpu_usage_percent"] > cpu_threshold:
|
|
346
|
-
alerts.append(
|
|
347
|
-
f"HIGH CPU USAGE: {metrics['cpu_usage_percent']}% (threshold: {cpu_threshold}%)"
|
|
348
|
-
)
|
|
349
|
-
recommendations.append(
|
|
350
|
-
"Consider closing unnecessary applications or upgrading CPU"
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
if metrics["memory_usage_percent"] > memory_threshold:
|
|
354
|
-
alerts.append(
|
|
355
|
-
f"HIGH MEMORY USAGE: {metrics['memory_usage_percent']}% (threshold: {memory_threshold}%)"
|
|
356
|
-
)
|
|
357
|
-
recommendations.append(
|
|
358
|
-
"Close memory-intensive applications or add more RAM"
|
|
359
|
-
)
|
|
360
|
-
|
|
361
|
-
if metrics["disk_usage_percent"] > disk_threshold:
|
|
362
|
-
alerts.append(
|
|
363
|
-
f"LOW DISK SPACE: {metrics['disk_usage_percent']}% used (threshold: {disk_threshold}%)"
|
|
364
|
-
)
|
|
365
|
-
recommendations.append(
|
|
366
|
-
"Clean up temporary files or expand disk storage"
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
health_status = (
|
|
370
|
-
"CRITICAL" if len(alerts) >= 2 else "WARNING" if alerts else "HEALTHY"
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
return {
|
|
374
|
-
"health_status": health_status,
|
|
375
|
-
"alerts": alerts,
|
|
376
|
-
"recommendations": recommendations,
|
|
377
|
-
"metrics": metrics,
|
|
378
|
-
"thresholds": {
|
|
379
|
-
"cpu": cpu_threshold,
|
|
380
|
-
"memory": memory_threshold,
|
|
381
|
-
"disk": disk_threshold,
|
|
382
|
-
},
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
# Project Management Tools
|
|
386
|
-
def create_project_tasks(count: int = 10) -> str:
|
|
387
|
-
"""Generate a list of software development tasks."""
|
|
388
|
-
task_types = [
|
|
389
|
-
"Implement user authentication system",
|
|
390
|
-
"Create REST API endpoints",
|
|
391
|
-
"Design database schema",
|
|
392
|
-
"Build responsive frontend components",
|
|
393
|
-
"Write unit tests",
|
|
394
|
-
"Set up CI/CD pipeline",
|
|
395
|
-
"Implement error handling",
|
|
396
|
-
"Create API documentation",
|
|
397
|
-
"Optimize database queries",
|
|
398
|
-
"Implement caching layer",
|
|
399
|
-
"Add logging and monitoring",
|
|
400
|
-
"Create user dashboard",
|
|
401
|
-
"Implement search functionality",
|
|
402
|
-
"Add data validation",
|
|
403
|
-
"Create admin panel",
|
|
404
|
-
]
|
|
405
|
-
|
|
406
|
-
tasks = []
|
|
407
|
-
for i in range(count):
|
|
408
|
-
task = random.choice(task_types)
|
|
409
|
-
priority = random.choice(["High", "Medium", "Low"])
|
|
410
|
-
estimated_hours = random.randint(2, 24)
|
|
411
|
-
|
|
412
|
-
tasks.append(
|
|
413
|
-
{
|
|
414
|
-
"task_id": f"TASK-{i+1:03d}",
|
|
415
|
-
"title": f"{task} #{i+1}",
|
|
416
|
-
"priority": priority,
|
|
417
|
-
"estimated_hours": estimated_hours,
|
|
418
|
-
"status": "Backlog",
|
|
419
|
-
"assigned_to": None,
|
|
420
|
-
}
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
return json.dumps(tasks, indent=2)
|
|
424
|
-
|
|
425
|
-
def plan_sprint(tasks_json: str, sprint_capacity_hours: int = 80) -> dict:
|
|
426
|
-
"""Organize tasks into a sprint with daily breakdowns."""
|
|
427
|
-
tasks = json.loads(tasks_json)
|
|
428
|
-
|
|
429
|
-
# Sort by priority and estimated hours
|
|
430
|
-
priority_order = {"High": 3, "Medium": 2, "Low": 1}
|
|
431
|
-
tasks.sort(
|
|
432
|
-
key=lambda x: (priority_order[x["priority"]], -x["estimated_hours"]),
|
|
433
|
-
reverse=True,
|
|
434
|
-
)
|
|
435
|
-
|
|
436
|
-
sprint_tasks = []
|
|
437
|
-
total_hours = 0
|
|
438
|
-
|
|
439
|
-
for task in tasks:
|
|
440
|
-
if total_hours + task["estimated_hours"] <= sprint_capacity_hours:
|
|
441
|
-
sprint_tasks.append(task)
|
|
442
|
-
total_hours += task["estimated_hours"]
|
|
443
|
-
else:
|
|
444
|
-
break
|
|
445
|
-
|
|
446
|
-
# Distribute across 2 weeks (10 working days)
|
|
447
|
-
daily_breakdown = []
|
|
448
|
-
remaining_hours = total_hours
|
|
449
|
-
days_remaining = 10
|
|
450
|
-
|
|
451
|
-
for day in range(1, 11):
|
|
452
|
-
if days_remaining > 0:
|
|
453
|
-
day_hours = min(
|
|
454
|
-
8,
|
|
455
|
-
remaining_hours // days_remaining
|
|
456
|
-
+ (1 if remaining_hours % days_remaining else 0),
|
|
457
|
-
)
|
|
458
|
-
daily_breakdown.append(
|
|
459
|
-
{
|
|
460
|
-
"day": day,
|
|
461
|
-
"planned_hours": day_hours,
|
|
462
|
-
"remaining_capacity": 8 - day_hours,
|
|
463
|
-
}
|
|
464
|
-
)
|
|
465
|
-
remaining_hours -= day_hours
|
|
466
|
-
days_remaining -= 1
|
|
467
|
-
|
|
468
|
-
return {
|
|
469
|
-
"sprint_summary": {
|
|
470
|
-
"total_tasks": len(sprint_tasks),
|
|
471
|
-
"total_planned_hours": total_hours,
|
|
472
|
-
"sprint_capacity": sprint_capacity_hours,
|
|
473
|
-
"utilization_percent": round(
|
|
474
|
-
(total_hours / sprint_capacity_hours) * 100, 1
|
|
475
|
-
),
|
|
476
|
-
},
|
|
477
|
-
"selected_tasks": sprint_tasks,
|
|
478
|
-
"daily_breakdown": daily_breakdown,
|
|
479
|
-
"backlog_remaining": len(tasks) - len(sprint_tasks),
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
# Reporting Tools
|
|
483
|
-
def create_formatted_report(
|
|
484
|
-
title: str, data: dict, report_type: str = "summary"
|
|
485
|
-
) -> str:
|
|
486
|
-
"""Create a formatted text report from structured data."""
|
|
487
|
-
report_lines = []
|
|
488
|
-
report_lines.append("=" * 60)
|
|
489
|
-
report_lines.append(f"{title.upper()}")
|
|
490
|
-
report_lines.append("=" * 60)
|
|
491
|
-
report_lines.append(
|
|
492
|
-
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
|
493
|
-
)
|
|
494
|
-
report_lines.append(f"Report Type: {report_type.title()}")
|
|
495
|
-
report_lines.append("")
|
|
496
|
-
|
|
497
|
-
def format_dict(d, indent=0):
|
|
498
|
-
lines = []
|
|
499
|
-
for key, value in d.items():
|
|
500
|
-
prefix = " " * indent
|
|
501
|
-
if isinstance(value, dict):
|
|
502
|
-
lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
|
|
503
|
-
lines.extend(format_dict(value, indent + 1))
|
|
504
|
-
elif isinstance(value, list):
|
|
505
|
-
lines.append(f"{prefix}{key.replace('_', ' ').title()}:")
|
|
506
|
-
for i, item in enumerate(value):
|
|
507
|
-
if isinstance(item, dict):
|
|
508
|
-
lines.append(f"{prefix} Item {i+1}:")
|
|
509
|
-
lines.extend(format_dict(item, indent + 2))
|
|
510
|
-
else:
|
|
511
|
-
lines.append(f"{prefix} - {item}")
|
|
512
|
-
else:
|
|
513
|
-
lines.append(
|
|
514
|
-
f"{prefix}{key.replace('_', ' ').title()}: {value}"
|
|
515
|
-
)
|
|
516
|
-
return lines
|
|
517
|
-
|
|
518
|
-
report_lines.extend(format_dict(data))
|
|
519
|
-
report_lines.append("")
|
|
520
|
-
report_lines.append("=" * 60)
|
|
521
|
-
|
|
522
|
-
return "\n".join(report_lines)
|
|
523
|
-
|
|
524
|
-
# Research Tools
|
|
525
|
-
def search_information(query: str, max_results: int = 5) -> dict:
|
|
526
|
-
"""Simulate information search with structured results."""
|
|
527
|
-
# Simulated search results for testing
|
|
528
|
-
simulated_results = [
|
|
529
|
-
{
|
|
530
|
-
"title": f"Research Paper: {query} - Latest Developments",
|
|
531
|
-
"source": "Journal of Advanced Computing",
|
|
532
|
-
"summary": f"Recent breakthrough in {query} showing promising results in error reduction and scalability improvements.",
|
|
533
|
-
"relevance_score": random.randint(80, 95),
|
|
534
|
-
"publication_date": "2024-11-15",
|
|
535
|
-
},
|
|
536
|
-
{
|
|
537
|
-
"title": f"Technical Review: {query} Implementation Challenges",
|
|
538
|
-
"source": "Tech Innovation Quarterly",
|
|
539
|
-
"summary": f"Comprehensive analysis of current {query} methodologies and their practical applications.",
|
|
540
|
-
"relevance_score": random.randint(75, 90),
|
|
541
|
-
"publication_date": "2024-10-22",
|
|
542
|
-
},
|
|
543
|
-
{
|
|
544
|
-
"title": f"Industry Report: {query} Market Trends",
|
|
545
|
-
"source": "Technology Research Institute",
|
|
546
|
-
"summary": f"Market analysis and future projections for {query} adoption across industries.",
|
|
547
|
-
"relevance_score": random.randint(70, 85),
|
|
548
|
-
"publication_date": "2024-09-30",
|
|
549
|
-
},
|
|
550
|
-
]
|
|
551
|
-
|
|
552
|
-
return {
|
|
553
|
-
"query": query,
|
|
554
|
-
"total_results": len(simulated_results),
|
|
555
|
-
"results": simulated_results[:max_results],
|
|
556
|
-
"search_timestamp": datetime.now().isoformat(),
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
def synthesize_research(search_results: dict) -> dict:
|
|
560
|
-
"""Synthesize research findings into structured summary."""
|
|
561
|
-
results = search_results["results"]
|
|
562
|
-
|
|
563
|
-
key_findings = []
|
|
564
|
-
technical_approaches = []
|
|
565
|
-
citations = []
|
|
566
|
-
|
|
567
|
-
for i, result in enumerate(results, 1):
|
|
568
|
-
key_findings.append(f"Finding {i}: {result['summary']}")
|
|
569
|
-
technical_approaches.append(
|
|
570
|
-
f"Approach {i}: Methodology described in '{result['title']}'"
|
|
571
|
-
)
|
|
572
|
-
citations.append(
|
|
573
|
-
f"[{i}] {result['title']} - {result['source']} ({result['publication_date']})"
|
|
574
|
-
)
|
|
575
|
-
|
|
576
|
-
return {
|
|
577
|
-
"research_topic": search_results["query"],
|
|
578
|
-
"sources_analyzed": len(results),
|
|
579
|
-
"key_findings": key_findings,
|
|
580
|
-
"technical_approaches": technical_approaches,
|
|
581
|
-
"citations": citations,
|
|
582
|
-
"confidence_level": "High" if len(results) >= 3 else "Medium",
|
|
583
|
-
"synthesis_date": datetime.now().isoformat(),
|
|
584
|
-
}
|
|
585
|
-
|
|
586
650
|
# Create and return all tools
|
|
587
651
|
return [
|
|
588
652
|
# Financial Analysis
|
|
@@ -590,20 +654,51 @@ class ModelBenchmark:
|
|
|
590
654
|
tools_factory.create_tool(project_investment_growth, vhc_eligible=False),
|
|
591
655
|
# Data Analysis
|
|
592
656
|
tools_factory.create_tool(generate_customer_dataset, vhc_eligible=False),
|
|
593
|
-
tools_factory.create_tool(analyze_customer_data, vhc_eligible=False),
|
|
657
|
+
tools_factory.create_tool(self.analyze_customer_data, vhc_eligible=False),
|
|
594
658
|
# System Monitoring
|
|
595
|
-
tools_factory.create_tool(get_system_metrics, vhc_eligible=False),
|
|
596
|
-
tools_factory.create_tool(check_system_health, vhc_eligible=False),
|
|
659
|
+
tools_factory.create_tool(self.get_system_metrics, vhc_eligible=False),
|
|
660
|
+
tools_factory.create_tool(self.check_system_health, vhc_eligible=False),
|
|
597
661
|
# Project Management
|
|
598
|
-
tools_factory.create_tool(create_project_tasks, vhc_eligible=False),
|
|
599
|
-
tools_factory.create_tool(plan_sprint, vhc_eligible=False),
|
|
662
|
+
tools_factory.create_tool(self.create_project_tasks, vhc_eligible=False),
|
|
663
|
+
tools_factory.create_tool(self.plan_sprint, vhc_eligible=False),
|
|
600
664
|
# Reporting
|
|
601
|
-
tools_factory.create_tool(create_formatted_report, vhc_eligible=False),
|
|
665
|
+
tools_factory.create_tool(self.create_formatted_report, vhc_eligible=False),
|
|
602
666
|
# Research
|
|
603
|
-
tools_factory.create_tool(search_information, vhc_eligible=False),
|
|
604
|
-
tools_factory.create_tool(synthesize_research, vhc_eligible=False),
|
|
667
|
+
tools_factory.create_tool(self.search_information, vhc_eligible=False),
|
|
668
|
+
tools_factory.create_tool(self.synthesize_research, vhc_eligible=False),
|
|
605
669
|
]
|
|
606
670
|
|
|
671
|
+
def _calculate_provider_delay(self, provider: ModelProvider) -> float:
|
|
672
|
+
"""Calculate appropriate delay based on provider rate limits."""
|
|
673
|
+
base_delay = 60.0 / self.provider_rate_limits.get(
|
|
674
|
+
provider, 60
|
|
675
|
+
) # seconds between requests
|
|
676
|
+
# Add jitter to prevent thundering herd
|
|
677
|
+
jitter = random.uniform(0.5, 1.5)
|
|
678
|
+
return base_delay * jitter * 2 # Extra conservative multiplier
|
|
679
|
+
|
|
680
|
+
async def _retry_with_backoff(
|
|
681
|
+
self, func, max_retries: int = 3, base_delay: float = 1.0
|
|
682
|
+
):
|
|
683
|
+
"""Retry function with exponential backoff on rate limit errors."""
|
|
684
|
+
for attempt in range(max_retries):
|
|
685
|
+
try:
|
|
686
|
+
return await func()
|
|
687
|
+
except Exception as e:
|
|
688
|
+
error_str = str(e).lower()
|
|
689
|
+
if "rate limit" in error_str or "429" in error_str:
|
|
690
|
+
if attempt == max_retries - 1:
|
|
691
|
+
raise # Last attempt, re-raise the error
|
|
692
|
+
|
|
693
|
+
# Calculate backoff delay
|
|
694
|
+
delay = base_delay * (2**attempt) + random.uniform(0, 1)
|
|
695
|
+
print(
|
|
696
|
+
f" ⏳ Rate limit hit, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})"
|
|
697
|
+
)
|
|
698
|
+
await asyncio.sleep(delay)
|
|
699
|
+
else:
|
|
700
|
+
raise # Non-rate-limit error, don't retry
|
|
701
|
+
|
|
607
702
|
async def measure_streaming_response(
|
|
608
703
|
self, agent: Agent, prompt: str
|
|
609
704
|
) -> Tuple[float, float, int]:
|
|
@@ -706,14 +801,15 @@ class ModelBenchmark:
|
|
|
706
801
|
)
|
|
707
802
|
|
|
708
803
|
async def run_benchmarks(self):
|
|
709
|
-
"""Run all benchmark combinations."""
|
|
804
|
+
"""Run all benchmark combinations with parallel execution."""
|
|
710
805
|
global _observability_initialized
|
|
711
806
|
|
|
712
807
|
print("Starting model performance benchmarks...")
|
|
713
808
|
print(
|
|
714
809
|
f"Testing {len(self.models_to_test)} models across {len(self.test_scenarios)} scenarios"
|
|
715
810
|
)
|
|
716
|
-
print(f"Running {self.iterations_per_test} iterations per combination
|
|
811
|
+
print(f"Running {self.iterations_per_test} iterations per combination")
|
|
812
|
+
print(f"Max concurrent models: {self.max_concurrent_models}\n")
|
|
717
813
|
|
|
718
814
|
# Setup observability once if enabled and not already initialized
|
|
719
815
|
if self.enable_observability and not _observability_initialized:
|
|
@@ -727,47 +823,116 @@ class ModelBenchmark:
|
|
|
727
823
|
else:
|
|
728
824
|
print("⚠️ Arize Phoenix observability setup failed\n")
|
|
729
825
|
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
* len(self.test_scenarios)
|
|
733
|
-
* self.iterations_per_test
|
|
734
|
-
)
|
|
735
|
-
current_test = 0
|
|
826
|
+
# Create semaphore to limit concurrent model testing
|
|
827
|
+
model_semaphore = asyncio.Semaphore(self.max_concurrent_models)
|
|
736
828
|
|
|
829
|
+
# Create tasks for all model benchmarks
|
|
830
|
+
tasks = []
|
|
737
831
|
for model_config in self.models_to_test:
|
|
832
|
+
task = asyncio.create_task(
|
|
833
|
+
self._run_model_benchmark(model_config, model_semaphore)
|
|
834
|
+
)
|
|
835
|
+
tasks.append(task)
|
|
836
|
+
|
|
837
|
+
# Execute all model benchmarks in parallel
|
|
838
|
+
print("🚀 Starting parallel benchmark execution...\n")
|
|
839
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
840
|
+
|
|
841
|
+
async def _run_model_benchmark(
|
|
842
|
+
self, model_config: Dict, semaphore: asyncio.Semaphore
|
|
843
|
+
):
|
|
844
|
+
"""Run all benchmarks for a single model."""
|
|
845
|
+
async with semaphore:
|
|
738
846
|
provider = model_config["provider"]
|
|
739
847
|
model_name = model_config["model"]
|
|
740
848
|
|
|
741
849
|
print(f"\n{'='*60}")
|
|
742
|
-
print(f"
|
|
850
|
+
print(f"Starting: {provider.value} - {model_name}")
|
|
743
851
|
print(f"{'='*60}")
|
|
744
852
|
|
|
853
|
+
# Run all scenarios for this model sequentially to avoid rate limits
|
|
745
854
|
for test_name, test_config in self.test_scenarios.items():
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
current_test += 1
|
|
750
|
-
progress = (current_test / total_tests) * 100
|
|
751
|
-
print(
|
|
752
|
-
f" Iteration {iteration + 1}/{self.iterations_per_test} ({progress:.1f}% complete)"
|
|
855
|
+
try:
|
|
856
|
+
await self._run_scenario_benchmark(
|
|
857
|
+
provider, model_name, test_name, test_config
|
|
753
858
|
)
|
|
859
|
+
except Exception as e:
|
|
860
|
+
print(f"❌ Error in {model_name} - {test_name}: {e}")
|
|
861
|
+
|
|
862
|
+
print(f"✅ Completed: {provider.value} - {model_name}")
|
|
754
863
|
|
|
755
|
-
|
|
864
|
+
async def _run_scenario_benchmark(
|
|
865
|
+
self,
|
|
866
|
+
provider: ModelProvider,
|
|
867
|
+
model_name: str,
|
|
868
|
+
test_name: str,
|
|
869
|
+
test_config: Dict[str, Any],
|
|
870
|
+
):
|
|
871
|
+
"""Run all iterations for a single test scenario sequentially."""
|
|
872
|
+
print(
|
|
873
|
+
f"\n🔄 Running {model_name}/{test_name}: {test_config['description']}"
|
|
874
|
+
)
|
|
875
|
+
|
|
876
|
+
iteration_results = []
|
|
877
|
+
|
|
878
|
+
# Run iterations sequentially to avoid rate limits
|
|
879
|
+
for iteration in range(self.iterations_per_test):
|
|
880
|
+
iteration_num = iteration + 1
|
|
881
|
+
try:
|
|
882
|
+
# Use retry with backoff for rate limit handling
|
|
883
|
+
async def run_benchmark():
|
|
884
|
+
return await self.run_single_benchmark(
|
|
756
885
|
provider, model_name, test_name, test_config
|
|
757
886
|
)
|
|
758
|
-
self.results.append(result)
|
|
759
887
|
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
f" Time: {result.total_response_time:.2f}s, "
|
|
765
|
-
f"First token: {result.first_token_latency:.2f}s, "
|
|
766
|
-
f"Speed: {result.tokens_per_second:.1f} chars/sec"
|
|
767
|
-
)
|
|
888
|
+
result = await self._retry_with_backoff(
|
|
889
|
+
run_benchmark, max_retries=3, base_delay=2.0
|
|
890
|
+
)
|
|
891
|
+
iteration_results.append(result)
|
|
768
892
|
|
|
769
|
-
|
|
770
|
-
|
|
893
|
+
if result.error:
|
|
894
|
+
print(
|
|
895
|
+
f" ❌ {model_name}/{test_name} Iteration {iteration_num}: {result.error}"
|
|
896
|
+
)
|
|
897
|
+
else:
|
|
898
|
+
print(
|
|
899
|
+
f" ✅ {model_name}/{test_name} Iteration {iteration_num}: "
|
|
900
|
+
f"{result.total_response_time:.2f}s, "
|
|
901
|
+
f"first token: {result.first_token_latency:.2f}s, "
|
|
902
|
+
f"{result.tokens_per_second:.1f} chars/sec"
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
except Exception as e:
|
|
906
|
+
print(f" ❌ {model_name}/{test_name} Iteration {iteration_num}: {e}")
|
|
907
|
+
# Create error result
|
|
908
|
+
error_result = BenchmarkResult(
|
|
909
|
+
model_name=model_name,
|
|
910
|
+
provider=provider.value,
|
|
911
|
+
test_type=test_name,
|
|
912
|
+
first_token_latency=-1,
|
|
913
|
+
total_response_time=-1,
|
|
914
|
+
response_length=0,
|
|
915
|
+
tokens_per_second=0,
|
|
916
|
+
error=str(e),
|
|
917
|
+
)
|
|
918
|
+
iteration_results.append(error_result)
|
|
919
|
+
|
|
920
|
+
# Add delay between iterations based on provider
|
|
921
|
+
if iteration_num < self.iterations_per_test:
|
|
922
|
+
delay = self._calculate_provider_delay(provider)
|
|
923
|
+
await asyncio.sleep(delay)
|
|
924
|
+
|
|
925
|
+
# Add all results to the main results list
|
|
926
|
+
self.results.extend(iteration_results)
|
|
927
|
+
|
|
928
|
+
# Calculate success rate for this scenario
|
|
929
|
+
successful = len([r for r in iteration_results if r.error is None])
|
|
930
|
+
success_rate = (successful / len(iteration_results)) * 100
|
|
931
|
+
print(
|
|
932
|
+
f" 📊 {model_name}/{test_name} complete: {successful}/{len(iteration_results)} successful ({success_rate:.1f}%)"
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
return iteration_results
|
|
771
936
|
|
|
772
937
|
def calculate_statistics(self) -> List[BenchmarkStats]:
|
|
773
938
|
"""Calculate aggregated statistics from results."""
|
|
@@ -913,7 +1078,17 @@ async def main():
|
|
|
913
1078
|
|
|
914
1079
|
# Check if observability should be enabled via environment variable
|
|
915
1080
|
enable_observability = os.getenv("ENABLE_OBSERVABILITY", "false").lower() == "true"
|
|
916
|
-
|
|
1081
|
+
|
|
1082
|
+
# Allow configuring concurrency via environment variable
|
|
1083
|
+
max_concurrent_models = int(os.getenv("MAX_CONCURRENT_MODELS", "5"))
|
|
1084
|
+
|
|
1085
|
+
benchmark = ModelBenchmark(
|
|
1086
|
+
enable_observability=enable_observability,
|
|
1087
|
+
max_concurrent_models=max_concurrent_models,
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
# Validate that all required API keys are present before running benchmarks
|
|
1091
|
+
validate_api_keys(benchmark.models_to_test)
|
|
917
1092
|
|
|
918
1093
|
try:
|
|
919
1094
|
await benchmark.run_benchmarks()
|