isa-model 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/client.py +732 -565
- isa_model/core/cache/redis_cache.py +401 -0
- isa_model/core/config/config_manager.py +53 -10
- isa_model/core/config.py +1 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/migrations.py +277 -0
- isa_model/core/database/supabase_client.py +123 -0
- isa_model/core/models/__init__.py +37 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +36 -18
- isa_model/core/models/model_repo.py +44 -38
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +101 -370
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +7 -0
- isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
- isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
- isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/core/deployment_manager.py +6 -4
- isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
- isa_model/eval/benchmarks/__init__.py +27 -0
- isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
- isa_model/eval/benchmarks.py +244 -12
- isa_model/eval/evaluators/__init__.py +8 -2
- isa_model/eval/evaluators/audio_evaluator.py +727 -0
- isa_model/eval/evaluators/embedding_evaluator.py +742 -0
- isa_model/eval/evaluators/vision_evaluator.py +564 -0
- isa_model/eval/example_evaluation.py +395 -0
- isa_model/eval/factory.py +272 -5
- isa_model/eval/isa_benchmarks.py +700 -0
- isa_model/eval/isa_integration.py +582 -0
- isa_model/eval/metrics.py +159 -6
- isa_model/eval/tests/unit/test_basic.py +396 -0
- isa_model/inference/ai_factory.py +44 -8
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +32 -6
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/base_llm_service.py +30 -6
- isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
- isa_model/inference/services/llm/ollama_llm_service.py +2 -1
- isa_model/inference/services/llm/openai_llm_service.py +652 -55
- isa_model/inference/services/llm/yyds_llm_service.py +2 -1
- isa_model/inference/services/vision/__init__.py +5 -5
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/helpers/image_utils.py +11 -5
- isa_model/inference/services/vision/isa_vision_service.py +573 -0
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/serving/api/fastapi_server.py +88 -16
- isa_model/serving/api/middleware/auth.py +311 -0
- isa_model/serving/api/middleware/security.py +278 -0
- isa_model/serving/api/routes/analytics.py +486 -0
- isa_model/serving/api/routes/deployments.py +339 -0
- isa_model/serving/api/routes/evaluations.py +579 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/unified.py +324 -165
- isa_model/serving/api/startup.py +304 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/training/__init__.py +100 -6
- isa_model/training/core/__init__.py +4 -1
- isa_model/training/examples/intelligent_training_example.py +281 -0
- isa_model/training/intelligent/__init__.py +25 -0
- isa_model/training/intelligent/decision_engine.py +643 -0
- isa_model/training/intelligent/intelligent_factory.py +888 -0
- isa_model/training/intelligent/knowledge_base.py +751 -0
- isa_model/training/intelligent/resource_optimizer.py +839 -0
- isa_model/training/intelligent/task_classifier.py +576 -0
- isa_model/training/storage/__init__.py +24 -0
- isa_model/training/storage/core_integration.py +439 -0
- isa_model/training/storage/training_repository.py +552 -0
- isa_model/training/storage/training_storage.py +628 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
- isa_model-0.4.0.dist-info/RECORD +182 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model-0.3.9.dist-info/RECORD +0 -138
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,700 @@
|
|
1
|
+
"""
|
2
|
+
ISA Model Service Benchmarks.
|
3
|
+
|
4
|
+
Specialized benchmarks for evaluating ISA custom services:
|
5
|
+
- Modal deployment performance
|
6
|
+
- Cost-effectiveness analysis
|
7
|
+
- GPU utilization testing
|
8
|
+
- Service reliability and scalability
|
9
|
+
- Cross-service comparison
|
10
|
+
"""
|
11
|
+
|
12
|
+
import asyncio
|
13
|
+
import logging
|
14
|
+
import time
|
15
|
+
import statistics
|
16
|
+
from typing import Dict, List, Any, Optional, Union
|
17
|
+
from dataclasses import dataclass
|
18
|
+
from datetime import datetime, timedelta
|
19
|
+
import json
|
20
|
+
|
21
|
+
from .isa_integration import ISAModelInterface
|
22
|
+
from .evaluators.base_evaluator import BaseEvaluator, EvaluationResult
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
@dataclass
|
28
|
+
class ServicePerformanceMetrics:
|
29
|
+
"""Performance metrics for ISA services."""
|
30
|
+
service_name: str
|
31
|
+
total_requests: int
|
32
|
+
successful_requests: int
|
33
|
+
failed_requests: int
|
34
|
+
avg_latency_ms: float
|
35
|
+
p95_latency_ms: float
|
36
|
+
p99_latency_ms: float
|
37
|
+
throughput_rps: float # Requests per second
|
38
|
+
total_cost_usd: float
|
39
|
+
cost_per_request_usd: float
|
40
|
+
gpu_utilization_percent: Optional[float] = None
|
41
|
+
memory_usage_mb: Optional[float] = None
|
42
|
+
error_rate: float = 0.0
|
43
|
+
|
44
|
+
|
45
|
+
class ISAServiceBenchmark:
|
46
|
+
"""
|
47
|
+
Comprehensive benchmark suite for ISA services.
|
48
|
+
|
49
|
+
Tests performance, cost, reliability, and scalability of:
|
50
|
+
- ISA OCR Service (Surya OCR)
|
51
|
+
- ISA Vision Services (Qwen2.5-VL, Table extraction)
|
52
|
+
- ISA Audio SOTA Service
|
53
|
+
- ISA Embedding & Reranking Service
|
54
|
+
- ISA Video Generation Service
|
55
|
+
"""
|
56
|
+
|
57
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
58
|
+
"""Initialize ISA service benchmark."""
|
59
|
+
self.config = config or {}
|
60
|
+
self.interface = ISAModelInterface(config)
|
61
|
+
|
62
|
+
# Benchmark configuration
|
63
|
+
self.test_duration_seconds = self.config.get("test_duration_seconds", 60)
|
64
|
+
self.max_concurrent_requests = self.config.get("max_concurrent_requests", 10)
|
65
|
+
self.warmup_requests = self.config.get("warmup_requests", 5)
|
66
|
+
|
67
|
+
# Service configurations
|
68
|
+
self.services_to_test = self.config.get("services_to_test", [
|
69
|
+
"isa_ocr_service",
|
70
|
+
"isa_vision_qwen25_service",
|
71
|
+
"isa_audio_sota_service",
|
72
|
+
"isa_embedding_reranking_service"
|
73
|
+
])
|
74
|
+
|
75
|
+
# Test data
|
76
|
+
self.test_samples = self._prepare_test_samples()
|
77
|
+
|
78
|
+
def _prepare_test_samples(self) -> Dict[str, List[Dict[str, Any]]]:
|
79
|
+
"""Prepare test samples for different service types."""
|
80
|
+
samples = {
|
81
|
+
"ocr": [
|
82
|
+
{"text": "Sample OCR text for performance testing", "complexity": "simple"},
|
83
|
+
{"text": "More complex OCR text with special characters: éñ中文", "complexity": "medium"},
|
84
|
+
{"text": "Very complex OCR text with multiple languages and formatting", "complexity": "complex"}
|
85
|
+
],
|
86
|
+
"vision_vqa": [
|
87
|
+
{"question": "What color is the object?", "complexity": "simple"},
|
88
|
+
{"question": "Describe the scene in detail", "complexity": "medium"},
|
89
|
+
{"question": "Analyze the complex relationships in this image", "complexity": "complex"}
|
90
|
+
],
|
91
|
+
"audio_stt": [
|
92
|
+
{"duration": 5, "content": "Short audio clip", "complexity": "simple"},
|
93
|
+
{"duration": 30, "content": "Medium length audio", "complexity": "medium"},
|
94
|
+
{"duration": 120, "content": "Long audio clip", "complexity": "complex"}
|
95
|
+
],
|
96
|
+
"embedding": [
|
97
|
+
{"text": "Short text for embedding", "length": "short"},
|
98
|
+
{"text": "Medium length text for embedding testing with more content", "length": "medium"},
|
99
|
+
{"text": "Very long text for embedding testing " * 20, "length": "long"}
|
100
|
+
]
|
101
|
+
}
|
102
|
+
return samples
|
103
|
+
|
104
|
+
async def run_comprehensive_benchmark(self) -> Dict[str, Any]:
|
105
|
+
"""Run comprehensive benchmark across all ISA services."""
|
106
|
+
logger.info("Starting comprehensive ISA service benchmark")
|
107
|
+
|
108
|
+
results = {
|
109
|
+
"benchmark_start_time": datetime.now().isoformat(),
|
110
|
+
"config": self.config,
|
111
|
+
"service_results": {},
|
112
|
+
"comparative_analysis": {},
|
113
|
+
"summary": {}
|
114
|
+
}
|
115
|
+
|
116
|
+
# Test each service
|
117
|
+
for service_name in self.services_to_test:
|
118
|
+
logger.info(f"Benchmarking {service_name}")
|
119
|
+
try:
|
120
|
+
service_results = await self._benchmark_service(service_name)
|
121
|
+
results["service_results"][service_name] = service_results
|
122
|
+
except Exception as e:
|
123
|
+
logger.error(f"Error benchmarking {service_name}: {e}")
|
124
|
+
results["service_results"][service_name] = {"error": str(e)}
|
125
|
+
|
126
|
+
# Comparative analysis
|
127
|
+
results["comparative_analysis"] = self._perform_comparative_analysis(
|
128
|
+
results["service_results"]
|
129
|
+
)
|
130
|
+
|
131
|
+
# Summary
|
132
|
+
results["summary"] = self._generate_summary(results["service_results"])
|
133
|
+
|
134
|
+
results["benchmark_end_time"] = datetime.now().isoformat()
|
135
|
+
|
136
|
+
logger.info("Comprehensive benchmark completed")
|
137
|
+
return results
|
138
|
+
|
139
|
+
async def _benchmark_service(self, service_name: str) -> Dict[str, Any]:
|
140
|
+
"""Benchmark a specific ISA service."""
|
141
|
+
service_type = self._get_service_type(service_name)
|
142
|
+
test_samples = self.test_samples.get(service_type, [])
|
143
|
+
|
144
|
+
if not test_samples:
|
145
|
+
logger.warning(f"No test samples for service type: {service_type}")
|
146
|
+
return {"error": "No test samples available"}
|
147
|
+
|
148
|
+
# Warmup
|
149
|
+
await self._warmup_service(service_name, test_samples[:self.warmup_requests])
|
150
|
+
|
151
|
+
# Performance testing
|
152
|
+
performance_results = await self._run_performance_test(service_name, test_samples)
|
153
|
+
|
154
|
+
# Load testing
|
155
|
+
load_results = await self._run_load_test(service_name, test_samples)
|
156
|
+
|
157
|
+
# Reliability testing
|
158
|
+
reliability_results = await self._run_reliability_test(service_name, test_samples)
|
159
|
+
|
160
|
+
# Cost analysis
|
161
|
+
cost_analysis = self._analyze_costs(performance_results, load_results)
|
162
|
+
|
163
|
+
return {
|
164
|
+
"service_name": service_name,
|
165
|
+
"service_type": service_type,
|
166
|
+
"performance_test": performance_results,
|
167
|
+
"load_test": load_results,
|
168
|
+
"reliability_test": reliability_results,
|
169
|
+
"cost_analysis": cost_analysis,
|
170
|
+
"overall_metrics": self._calculate_overall_metrics(
|
171
|
+
performance_results, load_results, reliability_results
|
172
|
+
)
|
173
|
+
}
|
174
|
+
|
175
|
+
def _get_service_type(self, service_name: str) -> str:
|
176
|
+
"""Map service name to service type."""
|
177
|
+
mapping = {
|
178
|
+
"isa_ocr_service": "ocr",
|
179
|
+
"isa_vision_qwen25_service": "vision_vqa",
|
180
|
+
"isa_audio_sota_service": "audio_stt",
|
181
|
+
"isa_embedding_reranking_service": "embedding"
|
182
|
+
}
|
183
|
+
return mapping.get(service_name, "unknown")
|
184
|
+
|
185
|
+
async def _warmup_service(self, service_name: str, samples: List[Dict[str, Any]]):
|
186
|
+
"""Warm up the service with initial requests."""
|
187
|
+
logger.info(f"Warming up {service_name}")
|
188
|
+
|
189
|
+
for sample in samples:
|
190
|
+
try:
|
191
|
+
await self._make_service_request(service_name, sample)
|
192
|
+
await asyncio.sleep(0.5) # Brief pause between warmup requests
|
193
|
+
except Exception as e:
|
194
|
+
logger.warning(f"Warmup request failed: {e}")
|
195
|
+
|
196
|
+
async def _run_performance_test(self, service_name: str, samples: List[Dict[str, Any]]) -> Dict[str, Any]:
|
197
|
+
"""Run performance test measuring latency and accuracy."""
|
198
|
+
logger.info(f"Running performance test for {service_name}")
|
199
|
+
|
200
|
+
results = {
|
201
|
+
"test_type": "performance",
|
202
|
+
"requests": [],
|
203
|
+
"metrics": {}
|
204
|
+
}
|
205
|
+
|
206
|
+
# Test each sample type
|
207
|
+
for sample in samples:
|
208
|
+
for _ in range(5): # 5 requests per sample type
|
209
|
+
start_time = time.time()
|
210
|
+
try:
|
211
|
+
response = await self._make_service_request(service_name, sample)
|
212
|
+
latency = (time.time() - start_time) * 1000 # Convert to milliseconds
|
213
|
+
|
214
|
+
request_result = {
|
215
|
+
"success": True,
|
216
|
+
"latency_ms": latency,
|
217
|
+
"sample_complexity": sample.get("complexity", "unknown"),
|
218
|
+
"response_size": len(str(response)),
|
219
|
+
"cost_estimate": response.get("cost_usd", 0.0)
|
220
|
+
}
|
221
|
+
|
222
|
+
except Exception as e:
|
223
|
+
request_result = {
|
224
|
+
"success": False,
|
225
|
+
"error": str(e),
|
226
|
+
"latency_ms": (time.time() - start_time) * 1000,
|
227
|
+
"sample_complexity": sample.get("complexity", "unknown")
|
228
|
+
}
|
229
|
+
|
230
|
+
results["requests"].append(request_result)
|
231
|
+
|
232
|
+
# Calculate metrics
|
233
|
+
successful_requests = [r for r in results["requests"] if r["success"]]
|
234
|
+
failed_requests = [r for r in results["requests"] if not r["success"]]
|
235
|
+
|
236
|
+
if successful_requests:
|
237
|
+
latencies = [r["latency_ms"] for r in successful_requests]
|
238
|
+
costs = [r.get("cost_estimate", 0.0) for r in successful_requests]
|
239
|
+
|
240
|
+
results["metrics"] = {
|
241
|
+
"total_requests": len(results["requests"]),
|
242
|
+
"successful_requests": len(successful_requests),
|
243
|
+
"failed_requests": len(failed_requests),
|
244
|
+
"success_rate": len(successful_requests) / len(results["requests"]),
|
245
|
+
"avg_latency_ms": statistics.mean(latencies),
|
246
|
+
"median_latency_ms": statistics.median(latencies),
|
247
|
+
"p95_latency_ms": self._percentile(latencies, 95),
|
248
|
+
"p99_latency_ms": self._percentile(latencies, 99),
|
249
|
+
"min_latency_ms": min(latencies),
|
250
|
+
"max_latency_ms": max(latencies),
|
251
|
+
"total_cost_usd": sum(costs),
|
252
|
+
"avg_cost_per_request": statistics.mean(costs) if costs else 0.0
|
253
|
+
}
|
254
|
+
else:
|
255
|
+
results["metrics"] = {
|
256
|
+
"total_requests": len(results["requests"]),
|
257
|
+
"successful_requests": 0,
|
258
|
+
"failed_requests": len(failed_requests),
|
259
|
+
"success_rate": 0.0,
|
260
|
+
"error": "All requests failed"
|
261
|
+
}
|
262
|
+
|
263
|
+
return results
|
264
|
+
|
265
|
+
async def _run_load_test(self, service_name: str, samples: List[Dict[str, Any]]) -> Dict[str, Any]:
|
266
|
+
"""Run load test to measure throughput and scalability."""
|
267
|
+
logger.info(f"Running load test for {service_name}")
|
268
|
+
|
269
|
+
results = {
|
270
|
+
"test_type": "load",
|
271
|
+
"test_duration_seconds": self.test_duration_seconds,
|
272
|
+
"max_concurrent_requests": self.max_concurrent_requests,
|
273
|
+
"requests": [],
|
274
|
+
"metrics": {}
|
275
|
+
}
|
276
|
+
|
277
|
+
# Create semaphore for concurrency control
|
278
|
+
semaphore = asyncio.Semaphore(self.max_concurrent_requests)
|
279
|
+
|
280
|
+
start_time = time.time()
|
281
|
+
end_time = start_time + self.test_duration_seconds
|
282
|
+
|
283
|
+
async def make_request():
|
284
|
+
async with semaphore:
|
285
|
+
sample = samples[len(results["requests"]) % len(samples)]
|
286
|
+
request_start = time.time()
|
287
|
+
|
288
|
+
try:
|
289
|
+
response = await self._make_service_request(service_name, sample)
|
290
|
+
latency = (time.time() - request_start) * 1000
|
291
|
+
|
292
|
+
return {
|
293
|
+
"success": True,
|
294
|
+
"latency_ms": latency,
|
295
|
+
"timestamp": request_start,
|
296
|
+
"cost_estimate": response.get("cost_usd", 0.0)
|
297
|
+
}
|
298
|
+
except Exception as e:
|
299
|
+
return {
|
300
|
+
"success": False,
|
301
|
+
"error": str(e),
|
302
|
+
"latency_ms": (time.time() - request_start) * 1000,
|
303
|
+
"timestamp": request_start
|
304
|
+
}
|
305
|
+
|
306
|
+
# Generate load
|
307
|
+
tasks = []
|
308
|
+
while time.time() < end_time:
|
309
|
+
if len(tasks) < self.max_concurrent_requests:
|
310
|
+
task = asyncio.create_task(make_request())
|
311
|
+
tasks.append(task)
|
312
|
+
|
313
|
+
# Collect completed tasks
|
314
|
+
done_tasks = [task for task in tasks if task.done()]
|
315
|
+
for task in done_tasks:
|
316
|
+
try:
|
317
|
+
result = await task
|
318
|
+
results["requests"].append(result)
|
319
|
+
except Exception as e:
|
320
|
+
logger.error(f"Task error: {e}")
|
321
|
+
tasks.remove(task)
|
322
|
+
|
323
|
+
await asyncio.sleep(0.1) # Brief pause
|
324
|
+
|
325
|
+
# Wait for remaining tasks
|
326
|
+
if tasks:
|
327
|
+
remaining_results = await asyncio.gather(*tasks, return_exceptions=True)
|
328
|
+
for result in remaining_results:
|
329
|
+
if isinstance(result, dict):
|
330
|
+
results["requests"].append(result)
|
331
|
+
|
332
|
+
# Calculate load test metrics
|
333
|
+
if results["requests"]:
|
334
|
+
successful_requests = [r for r in results["requests"] if r["success"]]
|
335
|
+
total_time = time.time() - start_time
|
336
|
+
|
337
|
+
results["metrics"] = {
|
338
|
+
"total_requests": len(results["requests"]),
|
339
|
+
"successful_requests": len(successful_requests),
|
340
|
+
"failed_requests": len(results["requests"]) - len(successful_requests),
|
341
|
+
"success_rate": len(successful_requests) / len(results["requests"]),
|
342
|
+
"throughput_rps": len(results["requests"]) / total_time,
|
343
|
+
"successful_throughput_rps": len(successful_requests) / total_time,
|
344
|
+
"actual_test_duration": total_time,
|
345
|
+
"concurrent_requests_achieved": min(self.max_concurrent_requests, len(results["requests"]))
|
346
|
+
}
|
347
|
+
|
348
|
+
if successful_requests:
|
349
|
+
latencies = [r["latency_ms"] for r in successful_requests]
|
350
|
+
results["metrics"].update({
|
351
|
+
"avg_latency_ms": statistics.mean(latencies),
|
352
|
+
"p95_latency_ms": self._percentile(latencies, 95),
|
353
|
+
"p99_latency_ms": self._percentile(latencies, 99)
|
354
|
+
})
|
355
|
+
|
356
|
+
return results
|
357
|
+
|
358
|
+
async def _run_reliability_test(self, service_name: str, samples: List[Dict[str, Any]]) -> Dict[str, Any]:
|
359
|
+
"""Run reliability test to measure service stability."""
|
360
|
+
logger.info(f"Running reliability test for {service_name}")
|
361
|
+
|
362
|
+
results = {
|
363
|
+
"test_type": "reliability",
|
364
|
+
"test_scenarios": [],
|
365
|
+
"metrics": {}
|
366
|
+
}
|
367
|
+
|
368
|
+
# Test different reliability scenarios
|
369
|
+
scenarios = [
|
370
|
+
{"name": "consecutive_requests", "description": "100 consecutive requests"},
|
371
|
+
{"name": "burst_requests", "description": "Burst of 20 concurrent requests"},
|
372
|
+
{"name": "mixed_complexity", "description": "Mixed complexity requests"}
|
373
|
+
]
|
374
|
+
|
375
|
+
for scenario in scenarios:
|
376
|
+
scenario_results = await self._run_reliability_scenario(service_name, samples, scenario)
|
377
|
+
results["test_scenarios"].append(scenario_results)
|
378
|
+
|
379
|
+
# Calculate overall reliability metrics
|
380
|
+
all_requests = []
|
381
|
+
for scenario in results["test_scenarios"]:
|
382
|
+
all_requests.extend(scenario.get("requests", []))
|
383
|
+
|
384
|
+
if all_requests:
|
385
|
+
successful = [r for r in all_requests if r["success"]]
|
386
|
+
results["metrics"] = {
|
387
|
+
"total_reliability_requests": len(all_requests),
|
388
|
+
"successful_reliability_requests": len(successful),
|
389
|
+
"overall_reliability_rate": len(successful) / len(all_requests),
|
390
|
+
"failure_types": self._analyze_failure_types(all_requests)
|
391
|
+
}
|
392
|
+
|
393
|
+
return results
|
394
|
+
|
395
|
+
async def _run_reliability_scenario(self, service_name: str, samples: List[Dict[str, Any]], scenario: Dict[str, Any]) -> Dict[str, Any]:
|
396
|
+
"""Run a specific reliability scenario."""
|
397
|
+
scenario_results = {
|
398
|
+
"scenario": scenario,
|
399
|
+
"requests": [],
|
400
|
+
"metrics": {}
|
401
|
+
}
|
402
|
+
|
403
|
+
if scenario["name"] == "consecutive_requests":
|
404
|
+
# 100 consecutive requests
|
405
|
+
for i in range(100):
|
406
|
+
sample = samples[i % len(samples)]
|
407
|
+
try:
|
408
|
+
start_time = time.time()
|
409
|
+
response = await self._make_service_request(service_name, sample)
|
410
|
+
latency = (time.time() - start_time) * 1000
|
411
|
+
|
412
|
+
scenario_results["requests"].append({
|
413
|
+
"success": True,
|
414
|
+
"request_number": i,
|
415
|
+
"latency_ms": latency
|
416
|
+
})
|
417
|
+
except Exception as e:
|
418
|
+
scenario_results["requests"].append({
|
419
|
+
"success": False,
|
420
|
+
"request_number": i,
|
421
|
+
"error": str(e)
|
422
|
+
})
|
423
|
+
|
424
|
+
elif scenario["name"] == "burst_requests":
|
425
|
+
# 20 concurrent requests
|
426
|
+
tasks = []
|
427
|
+
for i in range(20):
|
428
|
+
sample = samples[i % len(samples)]
|
429
|
+
task = asyncio.create_task(self._make_service_request(service_name, sample))
|
430
|
+
tasks.append(task)
|
431
|
+
|
432
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
433
|
+
for i, result in enumerate(results):
|
434
|
+
if isinstance(result, Exception):
|
435
|
+
scenario_results["requests"].append({
|
436
|
+
"success": False,
|
437
|
+
"request_number": i,
|
438
|
+
"error": str(result)
|
439
|
+
})
|
440
|
+
else:
|
441
|
+
scenario_results["requests"].append({
|
442
|
+
"success": True,
|
443
|
+
"request_number": i,
|
444
|
+
"response": result
|
445
|
+
})
|
446
|
+
|
447
|
+
elif scenario["name"] == "mixed_complexity":
|
448
|
+
# Mix of different complexity samples
|
449
|
+
for _ in range(30):
|
450
|
+
for sample in samples: # Test each complexity
|
451
|
+
try:
|
452
|
+
start_time = time.time()
|
453
|
+
response = await self._make_service_request(service_name, sample)
|
454
|
+
latency = (time.time() - start_time) * 1000
|
455
|
+
|
456
|
+
scenario_results["requests"].append({
|
457
|
+
"success": True,
|
458
|
+
"complexity": sample.get("complexity", "unknown"),
|
459
|
+
"latency_ms": latency
|
460
|
+
})
|
461
|
+
except Exception as e:
|
462
|
+
scenario_results["requests"].append({
|
463
|
+
"success": False,
|
464
|
+
"complexity": sample.get("complexity", "unknown"),
|
465
|
+
"error": str(e)
|
466
|
+
})
|
467
|
+
|
468
|
+
# Calculate scenario metrics
|
469
|
+
successful = [r for r in scenario_results["requests"] if r["success"]]
|
470
|
+
scenario_results["metrics"] = {
|
471
|
+
"total_requests": len(scenario_results["requests"]),
|
472
|
+
"successful_requests": len(successful),
|
473
|
+
"success_rate": len(successful) / len(scenario_results["requests"]) if scenario_results["requests"] else 0
|
474
|
+
}
|
475
|
+
|
476
|
+
return scenario_results
|
477
|
+
|
478
|
+
async def _make_service_request(self, service_name: str, sample: Dict[str, Any]) -> Dict[str, Any]:
|
479
|
+
"""Make a request to a specific ISA service."""
|
480
|
+
service_type = self._get_service_type(service_name)
|
481
|
+
|
482
|
+
if service_type == "ocr":
|
483
|
+
# Mock image for OCR testing
|
484
|
+
return await self.interface.vision_analysis(
|
485
|
+
image="mock_image_data",
|
486
|
+
task_type="ocr",
|
487
|
+
model_name="isa-surya-ocr-service"
|
488
|
+
)
|
489
|
+
|
490
|
+
elif service_type == "vision_vqa":
|
491
|
+
return await self.interface.vision_analysis(
|
492
|
+
image="mock_image_data",
|
493
|
+
prompt=sample["question"],
|
494
|
+
task_type="vqa",
|
495
|
+
model_name="isa-qwen25-vision-service"
|
496
|
+
)
|
497
|
+
|
498
|
+
elif service_type == "audio_stt":
|
499
|
+
return await self.interface.audio_processing(
|
500
|
+
audio="mock_audio_data",
|
501
|
+
task_type="stt",
|
502
|
+
model_name="isa_audio_sota_service"
|
503
|
+
)
|
504
|
+
|
505
|
+
elif service_type == "embedding":
|
506
|
+
return await self.interface.embedding_generation(
|
507
|
+
text=sample["text"],
|
508
|
+
model_name="isa-jina-reranker-v2-service"
|
509
|
+
)
|
510
|
+
|
511
|
+
else:
|
512
|
+
raise ValueError(f"Unknown service type: {service_type}")
|
513
|
+
|
514
|
+
def _analyze_costs(self, performance_results: Dict[str, Any], load_results: Dict[str, Any]) -> Dict[str, Any]:
|
515
|
+
"""Analyze cost-effectiveness of the service."""
|
516
|
+
analysis = {
|
517
|
+
"cost_breakdown": {},
|
518
|
+
"cost_efficiency": {},
|
519
|
+
"recommendations": []
|
520
|
+
}
|
521
|
+
|
522
|
+
# Extract cost data
|
523
|
+
perf_costs = []
|
524
|
+
load_costs = []
|
525
|
+
|
526
|
+
for request in performance_results.get("requests", []):
|
527
|
+
if request.get("success") and "cost_estimate" in request:
|
528
|
+
perf_costs.append(request["cost_estimate"])
|
529
|
+
|
530
|
+
for request in load_results.get("requests", []):
|
531
|
+
if request.get("success") and "cost_estimate" in request:
|
532
|
+
load_costs.append(request["cost_estimate"])
|
533
|
+
|
534
|
+
all_costs = perf_costs + load_costs
|
535
|
+
|
536
|
+
if all_costs:
|
537
|
+
analysis["cost_breakdown"] = {
|
538
|
+
"total_estimated_cost": sum(all_costs),
|
539
|
+
"avg_cost_per_request": statistics.mean(all_costs),
|
540
|
+
"min_cost_per_request": min(all_costs),
|
541
|
+
"max_cost_per_request": max(all_costs),
|
542
|
+
"cost_variance": statistics.variance(all_costs) if len(all_costs) > 1 else 0
|
543
|
+
}
|
544
|
+
|
545
|
+
# Cost efficiency analysis
|
546
|
+
perf_metrics = performance_results.get("metrics", {})
|
547
|
+
load_metrics = load_results.get("metrics", {})
|
548
|
+
|
549
|
+
avg_latency = perf_metrics.get("avg_latency_ms", 0)
|
550
|
+
throughput = load_metrics.get("throughput_rps", 0)
|
551
|
+
|
552
|
+
if avg_latency > 0 and throughput > 0:
|
553
|
+
analysis["cost_efficiency"] = {
|
554
|
+
"cost_per_second_latency": statistics.mean(all_costs) / (avg_latency / 1000),
|
555
|
+
"cost_per_rps": statistics.mean(all_costs) * throughput,
|
556
|
+
"efficiency_score": throughput / (statistics.mean(all_costs) * avg_latency) if avg_latency > 0 else 0
|
557
|
+
}
|
558
|
+
|
559
|
+
return analysis
|
560
|
+
|
561
|
+
def _calculate_overall_metrics(self, performance: Dict, load: Dict, reliability: Dict) -> ServicePerformanceMetrics:
|
562
|
+
"""Calculate overall service performance metrics."""
|
563
|
+
perf_metrics = performance.get("metrics", {})
|
564
|
+
load_metrics = load.get("metrics", {})
|
565
|
+
reliability_metrics = reliability.get("metrics", {})
|
566
|
+
|
567
|
+
return ServicePerformanceMetrics(
|
568
|
+
service_name=performance.get("service_name", "unknown"),
|
569
|
+
total_requests=perf_metrics.get("total_requests", 0) + load_metrics.get("total_requests", 0),
|
570
|
+
successful_requests=perf_metrics.get("successful_requests", 0) + load_metrics.get("successful_requests", 0),
|
571
|
+
failed_requests=perf_metrics.get("failed_requests", 0) + load_metrics.get("failed_requests", 0),
|
572
|
+
avg_latency_ms=perf_metrics.get("avg_latency_ms", 0),
|
573
|
+
p95_latency_ms=perf_metrics.get("p95_latency_ms", 0),
|
574
|
+
p99_latency_ms=perf_metrics.get("p99_latency_ms", 0),
|
575
|
+
throughput_rps=load_metrics.get("throughput_rps", 0),
|
576
|
+
total_cost_usd=perf_metrics.get("total_cost_usd", 0),
|
577
|
+
cost_per_request_usd=perf_metrics.get("avg_cost_per_request", 0),
|
578
|
+
error_rate=1 - reliability_metrics.get("overall_reliability_rate", 1)
|
579
|
+
)
|
580
|
+
|
581
|
+
def _perform_comparative_analysis(self, service_results: Dict[str, Any]) -> Dict[str, Any]:
|
582
|
+
"""Perform comparative analysis across services."""
|
583
|
+
analysis = {
|
584
|
+
"performance_comparison": {},
|
585
|
+
"cost_comparison": {},
|
586
|
+
"reliability_comparison": {},
|
587
|
+
"recommendations": []
|
588
|
+
}
|
589
|
+
|
590
|
+
services = list(service_results.keys())
|
591
|
+
|
592
|
+
# Performance comparison
|
593
|
+
performance_data = {}
|
594
|
+
for service in services:
|
595
|
+
if "error" not in service_results[service]:
|
596
|
+
metrics = service_results[service].get("overall_metrics")
|
597
|
+
if metrics:
|
598
|
+
performance_data[service] = {
|
599
|
+
"avg_latency_ms": metrics.avg_latency_ms,
|
600
|
+
"throughput_rps": metrics.throughput_rps,
|
601
|
+
"success_rate": 1 - metrics.error_rate
|
602
|
+
}
|
603
|
+
|
604
|
+
analysis["performance_comparison"] = performance_data
|
605
|
+
|
606
|
+
# Cost comparison
|
607
|
+
cost_data = {}
|
608
|
+
for service in services:
|
609
|
+
if "error" not in service_results[service]:
|
610
|
+
metrics = service_results[service].get("overall_metrics")
|
611
|
+
if metrics:
|
612
|
+
cost_data[service] = {
|
613
|
+
"cost_per_request": metrics.cost_per_request_usd,
|
614
|
+
"total_cost": metrics.total_cost_usd
|
615
|
+
}
|
616
|
+
|
617
|
+
analysis["cost_comparison"] = cost_data
|
618
|
+
|
619
|
+
# Generate recommendations
|
620
|
+
if performance_data:
|
621
|
+
fastest_service = min(performance_data.keys(), key=lambda x: performance_data[x]["avg_latency_ms"])
|
622
|
+
highest_throughput = max(performance_data.keys(), key=lambda x: performance_data[x]["throughput_rps"])
|
623
|
+
|
624
|
+
analysis["recommendations"].extend([
|
625
|
+
f"Fastest response time: {fastest_service}",
|
626
|
+
f"Highest throughput: {highest_throughput}"
|
627
|
+
])
|
628
|
+
|
629
|
+
if cost_data:
|
630
|
+
most_cost_effective = min(cost_data.keys(), key=lambda x: cost_data[x]["cost_per_request"])
|
631
|
+
analysis["recommendations"].append(f"Most cost-effective: {most_cost_effective}")
|
632
|
+
|
633
|
+
return analysis
|
634
|
+
|
635
|
+
def _generate_summary(self, service_results: Dict[str, Any]) -> Dict[str, Any]:
|
636
|
+
"""Generate benchmark summary."""
|
637
|
+
summary = {
|
638
|
+
"total_services_tested": len(service_results),
|
639
|
+
"successful_services": len([s for s in service_results.values() if "error" not in s]),
|
640
|
+
"failed_services": len([s for s in service_results.values() if "error" in s]),
|
641
|
+
"overall_performance": {},
|
642
|
+
"key_findings": []
|
643
|
+
}
|
644
|
+
|
645
|
+
# Calculate overall performance across all services
|
646
|
+
all_latencies = []
|
647
|
+
all_throughputs = []
|
648
|
+
all_costs = []
|
649
|
+
|
650
|
+
for service_name, results in service_results.items():
|
651
|
+
if "error" not in results:
|
652
|
+
metrics = results.get("overall_metrics")
|
653
|
+
if metrics:
|
654
|
+
all_latencies.append(metrics.avg_latency_ms)
|
655
|
+
all_throughputs.append(metrics.throughput_rps)
|
656
|
+
all_costs.append(metrics.cost_per_request_usd)
|
657
|
+
|
658
|
+
if all_latencies:
|
659
|
+
summary["overall_performance"] = {
|
660
|
+
"avg_latency_across_services": statistics.mean(all_latencies),
|
661
|
+
"avg_throughput_across_services": statistics.mean(all_throughputs),
|
662
|
+
"avg_cost_across_services": statistics.mean(all_costs) if all_costs else 0
|
663
|
+
}
|
664
|
+
|
665
|
+
return summary
|
666
|
+
|
667
|
+
def _percentile(self, data: List[float], percentile: int) -> float:
|
668
|
+
"""Calculate percentile of data."""
|
669
|
+
if not data:
|
670
|
+
return 0.0
|
671
|
+
sorted_data = sorted(data)
|
672
|
+
index = int((percentile / 100) * len(sorted_data))
|
673
|
+
return sorted_data[min(index, len(sorted_data) - 1)]
|
674
|
+
|
675
|
+
def _analyze_failure_types(self, requests: List[Dict[str, Any]]) -> Dict[str, int]:
|
676
|
+
"""Analyze types of failures."""
|
677
|
+
failure_types = {}
|
678
|
+
for request in requests:
|
679
|
+
if not request.get("success"):
|
680
|
+
error = request.get("error", "unknown_error")
|
681
|
+
# Categorize error types
|
682
|
+
if "timeout" in error.lower():
|
683
|
+
error_type = "timeout"
|
684
|
+
elif "connection" in error.lower():
|
685
|
+
error_type = "connection_error"
|
686
|
+
elif "rate limit" in error.lower():
|
687
|
+
error_type = "rate_limit"
|
688
|
+
else:
|
689
|
+
error_type = "other_error"
|
690
|
+
|
691
|
+
failure_types[error_type] = failure_types.get(error_type, 0) + 1
|
692
|
+
|
693
|
+
return failure_types
|
694
|
+
|
695
|
+
|
696
|
+
# Convenience function for running ISA benchmarks
|
697
|
+
async def run_isa_service_benchmark(config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
698
|
+
"""Run comprehensive ISA service benchmark."""
|
699
|
+
benchmark = ISAServiceBenchmark(config)
|
700
|
+
return await benchmark.run_comprehensive_benchmark()
|