isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +40 -17
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,888 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Intelligent Training Factory
|
3
|
-
|
4
|
-
This module provides the main interface for intelligent AI training.
|
5
|
-
It extends the existing TrainingFactory with AI-powered capabilities:
|
6
|
-
- Natural language training request parsing
|
7
|
-
- Intelligent model and resource selection
|
8
|
-
- Automatic configuration optimization
|
9
|
-
- Cost and performance prediction
|
10
|
-
|
11
|
-
The IntelligentTrainingFactory maintains backward compatibility while
|
12
|
-
adding advanced intelligence features.
|
13
|
-
"""
|
14
|
-
|
15
|
-
import logging
|
16
|
-
from typing import Dict, List, Optional, Any, Union
|
17
|
-
import os
|
18
|
-
from datetime import datetime
|
19
|
-
|
20
|
-
from ..factory import TrainingFactory
|
21
|
-
from .decision_engine import IntelligentDecisionEngine, TrainingRequest, TrainingRecommendation
|
22
|
-
from .task_classifier import TaskClassifier
|
23
|
-
from .knowledge_base import KnowledgeBase
|
24
|
-
from .resource_optimizer import ResourceOptimizer
|
25
|
-
from ..core.config import TrainingConfig, LoRAConfig, DatasetConfig
|
26
|
-
|
27
|
-
logger = logging.getLogger(__name__)
|
28
|
-
|
29
|
-
|
30
|
-
class IntelligentTrainingFactory(TrainingFactory):
|
31
|
-
"""
|
32
|
-
Intelligent Training Factory with AI-powered optimization.
|
33
|
-
|
34
|
-
This factory extends the base TrainingFactory with intelligent capabilities:
|
35
|
-
- Analyzes natural language training requests
|
36
|
-
- Automatically selects optimal models and configurations
|
37
|
-
- Provides cost and performance predictions
|
38
|
-
- Recommends best practices and alternatives
|
39
|
-
|
40
|
-
Maintains full backward compatibility with existing TrainingFactory API
|
41
|
-
while adding new intelligent features.
|
42
|
-
|
43
|
-
Example:
|
44
|
-
```python
|
45
|
-
from isa_model.training.intelligent import IntelligentTrainingFactory
|
46
|
-
|
47
|
-
# Create intelligent factory
|
48
|
-
factory = IntelligentTrainingFactory()
|
49
|
-
|
50
|
-
# Traditional usage (backward compatible)
|
51
|
-
model_path = factory.train_model(
|
52
|
-
model_name="google/gemma-2-4b-it",
|
53
|
-
dataset_path="tatsu-lab/alpaca"
|
54
|
-
)
|
55
|
-
|
56
|
-
# Intelligent usage with natural language
|
57
|
-
recommendation = factory.analyze_training_request(
|
58
|
-
"Train a Chinese customer service chatbot with high quality",
|
59
|
-
dataset_path="my-chinese-dialogues.json",
|
60
|
-
budget_limit=500.0,
|
61
|
-
time_limit=12
|
62
|
-
)
|
63
|
-
|
64
|
-
# Train with intelligent recommendation
|
65
|
-
model_path = factory.train_with_recommendation(recommendation)
|
66
|
-
```
|
67
|
-
"""
|
68
|
-
|
69
|
-
def __init__(self,
|
70
|
-
base_output_dir: Optional[str] = None,
|
71
|
-
enable_intelligence: bool = True,
|
72
|
-
knowledge_base_dir: Optional[str] = None,
|
73
|
-
resource_data_dir: Optional[str] = None):
|
74
|
-
"""
|
75
|
-
Initialize intelligent training factory.
|
76
|
-
|
77
|
-
Args:
|
78
|
-
base_output_dir: Base directory for training outputs
|
79
|
-
enable_intelligence: Enable intelligent features
|
80
|
-
knowledge_base_dir: Directory for knowledge base data
|
81
|
-
resource_data_dir: Directory for resource data
|
82
|
-
"""
|
83
|
-
# Initialize base factory
|
84
|
-
super().__init__(base_output_dir)
|
85
|
-
|
86
|
-
self.enable_intelligence = enable_intelligence
|
87
|
-
|
88
|
-
if enable_intelligence:
|
89
|
-
try:
|
90
|
-
# Initialize intelligent components
|
91
|
-
self.knowledge_base = KnowledgeBase(knowledge_base_dir)
|
92
|
-
self.task_classifier = TaskClassifier()
|
93
|
-
self.resource_optimizer = ResourceOptimizer(resource_data_dir)
|
94
|
-
self.decision_engine = IntelligentDecisionEngine(self.knowledge_base)
|
95
|
-
|
96
|
-
# Initialize training data management
|
97
|
-
from ..storage import TrainingRepository, CoreModelIntegration
|
98
|
-
self.training_repository = TrainingRepository()
|
99
|
-
self.core_integration = self.training_repository.core_integration
|
100
|
-
|
101
|
-
# Store recommendations for learning
|
102
|
-
self.recent_recommendations: List[TrainingRecommendation] = []
|
103
|
-
|
104
|
-
logger.info("Intelligent Training Factory initialized with AI capabilities and data persistence")
|
105
|
-
self._print_welcome_message()
|
106
|
-
|
107
|
-
except Exception as e:
|
108
|
-
logger.warning(f"Failed to initialize intelligent components: {e}")
|
109
|
-
logger.warning("Falling back to standard training factory mode")
|
110
|
-
self.enable_intelligence = False
|
111
|
-
else:
|
112
|
-
logger.info("Intelligent Training Factory initialized in standard mode")
|
113
|
-
|
114
|
-
def _print_welcome_message(self) -> None:
|
115
|
-
"""Print welcome message with intelligent capabilities."""
|
116
|
-
stats = self.knowledge_base.get_statistics()
|
117
|
-
resource_stats = self.resource_optimizer.get_statistics()
|
118
|
-
|
119
|
-
print("\n" + "="*60)
|
120
|
-
print("🧠 INTELLIGENT TRAINING FACTORY READY")
|
121
|
-
print("="*60)
|
122
|
-
print(f"📚 Knowledge Base: {stats['total_models']} models, {stats['best_practices']} best practices")
|
123
|
-
print(f"🖥️ Resource Pool: {resource_stats['total_gpus']} GPUs, {resource_stats['total_providers']} providers")
|
124
|
-
print(f"🎯 Task Support: {len(self.task_classifier.get_supported_tasks())} task types")
|
125
|
-
print(f"🌍 Domain Support: {len(self.task_classifier.get_supported_domains())} domains")
|
126
|
-
print("="*60)
|
127
|
-
print("New capabilities available:")
|
128
|
-
print(" • analyze_training_request() - Natural language analysis")
|
129
|
-
print(" • get_intelligent_recommendation() - Smart configuration")
|
130
|
-
print(" • train_with_recommendation() - Optimized training")
|
131
|
-
print(" • compare_training_options() - Cost/performance comparison")
|
132
|
-
print("="*60 + "\n")
|
133
|
-
|
134
|
-
def analyze_training_request(
|
135
|
-
self,
|
136
|
-
description: str,
|
137
|
-
dataset_source: str,
|
138
|
-
quality_target: str = "balanced",
|
139
|
-
budget_limit: Optional[float] = None,
|
140
|
-
time_limit: Optional[int] = None,
|
141
|
-
**preferences
|
142
|
-
) -> TrainingRecommendation:
|
143
|
-
"""
|
144
|
-
Analyze a natural language training request and generate recommendation.
|
145
|
-
|
146
|
-
Args:
|
147
|
-
description: Natural language description of the training task
|
148
|
-
dataset_source: Path to dataset or HuggingFace dataset name
|
149
|
-
quality_target: Quality target ("fast", "balanced", "high")
|
150
|
-
budget_limit: Maximum budget in USD
|
151
|
-
time_limit: Maximum time in hours
|
152
|
-
**preferences: Additional user preferences
|
153
|
-
|
154
|
-
Returns:
|
155
|
-
Complete training recommendation with configuration
|
156
|
-
|
157
|
-
Example:
|
158
|
-
```python
|
159
|
-
recommendation = factory.analyze_training_request(
|
160
|
-
"Fine-tune a medical chatbot for patient Q&A in Chinese",
|
161
|
-
dataset_source="medical_qa_chinese.json",
|
162
|
-
quality_target="high",
|
163
|
-
budget_limit=300.0,
|
164
|
-
time_limit=8
|
165
|
-
)
|
166
|
-
```
|
167
|
-
"""
|
168
|
-
if not self.enable_intelligence:
|
169
|
-
raise ValueError("Intelligence features not available. Initialize with enable_intelligence=True")
|
170
|
-
|
171
|
-
logger.info(f"Analyzing training request: {description[:50]}...")
|
172
|
-
|
173
|
-
try:
|
174
|
-
# Create training request object
|
175
|
-
request = TrainingRequest(
|
176
|
-
description=description,
|
177
|
-
dataset_source=dataset_source,
|
178
|
-
quality_target=quality_target,
|
179
|
-
budget_limit=budget_limit,
|
180
|
-
time_limit=time_limit,
|
181
|
-
model_preferences=preferences.get("model_preferences"),
|
182
|
-
gpu_preferences=preferences.get("gpu_preferences"),
|
183
|
-
cloud_preferences=preferences.get("cloud_preferences"),
|
184
|
-
use_lora=preferences.get("use_lora"),
|
185
|
-
batch_size=preferences.get("batch_size"),
|
186
|
-
learning_rate=preferences.get("learning_rate"),
|
187
|
-
user_id=preferences.get("user_id"),
|
188
|
-
project_name=preferences.get("project_name"),
|
189
|
-
tags=preferences.get("tags", {})
|
190
|
-
)
|
191
|
-
|
192
|
-
# Generate intelligent recommendation
|
193
|
-
recommendation = self.decision_engine.analyze_and_recommend(request)
|
194
|
-
|
195
|
-
# Store for learning
|
196
|
-
self.recent_recommendations.append(recommendation)
|
197
|
-
|
198
|
-
# Print summary
|
199
|
-
self._print_recommendation_summary(recommendation)
|
200
|
-
|
201
|
-
return recommendation
|
202
|
-
|
203
|
-
except Exception as e:
|
204
|
-
logger.error(f"Failed to analyze training request: {e}")
|
205
|
-
raise
|
206
|
-
|
207
|
-
def get_intelligent_recommendation(
|
208
|
-
self,
|
209
|
-
task_type: str,
|
210
|
-
domain: str = "general",
|
211
|
-
dataset_size: int = 10000,
|
212
|
-
quality_target: str = "balanced",
|
213
|
-
**constraints
|
214
|
-
) -> TrainingRecommendation:
|
215
|
-
"""
|
216
|
-
Get intelligent recommendation for specific task parameters.
|
217
|
-
|
218
|
-
Args:
|
219
|
-
task_type: Type of task (chat, classification, etc.)
|
220
|
-
domain: Domain/industry
|
221
|
-
dataset_size: Size of training dataset
|
222
|
-
quality_target: Quality target ("fast", "balanced", "high")
|
223
|
-
**constraints: Additional constraints
|
224
|
-
|
225
|
-
Returns:
|
226
|
-
Training recommendation
|
227
|
-
"""
|
228
|
-
if not self.enable_intelligence:
|
229
|
-
raise ValueError("Intelligence features not available")
|
230
|
-
|
231
|
-
# Create synthetic request
|
232
|
-
description = f"Train a {task_type} model for {domain} domain"
|
233
|
-
|
234
|
-
return self.analyze_training_request(
|
235
|
-
description=description,
|
236
|
-
dataset_source="synthetic_dataset",
|
237
|
-
quality_target=quality_target,
|
238
|
-
**constraints
|
239
|
-
)
|
240
|
-
|
241
|
-
def train_with_recommendation(
|
242
|
-
self,
|
243
|
-
recommendation: TrainingRecommendation,
|
244
|
-
dataset_path: Optional[str] = None,
|
245
|
-
output_dir: Optional[str] = None,
|
246
|
-
user_id: Optional[str] = None,
|
247
|
-
project_name: Optional[str] = None,
|
248
|
-
**overrides
|
249
|
-
) -> str:
|
250
|
-
"""
|
251
|
-
Train a model using an intelligent recommendation with full tracking.
|
252
|
-
|
253
|
-
Args:
|
254
|
-
recommendation: Training recommendation from analyze_training_request()
|
255
|
-
dataset_path: Override dataset path
|
256
|
-
output_dir: Override output directory
|
257
|
-
user_id: User identifier for tracking
|
258
|
-
project_name: Project name for organization
|
259
|
-
**overrides: Override specific configuration parameters
|
260
|
-
|
261
|
-
Returns:
|
262
|
-
Path to trained model
|
263
|
-
|
264
|
-
Example:
|
265
|
-
```python
|
266
|
-
# Get recommendation
|
267
|
-
rec = factory.analyze_training_request(
|
268
|
-
"Train a customer service chatbot",
|
269
|
-
"customer_service_data.json"
|
270
|
-
)
|
271
|
-
|
272
|
-
# Train with recommendation and tracking
|
273
|
-
model_path = factory.train_with_recommendation(
|
274
|
-
rec,
|
275
|
-
user_id="user_123",
|
276
|
-
project_name="medical_chatbot"
|
277
|
-
)
|
278
|
-
```
|
279
|
-
"""
|
280
|
-
logger.info(f"Training with intelligent recommendation: {recommendation.model_name}")
|
281
|
-
|
282
|
-
job_id = None
|
283
|
-
|
284
|
-
try:
|
285
|
-
# Create training job record if repository is available
|
286
|
-
if hasattr(self, 'training_repository'):
|
287
|
-
job_id = self.training_repository.create_training_job(
|
288
|
-
job_name=f"{recommendation.model_name.split('/')[-1]}_training",
|
289
|
-
base_model=recommendation.model_name,
|
290
|
-
task_type=recommendation.trainer_type,
|
291
|
-
domain="general", # TODO: Extract from recommendation
|
292
|
-
dataset_source=dataset_path or recommendation.training_config.dataset_config.dataset_path,
|
293
|
-
training_config=recommendation.training_config.to_dict(),
|
294
|
-
resource_config={
|
295
|
-
"gpu": recommendation.recommended_gpu,
|
296
|
-
"cloud_provider": recommendation.cloud_provider,
|
297
|
-
"estimated_cost": recommendation.estimated_cost,
|
298
|
-
"estimated_time": recommendation.estimated_time
|
299
|
-
},
|
300
|
-
user_id=user_id,
|
301
|
-
project_name=project_name
|
302
|
-
)
|
303
|
-
|
304
|
-
# Update job status to running
|
305
|
-
self.training_repository.update_job_status(job_id, "running")
|
306
|
-
|
307
|
-
# Get configuration from recommendation
|
308
|
-
config = recommendation.training_config
|
309
|
-
|
310
|
-
# Apply overrides
|
311
|
-
if dataset_path:
|
312
|
-
config.dataset_config.dataset_path = dataset_path
|
313
|
-
if output_dir:
|
314
|
-
config.output_dir = output_dir
|
315
|
-
|
316
|
-
for key, value in overrides.items():
|
317
|
-
if hasattr(config, key):
|
318
|
-
setattr(config, key, value)
|
319
|
-
elif config.lora_config and hasattr(config.lora_config, key):
|
320
|
-
setattr(config.lora_config, key, value)
|
321
|
-
elif config.dataset_config and hasattr(config.dataset_config, key):
|
322
|
-
setattr(config.dataset_config, key, value)
|
323
|
-
|
324
|
-
# Use base factory training with optimized config
|
325
|
-
result_path = self.train_model(
|
326
|
-
model_name=config.model_name,
|
327
|
-
dataset_path=config.dataset_config.dataset_path,
|
328
|
-
output_dir=config.output_dir,
|
329
|
-
training_type=config.training_type,
|
330
|
-
dataset_format=config.dataset_config.dataset_format,
|
331
|
-
use_lora=config.lora_config.use_lora if config.lora_config else False,
|
332
|
-
batch_size=config.batch_size,
|
333
|
-
num_epochs=config.num_epochs,
|
334
|
-
learning_rate=config.learning_rate,
|
335
|
-
max_length=config.dataset_config.max_length,
|
336
|
-
lora_rank=config.lora_config.lora_rank if config.lora_config else 8,
|
337
|
-
lora_alpha=config.lora_config.lora_alpha if config.lora_config else 16,
|
338
|
-
validation_split=config.dataset_config.validation_split
|
339
|
-
)
|
340
|
-
|
341
|
-
# Complete training and register model
|
342
|
-
if hasattr(self, 'training_repository') and job_id:
|
343
|
-
core_model_id = self.training_repository.complete_training(
|
344
|
-
job_id=job_id,
|
345
|
-
model_path=result_path,
|
346
|
-
final_metrics={"training_completed": True}, # TODO: Extract real metrics
|
347
|
-
cost_breakdown={"total": recommendation.estimated_cost}
|
348
|
-
)
|
349
|
-
|
350
|
-
if core_model_id:
|
351
|
-
logger.info(f"Model registered in core system: {core_model_id}")
|
352
|
-
|
353
|
-
# Update knowledge base with results
|
354
|
-
if self.enable_intelligence:
|
355
|
-
self._update_knowledge_from_training(recommendation, result_path)
|
356
|
-
|
357
|
-
logger.info("Training completed with intelligent recommendation")
|
358
|
-
return result_path
|
359
|
-
|
360
|
-
except Exception as e:
|
361
|
-
# Mark job as failed if it was created
|
362
|
-
if hasattr(self, 'training_repository') and job_id:
|
363
|
-
self.training_repository.update_job_status(
|
364
|
-
job_id,
|
365
|
-
"failed",
|
366
|
-
error_message=str(e)
|
367
|
-
)
|
368
|
-
|
369
|
-
logger.error(f"Training with recommendation failed: {e}")
|
370
|
-
raise
|
371
|
-
|
372
|
-
def train_on_runpod_intelligent(
|
373
|
-
self,
|
374
|
-
description: str,
|
375
|
-
dataset_path: str,
|
376
|
-
runpod_api_key: str,
|
377
|
-
template_id: str,
|
378
|
-
quality_target: str = "balanced",
|
379
|
-
budget_limit: Optional[float] = None,
|
380
|
-
time_limit: Optional[int] = None,
|
381
|
-
**preferences
|
382
|
-
) -> Dict[str, Any]:
|
383
|
-
"""
|
384
|
-
Intelligent cloud training on RunPod.
|
385
|
-
|
386
|
-
Combines natural language analysis with cloud training.
|
387
|
-
|
388
|
-
Args:
|
389
|
-
description: Natural language description
|
390
|
-
dataset_path: Dataset path
|
391
|
-
runpod_api_key: RunPod API key
|
392
|
-
template_id: RunPod template ID
|
393
|
-
quality_target: Quality target
|
394
|
-
budget_limit: Budget limit
|
395
|
-
time_limit: Time limit
|
396
|
-
**preferences: Additional preferences
|
397
|
-
|
398
|
-
Returns:
|
399
|
-
Training job results
|
400
|
-
"""
|
401
|
-
if not self.enable_intelligence:
|
402
|
-
# Fallback to base implementation
|
403
|
-
return self.train_on_runpod(
|
404
|
-
model_name=preferences.get("model_name", "google/gemma-2-4b-it"),
|
405
|
-
dataset_path=dataset_path,
|
406
|
-
runpod_api_key=runpod_api_key,
|
407
|
-
template_id=template_id,
|
408
|
-
**preferences
|
409
|
-
)
|
410
|
-
|
411
|
-
logger.info("Starting intelligent cloud training on RunPod")
|
412
|
-
|
413
|
-
try:
|
414
|
-
# Get intelligent recommendation
|
415
|
-
recommendation = self.analyze_training_request(
|
416
|
-
description=description,
|
417
|
-
dataset_source=dataset_path,
|
418
|
-
quality_target=quality_target,
|
419
|
-
budget_limit=budget_limit,
|
420
|
-
time_limit=time_limit,
|
421
|
-
**preferences
|
422
|
-
)
|
423
|
-
|
424
|
-
# Extract configuration
|
425
|
-
config = recommendation.training_config
|
426
|
-
|
427
|
-
# Use base RunPod training with intelligent config
|
428
|
-
result = self.train_on_runpod(
|
429
|
-
model_name=config.model_name,
|
430
|
-
dataset_path=dataset_path,
|
431
|
-
runpod_api_key=runpod_api_key,
|
432
|
-
template_id=template_id,
|
433
|
-
gpu_type=recommendation.recommended_gpu,
|
434
|
-
use_lora=config.lora_config.use_lora if config.lora_config else True,
|
435
|
-
batch_size=config.batch_size,
|
436
|
-
num_epochs=config.num_epochs,
|
437
|
-
learning_rate=config.learning_rate,
|
438
|
-
max_length=config.dataset_config.max_length,
|
439
|
-
lora_rank=config.lora_config.lora_rank if config.lora_config else 8,
|
440
|
-
lora_alpha=config.lora_config.lora_alpha if config.lora_config else 16
|
441
|
-
)
|
442
|
-
|
443
|
-
# Add intelligent metadata to result
|
444
|
-
result["intelligent_recommendation"] = {
|
445
|
-
"model_name": recommendation.model_name,
|
446
|
-
"estimated_cost": recommendation.estimated_cost,
|
447
|
-
"estimated_time": recommendation.estimated_time,
|
448
|
-
"confidence": recommendation.confidence_score,
|
449
|
-
"decision_reasons": recommendation.decision_reasons
|
450
|
-
}
|
451
|
-
|
452
|
-
return result
|
453
|
-
|
454
|
-
except Exception as e:
|
455
|
-
logger.error(f"Intelligent cloud training failed: {e}")
|
456
|
-
raise
|
457
|
-
|
458
|
-
def compare_training_options(
|
459
|
-
self,
|
460
|
-
description: str,
|
461
|
-
dataset_source: str,
|
462
|
-
quality_targets: List[str] = ["fast", "balanced", "high"],
|
463
|
-
budget_limits: Optional[List[float]] = None
|
464
|
-
) -> List[TrainingRecommendation]:
|
465
|
-
"""
|
466
|
-
Compare multiple training options for the same task.
|
467
|
-
|
468
|
-
Args:
|
469
|
-
description: Training task description
|
470
|
-
dataset_source: Dataset source
|
471
|
-
quality_targets: List of quality targets to compare
|
472
|
-
budget_limits: Optional budget limits for each target
|
473
|
-
|
474
|
-
Returns:
|
475
|
-
List of recommendations for comparison
|
476
|
-
"""
|
477
|
-
if not self.enable_intelligence:
|
478
|
-
raise ValueError("Intelligence features not available")
|
479
|
-
|
480
|
-
logger.info("Comparing training options...")
|
481
|
-
|
482
|
-
recommendations = []
|
483
|
-
budget_limits = budget_limits or [None] * len(quality_targets)
|
484
|
-
|
485
|
-
for i, quality_target in enumerate(quality_targets):
|
486
|
-
budget_limit = budget_limits[i] if i < len(budget_limits) else None
|
487
|
-
|
488
|
-
try:
|
489
|
-
rec = self.analyze_training_request(
|
490
|
-
description=description,
|
491
|
-
dataset_source=dataset_source,
|
492
|
-
quality_target=quality_target,
|
493
|
-
budget_limit=budget_limit
|
494
|
-
)
|
495
|
-
recommendations.append(rec)
|
496
|
-
except Exception as e:
|
497
|
-
logger.warning(f"Failed to generate recommendation for {quality_target}: {e}")
|
498
|
-
|
499
|
-
# Print comparison table
|
500
|
-
self._print_comparison_table(recommendations)
|
501
|
-
|
502
|
-
return recommendations
|
503
|
-
|
504
|
-
def get_best_practices(self, task_type: str, domain: str = "general") -> List[str]:
|
505
|
-
"""
|
506
|
-
Get best practices for a specific task and domain.
|
507
|
-
|
508
|
-
Args:
|
509
|
-
task_type: Type of task
|
510
|
-
domain: Domain/industry
|
511
|
-
|
512
|
-
Returns:
|
513
|
-
List of best practice recommendations
|
514
|
-
"""
|
515
|
-
if not self.enable_intelligence:
|
516
|
-
return ["Enable intelligence features to get best practices"]
|
517
|
-
|
518
|
-
practices = self.knowledge_base.get_best_practices(task_type, domain)
|
519
|
-
return [p.recommendation for p in practices]
|
520
|
-
|
521
|
-
def get_supported_capabilities(self) -> Dict[str, List[str]]:
|
522
|
-
"""
|
523
|
-
Get supported capabilities of the intelligent training system.
|
524
|
-
|
525
|
-
Returns:
|
526
|
-
Dictionary of supported capabilities
|
527
|
-
"""
|
528
|
-
if not self.enable_intelligence:
|
529
|
-
return {"status": "Intelligence features disabled"}
|
530
|
-
|
531
|
-
return {
|
532
|
-
"task_types": self.task_classifier.get_supported_tasks(),
|
533
|
-
"domains": self.task_classifier.get_supported_domains(),
|
534
|
-
"gpu_types": self.resource_optimizer.get_available_gpus(),
|
535
|
-
"cloud_providers": self.resource_optimizer.get_available_providers(),
|
536
|
-
"quality_targets": ["fast", "balanced", "high"]
|
537
|
-
}
|
538
|
-
|
539
|
-
def _print_recommendation_summary(self, recommendation: TrainingRecommendation) -> None:
|
540
|
-
"""Print a summary of the recommendation."""
|
541
|
-
print("\n" + "="*50)
|
542
|
-
print("🎯 INTELLIGENT TRAINING RECOMMENDATION")
|
543
|
-
print("="*50)
|
544
|
-
print(f"📱 Model: {recommendation.model_name}")
|
545
|
-
print(f"🖥️ GPU: {recommendation.recommended_gpu}")
|
546
|
-
print(f"☁️ Cloud: {recommendation.cloud_provider}")
|
547
|
-
print(f"💰 Cost: ${recommendation.estimated_cost:.2f}")
|
548
|
-
print(f"⏱️ Time: {recommendation.estimated_time:.1f} hours")
|
549
|
-
print(f"🎨 Quality: {recommendation.predicted_quality}")
|
550
|
-
print(f"🎯 Confidence: {recommendation.confidence_score:.1%}")
|
551
|
-
print("\n📋 Key Decisions:")
|
552
|
-
for reason in recommendation.decision_reasons:
|
553
|
-
print(f" • {reason}")
|
554
|
-
|
555
|
-
if recommendation.alternatives:
|
556
|
-
print(f"\n🔄 {len(recommendation.alternatives)} alternatives available")
|
557
|
-
|
558
|
-
print("="*50 + "\n")
|
559
|
-
|
560
|
-
def _print_comparison_table(self, recommendations: List[TrainingRecommendation]) -> None:
|
561
|
-
"""Print comparison table for multiple recommendations."""
|
562
|
-
print("\n" + "="*80)
|
563
|
-
print("📊 TRAINING OPTIONS COMPARISON")
|
564
|
-
print("="*80)
|
565
|
-
|
566
|
-
# Table header
|
567
|
-
print(f"{'Target':<10} {'Model':<25} {'GPU':<15} {'Cost':<8} {'Time':<6} {'Quality'}")
|
568
|
-
print("-" * 80)
|
569
|
-
|
570
|
-
# Table rows
|
571
|
-
for rec in recommendations:
|
572
|
-
quality_target = "unknown"
|
573
|
-
if rec.estimated_cost < 50:
|
574
|
-
quality_target = "fast"
|
575
|
-
elif rec.estimated_cost > 200:
|
576
|
-
quality_target = "high"
|
577
|
-
else:
|
578
|
-
quality_target = "balanced"
|
579
|
-
|
580
|
-
print(f"{quality_target:<10} {rec.model_name[:24]:<25} {rec.recommended_gpu[:14]:<15} "
|
581
|
-
f"${rec.estimated_cost:<7.2f} {rec.estimated_time:<5.1f}h {rec.predicted_quality}")
|
582
|
-
|
583
|
-
print("="*80 + "\n")
|
584
|
-
|
585
|
-
def _update_knowledge_from_training(
|
586
|
-
self,
|
587
|
-
recommendation: TrainingRecommendation,
|
588
|
-
result_path: str
|
589
|
-
) -> None:
|
590
|
-
"""Update knowledge base with training results."""
|
591
|
-
try:
|
592
|
-
# Create training result record
|
593
|
-
training_result = {
|
594
|
-
"model_name": recommendation.model_name,
|
595
|
-
"task_type": recommendation.trainer_type,
|
596
|
-
"dataset_name": "user_dataset",
|
597
|
-
"training_cost": recommendation.estimated_cost,
|
598
|
-
"gpu_type": recommendation.recommended_gpu,
|
599
|
-
"config": recommendation.training_config.to_dict(),
|
600
|
-
"result_path": result_path,
|
601
|
-
"timestamp": datetime.now().isoformat()
|
602
|
-
}
|
603
|
-
|
604
|
-
# Update knowledge base
|
605
|
-
self.knowledge_base.update_from_training_result(training_result)
|
606
|
-
|
607
|
-
logger.info("Updated knowledge base with training results")
|
608
|
-
|
609
|
-
except Exception as e:
|
610
|
-
logger.warning(f"Failed to update knowledge base: {e}")
|
611
|
-
|
612
|
-
def get_intelligence_statistics(self) -> Dict[str, Any]:
|
613
|
-
"""Get statistics about the intelligent training system."""
|
614
|
-
if not self.enable_intelligence:
|
615
|
-
return {"status": "Intelligence features disabled"}
|
616
|
-
|
617
|
-
kb_stats = self.knowledge_base.get_statistics()
|
618
|
-
resource_stats = self.resource_optimizer.get_statistics()
|
619
|
-
|
620
|
-
stats = {
|
621
|
-
"intelligence_enabled": True,
|
622
|
-
"knowledge_base": kb_stats,
|
623
|
-
"resource_optimizer": resource_stats,
|
624
|
-
"recent_recommendations": len(self.recent_recommendations),
|
625
|
-
"supported_tasks": len(self.task_classifier.get_supported_tasks()),
|
626
|
-
"supported_domains": len(self.task_classifier.get_supported_domains())
|
627
|
-
}
|
628
|
-
|
629
|
-
# Add training repository statistics if available
|
630
|
-
if hasattr(self, 'training_repository'):
|
631
|
-
try:
|
632
|
-
repo_stats = self.training_repository.get_repository_statistics()
|
633
|
-
stats["training_repository"] = repo_stats
|
634
|
-
except Exception as e:
|
635
|
-
stats["training_repository"] = {"error": str(e)}
|
636
|
-
|
637
|
-
return stats
|
638
|
-
|
639
|
-
def get_training_history(self, user_id: Optional[str] = None, limit: int = 50) -> List[Dict[str, Any]]:
|
640
|
-
"""
|
641
|
-
Get training history with intelligent insights.
|
642
|
-
|
643
|
-
Args:
|
644
|
-
user_id: Filter by user ID
|
645
|
-
limit: Maximum number of jobs to return
|
646
|
-
|
647
|
-
Returns:
|
648
|
-
List of training job summaries with insights
|
649
|
-
"""
|
650
|
-
if not hasattr(self, 'training_repository'):
|
651
|
-
return []
|
652
|
-
|
653
|
-
try:
|
654
|
-
jobs = self.training_repository.list_jobs(user_id=user_id, limit=limit)
|
655
|
-
|
656
|
-
history = []
|
657
|
-
for job in jobs:
|
658
|
-
job_summary = {
|
659
|
-
"job_id": job.job_id,
|
660
|
-
"job_name": job.job_name,
|
661
|
-
"status": job.status,
|
662
|
-
"base_model": job.base_model,
|
663
|
-
"task_type": job.task_type,
|
664
|
-
"domain": job.domain,
|
665
|
-
"created_at": job.created_at.isoformat(),
|
666
|
-
"user_id": job.user_id,
|
667
|
-
"project_name": job.project_name
|
668
|
-
}
|
669
|
-
|
670
|
-
if job.completed_at:
|
671
|
-
job_summary["completed_at"] = job.completed_at.isoformat()
|
672
|
-
|
673
|
-
if job.cost_breakdown:
|
674
|
-
job_summary["total_cost"] = sum(job.cost_breakdown.values())
|
675
|
-
|
676
|
-
# Add progress information
|
677
|
-
progress = self.training_repository.get_job_progress(job.job_id)
|
678
|
-
if progress:
|
679
|
-
job_summary["progress"] = progress
|
680
|
-
|
681
|
-
history.append(job_summary)
|
682
|
-
|
683
|
-
return history
|
684
|
-
|
685
|
-
except Exception as e:
|
686
|
-
logger.error(f"Failed to get training history: {e}")
|
687
|
-
return []
|
688
|
-
|
689
|
-
def get_user_insights(self, user_id: str) -> Dict[str, Any]:
|
690
|
-
"""
|
691
|
-
Get intelligent insights for a specific user.
|
692
|
-
|
693
|
-
Args:
|
694
|
-
user_id: User identifier
|
695
|
-
|
696
|
-
Returns:
|
697
|
-
User insights and recommendations
|
698
|
-
"""
|
699
|
-
if not hasattr(self, 'training_repository'):
|
700
|
-
return {"error": "Training repository not available"}
|
701
|
-
|
702
|
-
try:
|
703
|
-
# Get user statistics
|
704
|
-
user_stats = self.training_repository.get_user_statistics(user_id)
|
705
|
-
|
706
|
-
# Get user's training history
|
707
|
-
user_jobs = self.training_repository.list_jobs(user_id=user_id, limit=100)
|
708
|
-
|
709
|
-
# Analyze patterns
|
710
|
-
insights = {
|
711
|
-
"user_statistics": user_stats,
|
712
|
-
"patterns": self._analyze_user_patterns(user_jobs),
|
713
|
-
"recommendations": self._generate_user_recommendations(user_jobs),
|
714
|
-
"cost_optimization": self._analyze_cost_optimization(user_jobs)
|
715
|
-
}
|
716
|
-
|
717
|
-
return insights
|
718
|
-
|
719
|
-
except Exception as e:
|
720
|
-
logger.error(f"Failed to get user insights for {user_id}: {e}")
|
721
|
-
return {"error": str(e)}
|
722
|
-
|
723
|
-
def _analyze_user_patterns(self, jobs: List) -> Dict[str, Any]:
|
724
|
-
"""Analyze user training patterns."""
|
725
|
-
if not jobs:
|
726
|
-
return {}
|
727
|
-
|
728
|
-
patterns = {
|
729
|
-
"most_used_models": {},
|
730
|
-
"preferred_tasks": {},
|
731
|
-
"preferred_domains": {},
|
732
|
-
"average_cost": 0.0,
|
733
|
-
"cost_trend": "stable"
|
734
|
-
}
|
735
|
-
|
736
|
-
total_cost = 0.0
|
737
|
-
recent_costs = []
|
738
|
-
|
739
|
-
for job in jobs:
|
740
|
-
# Count model usage
|
741
|
-
model = job.base_model
|
742
|
-
patterns["most_used_models"][model] = patterns["most_used_models"].get(model, 0) + 1
|
743
|
-
|
744
|
-
# Count task types
|
745
|
-
task = job.task_type
|
746
|
-
patterns["preferred_tasks"][task] = patterns["preferred_tasks"].get(task, 0) + 1
|
747
|
-
|
748
|
-
# Count domains
|
749
|
-
domain = job.domain
|
750
|
-
patterns["preferred_domains"][domain] = patterns["preferred_domains"].get(domain, 0) + 1
|
751
|
-
|
752
|
-
# Track costs
|
753
|
-
if job.cost_breakdown:
|
754
|
-
cost = sum(job.cost_breakdown.values())
|
755
|
-
total_cost += cost
|
756
|
-
recent_costs.append(cost)
|
757
|
-
|
758
|
-
patterns["average_cost"] = total_cost / len(jobs) if jobs else 0.0
|
759
|
-
|
760
|
-
# Analyze cost trend (simplified)
|
761
|
-
if len(recent_costs) > 1:
|
762
|
-
first_half = recent_costs[:len(recent_costs)//2]
|
763
|
-
second_half = recent_costs[len(recent_costs)//2:]
|
764
|
-
|
765
|
-
avg_first = sum(first_half) / len(first_half)
|
766
|
-
avg_second = sum(second_half) / len(second_half)
|
767
|
-
|
768
|
-
if avg_second > avg_first * 1.2:
|
769
|
-
patterns["cost_trend"] = "increasing"
|
770
|
-
elif avg_second < avg_first * 0.8:
|
771
|
-
patterns["cost_trend"] = "decreasing"
|
772
|
-
|
773
|
-
return patterns
|
774
|
-
|
775
|
-
def _generate_user_recommendations(self, jobs: List) -> List[str]:
|
776
|
-
"""Generate recommendations for the user based on their history."""
|
777
|
-
if not jobs:
|
778
|
-
return ["Start with a simple chat model training to get familiar with the system"]
|
779
|
-
|
780
|
-
recommendations = []
|
781
|
-
|
782
|
-
# Analyze success rate
|
783
|
-
completed_jobs = [job for job in jobs if job.status == "completed"]
|
784
|
-
success_rate = len(completed_jobs) / len(jobs) if jobs else 0
|
785
|
-
|
786
|
-
if success_rate < 0.5:
|
787
|
-
recommendations.append("Consider using smaller models or LoRA training to improve success rate")
|
788
|
-
|
789
|
-
# Check for cost optimization opportunities
|
790
|
-
high_cost_jobs = [job for job in jobs if job.cost_breakdown and sum(job.cost_breakdown.values()) > 50]
|
791
|
-
if len(high_cost_jobs) > len(jobs) * 0.3:
|
792
|
-
recommendations.append("Consider using more cost-effective GPU options or shorter training times")
|
793
|
-
|
794
|
-
# Check for domain diversity
|
795
|
-
domains = set(job.domain for job in jobs)
|
796
|
-
if len(domains) == 1 and len(jobs) > 5:
|
797
|
-
recommendations.append("Try training models for different domains to expand your capabilities")
|
798
|
-
|
799
|
-
# Check for recent failures
|
800
|
-
recent_jobs = jobs[:5] # Last 5 jobs
|
801
|
-
recent_failures = [job for job in recent_jobs if job.status == "failed"]
|
802
|
-
if len(recent_failures) > 2:
|
803
|
-
recommendations.append("Recent training failures detected - consider using the intelligent recommendations for more reliable configurations")
|
804
|
-
|
805
|
-
return recommendations
|
806
|
-
|
807
|
-
def _analyze_cost_optimization(self, jobs: List) -> Dict[str, Any]:
|
808
|
-
"""Analyze cost optimization opportunities."""
|
809
|
-
if not jobs:
|
810
|
-
return {}
|
811
|
-
|
812
|
-
total_cost = 0.0
|
813
|
-
potential_savings = 0.0
|
814
|
-
|
815
|
-
for job in jobs:
|
816
|
-
if job.cost_breakdown:
|
817
|
-
job_cost = sum(job.cost_breakdown.values())
|
818
|
-
total_cost += job_cost
|
819
|
-
|
820
|
-
# Estimate potential savings with intelligent optimization
|
821
|
-
# This is a simplified calculation
|
822
|
-
if job_cost > 10: # Only for jobs that cost more than $10
|
823
|
-
potential_savings += job_cost * 0.3 # Assume 30% savings possible
|
824
|
-
|
825
|
-
return {
|
826
|
-
"total_spent": total_cost,
|
827
|
-
"potential_savings": potential_savings,
|
828
|
-
"optimization_percentage": (potential_savings / total_cost * 100) if total_cost > 0 else 0,
|
829
|
-
"recommendation": "Use intelligent training recommendations to optimize costs" if potential_savings > 5 else "Your costs are already well optimized"
|
830
|
-
}
|
831
|
-
|
832
|
-
def save_recommendation(self, recommendation: TrainingRecommendation, filename: str) -> None:
|
833
|
-
"""
|
834
|
-
Save a training recommendation to file.
|
835
|
-
|
836
|
-
Args:
|
837
|
-
recommendation: Training recommendation to save
|
838
|
-
filename: Output filename
|
839
|
-
"""
|
840
|
-
try:
|
841
|
-
import json
|
842
|
-
from dataclasses import asdict
|
843
|
-
|
844
|
-
# Convert recommendation to dict
|
845
|
-
rec_dict = asdict(recommendation)
|
846
|
-
|
847
|
-
# Convert datetime objects to strings
|
848
|
-
def convert_datetime(obj):
|
849
|
-
if isinstance(obj, datetime):
|
850
|
-
return obj.isoformat()
|
851
|
-
return obj
|
852
|
-
|
853
|
-
# Save to file
|
854
|
-
with open(filename, 'w') as f:
|
855
|
-
json.dump(rec_dict, f, indent=2, default=convert_datetime)
|
856
|
-
|
857
|
-
logger.info(f"Recommendation saved to {filename}")
|
858
|
-
|
859
|
-
except Exception as e:
|
860
|
-
logger.error(f"Failed to save recommendation: {e}")
|
861
|
-
raise
|
862
|
-
|
863
|
-
def load_recommendation(self, filename: str) -> TrainingRecommendation:
|
864
|
-
"""
|
865
|
-
Load a training recommendation from file.
|
866
|
-
|
867
|
-
Args:
|
868
|
-
filename: Input filename
|
869
|
-
|
870
|
-
Returns:
|
871
|
-
Loaded training recommendation
|
872
|
-
"""
|
873
|
-
try:
|
874
|
-
import json
|
875
|
-
|
876
|
-
with open(filename, 'r') as f:
|
877
|
-
data = json.load(f)
|
878
|
-
|
879
|
-
# Convert back to TrainingRecommendation
|
880
|
-
# Note: This is a simplified version - would need proper deserialization
|
881
|
-
# for complex objects like TrainingConfig
|
882
|
-
|
883
|
-
logger.info(f"Recommendation loaded from {filename}")
|
884
|
-
return data # Return dict for now
|
885
|
-
|
886
|
-
except Exception as e:
|
887
|
-
logger.error(f"Failed to load recommendation: {e}")
|
888
|
-
raise
|