isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,888 +0,0 @@
1
- """
2
- Intelligent Training Factory
3
-
4
- This module provides the main interface for intelligent AI training.
5
- It extends the existing TrainingFactory with AI-powered capabilities:
6
- - Natural language training request parsing
7
- - Intelligent model and resource selection
8
- - Automatic configuration optimization
9
- - Cost and performance prediction
10
-
11
- The IntelligentTrainingFactory maintains backward compatibility while
12
- adding advanced intelligence features.
13
- """
14
-
15
- import logging
16
- from typing import Dict, List, Optional, Any, Union
17
- import os
18
- from datetime import datetime
19
-
20
- from ..factory import TrainingFactory
21
- from .decision_engine import IntelligentDecisionEngine, TrainingRequest, TrainingRecommendation
22
- from .task_classifier import TaskClassifier
23
- from .knowledge_base import KnowledgeBase
24
- from .resource_optimizer import ResourceOptimizer
25
- from ..core.config import TrainingConfig, LoRAConfig, DatasetConfig
26
-
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- class IntelligentTrainingFactory(TrainingFactory):
31
- """
32
- Intelligent Training Factory with AI-powered optimization.
33
-
34
- This factory extends the base TrainingFactory with intelligent capabilities:
35
- - Analyzes natural language training requests
36
- - Automatically selects optimal models and configurations
37
- - Provides cost and performance predictions
38
- - Recommends best practices and alternatives
39
-
40
- Maintains full backward compatibility with existing TrainingFactory API
41
- while adding new intelligent features.
42
-
43
- Example:
44
- ```python
45
- from isa_model.training.intelligent import IntelligentTrainingFactory
46
-
47
- # Create intelligent factory
48
- factory = IntelligentTrainingFactory()
49
-
50
- # Traditional usage (backward compatible)
51
- model_path = factory.train_model(
52
- model_name="google/gemma-2-4b-it",
53
- dataset_path="tatsu-lab/alpaca"
54
- )
55
-
56
- # Intelligent usage with natural language
57
- recommendation = factory.analyze_training_request(
58
- "Train a Chinese customer service chatbot with high quality",
59
- dataset_path="my-chinese-dialogues.json",
60
- budget_limit=500.0,
61
- time_limit=12
62
- )
63
-
64
- # Train with intelligent recommendation
65
- model_path = factory.train_with_recommendation(recommendation)
66
- ```
67
- """
68
-
69
- def __init__(self,
70
- base_output_dir: Optional[str] = None,
71
- enable_intelligence: bool = True,
72
- knowledge_base_dir: Optional[str] = None,
73
- resource_data_dir: Optional[str] = None):
74
- """
75
- Initialize intelligent training factory.
76
-
77
- Args:
78
- base_output_dir: Base directory for training outputs
79
- enable_intelligence: Enable intelligent features
80
- knowledge_base_dir: Directory for knowledge base data
81
- resource_data_dir: Directory for resource data
82
- """
83
- # Initialize base factory
84
- super().__init__(base_output_dir)
85
-
86
- self.enable_intelligence = enable_intelligence
87
-
88
- if enable_intelligence:
89
- try:
90
- # Initialize intelligent components
91
- self.knowledge_base = KnowledgeBase(knowledge_base_dir)
92
- self.task_classifier = TaskClassifier()
93
- self.resource_optimizer = ResourceOptimizer(resource_data_dir)
94
- self.decision_engine = IntelligentDecisionEngine(self.knowledge_base)
95
-
96
- # Initialize training data management
97
- from ..storage import TrainingRepository, CoreModelIntegration
98
- self.training_repository = TrainingRepository()
99
- self.core_integration = self.training_repository.core_integration
100
-
101
- # Store recommendations for learning
102
- self.recent_recommendations: List[TrainingRecommendation] = []
103
-
104
- logger.info("Intelligent Training Factory initialized with AI capabilities and data persistence")
105
- self._print_welcome_message()
106
-
107
- except Exception as e:
108
- logger.warning(f"Failed to initialize intelligent components: {e}")
109
- logger.warning("Falling back to standard training factory mode")
110
- self.enable_intelligence = False
111
- else:
112
- logger.info("Intelligent Training Factory initialized in standard mode")
113
-
114
- def _print_welcome_message(self) -> None:
115
- """Print welcome message with intelligent capabilities."""
116
- stats = self.knowledge_base.get_statistics()
117
- resource_stats = self.resource_optimizer.get_statistics()
118
-
119
- print("\n" + "="*60)
120
- print("🧠 INTELLIGENT TRAINING FACTORY READY")
121
- print("="*60)
122
- print(f"📚 Knowledge Base: {stats['total_models']} models, {stats['best_practices']} best practices")
123
- print(f"🖥️ Resource Pool: {resource_stats['total_gpus']} GPUs, {resource_stats['total_providers']} providers")
124
- print(f"🎯 Task Support: {len(self.task_classifier.get_supported_tasks())} task types")
125
- print(f"🌍 Domain Support: {len(self.task_classifier.get_supported_domains())} domains")
126
- print("="*60)
127
- print("New capabilities available:")
128
- print(" • analyze_training_request() - Natural language analysis")
129
- print(" • get_intelligent_recommendation() - Smart configuration")
130
- print(" • train_with_recommendation() - Optimized training")
131
- print(" • compare_training_options() - Cost/performance comparison")
132
- print("="*60 + "\n")
133
-
134
- def analyze_training_request(
135
- self,
136
- description: str,
137
- dataset_source: str,
138
- quality_target: str = "balanced",
139
- budget_limit: Optional[float] = None,
140
- time_limit: Optional[int] = None,
141
- **preferences
142
- ) -> TrainingRecommendation:
143
- """
144
- Analyze a natural language training request and generate recommendation.
145
-
146
- Args:
147
- description: Natural language description of the training task
148
- dataset_source: Path to dataset or HuggingFace dataset name
149
- quality_target: Quality target ("fast", "balanced", "high")
150
- budget_limit: Maximum budget in USD
151
- time_limit: Maximum time in hours
152
- **preferences: Additional user preferences
153
-
154
- Returns:
155
- Complete training recommendation with configuration
156
-
157
- Example:
158
- ```python
159
- recommendation = factory.analyze_training_request(
160
- "Fine-tune a medical chatbot for patient Q&A in Chinese",
161
- dataset_source="medical_qa_chinese.json",
162
- quality_target="high",
163
- budget_limit=300.0,
164
- time_limit=8
165
- )
166
- ```
167
- """
168
- if not self.enable_intelligence:
169
- raise ValueError("Intelligence features not available. Initialize with enable_intelligence=True")
170
-
171
- logger.info(f"Analyzing training request: {description[:50]}...")
172
-
173
- try:
174
- # Create training request object
175
- request = TrainingRequest(
176
- description=description,
177
- dataset_source=dataset_source,
178
- quality_target=quality_target,
179
- budget_limit=budget_limit,
180
- time_limit=time_limit,
181
- model_preferences=preferences.get("model_preferences"),
182
- gpu_preferences=preferences.get("gpu_preferences"),
183
- cloud_preferences=preferences.get("cloud_preferences"),
184
- use_lora=preferences.get("use_lora"),
185
- batch_size=preferences.get("batch_size"),
186
- learning_rate=preferences.get("learning_rate"),
187
- user_id=preferences.get("user_id"),
188
- project_name=preferences.get("project_name"),
189
- tags=preferences.get("tags", {})
190
- )
191
-
192
- # Generate intelligent recommendation
193
- recommendation = self.decision_engine.analyze_and_recommend(request)
194
-
195
- # Store for learning
196
- self.recent_recommendations.append(recommendation)
197
-
198
- # Print summary
199
- self._print_recommendation_summary(recommendation)
200
-
201
- return recommendation
202
-
203
- except Exception as e:
204
- logger.error(f"Failed to analyze training request: {e}")
205
- raise
206
-
207
- def get_intelligent_recommendation(
208
- self,
209
- task_type: str,
210
- domain: str = "general",
211
- dataset_size: int = 10000,
212
- quality_target: str = "balanced",
213
- **constraints
214
- ) -> TrainingRecommendation:
215
- """
216
- Get intelligent recommendation for specific task parameters.
217
-
218
- Args:
219
- task_type: Type of task (chat, classification, etc.)
220
- domain: Domain/industry
221
- dataset_size: Size of training dataset
222
- quality_target: Quality target ("fast", "balanced", "high")
223
- **constraints: Additional constraints
224
-
225
- Returns:
226
- Training recommendation
227
- """
228
- if not self.enable_intelligence:
229
- raise ValueError("Intelligence features not available")
230
-
231
- # Create synthetic request
232
- description = f"Train a {task_type} model for {domain} domain"
233
-
234
- return self.analyze_training_request(
235
- description=description,
236
- dataset_source="synthetic_dataset",
237
- quality_target=quality_target,
238
- **constraints
239
- )
240
-
241
- def train_with_recommendation(
242
- self,
243
- recommendation: TrainingRecommendation,
244
- dataset_path: Optional[str] = None,
245
- output_dir: Optional[str] = None,
246
- user_id: Optional[str] = None,
247
- project_name: Optional[str] = None,
248
- **overrides
249
- ) -> str:
250
- """
251
- Train a model using an intelligent recommendation with full tracking.
252
-
253
- Args:
254
- recommendation: Training recommendation from analyze_training_request()
255
- dataset_path: Override dataset path
256
- output_dir: Override output directory
257
- user_id: User identifier for tracking
258
- project_name: Project name for organization
259
- **overrides: Override specific configuration parameters
260
-
261
- Returns:
262
- Path to trained model
263
-
264
- Example:
265
- ```python
266
- # Get recommendation
267
- rec = factory.analyze_training_request(
268
- "Train a customer service chatbot",
269
- "customer_service_data.json"
270
- )
271
-
272
- # Train with recommendation and tracking
273
- model_path = factory.train_with_recommendation(
274
- rec,
275
- user_id="user_123",
276
- project_name="medical_chatbot"
277
- )
278
- ```
279
- """
280
- logger.info(f"Training with intelligent recommendation: {recommendation.model_name}")
281
-
282
- job_id = None
283
-
284
- try:
285
- # Create training job record if repository is available
286
- if hasattr(self, 'training_repository'):
287
- job_id = self.training_repository.create_training_job(
288
- job_name=f"{recommendation.model_name.split('/')[-1]}_training",
289
- base_model=recommendation.model_name,
290
- task_type=recommendation.trainer_type,
291
- domain="general", # TODO: Extract from recommendation
292
- dataset_source=dataset_path or recommendation.training_config.dataset_config.dataset_path,
293
- training_config=recommendation.training_config.to_dict(),
294
- resource_config={
295
- "gpu": recommendation.recommended_gpu,
296
- "cloud_provider": recommendation.cloud_provider,
297
- "estimated_cost": recommendation.estimated_cost,
298
- "estimated_time": recommendation.estimated_time
299
- },
300
- user_id=user_id,
301
- project_name=project_name
302
- )
303
-
304
- # Update job status to running
305
- self.training_repository.update_job_status(job_id, "running")
306
-
307
- # Get configuration from recommendation
308
- config = recommendation.training_config
309
-
310
- # Apply overrides
311
- if dataset_path:
312
- config.dataset_config.dataset_path = dataset_path
313
- if output_dir:
314
- config.output_dir = output_dir
315
-
316
- for key, value in overrides.items():
317
- if hasattr(config, key):
318
- setattr(config, key, value)
319
- elif config.lora_config and hasattr(config.lora_config, key):
320
- setattr(config.lora_config, key, value)
321
- elif config.dataset_config and hasattr(config.dataset_config, key):
322
- setattr(config.dataset_config, key, value)
323
-
324
- # Use base factory training with optimized config
325
- result_path = self.train_model(
326
- model_name=config.model_name,
327
- dataset_path=config.dataset_config.dataset_path,
328
- output_dir=config.output_dir,
329
- training_type=config.training_type,
330
- dataset_format=config.dataset_config.dataset_format,
331
- use_lora=config.lora_config.use_lora if config.lora_config else False,
332
- batch_size=config.batch_size,
333
- num_epochs=config.num_epochs,
334
- learning_rate=config.learning_rate,
335
- max_length=config.dataset_config.max_length,
336
- lora_rank=config.lora_config.lora_rank if config.lora_config else 8,
337
- lora_alpha=config.lora_config.lora_alpha if config.lora_config else 16,
338
- validation_split=config.dataset_config.validation_split
339
- )
340
-
341
- # Complete training and register model
342
- if hasattr(self, 'training_repository') and job_id:
343
- core_model_id = self.training_repository.complete_training(
344
- job_id=job_id,
345
- model_path=result_path,
346
- final_metrics={"training_completed": True}, # TODO: Extract real metrics
347
- cost_breakdown={"total": recommendation.estimated_cost}
348
- )
349
-
350
- if core_model_id:
351
- logger.info(f"Model registered in core system: {core_model_id}")
352
-
353
- # Update knowledge base with results
354
- if self.enable_intelligence:
355
- self._update_knowledge_from_training(recommendation, result_path)
356
-
357
- logger.info("Training completed with intelligent recommendation")
358
- return result_path
359
-
360
- except Exception as e:
361
- # Mark job as failed if it was created
362
- if hasattr(self, 'training_repository') and job_id:
363
- self.training_repository.update_job_status(
364
- job_id,
365
- "failed",
366
- error_message=str(e)
367
- )
368
-
369
- logger.error(f"Training with recommendation failed: {e}")
370
- raise
371
-
372
- def train_on_runpod_intelligent(
373
- self,
374
- description: str,
375
- dataset_path: str,
376
- runpod_api_key: str,
377
- template_id: str,
378
- quality_target: str = "balanced",
379
- budget_limit: Optional[float] = None,
380
- time_limit: Optional[int] = None,
381
- **preferences
382
- ) -> Dict[str, Any]:
383
- """
384
- Intelligent cloud training on RunPod.
385
-
386
- Combines natural language analysis with cloud training.
387
-
388
- Args:
389
- description: Natural language description
390
- dataset_path: Dataset path
391
- runpod_api_key: RunPod API key
392
- template_id: RunPod template ID
393
- quality_target: Quality target
394
- budget_limit: Budget limit
395
- time_limit: Time limit
396
- **preferences: Additional preferences
397
-
398
- Returns:
399
- Training job results
400
- """
401
- if not self.enable_intelligence:
402
- # Fallback to base implementation
403
- return self.train_on_runpod(
404
- model_name=preferences.get("model_name", "google/gemma-2-4b-it"),
405
- dataset_path=dataset_path,
406
- runpod_api_key=runpod_api_key,
407
- template_id=template_id,
408
- **preferences
409
- )
410
-
411
- logger.info("Starting intelligent cloud training on RunPod")
412
-
413
- try:
414
- # Get intelligent recommendation
415
- recommendation = self.analyze_training_request(
416
- description=description,
417
- dataset_source=dataset_path,
418
- quality_target=quality_target,
419
- budget_limit=budget_limit,
420
- time_limit=time_limit,
421
- **preferences
422
- )
423
-
424
- # Extract configuration
425
- config = recommendation.training_config
426
-
427
- # Use base RunPod training with intelligent config
428
- result = self.train_on_runpod(
429
- model_name=config.model_name,
430
- dataset_path=dataset_path,
431
- runpod_api_key=runpod_api_key,
432
- template_id=template_id,
433
- gpu_type=recommendation.recommended_gpu,
434
- use_lora=config.lora_config.use_lora if config.lora_config else True,
435
- batch_size=config.batch_size,
436
- num_epochs=config.num_epochs,
437
- learning_rate=config.learning_rate,
438
- max_length=config.dataset_config.max_length,
439
- lora_rank=config.lora_config.lora_rank if config.lora_config else 8,
440
- lora_alpha=config.lora_config.lora_alpha if config.lora_config else 16
441
- )
442
-
443
- # Add intelligent metadata to result
444
- result["intelligent_recommendation"] = {
445
- "model_name": recommendation.model_name,
446
- "estimated_cost": recommendation.estimated_cost,
447
- "estimated_time": recommendation.estimated_time,
448
- "confidence": recommendation.confidence_score,
449
- "decision_reasons": recommendation.decision_reasons
450
- }
451
-
452
- return result
453
-
454
- except Exception as e:
455
- logger.error(f"Intelligent cloud training failed: {e}")
456
- raise
457
-
458
- def compare_training_options(
459
- self,
460
- description: str,
461
- dataset_source: str,
462
- quality_targets: List[str] = ["fast", "balanced", "high"],
463
- budget_limits: Optional[List[float]] = None
464
- ) -> List[TrainingRecommendation]:
465
- """
466
- Compare multiple training options for the same task.
467
-
468
- Args:
469
- description: Training task description
470
- dataset_source: Dataset source
471
- quality_targets: List of quality targets to compare
472
- budget_limits: Optional budget limits for each target
473
-
474
- Returns:
475
- List of recommendations for comparison
476
- """
477
- if not self.enable_intelligence:
478
- raise ValueError("Intelligence features not available")
479
-
480
- logger.info("Comparing training options...")
481
-
482
- recommendations = []
483
- budget_limits = budget_limits or [None] * len(quality_targets)
484
-
485
- for i, quality_target in enumerate(quality_targets):
486
- budget_limit = budget_limits[i] if i < len(budget_limits) else None
487
-
488
- try:
489
- rec = self.analyze_training_request(
490
- description=description,
491
- dataset_source=dataset_source,
492
- quality_target=quality_target,
493
- budget_limit=budget_limit
494
- )
495
- recommendations.append(rec)
496
- except Exception as e:
497
- logger.warning(f"Failed to generate recommendation for {quality_target}: {e}")
498
-
499
- # Print comparison table
500
- self._print_comparison_table(recommendations)
501
-
502
- return recommendations
503
-
504
- def get_best_practices(self, task_type: str, domain: str = "general") -> List[str]:
505
- """
506
- Get best practices for a specific task and domain.
507
-
508
- Args:
509
- task_type: Type of task
510
- domain: Domain/industry
511
-
512
- Returns:
513
- List of best practice recommendations
514
- """
515
- if not self.enable_intelligence:
516
- return ["Enable intelligence features to get best practices"]
517
-
518
- practices = self.knowledge_base.get_best_practices(task_type, domain)
519
- return [p.recommendation for p in practices]
520
-
521
- def get_supported_capabilities(self) -> Dict[str, List[str]]:
522
- """
523
- Get supported capabilities of the intelligent training system.
524
-
525
- Returns:
526
- Dictionary of supported capabilities
527
- """
528
- if not self.enable_intelligence:
529
- return {"status": "Intelligence features disabled"}
530
-
531
- return {
532
- "task_types": self.task_classifier.get_supported_tasks(),
533
- "domains": self.task_classifier.get_supported_domains(),
534
- "gpu_types": self.resource_optimizer.get_available_gpus(),
535
- "cloud_providers": self.resource_optimizer.get_available_providers(),
536
- "quality_targets": ["fast", "balanced", "high"]
537
- }
538
-
539
- def _print_recommendation_summary(self, recommendation: TrainingRecommendation) -> None:
540
- """Print a summary of the recommendation."""
541
- print("\n" + "="*50)
542
- print("🎯 INTELLIGENT TRAINING RECOMMENDATION")
543
- print("="*50)
544
- print(f"📱 Model: {recommendation.model_name}")
545
- print(f"🖥️ GPU: {recommendation.recommended_gpu}")
546
- print(f"☁️ Cloud: {recommendation.cloud_provider}")
547
- print(f"💰 Cost: ${recommendation.estimated_cost:.2f}")
548
- print(f"⏱️ Time: {recommendation.estimated_time:.1f} hours")
549
- print(f"🎨 Quality: {recommendation.predicted_quality}")
550
- print(f"🎯 Confidence: {recommendation.confidence_score:.1%}")
551
- print("\n📋 Key Decisions:")
552
- for reason in recommendation.decision_reasons:
553
- print(f" • {reason}")
554
-
555
- if recommendation.alternatives:
556
- print(f"\n🔄 {len(recommendation.alternatives)} alternatives available")
557
-
558
- print("="*50 + "\n")
559
-
560
- def _print_comparison_table(self, recommendations: List[TrainingRecommendation]) -> None:
561
- """Print comparison table for multiple recommendations."""
562
- print("\n" + "="*80)
563
- print("📊 TRAINING OPTIONS COMPARISON")
564
- print("="*80)
565
-
566
- # Table header
567
- print(f"{'Target':<10} {'Model':<25} {'GPU':<15} {'Cost':<8} {'Time':<6} {'Quality'}")
568
- print("-" * 80)
569
-
570
- # Table rows
571
- for rec in recommendations:
572
- quality_target = "unknown"
573
- if rec.estimated_cost < 50:
574
- quality_target = "fast"
575
- elif rec.estimated_cost > 200:
576
- quality_target = "high"
577
- else:
578
- quality_target = "balanced"
579
-
580
- print(f"{quality_target:<10} {rec.model_name[:24]:<25} {rec.recommended_gpu[:14]:<15} "
581
- f"${rec.estimated_cost:<7.2f} {rec.estimated_time:<5.1f}h {rec.predicted_quality}")
582
-
583
- print("="*80 + "\n")
584
-
585
- def _update_knowledge_from_training(
586
- self,
587
- recommendation: TrainingRecommendation,
588
- result_path: str
589
- ) -> None:
590
- """Update knowledge base with training results."""
591
- try:
592
- # Create training result record
593
- training_result = {
594
- "model_name": recommendation.model_name,
595
- "task_type": recommendation.trainer_type,
596
- "dataset_name": "user_dataset",
597
- "training_cost": recommendation.estimated_cost,
598
- "gpu_type": recommendation.recommended_gpu,
599
- "config": recommendation.training_config.to_dict(),
600
- "result_path": result_path,
601
- "timestamp": datetime.now().isoformat()
602
- }
603
-
604
- # Update knowledge base
605
- self.knowledge_base.update_from_training_result(training_result)
606
-
607
- logger.info("Updated knowledge base with training results")
608
-
609
- except Exception as e:
610
- logger.warning(f"Failed to update knowledge base: {e}")
611
-
612
- def get_intelligence_statistics(self) -> Dict[str, Any]:
613
- """Get statistics about the intelligent training system."""
614
- if not self.enable_intelligence:
615
- return {"status": "Intelligence features disabled"}
616
-
617
- kb_stats = self.knowledge_base.get_statistics()
618
- resource_stats = self.resource_optimizer.get_statistics()
619
-
620
- stats = {
621
- "intelligence_enabled": True,
622
- "knowledge_base": kb_stats,
623
- "resource_optimizer": resource_stats,
624
- "recent_recommendations": len(self.recent_recommendations),
625
- "supported_tasks": len(self.task_classifier.get_supported_tasks()),
626
- "supported_domains": len(self.task_classifier.get_supported_domains())
627
- }
628
-
629
- # Add training repository statistics if available
630
- if hasattr(self, 'training_repository'):
631
- try:
632
- repo_stats = self.training_repository.get_repository_statistics()
633
- stats["training_repository"] = repo_stats
634
- except Exception as e:
635
- stats["training_repository"] = {"error": str(e)}
636
-
637
- return stats
638
-
639
- def get_training_history(self, user_id: Optional[str] = None, limit: int = 50) -> List[Dict[str, Any]]:
640
- """
641
- Get training history with intelligent insights.
642
-
643
- Args:
644
- user_id: Filter by user ID
645
- limit: Maximum number of jobs to return
646
-
647
- Returns:
648
- List of training job summaries with insights
649
- """
650
- if not hasattr(self, 'training_repository'):
651
- return []
652
-
653
- try:
654
- jobs = self.training_repository.list_jobs(user_id=user_id, limit=limit)
655
-
656
- history = []
657
- for job in jobs:
658
- job_summary = {
659
- "job_id": job.job_id,
660
- "job_name": job.job_name,
661
- "status": job.status,
662
- "base_model": job.base_model,
663
- "task_type": job.task_type,
664
- "domain": job.domain,
665
- "created_at": job.created_at.isoformat(),
666
- "user_id": job.user_id,
667
- "project_name": job.project_name
668
- }
669
-
670
- if job.completed_at:
671
- job_summary["completed_at"] = job.completed_at.isoformat()
672
-
673
- if job.cost_breakdown:
674
- job_summary["total_cost"] = sum(job.cost_breakdown.values())
675
-
676
- # Add progress information
677
- progress = self.training_repository.get_job_progress(job.job_id)
678
- if progress:
679
- job_summary["progress"] = progress
680
-
681
- history.append(job_summary)
682
-
683
- return history
684
-
685
- except Exception as e:
686
- logger.error(f"Failed to get training history: {e}")
687
- return []
688
-
689
- def get_user_insights(self, user_id: str) -> Dict[str, Any]:
690
- """
691
- Get intelligent insights for a specific user.
692
-
693
- Args:
694
- user_id: User identifier
695
-
696
- Returns:
697
- User insights and recommendations
698
- """
699
- if not hasattr(self, 'training_repository'):
700
- return {"error": "Training repository not available"}
701
-
702
- try:
703
- # Get user statistics
704
- user_stats = self.training_repository.get_user_statistics(user_id)
705
-
706
- # Get user's training history
707
- user_jobs = self.training_repository.list_jobs(user_id=user_id, limit=100)
708
-
709
- # Analyze patterns
710
- insights = {
711
- "user_statistics": user_stats,
712
- "patterns": self._analyze_user_patterns(user_jobs),
713
- "recommendations": self._generate_user_recommendations(user_jobs),
714
- "cost_optimization": self._analyze_cost_optimization(user_jobs)
715
- }
716
-
717
- return insights
718
-
719
- except Exception as e:
720
- logger.error(f"Failed to get user insights for {user_id}: {e}")
721
- return {"error": str(e)}
722
-
723
- def _analyze_user_patterns(self, jobs: List) -> Dict[str, Any]:
724
- """Analyze user training patterns."""
725
- if not jobs:
726
- return {}
727
-
728
- patterns = {
729
- "most_used_models": {},
730
- "preferred_tasks": {},
731
- "preferred_domains": {},
732
- "average_cost": 0.0,
733
- "cost_trend": "stable"
734
- }
735
-
736
- total_cost = 0.0
737
- recent_costs = []
738
-
739
- for job in jobs:
740
- # Count model usage
741
- model = job.base_model
742
- patterns["most_used_models"][model] = patterns["most_used_models"].get(model, 0) + 1
743
-
744
- # Count task types
745
- task = job.task_type
746
- patterns["preferred_tasks"][task] = patterns["preferred_tasks"].get(task, 0) + 1
747
-
748
- # Count domains
749
- domain = job.domain
750
- patterns["preferred_domains"][domain] = patterns["preferred_domains"].get(domain, 0) + 1
751
-
752
- # Track costs
753
- if job.cost_breakdown:
754
- cost = sum(job.cost_breakdown.values())
755
- total_cost += cost
756
- recent_costs.append(cost)
757
-
758
- patterns["average_cost"] = total_cost / len(jobs) if jobs else 0.0
759
-
760
- # Analyze cost trend (simplified)
761
- if len(recent_costs) > 1:
762
- first_half = recent_costs[:len(recent_costs)//2]
763
- second_half = recent_costs[len(recent_costs)//2:]
764
-
765
- avg_first = sum(first_half) / len(first_half)
766
- avg_second = sum(second_half) / len(second_half)
767
-
768
- if avg_second > avg_first * 1.2:
769
- patterns["cost_trend"] = "increasing"
770
- elif avg_second < avg_first * 0.8:
771
- patterns["cost_trend"] = "decreasing"
772
-
773
- return patterns
774
-
775
- def _generate_user_recommendations(self, jobs: List) -> List[str]:
776
- """Generate recommendations for the user based on their history."""
777
- if not jobs:
778
- return ["Start with a simple chat model training to get familiar with the system"]
779
-
780
- recommendations = []
781
-
782
- # Analyze success rate
783
- completed_jobs = [job for job in jobs if job.status == "completed"]
784
- success_rate = len(completed_jobs) / len(jobs) if jobs else 0
785
-
786
- if success_rate < 0.5:
787
- recommendations.append("Consider using smaller models or LoRA training to improve success rate")
788
-
789
- # Check for cost optimization opportunities
790
- high_cost_jobs = [job for job in jobs if job.cost_breakdown and sum(job.cost_breakdown.values()) > 50]
791
- if len(high_cost_jobs) > len(jobs) * 0.3:
792
- recommendations.append("Consider using more cost-effective GPU options or shorter training times")
793
-
794
- # Check for domain diversity
795
- domains = set(job.domain for job in jobs)
796
- if len(domains) == 1 and len(jobs) > 5:
797
- recommendations.append("Try training models for different domains to expand your capabilities")
798
-
799
- # Check for recent failures
800
- recent_jobs = jobs[:5] # Last 5 jobs
801
- recent_failures = [job for job in recent_jobs if job.status == "failed"]
802
- if len(recent_failures) > 2:
803
- recommendations.append("Recent training failures detected - consider using the intelligent recommendations for more reliable configurations")
804
-
805
- return recommendations
806
-
807
- def _analyze_cost_optimization(self, jobs: List) -> Dict[str, Any]:
808
- """Analyze cost optimization opportunities."""
809
- if not jobs:
810
- return {}
811
-
812
- total_cost = 0.0
813
- potential_savings = 0.0
814
-
815
- for job in jobs:
816
- if job.cost_breakdown:
817
- job_cost = sum(job.cost_breakdown.values())
818
- total_cost += job_cost
819
-
820
- # Estimate potential savings with intelligent optimization
821
- # This is a simplified calculation
822
- if job_cost > 10: # Only for jobs that cost more than $10
823
- potential_savings += job_cost * 0.3 # Assume 30% savings possible
824
-
825
- return {
826
- "total_spent": total_cost,
827
- "potential_savings": potential_savings,
828
- "optimization_percentage": (potential_savings / total_cost * 100) if total_cost > 0 else 0,
829
- "recommendation": "Use intelligent training recommendations to optimize costs" if potential_savings > 5 else "Your costs are already well optimized"
830
- }
831
-
832
- def save_recommendation(self, recommendation: TrainingRecommendation, filename: str) -> None:
833
- """
834
- Save a training recommendation to file.
835
-
836
- Args:
837
- recommendation: Training recommendation to save
838
- filename: Output filename
839
- """
840
- try:
841
- import json
842
- from dataclasses import asdict
843
-
844
- # Convert recommendation to dict
845
- rec_dict = asdict(recommendation)
846
-
847
- # Convert datetime objects to strings
848
- def convert_datetime(obj):
849
- if isinstance(obj, datetime):
850
- return obj.isoformat()
851
- return obj
852
-
853
- # Save to file
854
- with open(filename, 'w') as f:
855
- json.dump(rec_dict, f, indent=2, default=convert_datetime)
856
-
857
- logger.info(f"Recommendation saved to {filename}")
858
-
859
- except Exception as e:
860
- logger.error(f"Failed to save recommendation: {e}")
861
- raise
862
-
863
- def load_recommendation(self, filename: str) -> TrainingRecommendation:
864
- """
865
- Load a training recommendation from file.
866
-
867
- Args:
868
- filename: Input filename
869
-
870
- Returns:
871
- Loaded training recommendation
872
- """
873
- try:
874
- import json
875
-
876
- with open(filename, 'r') as f:
877
- data = json.load(f)
878
-
879
- # Convert back to TrainingRecommendation
880
- # Note: This is a simplified version - would need proper deserialization
881
- # for complex objects like TrainingConfig
882
-
883
- logger.info(f"Recommendation loaded from {filename}")
884
- return data # Return dict for now
885
-
886
- except Exception as e:
887
- logger.error(f"Failed to load recommendation: {e}")
888
- raise