isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,402 +0,0 @@
1
- """
2
- Training Job Orchestrator
3
-
4
- This module orchestrates the complete training workflow:
5
- - Dataset preparation and validation
6
- - Job submission to cloud providers
7
- - Training monitoring and progress tracking
8
- - Model artifact collection and storage
9
- """
10
-
11
- import os
12
- import json
13
- import logging
14
- from typing import Dict, List, Optional, Any, Union
15
- from dataclasses import dataclass
16
- from pathlib import Path
17
- from datetime import datetime
18
-
19
- from .runpod_trainer import RunPodTrainer, RunPodConfig
20
- from .storage_manager import CloudStorageManager, StorageConfig
21
- # from ..engine.llama_factory.config import SFTConfig, DatasetFormat
22
- # Note: LlamaFactory integration is planned but not yet implemented
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- @dataclass
28
- class JobConfig:
29
- """Configuration for training job orchestration."""
30
-
31
- # Model and dataset
32
- model_name: str # e.g., "google/gemma-2-4b-it"
33
- dataset_source: str # HuggingFace dataset name or local path
34
-
35
- # Training parameters
36
- training_type: str = "sft" # "sft", "dpo", "rlhf"
37
- use_lora: bool = True
38
- batch_size: int = 4
39
- num_epochs: int = 3
40
- learning_rate: float = 2e-5
41
- max_length: int = 1024
42
-
43
- # LoRA parameters
44
- lora_rank: int = 8
45
- lora_alpha: int = 16
46
- lora_dropout: float = 0.05
47
-
48
- # Job settings
49
- job_name: Optional[str] = None
50
- description: Optional[str] = None
51
- tags: Optional[Dict[str, str]] = None
52
-
53
- # Storage settings
54
- save_model_to_storage: bool = True
55
- model_name_in_storage: Optional[str] = None
56
-
57
- def __post_init__(self):
58
- """Validate configuration."""
59
- if not self.model_name:
60
- raise ValueError("Model name is required")
61
- if not self.dataset_source:
62
- raise ValueError("Dataset source is required")
63
-
64
- if self.job_name is None:
65
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
66
- model_short = self.model_name.split("/")[-1] if "/" in self.model_name else self.model_name
67
- self.job_name = f"{model_short}_{self.training_type}_{timestamp}"
68
-
69
-
70
- class TrainingJobOrchestrator:
71
- """
72
- Orchestrates complete training workflows.
73
-
74
- This class manages the entire training pipeline from dataset preparation
75
- to model deployment, handling cloud resources and storage automatically.
76
-
77
- Example:
78
- ```python
79
- # Configure components
80
- runpod_config = RunPodConfig(
81
- api_key="your-runpod-key",
82
- template_id="your-template-id"
83
- )
84
-
85
- storage_config = StorageConfig(
86
- provider="s3",
87
- bucket_name="my-training-bucket"
88
- )
89
-
90
- # Initialize orchestrator
91
- orchestrator = TrainingJobOrchestrator(
92
- runpod_config=runpod_config,
93
- storage_config=storage_config
94
- )
95
-
96
- # Configure training job
97
- job_config = JobConfig(
98
- model_name="google/gemma-2-4b-it",
99
- dataset_source="tatsu-lab/alpaca",
100
- num_epochs=3,
101
- batch_size=4
102
- )
103
-
104
- # Execute training workflow
105
- result = orchestrator.execute_training_workflow(job_config)
106
- print(f"Training completed: {result['model_path']}")
107
- ```
108
- """
109
-
110
- def __init__(self,
111
- runpod_config: RunPodConfig,
112
- storage_config: Optional[StorageConfig] = None):
113
- """
114
- Initialize training job orchestrator.
115
-
116
- Args:
117
- runpod_config: RunPod configuration
118
- storage_config: Optional cloud storage configuration
119
- """
120
- self.runpod_trainer = RunPodTrainer(runpod_config)
121
- self.storage_manager = CloudStorageManager(storage_config) if storage_config else None
122
-
123
- self.active_jobs: Dict[str, Dict[str, Any]] = {}
124
-
125
- logger.info("Training job orchestrator initialized")
126
-
127
- def prepare_dataset(self, dataset_source: str, local_cache_dir: str = "./dataset_cache") -> str:
128
- """
129
- Prepare and validate dataset for training.
130
-
131
- Args:
132
- dataset_source: Dataset source (HuggingFace name or local path)
133
- local_cache_dir: Local directory to cache dataset
134
-
135
- Returns:
136
- Path to prepared dataset
137
- """
138
- os.makedirs(local_cache_dir, exist_ok=True)
139
-
140
- try:
141
- if dataset_source.startswith("hf://") or not os.path.exists(dataset_source):
142
- # HuggingFace dataset
143
- dataset_name = dataset_source.replace("hf://", "") if dataset_source.startswith("hf://") else dataset_source
144
-
145
- logger.info(f"Loading HuggingFace dataset: {dataset_name}")
146
-
147
- # Use datasets library to load and convert
148
- from datasets import load_dataset
149
-
150
- dataset = load_dataset(dataset_name)
151
- train_data = []
152
-
153
- # Convert to Alpaca format
154
- for item in dataset['train']:
155
- if 'instruction' in item and 'output' in item:
156
- train_data.append({
157
- 'instruction': item['instruction'],
158
- 'input': item.get('input', ''),
159
- 'output': item['output']
160
- })
161
- elif 'text' in item:
162
- # Handle raw text datasets
163
- train_data.append({
164
- 'instruction': "Continue the following text:",
165
- 'input': item['text'][:512], # First part as input
166
- 'output': item['text'][512:1024] # Next part as output
167
- })
168
-
169
- # Save prepared dataset
170
- dataset_path = os.path.join(local_cache_dir, f"{dataset_name.replace('/', '_')}.json")
171
- with open(dataset_path, 'w') as f:
172
- json.dump(train_data, f, indent=2)
173
-
174
- logger.info(f"Prepared {len(train_data)} training samples")
175
-
176
- else:
177
- # Local dataset file
178
- dataset_path = dataset_source
179
-
180
- # Validate format
181
- with open(dataset_path, 'r') as f:
182
- data = json.load(f)
183
-
184
- if not isinstance(data, list):
185
- raise ValueError("Dataset must be a list of training examples")
186
-
187
- # Validate required fields
188
- required_fields = {'instruction', 'output'}
189
- for i, item in enumerate(data[:5]): # Check first 5 items
190
- if not all(field in item for field in required_fields):
191
- raise ValueError(f"Item {i} missing required fields: {required_fields}")
192
-
193
- logger.info(f"Validated local dataset with {len(data)} samples")
194
-
195
- return dataset_path
196
-
197
- except Exception as e:
198
- logger.error(f"Failed to prepare dataset {dataset_source}: {e}")
199
- raise
200
-
201
- def execute_training_workflow(self, job_config: JobConfig) -> Dict[str, Any]:
202
- """
203
- Execute complete training workflow.
204
-
205
- Args:
206
- job_config: Training job configuration
207
-
208
- Returns:
209
- Training results with model path and metrics
210
- """
211
- workflow_start_time = datetime.now()
212
-
213
- try:
214
- logger.info(f"Starting training workflow: {job_config.job_name}")
215
-
216
- # Step 1: Prepare dataset
217
- logger.info("Step 1: Preparing dataset...")
218
- dataset_path = self.prepare_dataset(job_config.dataset_source)
219
-
220
- # Step 2: Upload dataset to storage if configured
221
- dataset_url = dataset_path
222
- if self.storage_manager:
223
- logger.info("Step 2: Uploading dataset to cloud storage...")
224
- dataset_url = self.storage_manager.upload_dataset(
225
- local_path=dataset_path,
226
- dataset_name=f"{job_config.job_name}_dataset",
227
- metadata={
228
- "source": job_config.dataset_source,
229
- "job_name": job_config.job_name,
230
- "created_at": workflow_start_time.isoformat()
231
- }
232
- )
233
-
234
- # Step 3: Start training job
235
- logger.info("Step 3: Starting RunPod training job...")
236
- training_params = {
237
- "use_lora": job_config.use_lora,
238
- "batch_size": job_config.batch_size,
239
- "num_epochs": job_config.num_epochs,
240
- "learning_rate": job_config.learning_rate,
241
- "max_length": job_config.max_length,
242
- "lora_rank": job_config.lora_rank,
243
- "lora_alpha": job_config.lora_alpha,
244
- "lora_dropout": job_config.lora_dropout,
245
- "dataset_name": dataset_url
246
- }
247
-
248
- job_id = self.runpod_trainer.start_training_job(
249
- model_name=job_config.model_name,
250
- dataset_path=dataset_url,
251
- training_params=training_params,
252
- job_name=job_config.job_name
253
- )
254
-
255
- # Track job
256
- self.active_jobs[job_id] = {
257
- "config": job_config,
258
- "start_time": workflow_start_time,
259
- "dataset_path": dataset_path,
260
- "dataset_url": dataset_url,
261
- "status": "running"
262
- }
263
-
264
- # Step 4: Monitor training
265
- logger.info("Step 4: Monitoring training progress...")
266
- final_status = self.runpod_trainer.monitor_job(job_id)
267
-
268
- # Step 5: Collect results
269
- logger.info("Step 5: Collecting training results...")
270
- if final_status["status"] == "COMPLETED":
271
- # Download trained model
272
- local_model_path = self.runpod_trainer.get_trained_model(job_id)
273
-
274
- # Upload to storage if configured
275
- model_storage_url = None
276
- if self.storage_manager and job_config.save_model_to_storage:
277
- model_name = job_config.model_name_in_storage or job_config.job_name
278
- model_storage_url = self.storage_manager.upload_model(
279
- local_model_dir=local_model_path,
280
- model_name=model_name,
281
- metadata={
282
- "base_model": job_config.model_name,
283
- "dataset_source": job_config.dataset_source,
284
- "training_params": training_params,
285
- "job_id": job_id,
286
- "completed_at": datetime.now().isoformat(),
287
- "training_duration": str(datetime.now() - workflow_start_time)
288
- }
289
- )
290
-
291
- # Update job status
292
- self.active_jobs[job_id].update({
293
- "status": "completed",
294
- "local_model_path": local_model_path,
295
- "model_storage_url": model_storage_url,
296
- "final_status": final_status,
297
- "end_time": datetime.now()
298
- })
299
-
300
- logger.info(f"Training workflow completed successfully: {job_config.job_name}")
301
-
302
- return {
303
- "success": True,
304
- "job_id": job_id,
305
- "job_name": job_config.job_name,
306
- "model_path": local_model_path,
307
- "model_storage_url": model_storage_url,
308
- "training_duration": str(datetime.now() - workflow_start_time),
309
- "final_status": final_status
310
- }
311
- else:
312
- # Training failed
313
- self.active_jobs[job_id].update({
314
- "status": "failed",
315
- "final_status": final_status,
316
- "end_time": datetime.now()
317
- })
318
-
319
- raise RuntimeError(f"Training job failed with status: {final_status['status']}")
320
-
321
- except Exception as e:
322
- logger.error(f"Training workflow failed: {e}")
323
-
324
- # Update job status if job_id exists
325
- if 'job_id' in locals():
326
- self.active_jobs[job_id].update({
327
- "status": "error",
328
- "error": str(e),
329
- "end_time": datetime.now()
330
- })
331
-
332
- return {
333
- "success": False,
334
- "error": str(e),
335
- "job_name": job_config.job_name,
336
- "training_duration": str(datetime.now() - workflow_start_time)
337
- }
338
-
339
- def get_job_status(self, job_id: str) -> Dict[str, Any]:
340
- """Get status of a training job."""
341
- if job_id in self.active_jobs:
342
- job_info = self.active_jobs[job_id].copy()
343
-
344
- # Get real-time status from RunPod if job is still running
345
- if job_info["status"] == "running":
346
- try:
347
- runpod_status = self.runpod_trainer.monitor_job(job_id, check_interval=0)
348
- job_info["runpod_status"] = runpod_status
349
- except:
350
- pass
351
-
352
- return job_info
353
- else:
354
- return {"error": f"Job {job_id} not found"}
355
-
356
- def list_active_jobs(self) -> List[Dict[str, Any]]:
357
- """List all active training jobs."""
358
- return [
359
- {
360
- "job_id": job_id,
361
- "job_name": info["config"].job_name,
362
- "status": info["status"],
363
- "start_time": info["start_time"].isoformat(),
364
- "model_name": info["config"].model_name,
365
- "dataset_source": info["config"].dataset_source
366
- }
367
- for job_id, info in self.active_jobs.items()
368
- ]
369
-
370
- def stop_job(self, job_id: str) -> bool:
371
- """Stop a running training job."""
372
- try:
373
- self.runpod_trainer.stop_job(job_id)
374
-
375
- if job_id in self.active_jobs:
376
- self.active_jobs[job_id].update({
377
- "status": "stopped",
378
- "end_time": datetime.now()
379
- })
380
-
381
- logger.info(f"Stopped training job: {job_id}")
382
- return True
383
-
384
- except Exception as e:
385
- logger.error(f"Failed to stop job {job_id}: {e}")
386
- return False
387
-
388
- def cleanup_job(self, job_id: str) -> None:
389
- """Clean up job resources and remove from tracking."""
390
- try:
391
- # Stop job if still running
392
- if job_id in self.active_jobs and self.active_jobs[job_id]["status"] == "running":
393
- self.stop_job(job_id)
394
-
395
- # Remove from tracking
396
- if job_id in self.active_jobs:
397
- del self.active_jobs[job_id]
398
-
399
- logger.info(f"Cleaned up job: {job_id}")
400
-
401
- except Exception as e:
402
- logger.error(f"Failed to cleanup job {job_id}: {e}")