isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +35 -80
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
- isa_model-0.4.4.dist-info/RECORD +180 -0
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,402 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Training Job Orchestrator
|
3
|
-
|
4
|
-
This module orchestrates the complete training workflow:
|
5
|
-
- Dataset preparation and validation
|
6
|
-
- Job submission to cloud providers
|
7
|
-
- Training monitoring and progress tracking
|
8
|
-
- Model artifact collection and storage
|
9
|
-
"""
|
10
|
-
|
11
|
-
import os
|
12
|
-
import json
|
13
|
-
import logging
|
14
|
-
from typing import Dict, List, Optional, Any, Union
|
15
|
-
from dataclasses import dataclass
|
16
|
-
from pathlib import Path
|
17
|
-
from datetime import datetime
|
18
|
-
|
19
|
-
from .runpod_trainer import RunPodTrainer, RunPodConfig
|
20
|
-
from .storage_manager import CloudStorageManager, StorageConfig
|
21
|
-
# from ..engine.llama_factory.config import SFTConfig, DatasetFormat
|
22
|
-
# Note: LlamaFactory integration is planned but not yet implemented
|
23
|
-
|
24
|
-
logger = logging.getLogger(__name__)
|
25
|
-
|
26
|
-
|
27
|
-
@dataclass
|
28
|
-
class JobConfig:
|
29
|
-
"""Configuration for training job orchestration."""
|
30
|
-
|
31
|
-
# Model and dataset
|
32
|
-
model_name: str # e.g., "google/gemma-2-4b-it"
|
33
|
-
dataset_source: str # HuggingFace dataset name or local path
|
34
|
-
|
35
|
-
# Training parameters
|
36
|
-
training_type: str = "sft" # "sft", "dpo", "rlhf"
|
37
|
-
use_lora: bool = True
|
38
|
-
batch_size: int = 4
|
39
|
-
num_epochs: int = 3
|
40
|
-
learning_rate: float = 2e-5
|
41
|
-
max_length: int = 1024
|
42
|
-
|
43
|
-
# LoRA parameters
|
44
|
-
lora_rank: int = 8
|
45
|
-
lora_alpha: int = 16
|
46
|
-
lora_dropout: float = 0.05
|
47
|
-
|
48
|
-
# Job settings
|
49
|
-
job_name: Optional[str] = None
|
50
|
-
description: Optional[str] = None
|
51
|
-
tags: Optional[Dict[str, str]] = None
|
52
|
-
|
53
|
-
# Storage settings
|
54
|
-
save_model_to_storage: bool = True
|
55
|
-
model_name_in_storage: Optional[str] = None
|
56
|
-
|
57
|
-
def __post_init__(self):
|
58
|
-
"""Validate configuration."""
|
59
|
-
if not self.model_name:
|
60
|
-
raise ValueError("Model name is required")
|
61
|
-
if not self.dataset_source:
|
62
|
-
raise ValueError("Dataset source is required")
|
63
|
-
|
64
|
-
if self.job_name is None:
|
65
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
66
|
-
model_short = self.model_name.split("/")[-1] if "/" in self.model_name else self.model_name
|
67
|
-
self.job_name = f"{model_short}_{self.training_type}_{timestamp}"
|
68
|
-
|
69
|
-
|
70
|
-
class TrainingJobOrchestrator:
|
71
|
-
"""
|
72
|
-
Orchestrates complete training workflows.
|
73
|
-
|
74
|
-
This class manages the entire training pipeline from dataset preparation
|
75
|
-
to model deployment, handling cloud resources and storage automatically.
|
76
|
-
|
77
|
-
Example:
|
78
|
-
```python
|
79
|
-
# Configure components
|
80
|
-
runpod_config = RunPodConfig(
|
81
|
-
api_key="your-runpod-key",
|
82
|
-
template_id="your-template-id"
|
83
|
-
)
|
84
|
-
|
85
|
-
storage_config = StorageConfig(
|
86
|
-
provider="s3",
|
87
|
-
bucket_name="my-training-bucket"
|
88
|
-
)
|
89
|
-
|
90
|
-
# Initialize orchestrator
|
91
|
-
orchestrator = TrainingJobOrchestrator(
|
92
|
-
runpod_config=runpod_config,
|
93
|
-
storage_config=storage_config
|
94
|
-
)
|
95
|
-
|
96
|
-
# Configure training job
|
97
|
-
job_config = JobConfig(
|
98
|
-
model_name="google/gemma-2-4b-it",
|
99
|
-
dataset_source="tatsu-lab/alpaca",
|
100
|
-
num_epochs=3,
|
101
|
-
batch_size=4
|
102
|
-
)
|
103
|
-
|
104
|
-
# Execute training workflow
|
105
|
-
result = orchestrator.execute_training_workflow(job_config)
|
106
|
-
print(f"Training completed: {result['model_path']}")
|
107
|
-
```
|
108
|
-
"""
|
109
|
-
|
110
|
-
def __init__(self,
|
111
|
-
runpod_config: RunPodConfig,
|
112
|
-
storage_config: Optional[StorageConfig] = None):
|
113
|
-
"""
|
114
|
-
Initialize training job orchestrator.
|
115
|
-
|
116
|
-
Args:
|
117
|
-
runpod_config: RunPod configuration
|
118
|
-
storage_config: Optional cloud storage configuration
|
119
|
-
"""
|
120
|
-
self.runpod_trainer = RunPodTrainer(runpod_config)
|
121
|
-
self.storage_manager = CloudStorageManager(storage_config) if storage_config else None
|
122
|
-
|
123
|
-
self.active_jobs: Dict[str, Dict[str, Any]] = {}
|
124
|
-
|
125
|
-
logger.info("Training job orchestrator initialized")
|
126
|
-
|
127
|
-
def prepare_dataset(self, dataset_source: str, local_cache_dir: str = "./dataset_cache") -> str:
|
128
|
-
"""
|
129
|
-
Prepare and validate dataset for training.
|
130
|
-
|
131
|
-
Args:
|
132
|
-
dataset_source: Dataset source (HuggingFace name or local path)
|
133
|
-
local_cache_dir: Local directory to cache dataset
|
134
|
-
|
135
|
-
Returns:
|
136
|
-
Path to prepared dataset
|
137
|
-
"""
|
138
|
-
os.makedirs(local_cache_dir, exist_ok=True)
|
139
|
-
|
140
|
-
try:
|
141
|
-
if dataset_source.startswith("hf://") or not os.path.exists(dataset_source):
|
142
|
-
# HuggingFace dataset
|
143
|
-
dataset_name = dataset_source.replace("hf://", "") if dataset_source.startswith("hf://") else dataset_source
|
144
|
-
|
145
|
-
logger.info(f"Loading HuggingFace dataset: {dataset_name}")
|
146
|
-
|
147
|
-
# Use datasets library to load and convert
|
148
|
-
from datasets import load_dataset
|
149
|
-
|
150
|
-
dataset = load_dataset(dataset_name)
|
151
|
-
train_data = []
|
152
|
-
|
153
|
-
# Convert to Alpaca format
|
154
|
-
for item in dataset['train']:
|
155
|
-
if 'instruction' in item and 'output' in item:
|
156
|
-
train_data.append({
|
157
|
-
'instruction': item['instruction'],
|
158
|
-
'input': item.get('input', ''),
|
159
|
-
'output': item['output']
|
160
|
-
})
|
161
|
-
elif 'text' in item:
|
162
|
-
# Handle raw text datasets
|
163
|
-
train_data.append({
|
164
|
-
'instruction': "Continue the following text:",
|
165
|
-
'input': item['text'][:512], # First part as input
|
166
|
-
'output': item['text'][512:1024] # Next part as output
|
167
|
-
})
|
168
|
-
|
169
|
-
# Save prepared dataset
|
170
|
-
dataset_path = os.path.join(local_cache_dir, f"{dataset_name.replace('/', '_')}.json")
|
171
|
-
with open(dataset_path, 'w') as f:
|
172
|
-
json.dump(train_data, f, indent=2)
|
173
|
-
|
174
|
-
logger.info(f"Prepared {len(train_data)} training samples")
|
175
|
-
|
176
|
-
else:
|
177
|
-
# Local dataset file
|
178
|
-
dataset_path = dataset_source
|
179
|
-
|
180
|
-
# Validate format
|
181
|
-
with open(dataset_path, 'r') as f:
|
182
|
-
data = json.load(f)
|
183
|
-
|
184
|
-
if not isinstance(data, list):
|
185
|
-
raise ValueError("Dataset must be a list of training examples")
|
186
|
-
|
187
|
-
# Validate required fields
|
188
|
-
required_fields = {'instruction', 'output'}
|
189
|
-
for i, item in enumerate(data[:5]): # Check first 5 items
|
190
|
-
if not all(field in item for field in required_fields):
|
191
|
-
raise ValueError(f"Item {i} missing required fields: {required_fields}")
|
192
|
-
|
193
|
-
logger.info(f"Validated local dataset with {len(data)} samples")
|
194
|
-
|
195
|
-
return dataset_path
|
196
|
-
|
197
|
-
except Exception as e:
|
198
|
-
logger.error(f"Failed to prepare dataset {dataset_source}: {e}")
|
199
|
-
raise
|
200
|
-
|
201
|
-
def execute_training_workflow(self, job_config: JobConfig) -> Dict[str, Any]:
|
202
|
-
"""
|
203
|
-
Execute complete training workflow.
|
204
|
-
|
205
|
-
Args:
|
206
|
-
job_config: Training job configuration
|
207
|
-
|
208
|
-
Returns:
|
209
|
-
Training results with model path and metrics
|
210
|
-
"""
|
211
|
-
workflow_start_time = datetime.now()
|
212
|
-
|
213
|
-
try:
|
214
|
-
logger.info(f"Starting training workflow: {job_config.job_name}")
|
215
|
-
|
216
|
-
# Step 1: Prepare dataset
|
217
|
-
logger.info("Step 1: Preparing dataset...")
|
218
|
-
dataset_path = self.prepare_dataset(job_config.dataset_source)
|
219
|
-
|
220
|
-
# Step 2: Upload dataset to storage if configured
|
221
|
-
dataset_url = dataset_path
|
222
|
-
if self.storage_manager:
|
223
|
-
logger.info("Step 2: Uploading dataset to cloud storage...")
|
224
|
-
dataset_url = self.storage_manager.upload_dataset(
|
225
|
-
local_path=dataset_path,
|
226
|
-
dataset_name=f"{job_config.job_name}_dataset",
|
227
|
-
metadata={
|
228
|
-
"source": job_config.dataset_source,
|
229
|
-
"job_name": job_config.job_name,
|
230
|
-
"created_at": workflow_start_time.isoformat()
|
231
|
-
}
|
232
|
-
)
|
233
|
-
|
234
|
-
# Step 3: Start training job
|
235
|
-
logger.info("Step 3: Starting RunPod training job...")
|
236
|
-
training_params = {
|
237
|
-
"use_lora": job_config.use_lora,
|
238
|
-
"batch_size": job_config.batch_size,
|
239
|
-
"num_epochs": job_config.num_epochs,
|
240
|
-
"learning_rate": job_config.learning_rate,
|
241
|
-
"max_length": job_config.max_length,
|
242
|
-
"lora_rank": job_config.lora_rank,
|
243
|
-
"lora_alpha": job_config.lora_alpha,
|
244
|
-
"lora_dropout": job_config.lora_dropout,
|
245
|
-
"dataset_name": dataset_url
|
246
|
-
}
|
247
|
-
|
248
|
-
job_id = self.runpod_trainer.start_training_job(
|
249
|
-
model_name=job_config.model_name,
|
250
|
-
dataset_path=dataset_url,
|
251
|
-
training_params=training_params,
|
252
|
-
job_name=job_config.job_name
|
253
|
-
)
|
254
|
-
|
255
|
-
# Track job
|
256
|
-
self.active_jobs[job_id] = {
|
257
|
-
"config": job_config,
|
258
|
-
"start_time": workflow_start_time,
|
259
|
-
"dataset_path": dataset_path,
|
260
|
-
"dataset_url": dataset_url,
|
261
|
-
"status": "running"
|
262
|
-
}
|
263
|
-
|
264
|
-
# Step 4: Monitor training
|
265
|
-
logger.info("Step 4: Monitoring training progress...")
|
266
|
-
final_status = self.runpod_trainer.monitor_job(job_id)
|
267
|
-
|
268
|
-
# Step 5: Collect results
|
269
|
-
logger.info("Step 5: Collecting training results...")
|
270
|
-
if final_status["status"] == "COMPLETED":
|
271
|
-
# Download trained model
|
272
|
-
local_model_path = self.runpod_trainer.get_trained_model(job_id)
|
273
|
-
|
274
|
-
# Upload to storage if configured
|
275
|
-
model_storage_url = None
|
276
|
-
if self.storage_manager and job_config.save_model_to_storage:
|
277
|
-
model_name = job_config.model_name_in_storage or job_config.job_name
|
278
|
-
model_storage_url = self.storage_manager.upload_model(
|
279
|
-
local_model_dir=local_model_path,
|
280
|
-
model_name=model_name,
|
281
|
-
metadata={
|
282
|
-
"base_model": job_config.model_name,
|
283
|
-
"dataset_source": job_config.dataset_source,
|
284
|
-
"training_params": training_params,
|
285
|
-
"job_id": job_id,
|
286
|
-
"completed_at": datetime.now().isoformat(),
|
287
|
-
"training_duration": str(datetime.now() - workflow_start_time)
|
288
|
-
}
|
289
|
-
)
|
290
|
-
|
291
|
-
# Update job status
|
292
|
-
self.active_jobs[job_id].update({
|
293
|
-
"status": "completed",
|
294
|
-
"local_model_path": local_model_path,
|
295
|
-
"model_storage_url": model_storage_url,
|
296
|
-
"final_status": final_status,
|
297
|
-
"end_time": datetime.now()
|
298
|
-
})
|
299
|
-
|
300
|
-
logger.info(f"Training workflow completed successfully: {job_config.job_name}")
|
301
|
-
|
302
|
-
return {
|
303
|
-
"success": True,
|
304
|
-
"job_id": job_id,
|
305
|
-
"job_name": job_config.job_name,
|
306
|
-
"model_path": local_model_path,
|
307
|
-
"model_storage_url": model_storage_url,
|
308
|
-
"training_duration": str(datetime.now() - workflow_start_time),
|
309
|
-
"final_status": final_status
|
310
|
-
}
|
311
|
-
else:
|
312
|
-
# Training failed
|
313
|
-
self.active_jobs[job_id].update({
|
314
|
-
"status": "failed",
|
315
|
-
"final_status": final_status,
|
316
|
-
"end_time": datetime.now()
|
317
|
-
})
|
318
|
-
|
319
|
-
raise RuntimeError(f"Training job failed with status: {final_status['status']}")
|
320
|
-
|
321
|
-
except Exception as e:
|
322
|
-
logger.error(f"Training workflow failed: {e}")
|
323
|
-
|
324
|
-
# Update job status if job_id exists
|
325
|
-
if 'job_id' in locals():
|
326
|
-
self.active_jobs[job_id].update({
|
327
|
-
"status": "error",
|
328
|
-
"error": str(e),
|
329
|
-
"end_time": datetime.now()
|
330
|
-
})
|
331
|
-
|
332
|
-
return {
|
333
|
-
"success": False,
|
334
|
-
"error": str(e),
|
335
|
-
"job_name": job_config.job_name,
|
336
|
-
"training_duration": str(datetime.now() - workflow_start_time)
|
337
|
-
}
|
338
|
-
|
339
|
-
def get_job_status(self, job_id: str) -> Dict[str, Any]:
|
340
|
-
"""Get status of a training job."""
|
341
|
-
if job_id in self.active_jobs:
|
342
|
-
job_info = self.active_jobs[job_id].copy()
|
343
|
-
|
344
|
-
# Get real-time status from RunPod if job is still running
|
345
|
-
if job_info["status"] == "running":
|
346
|
-
try:
|
347
|
-
runpod_status = self.runpod_trainer.monitor_job(job_id, check_interval=0)
|
348
|
-
job_info["runpod_status"] = runpod_status
|
349
|
-
except:
|
350
|
-
pass
|
351
|
-
|
352
|
-
return job_info
|
353
|
-
else:
|
354
|
-
return {"error": f"Job {job_id} not found"}
|
355
|
-
|
356
|
-
def list_active_jobs(self) -> List[Dict[str, Any]]:
|
357
|
-
"""List all active training jobs."""
|
358
|
-
return [
|
359
|
-
{
|
360
|
-
"job_id": job_id,
|
361
|
-
"job_name": info["config"].job_name,
|
362
|
-
"status": info["status"],
|
363
|
-
"start_time": info["start_time"].isoformat(),
|
364
|
-
"model_name": info["config"].model_name,
|
365
|
-
"dataset_source": info["config"].dataset_source
|
366
|
-
}
|
367
|
-
for job_id, info in self.active_jobs.items()
|
368
|
-
]
|
369
|
-
|
370
|
-
def stop_job(self, job_id: str) -> bool:
|
371
|
-
"""Stop a running training job."""
|
372
|
-
try:
|
373
|
-
self.runpod_trainer.stop_job(job_id)
|
374
|
-
|
375
|
-
if job_id in self.active_jobs:
|
376
|
-
self.active_jobs[job_id].update({
|
377
|
-
"status": "stopped",
|
378
|
-
"end_time": datetime.now()
|
379
|
-
})
|
380
|
-
|
381
|
-
logger.info(f"Stopped training job: {job_id}")
|
382
|
-
return True
|
383
|
-
|
384
|
-
except Exception as e:
|
385
|
-
logger.error(f"Failed to stop job {job_id}: {e}")
|
386
|
-
return False
|
387
|
-
|
388
|
-
def cleanup_job(self, job_id: str) -> None:
|
389
|
-
"""Clean up job resources and remove from tracking."""
|
390
|
-
try:
|
391
|
-
# Stop job if still running
|
392
|
-
if job_id in self.active_jobs and self.active_jobs[job_id]["status"] == "running":
|
393
|
-
self.stop_job(job_id)
|
394
|
-
|
395
|
-
# Remove from tracking
|
396
|
-
if job_id in self.active_jobs:
|
397
|
-
del self.active_jobs[job_id]
|
398
|
-
|
399
|
-
logger.info(f"Cleaned up job: {job_id}")
|
400
|
-
|
401
|
-
except Exception as e:
|
402
|
-
logger.error(f"Failed to cleanup job {job_id}: {e}")
|