isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +40 -17
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
isa_model/training/factory.py
DELETED
@@ -1,424 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
ISA Model Training Factory
|
3
|
-
|
4
|
-
A clean, simplified training factory that uses HuggingFace Transformers directly
|
5
|
-
without external dependencies like LlamaFactory.
|
6
|
-
"""
|
7
|
-
|
8
|
-
import os
|
9
|
-
import logging
|
10
|
-
from typing import Optional, Dict, Any, Union, List
|
11
|
-
from pathlib import Path
|
12
|
-
import datetime
|
13
|
-
|
14
|
-
from .core import (
|
15
|
-
TrainingConfig,
|
16
|
-
LoRAConfig,
|
17
|
-
DatasetConfig,
|
18
|
-
BaseTrainer,
|
19
|
-
SFTTrainer,
|
20
|
-
TrainingUtils,
|
21
|
-
DatasetManager,
|
22
|
-
)
|
23
|
-
from .cloud import TrainingJobOrchestrator
|
24
|
-
|
25
|
-
logger = logging.getLogger(__name__)
|
26
|
-
|
27
|
-
|
28
|
-
class TrainingFactory:
|
29
|
-
"""
|
30
|
-
Unified Training Factory for ISA Model SDK
|
31
|
-
|
32
|
-
Provides a clean interface for:
|
33
|
-
- Local training with SFT (Supervised Fine-Tuning)
|
34
|
-
- Cloud training on RunPod
|
35
|
-
- Model evaluation and management
|
36
|
-
|
37
|
-
Example usage:
|
38
|
-
```python
|
39
|
-
from isa_model.training import TrainingFactory
|
40
|
-
|
41
|
-
factory = TrainingFactory()
|
42
|
-
|
43
|
-
# Local training
|
44
|
-
model_path = factory.train_model(
|
45
|
-
model_name="google/gemma-2-4b-it",
|
46
|
-
dataset_path="tatsu-lab/alpaca",
|
47
|
-
use_lora=True,
|
48
|
-
num_epochs=3
|
49
|
-
)
|
50
|
-
|
51
|
-
# Cloud training on RunPod
|
52
|
-
result = factory.train_on_runpod(
|
53
|
-
model_name="google/gemma-2-4b-it",
|
54
|
-
dataset_path="tatsu-lab/alpaca",
|
55
|
-
runpod_api_key="your-api-key",
|
56
|
-
template_id="your-template-id"
|
57
|
-
)
|
58
|
-
```
|
59
|
-
"""
|
60
|
-
|
61
|
-
def __init__(self, base_output_dir: Optional[str] = None):
|
62
|
-
"""
|
63
|
-
Initialize the training factory.
|
64
|
-
|
65
|
-
Args:
|
66
|
-
base_output_dir: Base directory for training outputs
|
67
|
-
"""
|
68
|
-
self.base_output_dir = base_output_dir or os.path.join(os.getcwd(), "training_outputs")
|
69
|
-
os.makedirs(self.base_output_dir, exist_ok=True)
|
70
|
-
|
71
|
-
logger.info(f"TrainingFactory initialized with output dir: {self.base_output_dir}")
|
72
|
-
|
73
|
-
def train_model(
|
74
|
-
self,
|
75
|
-
model_name: str,
|
76
|
-
dataset_path: str,
|
77
|
-
output_dir: Optional[str] = None,
|
78
|
-
training_type: str = "sft",
|
79
|
-
dataset_format: str = "alpaca",
|
80
|
-
use_lora: bool = True,
|
81
|
-
batch_size: int = 4,
|
82
|
-
num_epochs: int = 3,
|
83
|
-
learning_rate: float = 2e-5,
|
84
|
-
max_length: int = 1024,
|
85
|
-
lora_rank: int = 8,
|
86
|
-
lora_alpha: int = 16,
|
87
|
-
validation_split: float = 0.1,
|
88
|
-
**kwargs
|
89
|
-
) -> str:
|
90
|
-
"""
|
91
|
-
Train a model locally.
|
92
|
-
|
93
|
-
Args:
|
94
|
-
model_name: Model identifier (e.g., "google/gemma-2-4b-it")
|
95
|
-
dataset_path: Path to dataset or HuggingFace dataset name
|
96
|
-
output_dir: Custom output directory
|
97
|
-
training_type: Type of training ("sft" supported)
|
98
|
-
dataset_format: Dataset format ("alpaca", "sharegpt", "custom")
|
99
|
-
use_lora: Whether to use LoRA for efficient training
|
100
|
-
batch_size: Training batch size
|
101
|
-
num_epochs: Number of training epochs
|
102
|
-
learning_rate: Learning rate
|
103
|
-
max_length: Maximum sequence length
|
104
|
-
lora_rank: LoRA rank parameter
|
105
|
-
lora_alpha: LoRA alpha parameter
|
106
|
-
validation_split: Fraction of data for validation
|
107
|
-
**kwargs: Additional training parameters
|
108
|
-
|
109
|
-
Returns:
|
110
|
-
Path to the trained model
|
111
|
-
|
112
|
-
Example:
|
113
|
-
```python
|
114
|
-
model_path = factory.train_model(
|
115
|
-
model_name="google/gemma-2-4b-it",
|
116
|
-
dataset_path="tatsu-lab/alpaca",
|
117
|
-
use_lora=True,
|
118
|
-
num_epochs=3,
|
119
|
-
batch_size=4
|
120
|
-
)
|
121
|
-
```
|
122
|
-
"""
|
123
|
-
# Generate output directory if not provided
|
124
|
-
if not output_dir:
|
125
|
-
output_dir = TrainingUtils.generate_output_dir(
|
126
|
-
model_name, training_type, self.base_output_dir
|
127
|
-
)
|
128
|
-
|
129
|
-
# Create configurations
|
130
|
-
lora_config = LoRAConfig(
|
131
|
-
use_lora=use_lora,
|
132
|
-
lora_rank=lora_rank,
|
133
|
-
lora_alpha=lora_alpha
|
134
|
-
) if use_lora else None
|
135
|
-
|
136
|
-
dataset_config = DatasetConfig(
|
137
|
-
dataset_path=dataset_path,
|
138
|
-
dataset_format=dataset_format,
|
139
|
-
max_length=max_length,
|
140
|
-
validation_split=validation_split
|
141
|
-
)
|
142
|
-
|
143
|
-
training_config = TrainingConfig(
|
144
|
-
model_name=model_name,
|
145
|
-
output_dir=output_dir,
|
146
|
-
training_type=training_type,
|
147
|
-
num_epochs=num_epochs,
|
148
|
-
batch_size=batch_size,
|
149
|
-
learning_rate=learning_rate,
|
150
|
-
lora_config=lora_config,
|
151
|
-
dataset_config=dataset_config,
|
152
|
-
**kwargs
|
153
|
-
)
|
154
|
-
|
155
|
-
# Print training summary
|
156
|
-
model_info = TrainingUtils.get_model_info(model_name)
|
157
|
-
memory_estimate = TrainingUtils.estimate_memory_usage(
|
158
|
-
model_name, batch_size, max_length, use_lora
|
159
|
-
)
|
160
|
-
|
161
|
-
summary = TrainingUtils.format_training_summary(
|
162
|
-
training_config.to_dict(), model_info, memory_estimate
|
163
|
-
)
|
164
|
-
print(summary)
|
165
|
-
|
166
|
-
# Validate configuration
|
167
|
-
issues = TrainingUtils.validate_training_config(training_config.to_dict())
|
168
|
-
if issues:
|
169
|
-
raise ValueError(f"Training configuration issues: {issues}")
|
170
|
-
|
171
|
-
# Initialize trainer based on training type
|
172
|
-
if training_type.lower() == "sft":
|
173
|
-
trainer = SFTTrainer(training_config)
|
174
|
-
else:
|
175
|
-
raise ValueError(f"Training type '{training_type}' not supported yet")
|
176
|
-
|
177
|
-
# Execute training
|
178
|
-
logger.info(f"Starting {training_type.upper()} training...")
|
179
|
-
result_path = trainer.train()
|
180
|
-
|
181
|
-
logger.info(f"Training completed! Model saved to: {result_path}")
|
182
|
-
return result_path
|
183
|
-
|
184
|
-
def train_on_runpod(
|
185
|
-
self,
|
186
|
-
model_name: str,
|
187
|
-
dataset_path: str,
|
188
|
-
runpod_api_key: str,
|
189
|
-
template_id: str,
|
190
|
-
gpu_type: str = "NVIDIA RTX A6000",
|
191
|
-
storage_config: Optional[Dict[str, Any]] = None,
|
192
|
-
job_name: Optional[str] = None,
|
193
|
-
**training_params
|
194
|
-
) -> Dict[str, Any]:
|
195
|
-
"""
|
196
|
-
Train a model on RunPod cloud infrastructure.
|
197
|
-
|
198
|
-
Args:
|
199
|
-
model_name: Model identifier
|
200
|
-
dataset_path: Dataset path or HuggingFace dataset name
|
201
|
-
runpod_api_key: RunPod API key
|
202
|
-
template_id: RunPod template ID
|
203
|
-
gpu_type: GPU type to use
|
204
|
-
storage_config: Optional cloud storage configuration
|
205
|
-
job_name: Optional job name
|
206
|
-
**training_params: Additional training parameters
|
207
|
-
|
208
|
-
Returns:
|
209
|
-
Training job results
|
210
|
-
|
211
|
-
Example:
|
212
|
-
```python
|
213
|
-
result = factory.train_on_runpod(
|
214
|
-
model_name="google/gemma-2-4b-it",
|
215
|
-
dataset_path="tatsu-lab/alpaca",
|
216
|
-
runpod_api_key="your-api-key",
|
217
|
-
template_id="your-template-id",
|
218
|
-
use_lora=True,
|
219
|
-
num_epochs=3
|
220
|
-
)
|
221
|
-
```
|
222
|
-
"""
|
223
|
-
# Import cloud components
|
224
|
-
from .cloud import TrainingJobOrchestrator
|
225
|
-
from .cloud.runpod_trainer import RunPodConfig
|
226
|
-
from .cloud.storage_manager import StorageConfig
|
227
|
-
from .cloud.job_orchestrator import JobConfig
|
228
|
-
|
229
|
-
# Create RunPod configuration
|
230
|
-
runpod_config = RunPodConfig(
|
231
|
-
api_key=runpod_api_key,
|
232
|
-
template_id=template_id,
|
233
|
-
gpu_type=gpu_type
|
234
|
-
)
|
235
|
-
|
236
|
-
# Create storage configuration if provided
|
237
|
-
storage_cfg = None
|
238
|
-
if storage_config:
|
239
|
-
storage_cfg = StorageConfig(**storage_config)
|
240
|
-
|
241
|
-
# Create job configuration
|
242
|
-
job_config = JobConfig(
|
243
|
-
model_name=model_name,
|
244
|
-
dataset_source=dataset_path,
|
245
|
-
job_name=job_name or f"gemma-training-{int(datetime.datetime.now().timestamp())}",
|
246
|
-
**training_params
|
247
|
-
)
|
248
|
-
|
249
|
-
# Initialize orchestrator and execute training
|
250
|
-
orchestrator = TrainingJobOrchestrator(
|
251
|
-
runpod_config=runpod_config,
|
252
|
-
storage_config=storage_cfg
|
253
|
-
)
|
254
|
-
|
255
|
-
logger.info(f"Starting RunPod training for {model_name}")
|
256
|
-
result = orchestrator.execute_training_workflow(job_config)
|
257
|
-
|
258
|
-
return result
|
259
|
-
|
260
|
-
async def upload_to_huggingface(
|
261
|
-
self,
|
262
|
-
model_path: str,
|
263
|
-
hf_model_name: str,
|
264
|
-
hf_token: Optional[str] = None,
|
265
|
-
metadata: Optional[Dict[str, Any]] = None
|
266
|
-
) -> str:
|
267
|
-
"""
|
268
|
-
Upload a trained model to HuggingFace Hub using HuggingFaceStorage.
|
269
|
-
|
270
|
-
Args:
|
271
|
-
model_path: Path to the trained model
|
272
|
-
hf_model_name: Name for the model on HuggingFace Hub
|
273
|
-
hf_token: HuggingFace token
|
274
|
-
metadata: Additional metadata for the model
|
275
|
-
|
276
|
-
Returns:
|
277
|
-
URL of the uploaded model
|
278
|
-
"""
|
279
|
-
try:
|
280
|
-
from ..core.storage.hf_storage import HuggingFaceStorage
|
281
|
-
|
282
|
-
logger.info(f"Uploading model to HuggingFace: {hf_model_name}")
|
283
|
-
|
284
|
-
# Initialize HuggingFace storage
|
285
|
-
storage = HuggingFaceStorage(
|
286
|
-
username="xenobordom",
|
287
|
-
token=hf_token
|
288
|
-
)
|
289
|
-
|
290
|
-
# Prepare metadata
|
291
|
-
upload_metadata = metadata or {}
|
292
|
-
upload_metadata.update({
|
293
|
-
"description": f"Fine-tuned model: {hf_model_name}",
|
294
|
-
"training_framework": "ISA Model SDK",
|
295
|
-
"uploaded_from": "training_factory"
|
296
|
-
})
|
297
|
-
|
298
|
-
# Upload model
|
299
|
-
success = await storage.save_model(
|
300
|
-
model_id=hf_model_name,
|
301
|
-
model_path=model_path,
|
302
|
-
metadata=upload_metadata
|
303
|
-
)
|
304
|
-
|
305
|
-
if success:
|
306
|
-
model_url = storage.get_public_url(hf_model_name)
|
307
|
-
logger.info(f"Model uploaded successfully: {model_url}")
|
308
|
-
return model_url
|
309
|
-
else:
|
310
|
-
raise Exception("Failed to upload model")
|
311
|
-
|
312
|
-
except Exception as e:
|
313
|
-
logger.error(f"Failed to upload to HuggingFace: {e}")
|
314
|
-
raise
|
315
|
-
|
316
|
-
def get_training_status(self, output_dir: str) -> Dict[str, Any]:
|
317
|
-
"""
|
318
|
-
Get training status from output directory.
|
319
|
-
|
320
|
-
Args:
|
321
|
-
output_dir: Training output directory
|
322
|
-
|
323
|
-
Returns:
|
324
|
-
Dictionary with training status information
|
325
|
-
"""
|
326
|
-
status = {
|
327
|
-
"output_dir": output_dir,
|
328
|
-
"exists": os.path.exists(output_dir),
|
329
|
-
"files": []
|
330
|
-
}
|
331
|
-
|
332
|
-
if status["exists"]:
|
333
|
-
status["files"] = os.listdir(output_dir)
|
334
|
-
|
335
|
-
# Check for specific files
|
336
|
-
config_path = os.path.join(output_dir, "training_config.json")
|
337
|
-
metrics_path = os.path.join(output_dir, "training_metrics.json")
|
338
|
-
model_path = os.path.join(output_dir, "pytorch_model.bin")
|
339
|
-
|
340
|
-
status["has_config"] = os.path.exists(config_path)
|
341
|
-
status["has_metrics"] = os.path.exists(metrics_path)
|
342
|
-
status["has_model"] = os.path.exists(model_path) or os.path.exists(os.path.join(output_dir, "adapter_model.bin"))
|
343
|
-
|
344
|
-
if status["has_config"]:
|
345
|
-
try:
|
346
|
-
status["config"] = TrainingUtils.load_training_args(output_dir)
|
347
|
-
except:
|
348
|
-
pass
|
349
|
-
|
350
|
-
return status
|
351
|
-
|
352
|
-
def list_trained_models(self) -> List[Dict[str, Any]]:
|
353
|
-
"""
|
354
|
-
List all trained models in the output directory.
|
355
|
-
|
356
|
-
Returns:
|
357
|
-
List of model information dictionaries
|
358
|
-
"""
|
359
|
-
models = []
|
360
|
-
|
361
|
-
if os.path.exists(self.base_output_dir):
|
362
|
-
for item in os.listdir(self.base_output_dir):
|
363
|
-
item_path = os.path.join(self.base_output_dir, item)
|
364
|
-
if os.path.isdir(item_path):
|
365
|
-
status = self.get_training_status(item_path)
|
366
|
-
models.append({
|
367
|
-
"name": item,
|
368
|
-
"path": item_path,
|
369
|
-
"created": datetime.datetime.fromtimestamp(
|
370
|
-
os.path.getctime(item_path)
|
371
|
-
).isoformat(),
|
372
|
-
"status": status
|
373
|
-
})
|
374
|
-
|
375
|
-
return sorted(models, key=lambda x: x["created"], reverse=True)
|
376
|
-
|
377
|
-
|
378
|
-
# Convenience functions for quick access
|
379
|
-
def train_gemma(
|
380
|
-
dataset_path: str,
|
381
|
-
model_size: str = "4b",
|
382
|
-
output_dir: Optional[str] = None,
|
383
|
-
**kwargs
|
384
|
-
) -> str:
|
385
|
-
"""
|
386
|
-
Quick function to train Gemma models.
|
387
|
-
|
388
|
-
Args:
|
389
|
-
dataset_path: Path to training dataset
|
390
|
-
model_size: Model size ("2b", "4b", "7b")
|
391
|
-
output_dir: Output directory
|
392
|
-
**kwargs: Additional training parameters
|
393
|
-
|
394
|
-
Returns:
|
395
|
-
Path to trained model
|
396
|
-
|
397
|
-
Example:
|
398
|
-
```python
|
399
|
-
from isa_model.training import train_gemma
|
400
|
-
|
401
|
-
model_path = train_gemma(
|
402
|
-
dataset_path="tatsu-lab/alpaca",
|
403
|
-
model_size="4b",
|
404
|
-
num_epochs=3,
|
405
|
-
batch_size=4
|
406
|
-
)
|
407
|
-
```
|
408
|
-
"""
|
409
|
-
factory = TrainingFactory()
|
410
|
-
|
411
|
-
model_map = {
|
412
|
-
"2b": "google/gemma-2-2b-it",
|
413
|
-
"4b": "google/gemma-2-4b-it",
|
414
|
-
"7b": "google/gemma-2-7b-it"
|
415
|
-
}
|
416
|
-
|
417
|
-
model_name = model_map.get(model_size, "google/gemma-2-4b-it")
|
418
|
-
|
419
|
-
return factory.train_model(
|
420
|
-
model_name=model_name,
|
421
|
-
dataset_path=dataset_path,
|
422
|
-
output_dir=output_dir,
|
423
|
-
**kwargs
|
424
|
-
)
|
@@ -1,25 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Intelligent Training Service Components
|
3
|
-
|
4
|
-
This module provides AI-powered training optimization and automation:
|
5
|
-
- Intelligent decision engine for configuration recommendations
|
6
|
-
- Task classification and model selection
|
7
|
-
- Resource optimization and cost estimation
|
8
|
-
- Natural language interface for training requests
|
9
|
-
"""
|
10
|
-
|
11
|
-
from .decision_engine import IntelligentDecisionEngine, TrainingRequest, TrainingRecommendation
|
12
|
-
from .task_classifier import TaskClassifier
|
13
|
-
from .knowledge_base import KnowledgeBase
|
14
|
-
from .resource_optimizer import ResourceOptimizer
|
15
|
-
from .intelligent_factory import IntelligentTrainingFactory
|
16
|
-
|
17
|
-
__all__ = [
|
18
|
-
'IntelligentDecisionEngine',
|
19
|
-
'TaskClassifier',
|
20
|
-
'KnowledgeBase',
|
21
|
-
'ResourceOptimizer',
|
22
|
-
'IntelligentTrainingFactory',
|
23
|
-
'TrainingRequest',
|
24
|
-
'TrainingRecommendation'
|
25
|
-
]
|