isa-model 0.1.1__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/core/storage/hf_storage.py +419 -0
- isa_model/deployment/__init__.py +52 -0
- isa_model/deployment/core/__init__.py +34 -0
- isa_model/deployment/core/deployment_config.py +356 -0
- isa_model/deployment/core/deployment_manager.py +549 -0
- isa_model/deployment/core/isa_deployment_service.py +401 -0
- isa_model/eval/factory.py +381 -140
- isa_model/inference/ai_factory.py +142 -240
- isa_model/inference/providers/ml_provider.py +50 -0
- isa_model/inference/services/audio/openai_tts_service.py +104 -3
- isa_model/inference/services/embedding/base_embed_service.py +112 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +28 -2
- isa_model/inference/services/llm/__init__.py +2 -0
- isa_model/inference/services/llm/base_llm_service.py +111 -1
- isa_model/inference/services/llm/ollama_llm_service.py +234 -26
- isa_model/inference/services/llm/openai_llm_service.py +225 -28
- isa_model/inference/services/llm/triton_llm_service.py +481 -0
- isa_model/inference/services/ml/base_ml_service.py +78 -0
- isa_model/inference/services/ml/sklearn_ml_service.py +140 -0
- isa_model/inference/services/vision/__init__.py +3 -3
- isa_model/inference/services/vision/base_image_gen_service.py +161 -0
- isa_model/inference/services/vision/base_vision_service.py +177 -0
- isa_model/inference/services/vision/ollama_vision_service.py +143 -17
- isa_model/inference/services/vision/replicate_image_gen_service.py +139 -7
- isa_model/training/__init__.py +62 -32
- isa_model/training/cloud/__init__.py +22 -0
- isa_model/training/cloud/job_orchestrator.py +402 -0
- isa_model/training/cloud/runpod_trainer.py +454 -0
- isa_model/training/cloud/storage_manager.py +482 -0
- isa_model/training/core/__init__.py +23 -0
- isa_model/training/core/config.py +181 -0
- isa_model/training/core/dataset.py +222 -0
- isa_model/training/core/trainer.py +720 -0
- isa_model/training/core/utils.py +213 -0
- isa_model/training/factory.py +229 -198
- isa_model-0.2.8.dist-info/METADATA +465 -0
- isa_model-0.2.8.dist-info/RECORD +86 -0
- isa_model/core/model_router.py +0 -226
- isa_model/core/model_version.py +0 -0
- isa_model/core/resource_manager.py +0 -202
- isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +0 -120
- isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +0 -18
- isa_model/training/engine/llama_factory/__init__.py +0 -39
- isa_model/training/engine/llama_factory/config.py +0 -115
- isa_model/training/engine/llama_factory/data_adapter.py +0 -284
- isa_model/training/engine/llama_factory/examples/__init__.py +0 -6
- isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +0 -185
- isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +0 -163
- isa_model/training/engine/llama_factory/factory.py +0 -331
- isa_model/training/engine/llama_factory/rl.py +0 -254
- isa_model/training/engine/llama_factory/trainer.py +0 -171
- isa_model/training/image_model/configs/create_config.py +0 -37
- isa_model/training/image_model/configs/create_flux_config.py +0 -26
- isa_model/training/image_model/configs/create_lora_config.py +0 -21
- isa_model/training/image_model/prepare_massed_compute.py +0 -97
- isa_model/training/image_model/prepare_upload.py +0 -17
- isa_model/training/image_model/raw_data/create_captions.py +0 -16
- isa_model/training/image_model/raw_data/create_lora_captions.py +0 -20
- isa_model/training/image_model/raw_data/pre_processing.py +0 -200
- isa_model/training/image_model/train/train.py +0 -42
- isa_model/training/image_model/train/train_flux.py +0 -41
- isa_model/training/image_model/train/train_lora.py +0 -57
- isa_model/training/image_model/train_main.py +0 -25
- isa_model-0.1.1.dist-info/METADATA +0 -327
- isa_model-0.1.1.dist-info/RECORD +0 -92
- isa_model-0.1.1.dist-info/licenses/LICENSE +0 -21
- /isa_model/training/{llm_model/annotation → annotation}/annotation_schema.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/processors/annotation_processor.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_manager.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_schema.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_annotation_flow.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio copy.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio_upload.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/views/annotation_controller.py +0 -0
- {isa_model-0.1.1.dist-info → isa_model-0.2.8.dist-info}/WHEEL +0 -0
- {isa_model-0.1.1.dist-info → isa_model-0.2.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,213 @@
|
|
1
|
+
"""
|
2
|
+
Training Utilities
|
3
|
+
|
4
|
+
Helper functions and utilities for training operations.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import datetime
|
11
|
+
from typing import Dict, Any, Optional, List
|
12
|
+
from pathlib import Path
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class TrainingUtils:
|
18
|
+
"""Utility functions for training operations."""
|
19
|
+
|
20
|
+
@staticmethod
|
21
|
+
def generate_output_dir(model_name: str, training_type: str, base_dir: str = "training_outputs") -> str:
|
22
|
+
"""Generate a timestamped output directory."""
|
23
|
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
24
|
+
safe_model_name = model_name.replace("/", "_").replace(":", "_")
|
25
|
+
output_dir = os.path.join(base_dir, f"{safe_model_name}_{training_type}_{timestamp}")
|
26
|
+
return output_dir
|
27
|
+
|
28
|
+
@staticmethod
|
29
|
+
def save_training_args(args: Dict[str, Any], output_dir: str) -> None:
|
30
|
+
"""Save training arguments to file."""
|
31
|
+
args_path = Path(output_dir) / "training_args.json"
|
32
|
+
args_path.parent.mkdir(parents=True, exist_ok=True)
|
33
|
+
|
34
|
+
with open(args_path, 'w') as f:
|
35
|
+
json.dump(args, f, indent=2, default=str)
|
36
|
+
|
37
|
+
logger.info(f"Training arguments saved to: {args_path}")
|
38
|
+
|
39
|
+
@staticmethod
|
40
|
+
def load_training_args(output_dir: str) -> Dict[str, Any]:
|
41
|
+
"""Load training arguments from file."""
|
42
|
+
args_path = Path(output_dir) / "training_args.json"
|
43
|
+
|
44
|
+
if not args_path.exists():
|
45
|
+
raise FileNotFoundError(f"Training args not found: {args_path}")
|
46
|
+
|
47
|
+
with open(args_path, 'r') as f:
|
48
|
+
args = json.load(f)
|
49
|
+
|
50
|
+
return args
|
51
|
+
|
52
|
+
@staticmethod
|
53
|
+
def get_model_info(model_name: str) -> Dict[str, Any]:
|
54
|
+
"""Get information about a model."""
|
55
|
+
try:
|
56
|
+
from transformers import AutoConfig
|
57
|
+
|
58
|
+
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
59
|
+
|
60
|
+
model_info = {
|
61
|
+
"model_name": model_name,
|
62
|
+
"model_type": config.model_type,
|
63
|
+
"vocab_size": getattr(config, 'vocab_size', None),
|
64
|
+
"hidden_size": getattr(config, 'hidden_size', None),
|
65
|
+
"num_layers": getattr(config, 'num_hidden_layers', None),
|
66
|
+
"num_attention_heads": getattr(config, 'num_attention_heads', None),
|
67
|
+
"max_position_embeddings": getattr(config, 'max_position_embeddings', None),
|
68
|
+
}
|
69
|
+
|
70
|
+
return model_info
|
71
|
+
|
72
|
+
except Exception as e:
|
73
|
+
logger.warning(f"Could not get model info for {model_name}: {e}")
|
74
|
+
return {"model_name": model_name, "error": str(e)}
|
75
|
+
|
76
|
+
@staticmethod
|
77
|
+
def estimate_memory_usage(
|
78
|
+
model_name: str,
|
79
|
+
batch_size: int = 1,
|
80
|
+
max_length: int = 1024,
|
81
|
+
use_lora: bool = True
|
82
|
+
) -> Dict[str, Any]:
|
83
|
+
"""Estimate memory usage for training."""
|
84
|
+
try:
|
85
|
+
model_info = TrainingUtils.get_model_info(model_name)
|
86
|
+
|
87
|
+
# Rough estimation based on model parameters
|
88
|
+
hidden_size = model_info.get('hidden_size', 4096)
|
89
|
+
num_layers = model_info.get('num_layers', 32)
|
90
|
+
vocab_size = model_info.get('vocab_size', 32000)
|
91
|
+
|
92
|
+
# Estimate model parameters (in millions)
|
93
|
+
param_count = (hidden_size * hidden_size * 12 * num_layers + vocab_size * hidden_size) / 1e6
|
94
|
+
|
95
|
+
# Base memory for model (assuming fp16)
|
96
|
+
model_memory_gb = param_count * 2 / 1024 # 2 bytes per parameter
|
97
|
+
|
98
|
+
# Training memory overhead (gradients, optimizer states, activations)
|
99
|
+
if use_lora:
|
100
|
+
training_overhead = 2.0 # LoRA reduces memory usage significantly
|
101
|
+
else:
|
102
|
+
training_overhead = 4.0 # Full fine-tuning needs more memory
|
103
|
+
|
104
|
+
# Batch and sequence length impact
|
105
|
+
sequence_memory = batch_size * max_length * hidden_size * 2 / (1024**3) # Activation memory
|
106
|
+
|
107
|
+
total_memory_gb = model_memory_gb * training_overhead + sequence_memory
|
108
|
+
|
109
|
+
return {
|
110
|
+
"estimated_params_millions": param_count,
|
111
|
+
"model_memory_gb": model_memory_gb,
|
112
|
+
"total_training_memory_gb": total_memory_gb,
|
113
|
+
"recommended_gpu": TrainingUtils._recommend_gpu(total_memory_gb),
|
114
|
+
"use_lora": use_lora,
|
115
|
+
"batch_size": batch_size,
|
116
|
+
"max_length": max_length
|
117
|
+
}
|
118
|
+
|
119
|
+
except Exception as e:
|
120
|
+
logger.warning(f"Could not estimate memory usage: {e}")
|
121
|
+
return {"error": str(e)}
|
122
|
+
|
123
|
+
@staticmethod
|
124
|
+
def _recommend_gpu(memory_gb: float) -> str:
|
125
|
+
"""Recommend GPU based on memory requirements."""
|
126
|
+
if memory_gb <= 8:
|
127
|
+
return "RTX 3080/4070 (8-12GB)"
|
128
|
+
elif memory_gb <= 16:
|
129
|
+
return "RTX 4080/4090 (16GB)"
|
130
|
+
elif memory_gb <= 24:
|
131
|
+
return "RTX A6000/4090 (24GB)"
|
132
|
+
elif memory_gb <= 40:
|
133
|
+
return "A100 40GB"
|
134
|
+
elif memory_gb <= 80:
|
135
|
+
return "A100 80GB"
|
136
|
+
else:
|
137
|
+
return "Multiple A100 80GB (Multi-GPU required)"
|
138
|
+
|
139
|
+
@staticmethod
|
140
|
+
def validate_training_config(config: Dict[str, Any]) -> List[str]:
|
141
|
+
"""Validate training configuration and return any issues."""
|
142
|
+
issues = []
|
143
|
+
|
144
|
+
# Check required fields
|
145
|
+
required_fields = ["model_name", "output_dir"]
|
146
|
+
for field in required_fields:
|
147
|
+
if field not in config:
|
148
|
+
issues.append(f"Missing required field: {field}")
|
149
|
+
|
150
|
+
# Check batch size
|
151
|
+
if config.get("batch_size", 0) <= 0:
|
152
|
+
issues.append("batch_size must be positive")
|
153
|
+
|
154
|
+
# Check learning rate
|
155
|
+
lr = config.get("learning_rate", 0)
|
156
|
+
if lr <= 0 or lr > 1:
|
157
|
+
issues.append("learning_rate should be between 0 and 1")
|
158
|
+
|
159
|
+
# Check epochs
|
160
|
+
if config.get("num_epochs", 0) <= 0:
|
161
|
+
issues.append("num_epochs must be positive")
|
162
|
+
|
163
|
+
# Check LoRA config
|
164
|
+
if config.get("use_lora", False):
|
165
|
+
lora_rank = config.get("lora_rank", 8)
|
166
|
+
if lora_rank <= 0 or lora_rank > 256:
|
167
|
+
issues.append("lora_rank should be between 1 and 256")
|
168
|
+
|
169
|
+
return issues
|
170
|
+
|
171
|
+
@staticmethod
|
172
|
+
def format_training_summary(
|
173
|
+
config: Dict[str, Any],
|
174
|
+
model_info: Dict[str, Any],
|
175
|
+
memory_estimate: Dict[str, Any]
|
176
|
+
) -> str:
|
177
|
+
"""Format a training summary for display."""
|
178
|
+
summary = []
|
179
|
+
summary.append("=" * 60)
|
180
|
+
summary.append("TRAINING CONFIGURATION SUMMARY")
|
181
|
+
summary.append("=" * 60)
|
182
|
+
|
183
|
+
# Model information
|
184
|
+
summary.append(f"Model: {config.get('model_name', 'Unknown')}")
|
185
|
+
summary.append(f"Model Type: {model_info.get('model_type', 'Unknown')}")
|
186
|
+
summary.append(f"Parameters: ~{memory_estimate.get('estimated_params_millions', 0):.1f}M")
|
187
|
+
|
188
|
+
# Training configuration
|
189
|
+
summary.append(f"\nTraining Configuration:")
|
190
|
+
summary.append(f" Training Type: {config.get('training_type', 'sft')}")
|
191
|
+
summary.append(f" Epochs: {config.get('num_epochs', 3)}")
|
192
|
+
summary.append(f" Batch Size: {config.get('batch_size', 4)}")
|
193
|
+
summary.append(f" Learning Rate: {config.get('learning_rate', 2e-5)}")
|
194
|
+
summary.append(f" Max Length: {config.get('max_length', 1024)}")
|
195
|
+
|
196
|
+
# LoRA configuration
|
197
|
+
if config.get('use_lora', True):
|
198
|
+
summary.append(f"\nLoRA Configuration:")
|
199
|
+
summary.append(f" LoRA Rank: {config.get('lora_rank', 8)}")
|
200
|
+
summary.append(f" LoRA Alpha: {config.get('lora_alpha', 16)}")
|
201
|
+
summary.append(f" LoRA Dropout: {config.get('lora_dropout', 0.05)}")
|
202
|
+
|
203
|
+
# Memory estimation
|
204
|
+
summary.append(f"\nMemory Estimation:")
|
205
|
+
summary.append(f" Estimated Memory: ~{memory_estimate.get('total_training_memory_gb', 0):.1f}GB")
|
206
|
+
summary.append(f" Recommended GPU: {memory_estimate.get('recommended_gpu', 'Unknown')}")
|
207
|
+
|
208
|
+
# Output
|
209
|
+
summary.append(f"\nOutput Directory: {config.get('output_dir', 'Unknown')}")
|
210
|
+
|
211
|
+
summary.append("=" * 60)
|
212
|
+
|
213
|
+
return "\n".join(summary)
|