isa-model 0.2.0__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/core/storage/hf_storage.py +419 -0
- isa_model/deployment/__init__.py +52 -0
- isa_model/deployment/core/__init__.py +34 -0
- isa_model/deployment/core/deployment_config.py +356 -0
- isa_model/deployment/core/deployment_manager.py +549 -0
- isa_model/deployment/core/isa_deployment_service.py +401 -0
- isa_model/eval/factory.py +381 -140
- isa_model/inference/ai_factory.py +142 -240
- isa_model/inference/providers/ml_provider.py +50 -0
- isa_model/inference/services/audio/openai_tts_service.py +104 -3
- isa_model/inference/services/embedding/base_embed_service.py +112 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +28 -2
- isa_model/inference/services/llm/__init__.py +2 -0
- isa_model/inference/services/llm/base_llm_service.py +111 -1
- isa_model/inference/services/llm/ollama_llm_service.py +234 -26
- isa_model/inference/services/llm/openai_llm_service.py +225 -28
- isa_model/inference/services/llm/triton_llm_service.py +481 -0
- isa_model/inference/services/ml/base_ml_service.py +78 -0
- isa_model/inference/services/ml/sklearn_ml_service.py +140 -0
- isa_model/inference/services/vision/__init__.py +3 -3
- isa_model/inference/services/vision/base_image_gen_service.py +161 -0
- isa_model/inference/services/vision/base_vision_service.py +177 -0
- isa_model/inference/services/vision/ollama_vision_service.py +143 -17
- isa_model/inference/services/vision/replicate_image_gen_service.py +139 -7
- isa_model/training/__init__.py +62 -32
- isa_model/training/cloud/__init__.py +22 -0
- isa_model/training/cloud/job_orchestrator.py +402 -0
- isa_model/training/cloud/runpod_trainer.py +454 -0
- isa_model/training/cloud/storage_manager.py +482 -0
- isa_model/training/core/__init__.py +23 -0
- isa_model/training/core/config.py +181 -0
- isa_model/training/core/dataset.py +222 -0
- isa_model/training/core/trainer.py +720 -0
- isa_model/training/core/utils.py +213 -0
- isa_model/training/factory.py +229 -198
- isa_model-0.2.8.dist-info/METADATA +465 -0
- isa_model-0.2.8.dist-info/RECORD +86 -0
- isa_model/core/model_router.py +0 -226
- isa_model/core/model_version.py +0 -0
- isa_model/core/resource_manager.py +0 -202
- isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +0 -120
- isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +0 -18
- isa_model/training/engine/llama_factory/__init__.py +0 -39
- isa_model/training/engine/llama_factory/config.py +0 -115
- isa_model/training/engine/llama_factory/data_adapter.py +0 -284
- isa_model/training/engine/llama_factory/examples/__init__.py +0 -6
- isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +0 -185
- isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +0 -163
- isa_model/training/engine/llama_factory/factory.py +0 -331
- isa_model/training/engine/llama_factory/rl.py +0 -254
- isa_model/training/engine/llama_factory/trainer.py +0 -171
- isa_model/training/image_model/configs/create_config.py +0 -37
- isa_model/training/image_model/configs/create_flux_config.py +0 -26
- isa_model/training/image_model/configs/create_lora_config.py +0 -21
- isa_model/training/image_model/prepare_massed_compute.py +0 -97
- isa_model/training/image_model/prepare_upload.py +0 -17
- isa_model/training/image_model/raw_data/create_captions.py +0 -16
- isa_model/training/image_model/raw_data/create_lora_captions.py +0 -20
- isa_model/training/image_model/raw_data/pre_processing.py +0 -200
- isa_model/training/image_model/train/train.py +0 -42
- isa_model/training/image_model/train/train_flux.py +0 -41
- isa_model/training/image_model/train/train_lora.py +0 -57
- isa_model/training/image_model/train_main.py +0 -25
- isa_model-0.2.0.dist-info/METADATA +0 -327
- isa_model-0.2.0.dist-info/RECORD +0 -92
- isa_model-0.2.0.dist-info/licenses/LICENSE +0 -21
- /isa_model/training/{llm_model/annotation → annotation}/annotation_schema.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/processors/annotation_processor.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_manager.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_schema.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_annotation_flow.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio copy.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio_upload.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/views/annotation_controller.py +0 -0
- {isa_model-0.2.0.dist-info → isa_model-0.2.8.dist-info}/WHEEL +0 -0
- {isa_model-0.2.0.dist-info → isa_model-0.2.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,482 @@
|
|
1
|
+
"""
|
2
|
+
Cloud Storage Manager for Training Assets
|
3
|
+
|
4
|
+
This module handles storage of datasets, models, and training artifacts
|
5
|
+
across different cloud storage providers (S3, GCS, Azure, etc.).
|
6
|
+
"""
|
7
|
+
|
8
|
+
import os
|
9
|
+
import json
|
10
|
+
import logging
|
11
|
+
from typing import Dict, List, Optional, Any, Union
|
12
|
+
from dataclasses import dataclass
|
13
|
+
from pathlib import Path
|
14
|
+
from urllib.parse import urlparse
|
15
|
+
|
16
|
+
try:
|
17
|
+
import boto3
|
18
|
+
from botocore.exceptions import ClientError
|
19
|
+
S3_AVAILABLE = True
|
20
|
+
except ImportError:
|
21
|
+
S3_AVAILABLE = False
|
22
|
+
boto3 = None
|
23
|
+
|
24
|
+
try:
|
25
|
+
from google.cloud import storage as gcs
|
26
|
+
GCS_AVAILABLE = True
|
27
|
+
except ImportError:
|
28
|
+
GCS_AVAILABLE = False
|
29
|
+
gcs = None
|
30
|
+
|
31
|
+
logger = logging.getLogger(__name__)
|
32
|
+
|
33
|
+
|
34
|
+
@dataclass
|
35
|
+
class StorageConfig:
|
36
|
+
"""Configuration for cloud storage."""
|
37
|
+
|
38
|
+
provider: str # "s3", "gcs", "azure"
|
39
|
+
bucket_name: str
|
40
|
+
region: Optional[str] = None
|
41
|
+
|
42
|
+
# Authentication
|
43
|
+
access_key: Optional[str] = None
|
44
|
+
secret_key: Optional[str] = None
|
45
|
+
service_account_path: Optional[str] = None
|
46
|
+
|
47
|
+
# Paths
|
48
|
+
datasets_prefix: str = "datasets/"
|
49
|
+
models_prefix: str = "models/"
|
50
|
+
logs_prefix: str = "logs/"
|
51
|
+
|
52
|
+
def __post_init__(self):
|
53
|
+
"""Validate configuration."""
|
54
|
+
if self.provider not in ["s3", "gcs", "azure"]:
|
55
|
+
raise ValueError(f"Unsupported storage provider: {self.provider}")
|
56
|
+
|
57
|
+
if not self.bucket_name:
|
58
|
+
raise ValueError("Bucket name is required")
|
59
|
+
|
60
|
+
|
61
|
+
class CloudStorageManager:
|
62
|
+
"""
|
63
|
+
Cloud storage manager for training assets.
|
64
|
+
|
65
|
+
Handles upload/download of datasets, models, and training artifacts
|
66
|
+
across different cloud storage providers.
|
67
|
+
|
68
|
+
Example:
|
69
|
+
```python
|
70
|
+
# Configure S3 storage
|
71
|
+
storage_config = StorageConfig(
|
72
|
+
provider="s3",
|
73
|
+
bucket_name="my-training-bucket",
|
74
|
+
region="us-west-2",
|
75
|
+
access_key="your-access-key",
|
76
|
+
secret_key="your-secret-key"
|
77
|
+
)
|
78
|
+
|
79
|
+
# Initialize storage manager
|
80
|
+
storage = CloudStorageManager(storage_config)
|
81
|
+
|
82
|
+
# Upload dataset
|
83
|
+
dataset_url = storage.upload_dataset("local_data.json", "my-dataset")
|
84
|
+
|
85
|
+
# Upload trained model
|
86
|
+
model_url = storage.upload_model("./trained_model/", "gemma-finetuned-v1")
|
87
|
+
|
88
|
+
# Download model
|
89
|
+
local_path = storage.download_model("gemma-finetuned-v1", "./downloaded_model/")
|
90
|
+
```
|
91
|
+
"""
|
92
|
+
|
93
|
+
def __init__(self, config: StorageConfig):
|
94
|
+
"""
|
95
|
+
Initialize cloud storage manager.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
config: Storage configuration
|
99
|
+
"""
|
100
|
+
self.config = config
|
101
|
+
self._client = None
|
102
|
+
self._initialize_client()
|
103
|
+
|
104
|
+
logger.info(f"Storage manager initialized for {config.provider}://{config.bucket_name}")
|
105
|
+
|
106
|
+
def _initialize_client(self) -> None:
|
107
|
+
"""Initialize storage client based on provider."""
|
108
|
+
if self.config.provider == "s3":
|
109
|
+
if not S3_AVAILABLE:
|
110
|
+
raise ImportError("boto3 is required for S3 storage. Install with: pip install boto3")
|
111
|
+
|
112
|
+
self._client = boto3.client(
|
113
|
+
's3',
|
114
|
+
aws_access_key_id=self.config.access_key,
|
115
|
+
aws_secret_access_key=self.config.secret_key,
|
116
|
+
region_name=self.config.region
|
117
|
+
)
|
118
|
+
|
119
|
+
elif self.config.provider == "gcs":
|
120
|
+
if not GCS_AVAILABLE:
|
121
|
+
raise ImportError("google-cloud-storage is required for GCS. Install with: pip install google-cloud-storage")
|
122
|
+
|
123
|
+
if self.config.service_account_path:
|
124
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.config.service_account_path
|
125
|
+
|
126
|
+
self._client = gcs.Client()
|
127
|
+
|
128
|
+
else:
|
129
|
+
raise NotImplementedError(f"Provider {self.config.provider} not implemented yet")
|
130
|
+
|
131
|
+
def upload_dataset(self, local_path: str, dataset_name: str,
|
132
|
+
metadata: Optional[Dict[str, Any]] = None) -> str:
|
133
|
+
"""
|
134
|
+
Upload dataset to cloud storage.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
local_path: Local path to dataset file
|
138
|
+
dataset_name: Name for the dataset
|
139
|
+
metadata: Optional metadata to store with dataset
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
Cloud storage URL for the dataset
|
143
|
+
"""
|
144
|
+
remote_path = f"{self.config.datasets_prefix}{dataset_name}.json"
|
145
|
+
|
146
|
+
try:
|
147
|
+
# Upload dataset file
|
148
|
+
self._upload_file(local_path, remote_path)
|
149
|
+
|
150
|
+
# Upload metadata if provided
|
151
|
+
if metadata:
|
152
|
+
metadata_path = f"{self.config.datasets_prefix}{dataset_name}_metadata.json"
|
153
|
+
metadata_content = json.dumps(metadata, indent=2)
|
154
|
+
self._upload_content(metadata_content, metadata_path)
|
155
|
+
|
156
|
+
dataset_url = self._get_public_url(remote_path)
|
157
|
+
logger.info(f"Dataset uploaded: {dataset_url}")
|
158
|
+
return dataset_url
|
159
|
+
|
160
|
+
except Exception as e:
|
161
|
+
logger.error(f"Failed to upload dataset {dataset_name}: {e}")
|
162
|
+
raise
|
163
|
+
|
164
|
+
def download_dataset(self, dataset_name: str, local_path: str) -> str:
|
165
|
+
"""
|
166
|
+
Download dataset from cloud storage.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
dataset_name: Name of the dataset
|
170
|
+
local_path: Local path to save dataset
|
171
|
+
|
172
|
+
Returns:
|
173
|
+
Local path to downloaded dataset
|
174
|
+
"""
|
175
|
+
remote_path = f"{self.config.datasets_prefix}{dataset_name}.json"
|
176
|
+
|
177
|
+
try:
|
178
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
179
|
+
self._download_file(remote_path, local_path)
|
180
|
+
|
181
|
+
logger.info(f"Dataset downloaded to: {local_path}")
|
182
|
+
return local_path
|
183
|
+
|
184
|
+
except Exception as e:
|
185
|
+
logger.error(f"Failed to download dataset {dataset_name}: {e}")
|
186
|
+
raise
|
187
|
+
|
188
|
+
def upload_model(self, local_model_dir: str, model_name: str,
|
189
|
+
metadata: Optional[Dict[str, Any]] = None) -> str:
|
190
|
+
"""
|
191
|
+
Upload trained model to cloud storage.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
local_model_dir: Local directory containing model files
|
195
|
+
model_name: Name for the model
|
196
|
+
metadata: Optional model metadata
|
197
|
+
|
198
|
+
Returns:
|
199
|
+
Cloud storage URL for the model
|
200
|
+
"""
|
201
|
+
model_prefix = f"{self.config.models_prefix}{model_name}/"
|
202
|
+
|
203
|
+
try:
|
204
|
+
# Upload all model files
|
205
|
+
model_files = []
|
206
|
+
for root, dirs, files in os.walk(local_model_dir):
|
207
|
+
for file in files:
|
208
|
+
local_file_path = os.path.join(root, file)
|
209
|
+
relative_path = os.path.relpath(local_file_path, local_model_dir)
|
210
|
+
remote_path = f"{model_prefix}{relative_path}"
|
211
|
+
|
212
|
+
self._upload_file(local_file_path, remote_path)
|
213
|
+
model_files.append(relative_path)
|
214
|
+
|
215
|
+
# Upload model metadata
|
216
|
+
if metadata is None:
|
217
|
+
metadata = {}
|
218
|
+
|
219
|
+
metadata.update({
|
220
|
+
"model_name": model_name,
|
221
|
+
"files": model_files,
|
222
|
+
"upload_timestamp": self._get_timestamp()
|
223
|
+
})
|
224
|
+
|
225
|
+
metadata_path = f"{model_prefix}model_metadata.json"
|
226
|
+
metadata_content = json.dumps(metadata, indent=2)
|
227
|
+
self._upload_content(metadata_content, metadata_path)
|
228
|
+
|
229
|
+
model_url = self._get_public_url(model_prefix)
|
230
|
+
logger.info(f"Model uploaded: {model_url}")
|
231
|
+
return model_url
|
232
|
+
|
233
|
+
except Exception as e:
|
234
|
+
logger.error(f"Failed to upload model {model_name}: {e}")
|
235
|
+
raise
|
236
|
+
|
237
|
+
def download_model(self, model_name: str, local_dir: str) -> str:
|
238
|
+
"""
|
239
|
+
Download model from cloud storage.
|
240
|
+
|
241
|
+
Args:
|
242
|
+
model_name: Name of the model
|
243
|
+
local_dir: Local directory to save model
|
244
|
+
|
245
|
+
Returns:
|
246
|
+
Local path to downloaded model
|
247
|
+
"""
|
248
|
+
model_prefix = f"{self.config.models_prefix}{model_name}/"
|
249
|
+
|
250
|
+
try:
|
251
|
+
os.makedirs(local_dir, exist_ok=True)
|
252
|
+
|
253
|
+
# First, get model metadata to know which files to download
|
254
|
+
metadata_path = f"{model_prefix}model_metadata.json"
|
255
|
+
metadata_content = self._download_content(metadata_path)
|
256
|
+
metadata = json.loads(metadata_content)
|
257
|
+
|
258
|
+
# Download all model files
|
259
|
+
for file_path in metadata.get("files", []):
|
260
|
+
remote_path = f"{model_prefix}{file_path}"
|
261
|
+
local_file_path = os.path.join(local_dir, file_path)
|
262
|
+
|
263
|
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
264
|
+
self._download_file(remote_path, local_file_path)
|
265
|
+
|
266
|
+
logger.info(f"Model downloaded to: {local_dir}")
|
267
|
+
return local_dir
|
268
|
+
|
269
|
+
except Exception as e:
|
270
|
+
logger.error(f"Failed to download model {model_name}: {e}")
|
271
|
+
raise
|
272
|
+
|
273
|
+
def upload_training_logs(self, local_log_dir: str, job_id: str) -> str:
|
274
|
+
"""Upload training logs to cloud storage."""
|
275
|
+
logs_prefix = f"{self.config.logs_prefix}{job_id}/"
|
276
|
+
|
277
|
+
try:
|
278
|
+
for root, dirs, files in os.walk(local_log_dir):
|
279
|
+
for file in files:
|
280
|
+
local_file_path = os.path.join(root, file)
|
281
|
+
relative_path = os.path.relpath(local_file_path, local_log_dir)
|
282
|
+
remote_path = f"{logs_prefix}{relative_path}"
|
283
|
+
|
284
|
+
self._upload_file(local_file_path, remote_path)
|
285
|
+
|
286
|
+
logs_url = self._get_public_url(logs_prefix)
|
287
|
+
logger.info(f"Training logs uploaded: {logs_url}")
|
288
|
+
return logs_url
|
289
|
+
|
290
|
+
except Exception as e:
|
291
|
+
logger.error(f"Failed to upload training logs for job {job_id}: {e}")
|
292
|
+
raise
|
293
|
+
|
294
|
+
def list_datasets(self) -> List[Dict[str, Any]]:
|
295
|
+
"""List all datasets in storage."""
|
296
|
+
try:
|
297
|
+
datasets = []
|
298
|
+
objects = self._list_objects(self.config.datasets_prefix)
|
299
|
+
|
300
|
+
for obj in objects:
|
301
|
+
if obj.endswith('.json') and not obj.endswith('_metadata.json'):
|
302
|
+
dataset_name = os.path.basename(obj).replace('.json', '')
|
303
|
+
datasets.append({
|
304
|
+
"name": dataset_name,
|
305
|
+
"path": obj,
|
306
|
+
"url": self._get_public_url(obj),
|
307
|
+
"size": self._get_object_size(obj),
|
308
|
+
"modified": self._get_object_modified_time(obj)
|
309
|
+
})
|
310
|
+
|
311
|
+
return datasets
|
312
|
+
|
313
|
+
except Exception as e:
|
314
|
+
logger.error(f"Failed to list datasets: {e}")
|
315
|
+
return []
|
316
|
+
|
317
|
+
def list_models(self) -> List[Dict[str, Any]]:
|
318
|
+
"""List all models in storage."""
|
319
|
+
try:
|
320
|
+
models = []
|
321
|
+
prefixes = self._list_prefixes(self.config.models_prefix)
|
322
|
+
|
323
|
+
for prefix in prefixes:
|
324
|
+
model_name = prefix.rstrip('/').split('/')[-1]
|
325
|
+
metadata_path = f"{prefix}model_metadata.json"
|
326
|
+
|
327
|
+
try:
|
328
|
+
metadata_content = self._download_content(metadata_path)
|
329
|
+
metadata = json.loads(metadata_content)
|
330
|
+
|
331
|
+
models.append({
|
332
|
+
"name": model_name,
|
333
|
+
"path": prefix,
|
334
|
+
"url": self._get_public_url(prefix),
|
335
|
+
"metadata": metadata,
|
336
|
+
"files_count": len(metadata.get("files", [])),
|
337
|
+
"upload_time": metadata.get("upload_timestamp", "")
|
338
|
+
})
|
339
|
+
except:
|
340
|
+
# If metadata doesn't exist, add basic info
|
341
|
+
models.append({
|
342
|
+
"name": model_name,
|
343
|
+
"path": prefix,
|
344
|
+
"url": self._get_public_url(prefix),
|
345
|
+
"metadata": {},
|
346
|
+
"files_count": 0,
|
347
|
+
"upload_time": ""
|
348
|
+
})
|
349
|
+
|
350
|
+
return models
|
351
|
+
|
352
|
+
except Exception as e:
|
353
|
+
logger.error(f"Failed to list models: {e}")
|
354
|
+
return []
|
355
|
+
|
356
|
+
def _upload_file(self, local_path: str, remote_path: str) -> None:
|
357
|
+
"""Upload file to storage."""
|
358
|
+
if self.config.provider == "s3":
|
359
|
+
self._client.upload_file(local_path, self.config.bucket_name, remote_path)
|
360
|
+
elif self.config.provider == "gcs":
|
361
|
+
bucket = self._client.bucket(self.config.bucket_name)
|
362
|
+
blob = bucket.blob(remote_path)
|
363
|
+
blob.upload_from_filename(local_path)
|
364
|
+
else:
|
365
|
+
raise NotImplementedError(f"Upload not implemented for {self.config.provider}")
|
366
|
+
|
367
|
+
def _upload_content(self, content: str, remote_path: str) -> None:
|
368
|
+
"""Upload string content to storage."""
|
369
|
+
if self.config.provider == "s3":
|
370
|
+
self._client.put_object(
|
371
|
+
Bucket=self.config.bucket_name,
|
372
|
+
Key=remote_path,
|
373
|
+
Body=content.encode('utf-8')
|
374
|
+
)
|
375
|
+
elif self.config.provider == "gcs":
|
376
|
+
bucket = self._client.bucket(self.config.bucket_name)
|
377
|
+
blob = bucket.blob(remote_path)
|
378
|
+
blob.upload_from_string(content)
|
379
|
+
else:
|
380
|
+
raise NotImplementedError(f"Upload not implemented for {self.config.provider}")
|
381
|
+
|
382
|
+
def _download_file(self, remote_path: str, local_path: str) -> None:
|
383
|
+
"""Download file from storage."""
|
384
|
+
if self.config.provider == "s3":
|
385
|
+
self._client.download_file(self.config.bucket_name, remote_path, local_path)
|
386
|
+
elif self.config.provider == "gcs":
|
387
|
+
bucket = self._client.bucket(self.config.bucket_name)
|
388
|
+
blob = bucket.blob(remote_path)
|
389
|
+
blob.download_to_filename(local_path)
|
390
|
+
else:
|
391
|
+
raise NotImplementedError(f"Download not implemented for {self.config.provider}")
|
392
|
+
|
393
|
+
def _download_content(self, remote_path: str) -> str:
|
394
|
+
"""Download content as string."""
|
395
|
+
if self.config.provider == "s3":
|
396
|
+
response = self._client.get_object(Bucket=self.config.bucket_name, Key=remote_path)
|
397
|
+
return response['Body'].read().decode('utf-8')
|
398
|
+
elif self.config.provider == "gcs":
|
399
|
+
bucket = self._client.bucket(self.config.bucket_name)
|
400
|
+
blob = bucket.blob(remote_path)
|
401
|
+
return blob.download_as_text()
|
402
|
+
else:
|
403
|
+
raise NotImplementedError(f"Download not implemented for {self.config.provider}")
|
404
|
+
|
405
|
+
def _list_objects(self, prefix: str) -> List[str]:
|
406
|
+
"""List objects with given prefix."""
|
407
|
+
if self.config.provider == "s3":
|
408
|
+
response = self._client.list_objects_v2(
|
409
|
+
Bucket=self.config.bucket_name,
|
410
|
+
Prefix=prefix
|
411
|
+
)
|
412
|
+
return [obj['Key'] for obj in response.get('Contents', [])]
|
413
|
+
elif self.config.provider == "gcs":
|
414
|
+
bucket = self._client.bucket(self.config.bucket_name)
|
415
|
+
blobs = bucket.list_blobs(prefix=prefix)
|
416
|
+
return [blob.name for blob in blobs]
|
417
|
+
else:
|
418
|
+
raise NotImplementedError(f"List objects not implemented for {self.config.provider}")
|
419
|
+
|
420
|
+
def _list_prefixes(self, prefix: str) -> List[str]:
|
421
|
+
"""List prefixes (directories) under given prefix."""
|
422
|
+
if self.config.provider == "s3":
|
423
|
+
response = self._client.list_objects_v2(
|
424
|
+
Bucket=self.config.bucket_name,
|
425
|
+
Prefix=prefix,
|
426
|
+
Delimiter='/'
|
427
|
+
)
|
428
|
+
return [cp['Prefix'] for cp in response.get('CommonPrefixes', [])]
|
429
|
+
elif self.config.provider == "gcs":
|
430
|
+
# GCS doesn't have true directories, so we simulate by grouping by prefix
|
431
|
+
bucket = self._client.bucket(self.config.bucket_name)
|
432
|
+
blobs = bucket.list_blobs(prefix=prefix)
|
433
|
+
prefixes = set()
|
434
|
+
for blob in blobs:
|
435
|
+
parts = blob.name[len(prefix):].split('/')
|
436
|
+
if len(parts) > 1:
|
437
|
+
prefixes.add(f"{prefix}{parts[0]}/")
|
438
|
+
return list(prefixes)
|
439
|
+
else:
|
440
|
+
raise NotImplementedError(f"List prefixes not implemented for {self.config.provider}")
|
441
|
+
|
442
|
+
def _get_public_url(self, remote_path: str) -> str:
|
443
|
+
"""Get public URL for object."""
|
444
|
+
if self.config.provider == "s3":
|
445
|
+
return f"https://{self.config.bucket_name}.s3.{self.config.region}.amazonaws.com/{remote_path}"
|
446
|
+
elif self.config.provider == "gcs":
|
447
|
+
return f"https://storage.googleapis.com/{self.config.bucket_name}/{remote_path}"
|
448
|
+
else:
|
449
|
+
return f"{self.config.provider}://{self.config.bucket_name}/{remote_path}"
|
450
|
+
|
451
|
+
def _get_object_size(self, remote_path: str) -> int:
|
452
|
+
"""Get object size in bytes."""
|
453
|
+
try:
|
454
|
+
if self.config.provider == "s3":
|
455
|
+
response = self._client.head_object(Bucket=self.config.bucket_name, Key=remote_path)
|
456
|
+
return response['ContentLength']
|
457
|
+
elif self.config.provider == "gcs":
|
458
|
+
bucket = self._client.bucket(self.config.bucket_name)
|
459
|
+
blob = bucket.blob(remote_path)
|
460
|
+
blob.reload()
|
461
|
+
return blob.size
|
462
|
+
except:
|
463
|
+
return 0
|
464
|
+
|
465
|
+
def _get_object_modified_time(self, remote_path: str) -> str:
|
466
|
+
"""Get object last modified time."""
|
467
|
+
try:
|
468
|
+
if self.config.provider == "s3":
|
469
|
+
response = self._client.head_object(Bucket=self.config.bucket_name, Key=remote_path)
|
470
|
+
return response['LastModified'].isoformat()
|
471
|
+
elif self.config.provider == "gcs":
|
472
|
+
bucket = self._client.bucket(self.config.bucket_name)
|
473
|
+
blob = bucket.blob(remote_path)
|
474
|
+
blob.reload()
|
475
|
+
return blob.time_created.isoformat()
|
476
|
+
except:
|
477
|
+
return ""
|
478
|
+
|
479
|
+
def _get_timestamp(self) -> str:
|
480
|
+
"""Get current timestamp."""
|
481
|
+
from datetime import datetime
|
482
|
+
return datetime.utcnow().isoformat()
|
@@ -0,0 +1,23 @@
|
|
1
|
+
"""
|
2
|
+
Core Training Components for ISA Model SDK
|
3
|
+
|
4
|
+
This module provides the core training functionality:
|
5
|
+
- Base training classes and interfaces
|
6
|
+
- Configuration management
|
7
|
+
- Training utilities
|
8
|
+
"""
|
9
|
+
|
10
|
+
from .trainer import BaseTrainer, SFTTrainer
|
11
|
+
from .config import TrainingConfig, LoRAConfig, DatasetConfig
|
12
|
+
from .dataset import DatasetManager
|
13
|
+
from .utils import TrainingUtils
|
14
|
+
|
15
|
+
__all__ = [
|
16
|
+
'BaseTrainer',
|
17
|
+
'SFTTrainer',
|
18
|
+
'TrainingConfig',
|
19
|
+
'LoRAConfig',
|
20
|
+
'DatasetConfig',
|
21
|
+
'DatasetManager',
|
22
|
+
'TrainingUtils'
|
23
|
+
]
|
@@ -0,0 +1,181 @@
|
|
1
|
+
"""
|
2
|
+
Training Configuration Classes
|
3
|
+
|
4
|
+
Defines configuration classes for different training scenarios.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from dataclasses import dataclass, field
|
8
|
+
from typing import Optional, Dict, Any, List
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class LoRAConfig:
|
14
|
+
"""LoRA (Low-Rank Adaptation) configuration."""
|
15
|
+
|
16
|
+
use_lora: bool = True
|
17
|
+
lora_rank: int = 8
|
18
|
+
lora_alpha: int = 16
|
19
|
+
lora_dropout: float = 0.05
|
20
|
+
lora_target_modules: Optional[List[str]] = None
|
21
|
+
|
22
|
+
def __post_init__(self):
|
23
|
+
if self.lora_target_modules is None:
|
24
|
+
# Default target modules for most transformer models
|
25
|
+
self.lora_target_modules = ["q_proj", "v_proj", "k_proj", "o_proj"]
|
26
|
+
|
27
|
+
|
28
|
+
@dataclass
|
29
|
+
class DatasetConfig:
|
30
|
+
"""Dataset configuration."""
|
31
|
+
|
32
|
+
dataset_path: str
|
33
|
+
dataset_format: str = "alpaca" # alpaca, sharegpt, custom
|
34
|
+
max_length: int = 1024
|
35
|
+
validation_split: float = 0.1
|
36
|
+
preprocessing_num_workers: int = 4
|
37
|
+
|
38
|
+
def __post_init__(self):
|
39
|
+
if not Path(self.dataset_path).exists() and not self.dataset_path.startswith("http"):
|
40
|
+
# Assume it's a HuggingFace dataset name
|
41
|
+
pass
|
42
|
+
|
43
|
+
|
44
|
+
@dataclass
|
45
|
+
class TrainingConfig:
|
46
|
+
"""Main training configuration."""
|
47
|
+
|
48
|
+
# Model configuration
|
49
|
+
model_name: str
|
50
|
+
output_dir: str
|
51
|
+
|
52
|
+
# Training hyperparameters
|
53
|
+
num_epochs: int = 3
|
54
|
+
batch_size: int = 4
|
55
|
+
learning_rate: float = 2e-5
|
56
|
+
warmup_steps: int = 100
|
57
|
+
weight_decay: float = 0.01
|
58
|
+
gradient_accumulation_steps: int = 1
|
59
|
+
max_grad_norm: float = 1.0
|
60
|
+
|
61
|
+
# Training strategy
|
62
|
+
training_type: str = "sft" # sft, dpo, rlhf
|
63
|
+
fp16: bool = True
|
64
|
+
bf16: bool = False
|
65
|
+
gradient_checkpointing: bool = True
|
66
|
+
|
67
|
+
# Saving and logging
|
68
|
+
save_steps: int = 500
|
69
|
+
logging_steps: int = 10
|
70
|
+
eval_steps: int = 500
|
71
|
+
save_total_limit: int = 3
|
72
|
+
|
73
|
+
# LoRA configuration
|
74
|
+
lora_config: Optional[LoRAConfig] = field(default_factory=LoRAConfig)
|
75
|
+
|
76
|
+
# Dataset configuration
|
77
|
+
dataset_config: Optional[DatasetConfig] = None
|
78
|
+
|
79
|
+
# Additional parameters
|
80
|
+
extra_params: Dict[str, Any] = field(default_factory=dict)
|
81
|
+
|
82
|
+
def __post_init__(self):
|
83
|
+
# Create output directory if it doesn't exist
|
84
|
+
Path(self.output_dir).mkdir(parents=True, exist_ok=True)
|
85
|
+
|
86
|
+
# Set BF16 for newer GPUs, FP16 for older ones
|
87
|
+
if self.bf16:
|
88
|
+
self.fp16 = False
|
89
|
+
|
90
|
+
def to_dict(self) -> Dict[str, Any]:
|
91
|
+
"""Convert config to dictionary."""
|
92
|
+
config_dict = {}
|
93
|
+
|
94
|
+
for key, value in self.__dict__.items():
|
95
|
+
if key in ['lora_config', 'dataset_config']:
|
96
|
+
if value is not None:
|
97
|
+
config_dict[key] = value.__dict__
|
98
|
+
else:
|
99
|
+
config_dict[key] = None
|
100
|
+
else:
|
101
|
+
config_dict[key] = value
|
102
|
+
|
103
|
+
return config_dict
|
104
|
+
|
105
|
+
@classmethod
|
106
|
+
def from_dict(cls, config_dict: Dict[str, Any]) -> 'TrainingConfig':
|
107
|
+
"""Create config from dictionary."""
|
108
|
+
# Handle nested configs
|
109
|
+
if 'lora_config' in config_dict and config_dict['lora_config'] is not None:
|
110
|
+
config_dict['lora_config'] = LoRAConfig(**config_dict['lora_config'])
|
111
|
+
|
112
|
+
if 'dataset_config' in config_dict and config_dict['dataset_config'] is not None:
|
113
|
+
config_dict['dataset_config'] = DatasetConfig(**config_dict['dataset_config'])
|
114
|
+
|
115
|
+
return cls(**config_dict)
|
116
|
+
|
117
|
+
|
118
|
+
@dataclass
|
119
|
+
class RunPodConfig:
|
120
|
+
"""RunPod cloud training configuration."""
|
121
|
+
|
122
|
+
api_key: str
|
123
|
+
template_id: str
|
124
|
+
gpu_type: str = "NVIDIA RTX A6000"
|
125
|
+
gpu_count: int = 1
|
126
|
+
container_disk_in_gb: int = 50
|
127
|
+
volume_in_gb: int = 100
|
128
|
+
max_runtime_hours: int = 24
|
129
|
+
|
130
|
+
def to_dict(self) -> Dict[str, Any]:
|
131
|
+
"""Convert to dictionary."""
|
132
|
+
return self.__dict__.copy()
|
133
|
+
|
134
|
+
|
135
|
+
@dataclass
|
136
|
+
class StorageConfig:
|
137
|
+
"""Cloud storage configuration."""
|
138
|
+
|
139
|
+
provider: str # s3, gcs, local
|
140
|
+
bucket_name: Optional[str] = None
|
141
|
+
region: Optional[str] = None
|
142
|
+
access_key: Optional[str] = None
|
143
|
+
secret_key: Optional[str] = None
|
144
|
+
service_account_path: Optional[str] = None
|
145
|
+
|
146
|
+
def to_dict(self) -> Dict[str, Any]:
|
147
|
+
"""Convert to dictionary."""
|
148
|
+
return self.__dict__.copy()
|
149
|
+
|
150
|
+
|
151
|
+
@dataclass
|
152
|
+
class JobConfig:
|
153
|
+
"""Training job configuration for cloud training."""
|
154
|
+
|
155
|
+
model_name: str
|
156
|
+
dataset_source: str
|
157
|
+
job_name: Optional[str] = None
|
158
|
+
description: Optional[str] = None
|
159
|
+
|
160
|
+
# Training parameters
|
161
|
+
training_type: str = "sft"
|
162
|
+
num_epochs: int = 3
|
163
|
+
batch_size: int = 4
|
164
|
+
learning_rate: float = 2e-5
|
165
|
+
max_length: int = 1024
|
166
|
+
|
167
|
+
# LoRA parameters
|
168
|
+
use_lora: bool = True
|
169
|
+
lora_rank: int = 8
|
170
|
+
lora_alpha: int = 16
|
171
|
+
lora_dropout: float = 0.05
|
172
|
+
|
173
|
+
# Storage parameters
|
174
|
+
save_model_to_storage: bool = True
|
175
|
+
model_name_in_storage: Optional[str] = None
|
176
|
+
upload_to_hf: bool = False
|
177
|
+
hf_model_name: Optional[str] = None
|
178
|
+
|
179
|
+
def to_dict(self) -> Dict[str, Any]:
|
180
|
+
"""Convert to dictionary."""
|
181
|
+
return self.__dict__.copy()
|