isa-model 0.0.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. isa_model/__init__.py +1 -1
  2. isa_model/core/model_manager.py +69 -4
  3. isa_model/core/model_registry.py +273 -46
  4. isa_model/core/storage/hf_storage.py +419 -0
  5. isa_model/deployment/__init__.py +52 -0
  6. isa_model/deployment/core/__init__.py +34 -0
  7. isa_model/deployment/core/deployment_config.py +356 -0
  8. isa_model/deployment/core/deployment_manager.py +549 -0
  9. isa_model/deployment/core/isa_deployment_service.py +401 -0
  10. isa_model/eval/factory.py +381 -140
  11. isa_model/inference/ai_factory.py +427 -236
  12. isa_model/inference/billing_tracker.py +406 -0
  13. isa_model/inference/providers/base_provider.py +51 -4
  14. isa_model/inference/providers/ml_provider.py +50 -0
  15. isa_model/inference/providers/ollama_provider.py +37 -18
  16. isa_model/inference/providers/openai_provider.py +65 -36
  17. isa_model/inference/providers/replicate_provider.py +42 -30
  18. isa_model/inference/services/audio/base_stt_service.py +21 -2
  19. isa_model/inference/services/audio/openai_realtime_service.py +353 -0
  20. isa_model/inference/services/audio/openai_stt_service.py +252 -0
  21. isa_model/inference/services/audio/openai_tts_service.py +149 -9
  22. isa_model/inference/services/audio/replicate_tts_service.py +239 -0
  23. isa_model/inference/services/base_service.py +36 -1
  24. isa_model/inference/services/embedding/base_embed_service.py +112 -0
  25. isa_model/inference/services/embedding/ollama_embed_service.py +28 -2
  26. isa_model/inference/services/embedding/openai_embed_service.py +223 -0
  27. isa_model/inference/services/llm/__init__.py +2 -0
  28. isa_model/inference/services/llm/base_llm_service.py +158 -86
  29. isa_model/inference/services/llm/llm_adapter.py +414 -0
  30. isa_model/inference/services/llm/ollama_llm_service.py +252 -63
  31. isa_model/inference/services/llm/openai_llm_service.py +231 -93
  32. isa_model/inference/services/llm/triton_llm_service.py +481 -0
  33. isa_model/inference/services/ml/base_ml_service.py +78 -0
  34. isa_model/inference/services/ml/sklearn_ml_service.py +140 -0
  35. isa_model/inference/services/vision/__init__.py +3 -3
  36. isa_model/inference/services/vision/base_image_gen_service.py +161 -0
  37. isa_model/inference/services/vision/base_vision_service.py +177 -0
  38. isa_model/inference/services/vision/helpers/image_utils.py +4 -3
  39. isa_model/inference/services/vision/ollama_vision_service.py +151 -17
  40. isa_model/inference/services/vision/openai_vision_service.py +275 -41
  41. isa_model/inference/services/vision/replicate_image_gen_service.py +278 -118
  42. isa_model/training/__init__.py +62 -32
  43. isa_model/training/cloud/__init__.py +22 -0
  44. isa_model/training/cloud/job_orchestrator.py +402 -0
  45. isa_model/training/cloud/runpod_trainer.py +454 -0
  46. isa_model/training/cloud/storage_manager.py +482 -0
  47. isa_model/training/core/__init__.py +23 -0
  48. isa_model/training/core/config.py +181 -0
  49. isa_model/training/core/dataset.py +222 -0
  50. isa_model/training/core/trainer.py +720 -0
  51. isa_model/training/core/utils.py +213 -0
  52. isa_model/training/factory.py +229 -198
  53. isa_model-0.3.1.dist-info/METADATA +465 -0
  54. isa_model-0.3.1.dist-info/RECORD +91 -0
  55. isa_model/core/model_router.py +0 -226
  56. isa_model/core/model_version.py +0 -0
  57. isa_model/core/resource_manager.py +0 -202
  58. isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +0 -120
  59. isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +0 -18
  60. isa_model/training/engine/llama_factory/__init__.py +0 -39
  61. isa_model/training/engine/llama_factory/config.py +0 -115
  62. isa_model/training/engine/llama_factory/data_adapter.py +0 -284
  63. isa_model/training/engine/llama_factory/examples/__init__.py +0 -6
  64. isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +0 -185
  65. isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +0 -163
  66. isa_model/training/engine/llama_factory/factory.py +0 -331
  67. isa_model/training/engine/llama_factory/rl.py +0 -254
  68. isa_model/training/engine/llama_factory/trainer.py +0 -171
  69. isa_model/training/image_model/configs/create_config.py +0 -37
  70. isa_model/training/image_model/configs/create_flux_config.py +0 -26
  71. isa_model/training/image_model/configs/create_lora_config.py +0 -21
  72. isa_model/training/image_model/prepare_massed_compute.py +0 -97
  73. isa_model/training/image_model/prepare_upload.py +0 -17
  74. isa_model/training/image_model/raw_data/create_captions.py +0 -16
  75. isa_model/training/image_model/raw_data/create_lora_captions.py +0 -20
  76. isa_model/training/image_model/raw_data/pre_processing.py +0 -200
  77. isa_model/training/image_model/train/train.py +0 -42
  78. isa_model/training/image_model/train/train_flux.py +0 -41
  79. isa_model/training/image_model/train/train_lora.py +0 -57
  80. isa_model/training/image_model/train_main.py +0 -25
  81. isa_model-0.0.2.dist-info/METADATA +0 -327
  82. isa_model-0.0.2.dist-info/RECORD +0 -92
  83. isa_model-0.0.2.dist-info/licenses/LICENSE +0 -21
  84. /isa_model/training/{llm_model/annotation → annotation}/annotation_schema.py +0 -0
  85. /isa_model/training/{llm_model/annotation → annotation}/processors/annotation_processor.py +0 -0
  86. /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_manager.py +0 -0
  87. /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_schema.py +0 -0
  88. /isa_model/training/{llm_model/annotation → annotation}/tests/test_annotation_flow.py +0 -0
  89. /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio copy.py +0 -0
  90. /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio_upload.py +0 -0
  91. /isa_model/training/{llm_model/annotation → annotation}/views/annotation_controller.py +0 -0
  92. {isa_model-0.0.2.dist-info → isa_model-0.3.1.dist-info}/WHEEL +0 -0
  93. {isa_model-0.0.2.dist-info → isa_model-0.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,482 @@
1
+ """
2
+ Cloud Storage Manager for Training Assets
3
+
4
+ This module handles storage of datasets, models, and training artifacts
5
+ across different cloud storage providers (S3, GCS, Azure, etc.).
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import logging
11
+ from typing import Dict, List, Optional, Any, Union
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from urllib.parse import urlparse
15
+
16
+ try:
17
+ import boto3
18
+ from botocore.exceptions import ClientError
19
+ S3_AVAILABLE = True
20
+ except ImportError:
21
+ S3_AVAILABLE = False
22
+ boto3 = None
23
+
24
+ try:
25
+ from google.cloud import storage as gcs
26
+ GCS_AVAILABLE = True
27
+ except ImportError:
28
+ GCS_AVAILABLE = False
29
+ gcs = None
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ @dataclass
35
+ class StorageConfig:
36
+ """Configuration for cloud storage."""
37
+
38
+ provider: str # "s3", "gcs", "azure"
39
+ bucket_name: str
40
+ region: Optional[str] = None
41
+
42
+ # Authentication
43
+ access_key: Optional[str] = None
44
+ secret_key: Optional[str] = None
45
+ service_account_path: Optional[str] = None
46
+
47
+ # Paths
48
+ datasets_prefix: str = "datasets/"
49
+ models_prefix: str = "models/"
50
+ logs_prefix: str = "logs/"
51
+
52
+ def __post_init__(self):
53
+ """Validate configuration."""
54
+ if self.provider not in ["s3", "gcs", "azure"]:
55
+ raise ValueError(f"Unsupported storage provider: {self.provider}")
56
+
57
+ if not self.bucket_name:
58
+ raise ValueError("Bucket name is required")
59
+
60
+
61
+ class CloudStorageManager:
62
+ """
63
+ Cloud storage manager for training assets.
64
+
65
+ Handles upload/download of datasets, models, and training artifacts
66
+ across different cloud storage providers.
67
+
68
+ Example:
69
+ ```python
70
+ # Configure S3 storage
71
+ storage_config = StorageConfig(
72
+ provider="s3",
73
+ bucket_name="my-training-bucket",
74
+ region="us-west-2",
75
+ access_key="your-access-key",
76
+ secret_key="your-secret-key"
77
+ )
78
+
79
+ # Initialize storage manager
80
+ storage = CloudStorageManager(storage_config)
81
+
82
+ # Upload dataset
83
+ dataset_url = storage.upload_dataset("local_data.json", "my-dataset")
84
+
85
+ # Upload trained model
86
+ model_url = storage.upload_model("./trained_model/", "gemma-finetuned-v1")
87
+
88
+ # Download model
89
+ local_path = storage.download_model("gemma-finetuned-v1", "./downloaded_model/")
90
+ ```
91
+ """
92
+
93
+ def __init__(self, config: StorageConfig):
94
+ """
95
+ Initialize cloud storage manager.
96
+
97
+ Args:
98
+ config: Storage configuration
99
+ """
100
+ self.config = config
101
+ self._client = None
102
+ self._initialize_client()
103
+
104
+ logger.info(f"Storage manager initialized for {config.provider}://{config.bucket_name}")
105
+
106
+ def _initialize_client(self) -> None:
107
+ """Initialize storage client based on provider."""
108
+ if self.config.provider == "s3":
109
+ if not S3_AVAILABLE:
110
+ raise ImportError("boto3 is required for S3 storage. Install with: pip install boto3")
111
+
112
+ self._client = boto3.client(
113
+ 's3',
114
+ aws_access_key_id=self.config.access_key,
115
+ aws_secret_access_key=self.config.secret_key,
116
+ region_name=self.config.region
117
+ )
118
+
119
+ elif self.config.provider == "gcs":
120
+ if not GCS_AVAILABLE:
121
+ raise ImportError("google-cloud-storage is required for GCS. Install with: pip install google-cloud-storage")
122
+
123
+ if self.config.service_account_path:
124
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.config.service_account_path
125
+
126
+ self._client = gcs.Client()
127
+
128
+ else:
129
+ raise NotImplementedError(f"Provider {self.config.provider} not implemented yet")
130
+
131
+ def upload_dataset(self, local_path: str, dataset_name: str,
132
+ metadata: Optional[Dict[str, Any]] = None) -> str:
133
+ """
134
+ Upload dataset to cloud storage.
135
+
136
+ Args:
137
+ local_path: Local path to dataset file
138
+ dataset_name: Name for the dataset
139
+ metadata: Optional metadata to store with dataset
140
+
141
+ Returns:
142
+ Cloud storage URL for the dataset
143
+ """
144
+ remote_path = f"{self.config.datasets_prefix}{dataset_name}.json"
145
+
146
+ try:
147
+ # Upload dataset file
148
+ self._upload_file(local_path, remote_path)
149
+
150
+ # Upload metadata if provided
151
+ if metadata:
152
+ metadata_path = f"{self.config.datasets_prefix}{dataset_name}_metadata.json"
153
+ metadata_content = json.dumps(metadata, indent=2)
154
+ self._upload_content(metadata_content, metadata_path)
155
+
156
+ dataset_url = self._get_public_url(remote_path)
157
+ logger.info(f"Dataset uploaded: {dataset_url}")
158
+ return dataset_url
159
+
160
+ except Exception as e:
161
+ logger.error(f"Failed to upload dataset {dataset_name}: {e}")
162
+ raise
163
+
164
+ def download_dataset(self, dataset_name: str, local_path: str) -> str:
165
+ """
166
+ Download dataset from cloud storage.
167
+
168
+ Args:
169
+ dataset_name: Name of the dataset
170
+ local_path: Local path to save dataset
171
+
172
+ Returns:
173
+ Local path to downloaded dataset
174
+ """
175
+ remote_path = f"{self.config.datasets_prefix}{dataset_name}.json"
176
+
177
+ try:
178
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
179
+ self._download_file(remote_path, local_path)
180
+
181
+ logger.info(f"Dataset downloaded to: {local_path}")
182
+ return local_path
183
+
184
+ except Exception as e:
185
+ logger.error(f"Failed to download dataset {dataset_name}: {e}")
186
+ raise
187
+
188
+ def upload_model(self, local_model_dir: str, model_name: str,
189
+ metadata: Optional[Dict[str, Any]] = None) -> str:
190
+ """
191
+ Upload trained model to cloud storage.
192
+
193
+ Args:
194
+ local_model_dir: Local directory containing model files
195
+ model_name: Name for the model
196
+ metadata: Optional model metadata
197
+
198
+ Returns:
199
+ Cloud storage URL for the model
200
+ """
201
+ model_prefix = f"{self.config.models_prefix}{model_name}/"
202
+
203
+ try:
204
+ # Upload all model files
205
+ model_files = []
206
+ for root, dirs, files in os.walk(local_model_dir):
207
+ for file in files:
208
+ local_file_path = os.path.join(root, file)
209
+ relative_path = os.path.relpath(local_file_path, local_model_dir)
210
+ remote_path = f"{model_prefix}{relative_path}"
211
+
212
+ self._upload_file(local_file_path, remote_path)
213
+ model_files.append(relative_path)
214
+
215
+ # Upload model metadata
216
+ if metadata is None:
217
+ metadata = {}
218
+
219
+ metadata.update({
220
+ "model_name": model_name,
221
+ "files": model_files,
222
+ "upload_timestamp": self._get_timestamp()
223
+ })
224
+
225
+ metadata_path = f"{model_prefix}model_metadata.json"
226
+ metadata_content = json.dumps(metadata, indent=2)
227
+ self._upload_content(metadata_content, metadata_path)
228
+
229
+ model_url = self._get_public_url(model_prefix)
230
+ logger.info(f"Model uploaded: {model_url}")
231
+ return model_url
232
+
233
+ except Exception as e:
234
+ logger.error(f"Failed to upload model {model_name}: {e}")
235
+ raise
236
+
237
+ def download_model(self, model_name: str, local_dir: str) -> str:
238
+ """
239
+ Download model from cloud storage.
240
+
241
+ Args:
242
+ model_name: Name of the model
243
+ local_dir: Local directory to save model
244
+
245
+ Returns:
246
+ Local path to downloaded model
247
+ """
248
+ model_prefix = f"{self.config.models_prefix}{model_name}/"
249
+
250
+ try:
251
+ os.makedirs(local_dir, exist_ok=True)
252
+
253
+ # First, get model metadata to know which files to download
254
+ metadata_path = f"{model_prefix}model_metadata.json"
255
+ metadata_content = self._download_content(metadata_path)
256
+ metadata = json.loads(metadata_content)
257
+
258
+ # Download all model files
259
+ for file_path in metadata.get("files", []):
260
+ remote_path = f"{model_prefix}{file_path}"
261
+ local_file_path = os.path.join(local_dir, file_path)
262
+
263
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
264
+ self._download_file(remote_path, local_file_path)
265
+
266
+ logger.info(f"Model downloaded to: {local_dir}")
267
+ return local_dir
268
+
269
+ except Exception as e:
270
+ logger.error(f"Failed to download model {model_name}: {e}")
271
+ raise
272
+
273
+ def upload_training_logs(self, local_log_dir: str, job_id: str) -> str:
274
+ """Upload training logs to cloud storage."""
275
+ logs_prefix = f"{self.config.logs_prefix}{job_id}/"
276
+
277
+ try:
278
+ for root, dirs, files in os.walk(local_log_dir):
279
+ for file in files:
280
+ local_file_path = os.path.join(root, file)
281
+ relative_path = os.path.relpath(local_file_path, local_log_dir)
282
+ remote_path = f"{logs_prefix}{relative_path}"
283
+
284
+ self._upload_file(local_file_path, remote_path)
285
+
286
+ logs_url = self._get_public_url(logs_prefix)
287
+ logger.info(f"Training logs uploaded: {logs_url}")
288
+ return logs_url
289
+
290
+ except Exception as e:
291
+ logger.error(f"Failed to upload training logs for job {job_id}: {e}")
292
+ raise
293
+
294
+ def list_datasets(self) -> List[Dict[str, Any]]:
295
+ """List all datasets in storage."""
296
+ try:
297
+ datasets = []
298
+ objects = self._list_objects(self.config.datasets_prefix)
299
+
300
+ for obj in objects:
301
+ if obj.endswith('.json') and not obj.endswith('_metadata.json'):
302
+ dataset_name = os.path.basename(obj).replace('.json', '')
303
+ datasets.append({
304
+ "name": dataset_name,
305
+ "path": obj,
306
+ "url": self._get_public_url(obj),
307
+ "size": self._get_object_size(obj),
308
+ "modified": self._get_object_modified_time(obj)
309
+ })
310
+
311
+ return datasets
312
+
313
+ except Exception as e:
314
+ logger.error(f"Failed to list datasets: {e}")
315
+ return []
316
+
317
+ def list_models(self) -> List[Dict[str, Any]]:
318
+ """List all models in storage."""
319
+ try:
320
+ models = []
321
+ prefixes = self._list_prefixes(self.config.models_prefix)
322
+
323
+ for prefix in prefixes:
324
+ model_name = prefix.rstrip('/').split('/')[-1]
325
+ metadata_path = f"{prefix}model_metadata.json"
326
+
327
+ try:
328
+ metadata_content = self._download_content(metadata_path)
329
+ metadata = json.loads(metadata_content)
330
+
331
+ models.append({
332
+ "name": model_name,
333
+ "path": prefix,
334
+ "url": self._get_public_url(prefix),
335
+ "metadata": metadata,
336
+ "files_count": len(metadata.get("files", [])),
337
+ "upload_time": metadata.get("upload_timestamp", "")
338
+ })
339
+ except:
340
+ # If metadata doesn't exist, add basic info
341
+ models.append({
342
+ "name": model_name,
343
+ "path": prefix,
344
+ "url": self._get_public_url(prefix),
345
+ "metadata": {},
346
+ "files_count": 0,
347
+ "upload_time": ""
348
+ })
349
+
350
+ return models
351
+
352
+ except Exception as e:
353
+ logger.error(f"Failed to list models: {e}")
354
+ return []
355
+
356
+ def _upload_file(self, local_path: str, remote_path: str) -> None:
357
+ """Upload file to storage."""
358
+ if self.config.provider == "s3":
359
+ self._client.upload_file(local_path, self.config.bucket_name, remote_path)
360
+ elif self.config.provider == "gcs":
361
+ bucket = self._client.bucket(self.config.bucket_name)
362
+ blob = bucket.blob(remote_path)
363
+ blob.upload_from_filename(local_path)
364
+ else:
365
+ raise NotImplementedError(f"Upload not implemented for {self.config.provider}")
366
+
367
+ def _upload_content(self, content: str, remote_path: str) -> None:
368
+ """Upload string content to storage."""
369
+ if self.config.provider == "s3":
370
+ self._client.put_object(
371
+ Bucket=self.config.bucket_name,
372
+ Key=remote_path,
373
+ Body=content.encode('utf-8')
374
+ )
375
+ elif self.config.provider == "gcs":
376
+ bucket = self._client.bucket(self.config.bucket_name)
377
+ blob = bucket.blob(remote_path)
378
+ blob.upload_from_string(content)
379
+ else:
380
+ raise NotImplementedError(f"Upload not implemented for {self.config.provider}")
381
+
382
+ def _download_file(self, remote_path: str, local_path: str) -> None:
383
+ """Download file from storage."""
384
+ if self.config.provider == "s3":
385
+ self._client.download_file(self.config.bucket_name, remote_path, local_path)
386
+ elif self.config.provider == "gcs":
387
+ bucket = self._client.bucket(self.config.bucket_name)
388
+ blob = bucket.blob(remote_path)
389
+ blob.download_to_filename(local_path)
390
+ else:
391
+ raise NotImplementedError(f"Download not implemented for {self.config.provider}")
392
+
393
+ def _download_content(self, remote_path: str) -> str:
394
+ """Download content as string."""
395
+ if self.config.provider == "s3":
396
+ response = self._client.get_object(Bucket=self.config.bucket_name, Key=remote_path)
397
+ return response['Body'].read().decode('utf-8')
398
+ elif self.config.provider == "gcs":
399
+ bucket = self._client.bucket(self.config.bucket_name)
400
+ blob = bucket.blob(remote_path)
401
+ return blob.download_as_text()
402
+ else:
403
+ raise NotImplementedError(f"Download not implemented for {self.config.provider}")
404
+
405
+ def _list_objects(self, prefix: str) -> List[str]:
406
+ """List objects with given prefix."""
407
+ if self.config.provider == "s3":
408
+ response = self._client.list_objects_v2(
409
+ Bucket=self.config.bucket_name,
410
+ Prefix=prefix
411
+ )
412
+ return [obj['Key'] for obj in response.get('Contents', [])]
413
+ elif self.config.provider == "gcs":
414
+ bucket = self._client.bucket(self.config.bucket_name)
415
+ blobs = bucket.list_blobs(prefix=prefix)
416
+ return [blob.name for blob in blobs]
417
+ else:
418
+ raise NotImplementedError(f"List objects not implemented for {self.config.provider}")
419
+
420
+ def _list_prefixes(self, prefix: str) -> List[str]:
421
+ """List prefixes (directories) under given prefix."""
422
+ if self.config.provider == "s3":
423
+ response = self._client.list_objects_v2(
424
+ Bucket=self.config.bucket_name,
425
+ Prefix=prefix,
426
+ Delimiter='/'
427
+ )
428
+ return [cp['Prefix'] for cp in response.get('CommonPrefixes', [])]
429
+ elif self.config.provider == "gcs":
430
+ # GCS doesn't have true directories, so we simulate by grouping by prefix
431
+ bucket = self._client.bucket(self.config.bucket_name)
432
+ blobs = bucket.list_blobs(prefix=prefix)
433
+ prefixes = set()
434
+ for blob in blobs:
435
+ parts = blob.name[len(prefix):].split('/')
436
+ if len(parts) > 1:
437
+ prefixes.add(f"{prefix}{parts[0]}/")
438
+ return list(prefixes)
439
+ else:
440
+ raise NotImplementedError(f"List prefixes not implemented for {self.config.provider}")
441
+
442
+ def _get_public_url(self, remote_path: str) -> str:
443
+ """Get public URL for object."""
444
+ if self.config.provider == "s3":
445
+ return f"https://{self.config.bucket_name}.s3.{self.config.region}.amazonaws.com/{remote_path}"
446
+ elif self.config.provider == "gcs":
447
+ return f"https://storage.googleapis.com/{self.config.bucket_name}/{remote_path}"
448
+ else:
449
+ return f"{self.config.provider}://{self.config.bucket_name}/{remote_path}"
450
+
451
+ def _get_object_size(self, remote_path: str) -> int:
452
+ """Get object size in bytes."""
453
+ try:
454
+ if self.config.provider == "s3":
455
+ response = self._client.head_object(Bucket=self.config.bucket_name, Key=remote_path)
456
+ return response['ContentLength']
457
+ elif self.config.provider == "gcs":
458
+ bucket = self._client.bucket(self.config.bucket_name)
459
+ blob = bucket.blob(remote_path)
460
+ blob.reload()
461
+ return blob.size
462
+ except:
463
+ return 0
464
+
465
+ def _get_object_modified_time(self, remote_path: str) -> str:
466
+ """Get object last modified time."""
467
+ try:
468
+ if self.config.provider == "s3":
469
+ response = self._client.head_object(Bucket=self.config.bucket_name, Key=remote_path)
470
+ return response['LastModified'].isoformat()
471
+ elif self.config.provider == "gcs":
472
+ bucket = self._client.bucket(self.config.bucket_name)
473
+ blob = bucket.blob(remote_path)
474
+ blob.reload()
475
+ return blob.time_created.isoformat()
476
+ except:
477
+ return ""
478
+
479
+ def _get_timestamp(self) -> str:
480
+ """Get current timestamp."""
481
+ from datetime import datetime
482
+ return datetime.utcnow().isoformat()
@@ -0,0 +1,23 @@
1
+ """
2
+ Core Training Components for ISA Model SDK
3
+
4
+ This module provides the core training functionality:
5
+ - Base training classes and interfaces
6
+ - Configuration management
7
+ - Training utilities
8
+ """
9
+
10
+ from .trainer import BaseTrainer, SFTTrainer
11
+ from .config import TrainingConfig, LoRAConfig, DatasetConfig
12
+ from .dataset import DatasetManager
13
+ from .utils import TrainingUtils
14
+
15
+ __all__ = [
16
+ 'BaseTrainer',
17
+ 'SFTTrainer',
18
+ 'TrainingConfig',
19
+ 'LoRAConfig',
20
+ 'DatasetConfig',
21
+ 'DatasetManager',
22
+ 'TrainingUtils'
23
+ ]
@@ -0,0 +1,181 @@
1
+ """
2
+ Training Configuration Classes
3
+
4
+ Defines configuration classes for different training scenarios.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Optional, Dict, Any, List
9
+ from pathlib import Path
10
+
11
+
12
+ @dataclass
13
+ class LoRAConfig:
14
+ """LoRA (Low-Rank Adaptation) configuration."""
15
+
16
+ use_lora: bool = True
17
+ lora_rank: int = 8
18
+ lora_alpha: int = 16
19
+ lora_dropout: float = 0.05
20
+ lora_target_modules: Optional[List[str]] = None
21
+
22
+ def __post_init__(self):
23
+ if self.lora_target_modules is None:
24
+ # Default target modules for most transformer models
25
+ self.lora_target_modules = ["q_proj", "v_proj", "k_proj", "o_proj"]
26
+
27
+
28
+ @dataclass
29
+ class DatasetConfig:
30
+ """Dataset configuration."""
31
+
32
+ dataset_path: str
33
+ dataset_format: str = "alpaca" # alpaca, sharegpt, custom
34
+ max_length: int = 1024
35
+ validation_split: float = 0.1
36
+ preprocessing_num_workers: int = 4
37
+
38
+ def __post_init__(self):
39
+ if not Path(self.dataset_path).exists() and not self.dataset_path.startswith("http"):
40
+ # Assume it's a HuggingFace dataset name
41
+ pass
42
+
43
+
44
+ @dataclass
45
+ class TrainingConfig:
46
+ """Main training configuration."""
47
+
48
+ # Model configuration
49
+ model_name: str
50
+ output_dir: str
51
+
52
+ # Training hyperparameters
53
+ num_epochs: int = 3
54
+ batch_size: int = 4
55
+ learning_rate: float = 2e-5
56
+ warmup_steps: int = 100
57
+ weight_decay: float = 0.01
58
+ gradient_accumulation_steps: int = 1
59
+ max_grad_norm: float = 1.0
60
+
61
+ # Training strategy
62
+ training_type: str = "sft" # sft, dpo, rlhf
63
+ fp16: bool = True
64
+ bf16: bool = False
65
+ gradient_checkpointing: bool = True
66
+
67
+ # Saving and logging
68
+ save_steps: int = 500
69
+ logging_steps: int = 10
70
+ eval_steps: int = 500
71
+ save_total_limit: int = 3
72
+
73
+ # LoRA configuration
74
+ lora_config: Optional[LoRAConfig] = field(default_factory=LoRAConfig)
75
+
76
+ # Dataset configuration
77
+ dataset_config: Optional[DatasetConfig] = None
78
+
79
+ # Additional parameters
80
+ extra_params: Dict[str, Any] = field(default_factory=dict)
81
+
82
+ def __post_init__(self):
83
+ # Create output directory if it doesn't exist
84
+ Path(self.output_dir).mkdir(parents=True, exist_ok=True)
85
+
86
+ # Set BF16 for newer GPUs, FP16 for older ones
87
+ if self.bf16:
88
+ self.fp16 = False
89
+
90
+ def to_dict(self) -> Dict[str, Any]:
91
+ """Convert config to dictionary."""
92
+ config_dict = {}
93
+
94
+ for key, value in self.__dict__.items():
95
+ if key in ['lora_config', 'dataset_config']:
96
+ if value is not None:
97
+ config_dict[key] = value.__dict__
98
+ else:
99
+ config_dict[key] = None
100
+ else:
101
+ config_dict[key] = value
102
+
103
+ return config_dict
104
+
105
+ @classmethod
106
+ def from_dict(cls, config_dict: Dict[str, Any]) -> 'TrainingConfig':
107
+ """Create config from dictionary."""
108
+ # Handle nested configs
109
+ if 'lora_config' in config_dict and config_dict['lora_config'] is not None:
110
+ config_dict['lora_config'] = LoRAConfig(**config_dict['lora_config'])
111
+
112
+ if 'dataset_config' in config_dict and config_dict['dataset_config'] is not None:
113
+ config_dict['dataset_config'] = DatasetConfig(**config_dict['dataset_config'])
114
+
115
+ return cls(**config_dict)
116
+
117
+
118
+ @dataclass
119
+ class RunPodConfig:
120
+ """RunPod cloud training configuration."""
121
+
122
+ api_key: str
123
+ template_id: str
124
+ gpu_type: str = "NVIDIA RTX A6000"
125
+ gpu_count: int = 1
126
+ container_disk_in_gb: int = 50
127
+ volume_in_gb: int = 100
128
+ max_runtime_hours: int = 24
129
+
130
+ def to_dict(self) -> Dict[str, Any]:
131
+ """Convert to dictionary."""
132
+ return self.__dict__.copy()
133
+
134
+
135
+ @dataclass
136
+ class StorageConfig:
137
+ """Cloud storage configuration."""
138
+
139
+ provider: str # s3, gcs, local
140
+ bucket_name: Optional[str] = None
141
+ region: Optional[str] = None
142
+ access_key: Optional[str] = None
143
+ secret_key: Optional[str] = None
144
+ service_account_path: Optional[str] = None
145
+
146
+ def to_dict(self) -> Dict[str, Any]:
147
+ """Convert to dictionary."""
148
+ return self.__dict__.copy()
149
+
150
+
151
+ @dataclass
152
+ class JobConfig:
153
+ """Training job configuration for cloud training."""
154
+
155
+ model_name: str
156
+ dataset_source: str
157
+ job_name: Optional[str] = None
158
+ description: Optional[str] = None
159
+
160
+ # Training parameters
161
+ training_type: str = "sft"
162
+ num_epochs: int = 3
163
+ batch_size: int = 4
164
+ learning_rate: float = 2e-5
165
+ max_length: int = 1024
166
+
167
+ # LoRA parameters
168
+ use_lora: bool = True
169
+ lora_rank: int = 8
170
+ lora_alpha: int = 16
171
+ lora_dropout: float = 0.05
172
+
173
+ # Storage parameters
174
+ save_model_to_storage: bool = True
175
+ model_name_in_storage: Optional[str] = None
176
+ upload_to_hf: bool = False
177
+ hf_model_name: Optional[str] = None
178
+
179
+ def to_dict(self) -> Dict[str, Any]:
180
+ """Convert to dictionary."""
181
+ return self.__dict__.copy()