isa-model 0.2.0__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. isa_model/__init__.py +1 -1
  2. isa_model/core/storage/hf_storage.py +419 -0
  3. isa_model/deployment/__init__.py +52 -0
  4. isa_model/deployment/core/__init__.py +34 -0
  5. isa_model/deployment/core/deployment_config.py +356 -0
  6. isa_model/deployment/core/deployment_manager.py +549 -0
  7. isa_model/deployment/core/isa_deployment_service.py +401 -0
  8. isa_model/eval/factory.py +381 -140
  9. isa_model/inference/ai_factory.py +142 -240
  10. isa_model/inference/providers/ml_provider.py +50 -0
  11. isa_model/inference/services/audio/openai_tts_service.py +104 -3
  12. isa_model/inference/services/embedding/base_embed_service.py +112 -0
  13. isa_model/inference/services/embedding/ollama_embed_service.py +28 -2
  14. isa_model/inference/services/llm/__init__.py +2 -0
  15. isa_model/inference/services/llm/base_llm_service.py +111 -1
  16. isa_model/inference/services/llm/ollama_llm_service.py +234 -26
  17. isa_model/inference/services/llm/openai_llm_service.py +225 -28
  18. isa_model/inference/services/llm/triton_llm_service.py +481 -0
  19. isa_model/inference/services/ml/base_ml_service.py +78 -0
  20. isa_model/inference/services/ml/sklearn_ml_service.py +140 -0
  21. isa_model/inference/services/vision/__init__.py +3 -3
  22. isa_model/inference/services/vision/base_image_gen_service.py +161 -0
  23. isa_model/inference/services/vision/base_vision_service.py +177 -0
  24. isa_model/inference/services/vision/ollama_vision_service.py +143 -17
  25. isa_model/inference/services/vision/replicate_image_gen_service.py +139 -7
  26. isa_model/training/__init__.py +62 -32
  27. isa_model/training/cloud/__init__.py +22 -0
  28. isa_model/training/cloud/job_orchestrator.py +402 -0
  29. isa_model/training/cloud/runpod_trainer.py +454 -0
  30. isa_model/training/cloud/storage_manager.py +482 -0
  31. isa_model/training/core/__init__.py +23 -0
  32. isa_model/training/core/config.py +181 -0
  33. isa_model/training/core/dataset.py +222 -0
  34. isa_model/training/core/trainer.py +720 -0
  35. isa_model/training/core/utils.py +213 -0
  36. isa_model/training/factory.py +229 -198
  37. isa_model-0.2.8.dist-info/METADATA +465 -0
  38. isa_model-0.2.8.dist-info/RECORD +86 -0
  39. isa_model/core/model_router.py +0 -226
  40. isa_model/core/model_version.py +0 -0
  41. isa_model/core/resource_manager.py +0 -202
  42. isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +0 -120
  43. isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +0 -18
  44. isa_model/training/engine/llama_factory/__init__.py +0 -39
  45. isa_model/training/engine/llama_factory/config.py +0 -115
  46. isa_model/training/engine/llama_factory/data_adapter.py +0 -284
  47. isa_model/training/engine/llama_factory/examples/__init__.py +0 -6
  48. isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +0 -185
  49. isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +0 -163
  50. isa_model/training/engine/llama_factory/factory.py +0 -331
  51. isa_model/training/engine/llama_factory/rl.py +0 -254
  52. isa_model/training/engine/llama_factory/trainer.py +0 -171
  53. isa_model/training/image_model/configs/create_config.py +0 -37
  54. isa_model/training/image_model/configs/create_flux_config.py +0 -26
  55. isa_model/training/image_model/configs/create_lora_config.py +0 -21
  56. isa_model/training/image_model/prepare_massed_compute.py +0 -97
  57. isa_model/training/image_model/prepare_upload.py +0 -17
  58. isa_model/training/image_model/raw_data/create_captions.py +0 -16
  59. isa_model/training/image_model/raw_data/create_lora_captions.py +0 -20
  60. isa_model/training/image_model/raw_data/pre_processing.py +0 -200
  61. isa_model/training/image_model/train/train.py +0 -42
  62. isa_model/training/image_model/train/train_flux.py +0 -41
  63. isa_model/training/image_model/train/train_lora.py +0 -57
  64. isa_model/training/image_model/train_main.py +0 -25
  65. isa_model-0.2.0.dist-info/METADATA +0 -327
  66. isa_model-0.2.0.dist-info/RECORD +0 -92
  67. isa_model-0.2.0.dist-info/licenses/LICENSE +0 -21
  68. /isa_model/training/{llm_model/annotation → annotation}/annotation_schema.py +0 -0
  69. /isa_model/training/{llm_model/annotation → annotation}/processors/annotation_processor.py +0 -0
  70. /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_manager.py +0 -0
  71. /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_schema.py +0 -0
  72. /isa_model/training/{llm_model/annotation → annotation}/tests/test_annotation_flow.py +0 -0
  73. /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio copy.py +0 -0
  74. /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio_upload.py +0 -0
  75. /isa_model/training/{llm_model/annotation → annotation}/views/annotation_controller.py +0 -0
  76. {isa_model-0.2.0.dist-info → isa_model-0.2.8.dist-info}/WHEEL +0 -0
  77. {isa_model-0.2.0.dist-info → isa_model-0.2.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,402 @@
1
+ """
2
+ Training Job Orchestrator
3
+
4
+ This module orchestrates the complete training workflow:
5
+ - Dataset preparation and validation
6
+ - Job submission to cloud providers
7
+ - Training monitoring and progress tracking
8
+ - Model artifact collection and storage
9
+ """
10
+
11
+ import os
12
+ import json
13
+ import logging
14
+ from typing import Dict, List, Optional, Any, Union
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from datetime import datetime
18
+
19
+ from .runpod_trainer import RunPodTrainer, RunPodConfig
20
+ from .storage_manager import CloudStorageManager, StorageConfig
21
+ # from ..engine.llama_factory.config import SFTConfig, DatasetFormat
22
+ # Note: LlamaFactory integration is planned but not yet implemented
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ @dataclass
28
+ class JobConfig:
29
+ """Configuration for training job orchestration."""
30
+
31
+ # Model and dataset
32
+ model_name: str # e.g., "google/gemma-2-4b-it"
33
+ dataset_source: str # HuggingFace dataset name or local path
34
+
35
+ # Training parameters
36
+ training_type: str = "sft" # "sft", "dpo", "rlhf"
37
+ use_lora: bool = True
38
+ batch_size: int = 4
39
+ num_epochs: int = 3
40
+ learning_rate: float = 2e-5
41
+ max_length: int = 1024
42
+
43
+ # LoRA parameters
44
+ lora_rank: int = 8
45
+ lora_alpha: int = 16
46
+ lora_dropout: float = 0.05
47
+
48
+ # Job settings
49
+ job_name: Optional[str] = None
50
+ description: Optional[str] = None
51
+ tags: Optional[Dict[str, str]] = None
52
+
53
+ # Storage settings
54
+ save_model_to_storage: bool = True
55
+ model_name_in_storage: Optional[str] = None
56
+
57
+ def __post_init__(self):
58
+ """Validate configuration."""
59
+ if not self.model_name:
60
+ raise ValueError("Model name is required")
61
+ if not self.dataset_source:
62
+ raise ValueError("Dataset source is required")
63
+
64
+ if self.job_name is None:
65
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
66
+ model_short = self.model_name.split("/")[-1] if "/" in self.model_name else self.model_name
67
+ self.job_name = f"{model_short}_{self.training_type}_{timestamp}"
68
+
69
+
70
+ class TrainingJobOrchestrator:
71
+ """
72
+ Orchestrates complete training workflows.
73
+
74
+ This class manages the entire training pipeline from dataset preparation
75
+ to model deployment, handling cloud resources and storage automatically.
76
+
77
+ Example:
78
+ ```python
79
+ # Configure components
80
+ runpod_config = RunPodConfig(
81
+ api_key="your-runpod-key",
82
+ template_id="your-template-id"
83
+ )
84
+
85
+ storage_config = StorageConfig(
86
+ provider="s3",
87
+ bucket_name="my-training-bucket"
88
+ )
89
+
90
+ # Initialize orchestrator
91
+ orchestrator = TrainingJobOrchestrator(
92
+ runpod_config=runpod_config,
93
+ storage_config=storage_config
94
+ )
95
+
96
+ # Configure training job
97
+ job_config = JobConfig(
98
+ model_name="google/gemma-2-4b-it",
99
+ dataset_source="tatsu-lab/alpaca",
100
+ num_epochs=3,
101
+ batch_size=4
102
+ )
103
+
104
+ # Execute training workflow
105
+ result = orchestrator.execute_training_workflow(job_config)
106
+ print(f"Training completed: {result['model_path']}")
107
+ ```
108
+ """
109
+
110
+ def __init__(self,
111
+ runpod_config: RunPodConfig,
112
+ storage_config: Optional[StorageConfig] = None):
113
+ """
114
+ Initialize training job orchestrator.
115
+
116
+ Args:
117
+ runpod_config: RunPod configuration
118
+ storage_config: Optional cloud storage configuration
119
+ """
120
+ self.runpod_trainer = RunPodTrainer(runpod_config)
121
+ self.storage_manager = CloudStorageManager(storage_config) if storage_config else None
122
+
123
+ self.active_jobs: Dict[str, Dict[str, Any]] = {}
124
+
125
+ logger.info("Training job orchestrator initialized")
126
+
127
+ def prepare_dataset(self, dataset_source: str, local_cache_dir: str = "./dataset_cache") -> str:
128
+ """
129
+ Prepare and validate dataset for training.
130
+
131
+ Args:
132
+ dataset_source: Dataset source (HuggingFace name or local path)
133
+ local_cache_dir: Local directory to cache dataset
134
+
135
+ Returns:
136
+ Path to prepared dataset
137
+ """
138
+ os.makedirs(local_cache_dir, exist_ok=True)
139
+
140
+ try:
141
+ if dataset_source.startswith("hf://") or not os.path.exists(dataset_source):
142
+ # HuggingFace dataset
143
+ dataset_name = dataset_source.replace("hf://", "") if dataset_source.startswith("hf://") else dataset_source
144
+
145
+ logger.info(f"Loading HuggingFace dataset: {dataset_name}")
146
+
147
+ # Use datasets library to load and convert
148
+ from datasets import load_dataset
149
+
150
+ dataset = load_dataset(dataset_name)
151
+ train_data = []
152
+
153
+ # Convert to Alpaca format
154
+ for item in dataset['train']:
155
+ if 'instruction' in item and 'output' in item:
156
+ train_data.append({
157
+ 'instruction': item['instruction'],
158
+ 'input': item.get('input', ''),
159
+ 'output': item['output']
160
+ })
161
+ elif 'text' in item:
162
+ # Handle raw text datasets
163
+ train_data.append({
164
+ 'instruction': "Continue the following text:",
165
+ 'input': item['text'][:512], # First part as input
166
+ 'output': item['text'][512:1024] # Next part as output
167
+ })
168
+
169
+ # Save prepared dataset
170
+ dataset_path = os.path.join(local_cache_dir, f"{dataset_name.replace('/', '_')}.json")
171
+ with open(dataset_path, 'w') as f:
172
+ json.dump(train_data, f, indent=2)
173
+
174
+ logger.info(f"Prepared {len(train_data)} training samples")
175
+
176
+ else:
177
+ # Local dataset file
178
+ dataset_path = dataset_source
179
+
180
+ # Validate format
181
+ with open(dataset_path, 'r') as f:
182
+ data = json.load(f)
183
+
184
+ if not isinstance(data, list):
185
+ raise ValueError("Dataset must be a list of training examples")
186
+
187
+ # Validate required fields
188
+ required_fields = {'instruction', 'output'}
189
+ for i, item in enumerate(data[:5]): # Check first 5 items
190
+ if not all(field in item for field in required_fields):
191
+ raise ValueError(f"Item {i} missing required fields: {required_fields}")
192
+
193
+ logger.info(f"Validated local dataset with {len(data)} samples")
194
+
195
+ return dataset_path
196
+
197
+ except Exception as e:
198
+ logger.error(f"Failed to prepare dataset {dataset_source}: {e}")
199
+ raise
200
+
201
+ def execute_training_workflow(self, job_config: JobConfig) -> Dict[str, Any]:
202
+ """
203
+ Execute complete training workflow.
204
+
205
+ Args:
206
+ job_config: Training job configuration
207
+
208
+ Returns:
209
+ Training results with model path and metrics
210
+ """
211
+ workflow_start_time = datetime.now()
212
+
213
+ try:
214
+ logger.info(f"Starting training workflow: {job_config.job_name}")
215
+
216
+ # Step 1: Prepare dataset
217
+ logger.info("Step 1: Preparing dataset...")
218
+ dataset_path = self.prepare_dataset(job_config.dataset_source)
219
+
220
+ # Step 2: Upload dataset to storage if configured
221
+ dataset_url = dataset_path
222
+ if self.storage_manager:
223
+ logger.info("Step 2: Uploading dataset to cloud storage...")
224
+ dataset_url = self.storage_manager.upload_dataset(
225
+ local_path=dataset_path,
226
+ dataset_name=f"{job_config.job_name}_dataset",
227
+ metadata={
228
+ "source": job_config.dataset_source,
229
+ "job_name": job_config.job_name,
230
+ "created_at": workflow_start_time.isoformat()
231
+ }
232
+ )
233
+
234
+ # Step 3: Start training job
235
+ logger.info("Step 3: Starting RunPod training job...")
236
+ training_params = {
237
+ "use_lora": job_config.use_lora,
238
+ "batch_size": job_config.batch_size,
239
+ "num_epochs": job_config.num_epochs,
240
+ "learning_rate": job_config.learning_rate,
241
+ "max_length": job_config.max_length,
242
+ "lora_rank": job_config.lora_rank,
243
+ "lora_alpha": job_config.lora_alpha,
244
+ "lora_dropout": job_config.lora_dropout,
245
+ "dataset_name": dataset_url
246
+ }
247
+
248
+ job_id = self.runpod_trainer.start_training_job(
249
+ model_name=job_config.model_name,
250
+ dataset_path=dataset_url,
251
+ training_params=training_params,
252
+ job_name=job_config.job_name
253
+ )
254
+
255
+ # Track job
256
+ self.active_jobs[job_id] = {
257
+ "config": job_config,
258
+ "start_time": workflow_start_time,
259
+ "dataset_path": dataset_path,
260
+ "dataset_url": dataset_url,
261
+ "status": "running"
262
+ }
263
+
264
+ # Step 4: Monitor training
265
+ logger.info("Step 4: Monitoring training progress...")
266
+ final_status = self.runpod_trainer.monitor_job(job_id)
267
+
268
+ # Step 5: Collect results
269
+ logger.info("Step 5: Collecting training results...")
270
+ if final_status["status"] == "COMPLETED":
271
+ # Download trained model
272
+ local_model_path = self.runpod_trainer.get_trained_model(job_id)
273
+
274
+ # Upload to storage if configured
275
+ model_storage_url = None
276
+ if self.storage_manager and job_config.save_model_to_storage:
277
+ model_name = job_config.model_name_in_storage or job_config.job_name
278
+ model_storage_url = self.storage_manager.upload_model(
279
+ local_model_dir=local_model_path,
280
+ model_name=model_name,
281
+ metadata={
282
+ "base_model": job_config.model_name,
283
+ "dataset_source": job_config.dataset_source,
284
+ "training_params": training_params,
285
+ "job_id": job_id,
286
+ "completed_at": datetime.now().isoformat(),
287
+ "training_duration": str(datetime.now() - workflow_start_time)
288
+ }
289
+ )
290
+
291
+ # Update job status
292
+ self.active_jobs[job_id].update({
293
+ "status": "completed",
294
+ "local_model_path": local_model_path,
295
+ "model_storage_url": model_storage_url,
296
+ "final_status": final_status,
297
+ "end_time": datetime.now()
298
+ })
299
+
300
+ logger.info(f"Training workflow completed successfully: {job_config.job_name}")
301
+
302
+ return {
303
+ "success": True,
304
+ "job_id": job_id,
305
+ "job_name": job_config.job_name,
306
+ "model_path": local_model_path,
307
+ "model_storage_url": model_storage_url,
308
+ "training_duration": str(datetime.now() - workflow_start_time),
309
+ "final_status": final_status
310
+ }
311
+ else:
312
+ # Training failed
313
+ self.active_jobs[job_id].update({
314
+ "status": "failed",
315
+ "final_status": final_status,
316
+ "end_time": datetime.now()
317
+ })
318
+
319
+ raise RuntimeError(f"Training job failed with status: {final_status['status']}")
320
+
321
+ except Exception as e:
322
+ logger.error(f"Training workflow failed: {e}")
323
+
324
+ # Update job status if job_id exists
325
+ if 'job_id' in locals():
326
+ self.active_jobs[job_id].update({
327
+ "status": "error",
328
+ "error": str(e),
329
+ "end_time": datetime.now()
330
+ })
331
+
332
+ return {
333
+ "success": False,
334
+ "error": str(e),
335
+ "job_name": job_config.job_name,
336
+ "training_duration": str(datetime.now() - workflow_start_time)
337
+ }
338
+
339
+ def get_job_status(self, job_id: str) -> Dict[str, Any]:
340
+ """Get status of a training job."""
341
+ if job_id in self.active_jobs:
342
+ job_info = self.active_jobs[job_id].copy()
343
+
344
+ # Get real-time status from RunPod if job is still running
345
+ if job_info["status"] == "running":
346
+ try:
347
+ runpod_status = self.runpod_trainer.monitor_job(job_id, check_interval=0)
348
+ job_info["runpod_status"] = runpod_status
349
+ except:
350
+ pass
351
+
352
+ return job_info
353
+ else:
354
+ return {"error": f"Job {job_id} not found"}
355
+
356
+ def list_active_jobs(self) -> List[Dict[str, Any]]:
357
+ """List all active training jobs."""
358
+ return [
359
+ {
360
+ "job_id": job_id,
361
+ "job_name": info["config"].job_name,
362
+ "status": info["status"],
363
+ "start_time": info["start_time"].isoformat(),
364
+ "model_name": info["config"].model_name,
365
+ "dataset_source": info["config"].dataset_source
366
+ }
367
+ for job_id, info in self.active_jobs.items()
368
+ ]
369
+
370
+ def stop_job(self, job_id: str) -> bool:
371
+ """Stop a running training job."""
372
+ try:
373
+ self.runpod_trainer.stop_job(job_id)
374
+
375
+ if job_id in self.active_jobs:
376
+ self.active_jobs[job_id].update({
377
+ "status": "stopped",
378
+ "end_time": datetime.now()
379
+ })
380
+
381
+ logger.info(f"Stopped training job: {job_id}")
382
+ return True
383
+
384
+ except Exception as e:
385
+ logger.error(f"Failed to stop job {job_id}: {e}")
386
+ return False
387
+
388
+ def cleanup_job(self, job_id: str) -> None:
389
+ """Clean up job resources and remove from tracking."""
390
+ try:
391
+ # Stop job if still running
392
+ if job_id in self.active_jobs and self.active_jobs[job_id]["status"] == "running":
393
+ self.stop_job(job_id)
394
+
395
+ # Remove from tracking
396
+ if job_id in self.active_jobs:
397
+ del self.active_jobs[job_id]
398
+
399
+ logger.info(f"Cleaned up job: {job_id}")
400
+
401
+ except Exception as e:
402
+ logger.error(f"Failed to cleanup job {job_id}: {e}")