isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,628 +0,0 @@
1
- """
2
- Training Data Storage System
3
-
4
- Provides persistent storage for training jobs, metrics, and model lifecycle data.
5
- Integrates with the core database system while maintaining training module independence.
6
- """
7
-
8
- import json
9
- import logging
10
- from typing import Dict, List, Optional, Any, Union
11
- from dataclasses import dataclass, field, asdict
12
- from datetime import datetime
13
- from pathlib import Path
14
- import uuid
15
-
16
- try:
17
- from ...core.database.supabase_client import SupabaseClient
18
- SUPABASE_AVAILABLE = True
19
- except ImportError:
20
- SUPABASE_AVAILABLE = False
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- @dataclass
26
- class TrainingJobRecord:
27
- """Training job record for persistent storage."""
28
-
29
- # Basic information
30
- job_id: str
31
- job_name: str
32
- status: str # "pending", "running", "completed", "failed", "cancelled"
33
-
34
- # Model and task information
35
- base_model: str
36
- task_type: str
37
- domain: str
38
- dataset_source: str
39
-
40
- # Training configuration
41
- training_config: Dict[str, Any]
42
- resource_config: Dict[str, Any]
43
-
44
- # Results and metrics
45
- output_model_path: Optional[str] = None
46
- training_metrics: Optional[Dict[str, Any]] = None
47
- cost_breakdown: Optional[Dict[str, float]] = None
48
-
49
- # Timing information
50
- created_at: datetime = field(default_factory=datetime.now)
51
- started_at: Optional[datetime] = None
52
- completed_at: Optional[datetime] = None
53
-
54
- # User and project information
55
- user_id: Optional[str] = None
56
- project_name: Optional[str] = None
57
- tags: Dict[str, str] = field(default_factory=dict)
58
-
59
- # Error information
60
- error_message: Optional[str] = None
61
- error_details: Optional[Dict[str, Any]] = None
62
-
63
- def to_dict(self) -> Dict[str, Any]:
64
- """Convert to dictionary for storage."""
65
- data = asdict(self)
66
- # Convert datetime objects to ISO strings
67
- for key, value in data.items():
68
- if isinstance(value, datetime):
69
- data[key] = value.isoformat()
70
- return data
71
-
72
- @classmethod
73
- def from_dict(cls, data: Dict[str, Any]) -> 'TrainingJobRecord':
74
- """Create from dictionary."""
75
- # Convert ISO strings back to datetime objects
76
- datetime_fields = ['created_at', 'started_at', 'completed_at']
77
- for field_name in datetime_fields:
78
- if field_name in data and data[field_name]:
79
- if isinstance(data[field_name], str):
80
- data[field_name] = datetime.fromisoformat(data[field_name])
81
-
82
- return cls(**data)
83
-
84
-
85
- @dataclass
86
- class TrainingMetrics:
87
- """Training metrics and performance data."""
88
-
89
- job_id: str
90
-
91
- # Training progress
92
- epoch: int
93
- step: int
94
- total_steps: int
95
-
96
- # Loss metrics
97
- training_loss: Optional[float] = None
98
- validation_loss: Optional[float] = None
99
- perplexity: Optional[float] = None
100
-
101
- # Performance metrics
102
- accuracy: Optional[float] = None
103
- f1_score: Optional[float] = None
104
- bleu_score: Optional[float] = None
105
- rouge_score: Optional[Dict[str, float]] = None
106
-
107
- # Resource utilization
108
- gpu_utilization: Optional[float] = None
109
- memory_usage: Optional[float] = None
110
-
111
- # Time tracking
112
- epoch_time: Optional[float] = None
113
- samples_per_second: Optional[float] = None
114
-
115
- # Custom metrics
116
- custom_metrics: Dict[str, Any] = field(default_factory=dict)
117
-
118
- # Timestamp
119
- recorded_at: datetime = field(default_factory=datetime.now)
120
-
121
- def to_dict(self) -> Dict[str, Any]:
122
- """Convert to dictionary for storage."""
123
- data = asdict(self)
124
- if isinstance(data['recorded_at'], datetime):
125
- data['recorded_at'] = data['recorded_at'].isoformat()
126
- return data
127
-
128
-
129
- class TrainingStorage:
130
- """
131
- Training data storage system.
132
-
133
- Provides persistent storage for training jobs, metrics, and related data.
134
- Uses Supabase when available, falls back to local JSON storage.
135
-
136
- Example:
137
- ```python
138
- storage = TrainingStorage()
139
-
140
- # Store training job
141
- job_record = TrainingJobRecord(
142
- job_id="training_123",
143
- job_name="medical_chatbot_training",
144
- status="running",
145
- base_model="google/gemma-2-4b-it",
146
- task_type="chat",
147
- domain="medical",
148
- dataset_source="medical_qa.json",
149
- training_config={"epochs": 3, "lr": 2e-5},
150
- resource_config={"gpu": "RTX 4090", "provider": "runpod"}
151
- )
152
-
153
- storage.save_training_job(job_record)
154
-
155
- # Store metrics
156
- metrics = TrainingMetrics(
157
- job_id="training_123",
158
- epoch=1,
159
- step=100,
160
- total_steps=1000,
161
- training_loss=0.5,
162
- validation_loss=0.6
163
- )
164
-
165
- storage.save_training_metrics(metrics)
166
- ```
167
- """
168
-
169
- def __init__(self, storage_dir: Optional[str] = None, use_database: bool = True):
170
- """
171
- Initialize training storage.
172
-
173
- Args:
174
- storage_dir: Local storage directory (fallback)
175
- use_database: Whether to use database storage
176
- """
177
- self.use_database = use_database and SUPABASE_AVAILABLE
178
- self.storage_dir = Path(storage_dir or "./training_data")
179
- self.storage_dir.mkdir(exist_ok=True)
180
-
181
- if self.use_database:
182
- try:
183
- self.db_client = SupabaseClient()
184
- logger.info("Training storage initialized with database backend")
185
- except Exception as e:
186
- logger.warning(f"Failed to initialize database client: {e}")
187
- self.use_database = False
188
-
189
- if not self.use_database:
190
- logger.info("Training storage initialized with local file backend")
191
-
192
- def save_training_job(self, job_record: TrainingJobRecord) -> bool:
193
- """
194
- Save training job record.
195
-
196
- Args:
197
- job_record: Training job record to save
198
-
199
- Returns:
200
- True if successful
201
- """
202
- try:
203
- if self.use_database:
204
- return self._save_job_to_database(job_record)
205
- else:
206
- return self._save_job_to_file(job_record)
207
- except Exception as e:
208
- logger.error(f"Failed to save training job {job_record.job_id}: {e}")
209
- return False
210
-
211
- def get_training_job(self, job_id: str) -> Optional[TrainingJobRecord]:
212
- """
213
- Get training job record by ID.
214
-
215
- Args:
216
- job_id: Job ID to retrieve
217
-
218
- Returns:
219
- Training job record or None if not found
220
- """
221
- try:
222
- if self.use_database:
223
- return self._get_job_from_database(job_id)
224
- else:
225
- return self._get_job_from_file(job_id)
226
- except Exception as e:
227
- logger.error(f"Failed to get training job {job_id}: {e}")
228
- return None
229
-
230
- def update_training_job(self, job_id: str, updates: Dict[str, Any]) -> bool:
231
- """
232
- Update training job record.
233
-
234
- Args:
235
- job_id: Job ID to update
236
- updates: Fields to update
237
-
238
- Returns:
239
- True if successful
240
- """
241
- try:
242
- if self.use_database:
243
- return self._update_job_in_database(job_id, updates)
244
- else:
245
- return self._update_job_in_file(job_id, updates)
246
- except Exception as e:
247
- logger.error(f"Failed to update training job {job_id}: {e}")
248
- return False
249
-
250
- def list_training_jobs(
251
- self,
252
- status: Optional[str] = None,
253
- user_id: Optional[str] = None,
254
- limit: int = 100
255
- ) -> List[TrainingJobRecord]:
256
- """
257
- List training jobs with optional filtering.
258
-
259
- Args:
260
- status: Filter by job status
261
- user_id: Filter by user ID
262
- limit: Maximum number of jobs to return
263
-
264
- Returns:
265
- List of training job records
266
- """
267
- try:
268
- if self.use_database:
269
- return self._list_jobs_from_database(status, user_id, limit)
270
- else:
271
- return self._list_jobs_from_files(status, user_id, limit)
272
- except Exception as e:
273
- logger.error(f"Failed to list training jobs: {e}")
274
- return []
275
-
276
- def save_training_metrics(self, metrics: TrainingMetrics) -> bool:
277
- """
278
- Save training metrics.
279
-
280
- Args:
281
- metrics: Training metrics to save
282
-
283
- Returns:
284
- True if successful
285
- """
286
- try:
287
- if self.use_database:
288
- return self._save_metrics_to_database(metrics)
289
- else:
290
- return self._save_metrics_to_file(metrics)
291
- except Exception as e:
292
- logger.error(f"Failed to save training metrics for job {metrics.job_id}: {e}")
293
- return False
294
-
295
- def get_training_metrics(self, job_id: str) -> List[TrainingMetrics]:
296
- """
297
- Get training metrics for a job.
298
-
299
- Args:
300
- job_id: Job ID to get metrics for
301
-
302
- Returns:
303
- List of training metrics
304
- """
305
- try:
306
- if self.use_database:
307
- return self._get_metrics_from_database(job_id)
308
- else:
309
- return self._get_metrics_from_files(job_id)
310
- except Exception as e:
311
- logger.error(f"Failed to get training metrics for job {job_id}: {e}")
312
- return []
313
-
314
- def delete_training_job(self, job_id: str) -> bool:
315
- """
316
- Delete training job and associated data.
317
-
318
- Args:
319
- job_id: Job ID to delete
320
-
321
- Returns:
322
- True if successful
323
- """
324
- try:
325
- if self.use_database:
326
- return self._delete_job_from_database(job_id)
327
- else:
328
- return self._delete_job_from_files(job_id)
329
- except Exception as e:
330
- logger.error(f"Failed to delete training job {job_id}: {e}")
331
- return False
332
-
333
- # Database backend methods
334
- def _save_job_to_database(self, job_record: TrainingJobRecord) -> bool:
335
- """Save job record to database."""
336
- if not self.use_database:
337
- return False
338
-
339
- try:
340
- client = self.db_client.get_client()
341
- data = job_record.to_dict()
342
-
343
- result = client.table("training_jobs").insert(data).execute()
344
- return len(result.data) > 0
345
- except Exception as e:
346
- logger.error(f"Database save failed: {e}")
347
- return False
348
-
349
- def _get_job_from_database(self, job_id: str) -> Optional[TrainingJobRecord]:
350
- """Get job record from database."""
351
- if not self.use_database:
352
- return None
353
-
354
- try:
355
- client = self.db_client.get_client()
356
- result = client.table("training_jobs").select("*").eq("job_id", job_id).execute()
357
-
358
- if result.data:
359
- return TrainingJobRecord.from_dict(result.data[0])
360
- return None
361
- except Exception as e:
362
- logger.error(f"Database get failed: {e}")
363
- return None
364
-
365
- def _update_job_in_database(self, job_id: str, updates: Dict[str, Any]) -> bool:
366
- """Update job record in database."""
367
- if not self.use_database:
368
- return False
369
-
370
- try:
371
- client = self.db_client.get_client()
372
- result = client.table("training_jobs").update(updates).eq("job_id", job_id).execute()
373
- return len(result.data) > 0
374
- except Exception as e:
375
- logger.error(f"Database update failed: {e}")
376
- return False
377
-
378
- def _list_jobs_from_database(
379
- self,
380
- status: Optional[str],
381
- user_id: Optional[str],
382
- limit: int
383
- ) -> List[TrainingJobRecord]:
384
- """List job records from database."""
385
- if not self.use_database:
386
- return []
387
-
388
- try:
389
- client = self.db_client.get_client()
390
- query = client.table("training_jobs").select("*")
391
-
392
- if status:
393
- query = query.eq("status", status)
394
- if user_id:
395
- query = query.eq("user_id", user_id)
396
-
397
- query = query.order("created_at", desc=True).limit(limit)
398
- result = query.execute()
399
-
400
- return [TrainingJobRecord.from_dict(record) for record in result.data]
401
- except Exception as e:
402
- logger.error(f"Database list failed: {e}")
403
- return []
404
-
405
- def _save_metrics_to_database(self, metrics: TrainingMetrics) -> bool:
406
- """Save metrics to database."""
407
- if not self.use_database:
408
- return False
409
-
410
- try:
411
- client = self.db_client.get_client()
412
- data = metrics.to_dict()
413
-
414
- result = client.table("training_metrics").insert(data).execute()
415
- return len(result.data) > 0
416
- except Exception as e:
417
- logger.error(f"Database metrics save failed: {e}")
418
- return False
419
-
420
- def _get_metrics_from_database(self, job_id: str) -> List[TrainingMetrics]:
421
- """Get metrics from database."""
422
- if not self.use_database:
423
- return []
424
-
425
- try:
426
- client = self.db_client.get_client()
427
- result = client.table("training_metrics").select("*").eq("job_id", job_id).order("recorded_at").execute()
428
-
429
- metrics_list = []
430
- for record in result.data:
431
- if isinstance(record['recorded_at'], str):
432
- record['recorded_at'] = datetime.fromisoformat(record['recorded_at'])
433
- metrics_list.append(TrainingMetrics(**record))
434
-
435
- return metrics_list
436
- except Exception as e:
437
- logger.error(f"Database metrics get failed: {e}")
438
- return []
439
-
440
- def _delete_job_from_database(self, job_id: str) -> bool:
441
- """Delete job from database."""
442
- if not self.use_database:
443
- return False
444
-
445
- try:
446
- client = self.db_client.get_client()
447
-
448
- # Delete metrics first
449
- client.table("training_metrics").delete().eq("job_id", job_id).execute()
450
-
451
- # Delete job record
452
- result = client.table("training_jobs").delete().eq("job_id", job_id).execute()
453
- return len(result.data) > 0
454
- except Exception as e:
455
- logger.error(f"Database delete failed: {e}")
456
- return False
457
-
458
- # File backend methods (fallback)
459
- def _save_job_to_file(self, job_record: TrainingJobRecord) -> bool:
460
- """Save job record to local file."""
461
- try:
462
- job_file = self.storage_dir / "jobs" / f"{job_record.job_id}.json"
463
- job_file.parent.mkdir(exist_ok=True)
464
-
465
- with open(job_file, 'w') as f:
466
- json.dump(job_record.to_dict(), f, indent=2, default=str)
467
-
468
- return True
469
- except Exception as e:
470
- logger.error(f"File save failed: {e}")
471
- return False
472
-
473
- def _get_job_from_file(self, job_id: str) -> Optional[TrainingJobRecord]:
474
- """Get job record from local file."""
475
- try:
476
- job_file = self.storage_dir / "jobs" / f"{job_id}.json"
477
- if not job_file.exists():
478
- return None
479
-
480
- with open(job_file, 'r') as f:
481
- data = json.load(f)
482
-
483
- return TrainingJobRecord.from_dict(data)
484
- except Exception as e:
485
- logger.error(f"File get failed: {e}")
486
- return None
487
-
488
- def _update_job_in_file(self, job_id: str, updates: Dict[str, Any]) -> bool:
489
- """Update job record in local file."""
490
- try:
491
- job_record = self._get_job_from_file(job_id)
492
- if not job_record:
493
- return False
494
-
495
- # Update fields
496
- for key, value in updates.items():
497
- if hasattr(job_record, key):
498
- setattr(job_record, key, value)
499
-
500
- return self._save_job_to_file(job_record)
501
- except Exception as e:
502
- logger.error(f"File update failed: {e}")
503
- return False
504
-
505
- def _list_jobs_from_files(
506
- self,
507
- status: Optional[str],
508
- user_id: Optional[str],
509
- limit: int
510
- ) -> List[TrainingJobRecord]:
511
- """List job records from local files."""
512
- try:
513
- jobs_dir = self.storage_dir / "jobs"
514
- if not jobs_dir.exists():
515
- return []
516
-
517
- jobs = []
518
- for job_file in jobs_dir.glob("*.json"):
519
- try:
520
- with open(job_file, 'r') as f:
521
- data = json.load(f)
522
-
523
- job_record = TrainingJobRecord.from_dict(data)
524
-
525
- # Apply filters
526
- if status and job_record.status != status:
527
- continue
528
- if user_id and job_record.user_id != user_id:
529
- continue
530
-
531
- jobs.append(job_record)
532
- except Exception as e:
533
- logger.warning(f"Failed to load job file {job_file}: {e}")
534
- continue
535
-
536
- # Sort by creation time (newest first)
537
- jobs.sort(key=lambda x: x.created_at, reverse=True)
538
-
539
- return jobs[:limit]
540
- except Exception as e:
541
- logger.error(f"File list failed: {e}")
542
- return []
543
-
544
- def _save_metrics_to_file(self, metrics: TrainingMetrics) -> bool:
545
- """Save metrics to local file."""
546
- try:
547
- metrics_dir = self.storage_dir / "metrics" / metrics.job_id
548
- metrics_dir.mkdir(parents=True, exist_ok=True)
549
-
550
- # Use timestamp for unique filename
551
- timestamp = metrics.recorded_at.strftime("%Y%m%d_%H%M%S_%f")
552
- metrics_file = metrics_dir / f"metrics_{timestamp}.json"
553
-
554
- with open(metrics_file, 'w') as f:
555
- json.dump(metrics.to_dict(), f, indent=2, default=str)
556
-
557
- return True
558
- except Exception as e:
559
- logger.error(f"File metrics save failed: {e}")
560
- return False
561
-
562
- def _get_metrics_from_files(self, job_id: str) -> List[TrainingMetrics]:
563
- """Get metrics from local files."""
564
- try:
565
- metrics_dir = self.storage_dir / "metrics" / job_id
566
- if not metrics_dir.exists():
567
- return []
568
-
569
- metrics_list = []
570
- for metrics_file in metrics_dir.glob("metrics_*.json"):
571
- try:
572
- with open(metrics_file, 'r') as f:
573
- data = json.load(f)
574
-
575
- if isinstance(data['recorded_at'], str):
576
- data['recorded_at'] = datetime.fromisoformat(data['recorded_at'])
577
-
578
- metrics_list.append(TrainingMetrics(**data))
579
- except Exception as e:
580
- logger.warning(f"Failed to load metrics file {metrics_file}: {e}")
581
- continue
582
-
583
- # Sort by recording time
584
- metrics_list.sort(key=lambda x: x.recorded_at)
585
-
586
- return metrics_list
587
- except Exception as e:
588
- logger.error(f"File metrics get failed: {e}")
589
- return []
590
-
591
- def _delete_job_from_files(self, job_id: str) -> bool:
592
- """Delete job from local files."""
593
- try:
594
- # Delete job file
595
- job_file = self.storage_dir / "jobs" / f"{job_id}.json"
596
- if job_file.exists():
597
- job_file.unlink()
598
-
599
- # Delete metrics directory
600
- metrics_dir = self.storage_dir / "metrics" / job_id
601
- if metrics_dir.exists():
602
- import shutil
603
- shutil.rmtree(metrics_dir)
604
-
605
- return True
606
- except Exception as e:
607
- logger.error(f"File delete failed: {e}")
608
- return False
609
-
610
- def get_statistics(self) -> Dict[str, Any]:
611
- """Get storage statistics."""
612
- try:
613
- all_jobs = self.list_training_jobs(limit=1000)
614
-
615
- total_jobs = len(all_jobs)
616
- status_counts = {}
617
- for job in all_jobs:
618
- status_counts[job.status] = status_counts.get(job.status, 0) + 1
619
-
620
- return {
621
- "total_jobs": total_jobs,
622
- "status_breakdown": status_counts,
623
- "backend": "database" if self.use_database else "file",
624
- "storage_available": SUPABASE_AVAILABLE
625
- }
626
- except Exception as e:
627
- logger.error(f"Failed to get statistics: {e}")
628
- return {"error": str(e)}