gptmed 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gptmed/__init__.py +37 -3
- gptmed/model/__init__.py +2 -2
- gptmed/observability/__init__.py +43 -0
- gptmed/observability/base.py +369 -0
- gptmed/observability/callbacks.py +397 -0
- gptmed/observability/metrics_tracker.py +544 -0
- gptmed/services/__init__.py +15 -0
- gptmed/services/device_manager.py +252 -0
- gptmed/services/training_service.py +489 -0
- gptmed/training/trainer.py +124 -10
- gptmed/utils/checkpoints.py +1 -1
- {gptmed-0.3.4.dist-info → gptmed-0.4.0.dist-info}/METADATA +180 -43
- {gptmed-0.3.4.dist-info → gptmed-0.4.0.dist-info}/RECORD +17 -10
- {gptmed-0.3.4.dist-info → gptmed-0.4.0.dist-info}/WHEEL +0 -0
- {gptmed-0.3.4.dist-info → gptmed-0.4.0.dist-info}/entry_points.txt +0 -0
- {gptmed-0.3.4.dist-info → gptmed-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {gptmed-0.3.4.dist-info → gptmed-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Device Manager Service
|
|
3
|
+
|
|
4
|
+
PURPOSE:
|
|
5
|
+
Manages device selection and configuration for model training and inference.
|
|
6
|
+
Implements Strategy Pattern for flexible device handling.
|
|
7
|
+
|
|
8
|
+
DESIGN PATTERNS:
|
|
9
|
+
- Strategy Pattern: Different strategies for CPU vs GPU
|
|
10
|
+
- Dependency Injection: DeviceManager can be injected into services
|
|
11
|
+
- Single Responsibility: Only handles device-related concerns
|
|
12
|
+
|
|
13
|
+
WHAT THIS FILE DOES:
|
|
14
|
+
1. Validates device availability (CUDA check)
|
|
15
|
+
2. Provides device selection logic with fallback
|
|
16
|
+
3. Manages device-specific configurations
|
|
17
|
+
4. Ensures consistent device handling across the codebase
|
|
18
|
+
|
|
19
|
+
PACKAGES USED:
|
|
20
|
+
- torch: Device detection and management
|
|
21
|
+
- abc: Abstract base classes for strategy pattern
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from abc import ABC, abstractmethod
|
|
25
|
+
from typing import Optional
|
|
26
|
+
import torch
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DeviceStrategy(ABC):
|
|
30
|
+
"""
|
|
31
|
+
Abstract base class for device strategies.
|
|
32
|
+
Implements Strategy Pattern for different device types.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def get_device(self) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Get the device string for PyTorch.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Device string ('cuda' or 'cpu')
|
|
42
|
+
"""
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def is_available(self) -> bool:
|
|
47
|
+
"""
|
|
48
|
+
Check if the device is available.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
True if device is available, False otherwise
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def get_device_info(self) -> dict:
|
|
57
|
+
"""
|
|
58
|
+
Get information about the device.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Dictionary with device information
|
|
62
|
+
"""
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class CUDAStrategy(DeviceStrategy):
|
|
67
|
+
"""Strategy for CUDA/GPU devices."""
|
|
68
|
+
|
|
69
|
+
def get_device(self) -> str:
|
|
70
|
+
"""Get CUDA device if available."""
|
|
71
|
+
return 'cuda' if self.is_available() else 'cpu'
|
|
72
|
+
|
|
73
|
+
def is_available(self) -> bool:
|
|
74
|
+
"""Check if CUDA is available."""
|
|
75
|
+
return torch.cuda.is_available()
|
|
76
|
+
|
|
77
|
+
def get_device_info(self) -> dict:
|
|
78
|
+
"""Get CUDA device information."""
|
|
79
|
+
if not self.is_available():
|
|
80
|
+
return {
|
|
81
|
+
'device': 'cuda',
|
|
82
|
+
'available': False,
|
|
83
|
+
'message': 'CUDA not available'
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
'device': 'cuda',
|
|
88
|
+
'available': True,
|
|
89
|
+
'device_name': torch.cuda.get_device_name(0),
|
|
90
|
+
'device_count': torch.cuda.device_count(),
|
|
91
|
+
'cuda_version': torch.version.cuda if torch.version.cuda else 'N/A',
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class CPUStrategy(DeviceStrategy):
|
|
96
|
+
"""Strategy for CPU devices."""
|
|
97
|
+
|
|
98
|
+
def get_device(self) -> str:
|
|
99
|
+
"""Always return CPU."""
|
|
100
|
+
return 'cpu'
|
|
101
|
+
|
|
102
|
+
def is_available(self) -> bool:
|
|
103
|
+
"""CPU is always available."""
|
|
104
|
+
return True
|
|
105
|
+
|
|
106
|
+
def get_device_info(self) -> dict:
|
|
107
|
+
"""Get CPU device information."""
|
|
108
|
+
return {
|
|
109
|
+
'device': 'cpu',
|
|
110
|
+
'available': True,
|
|
111
|
+
'num_threads': torch.get_num_threads(),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class DeviceManager:
|
|
116
|
+
"""
|
|
117
|
+
Manages device selection and configuration.
|
|
118
|
+
|
|
119
|
+
Follows Single Responsibility Principle - only handles device concerns.
|
|
120
|
+
Uses Strategy Pattern for different device types.
|
|
121
|
+
|
|
122
|
+
Example:
|
|
123
|
+
>>> device_manager = DeviceManager(preferred_device='cuda')
|
|
124
|
+
>>> device = device_manager.get_device()
|
|
125
|
+
>>> print(f"Using device: {device}")
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, preferred_device: str = 'cuda', allow_fallback: bool = True):
|
|
129
|
+
"""
|
|
130
|
+
Initialize DeviceManager.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
preferred_device: Preferred device ('cuda' or 'cpu')
|
|
134
|
+
allow_fallback: If True, fallback to CPU if CUDA unavailable
|
|
135
|
+
"""
|
|
136
|
+
self.preferred_device = preferred_device.lower()
|
|
137
|
+
self.allow_fallback = allow_fallback
|
|
138
|
+
|
|
139
|
+
# Validate device input
|
|
140
|
+
if self.preferred_device not in ['cuda', 'cpu']:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Invalid device: {preferred_device}. Must be 'cuda' or 'cpu'"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Select strategy based on preferred device
|
|
146
|
+
if self.preferred_device == 'cuda':
|
|
147
|
+
self.strategy = CUDAStrategy()
|
|
148
|
+
else:
|
|
149
|
+
self.strategy = CPUStrategy()
|
|
150
|
+
|
|
151
|
+
def get_device(self) -> str:
|
|
152
|
+
"""
|
|
153
|
+
Get the actual device to use.
|
|
154
|
+
|
|
155
|
+
Returns fallback device if preferred is unavailable and fallback is allowed.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Device string ('cuda' or 'cpu')
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
RuntimeError: If preferred device unavailable and fallback disabled
|
|
162
|
+
"""
|
|
163
|
+
if self.strategy.is_available():
|
|
164
|
+
return self.strategy.get_device()
|
|
165
|
+
|
|
166
|
+
# Handle unavailable device
|
|
167
|
+
if self.allow_fallback and self.preferred_device == 'cuda':
|
|
168
|
+
# Fallback to CPU
|
|
169
|
+
return 'cpu'
|
|
170
|
+
else:
|
|
171
|
+
raise RuntimeError(
|
|
172
|
+
f"Device '{self.preferred_device}' is not available and "
|
|
173
|
+
f"fallback is {'disabled' if not self.allow_fallback else 'not applicable'}"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def get_device_info(self) -> dict:
|
|
177
|
+
"""
|
|
178
|
+
Get information about the current device.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Dictionary with device information
|
|
182
|
+
"""
|
|
183
|
+
info = self.strategy.get_device_info()
|
|
184
|
+
info['preferred_device'] = self.preferred_device
|
|
185
|
+
info['actual_device'] = self.get_device()
|
|
186
|
+
info['allow_fallback'] = self.allow_fallback
|
|
187
|
+
return info
|
|
188
|
+
|
|
189
|
+
def print_device_info(self, verbose: bool = True) -> None:
|
|
190
|
+
"""
|
|
191
|
+
Print device information.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
verbose: If True, print detailed information
|
|
195
|
+
"""
|
|
196
|
+
if not verbose:
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
info = self.get_device_info()
|
|
200
|
+
actual = info['actual_device']
|
|
201
|
+
preferred = info['preferred_device']
|
|
202
|
+
|
|
203
|
+
print(f"\n💻 Device Configuration:")
|
|
204
|
+
print(f" Preferred: {preferred}")
|
|
205
|
+
print(f" Using: {actual}")
|
|
206
|
+
|
|
207
|
+
if preferred != actual:
|
|
208
|
+
print(f" ⚠️ Fallback to CPU (CUDA not available)")
|
|
209
|
+
|
|
210
|
+
if actual == 'cuda' and info.get('available'):
|
|
211
|
+
print(f" GPU: {info.get('device_name', 'Unknown')}")
|
|
212
|
+
print(f" CUDA Version: {info.get('cuda_version', 'N/A')}")
|
|
213
|
+
print(f" GPU Count: {info.get('device_count', 0)}")
|
|
214
|
+
elif actual == 'cpu':
|
|
215
|
+
print(f" CPU Threads: {info.get('num_threads', 'N/A')}")
|
|
216
|
+
|
|
217
|
+
@staticmethod
|
|
218
|
+
def validate_device(device: str) -> str:
|
|
219
|
+
"""
|
|
220
|
+
Validate and normalize device string.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
device: Device string to validate
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Normalized device string
|
|
227
|
+
|
|
228
|
+
Raises:
|
|
229
|
+
ValueError: If device is invalid
|
|
230
|
+
"""
|
|
231
|
+
device = device.lower().strip()
|
|
232
|
+
|
|
233
|
+
if device not in ['cuda', 'cpu', 'auto']:
|
|
234
|
+
raise ValueError(
|
|
235
|
+
f"Invalid device: '{device}'. Must be 'cuda', 'cpu', or 'auto'"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Auto-select best available device
|
|
239
|
+
if device == 'auto':
|
|
240
|
+
return 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
241
|
+
|
|
242
|
+
return device
|
|
243
|
+
|
|
244
|
+
@staticmethod
|
|
245
|
+
def get_optimal_device() -> str:
|
|
246
|
+
"""
|
|
247
|
+
Get the optimal device for the current environment.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
'cuda' if available, otherwise 'cpu'
|
|
251
|
+
"""
|
|
252
|
+
return 'cuda' if torch.cuda.is_available() else 'cpu'
|