matrice-compute 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,7 +56,7 @@ class ResourcesTracker:
56
56
  return cpu_utilization, memory_utilization
57
57
  return 0, 0
58
58
 
59
- @log_errors(default_return=(0, 0), raise_exception=False)
59
+ @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
60
60
  def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
61
61
  """
62
62
  Get CPU and memory usage for a specific container by its ID.
@@ -67,32 +67,46 @@ class ResourcesTracker:
67
67
  Returns:
68
68
  Tuple[float, float]: CPU utilization percentage and memory usage in MB.
69
69
  """
70
- stats_result = subprocess.run(
71
- [
72
- "docker",
73
- "stats",
74
- "--no-stream",
75
- "--format",
76
- "{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
77
- container_id,
78
- ],
79
- capture_output=True,
80
- text=True,
81
- check=True,
82
- )
83
- stats = stats_result.stdout.strip().split(": ")[1].split(", ")
84
- cpu_usage = float(stats[0].replace("% CPU", "").strip())
85
- memory_usage = stats[1].split(" / ")[0]
86
- mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
87
- if mem_unit == "KiB":
88
- memory_usage_mb = float(mem_value) / 1024
89
- elif mem_unit == "MiB":
90
- memory_usage_mb = float(mem_value)
91
- elif mem_unit == "GiB":
92
- memory_usage_mb = float(mem_value) * 1024
93
- else:
94
- memory_usage_mb = float(mem_value)
95
- return cpu_usage, memory_usage_mb
70
+ try:
71
+ stats_result = subprocess.run(
72
+ [
73
+ "docker",
74
+ "stats",
75
+ "--no-stream",
76
+ "--format",
77
+ "{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
78
+ container_id,
79
+ ],
80
+ capture_output=True,
81
+ text=True,
82
+ check=False,
83
+ timeout=10,
84
+ )
85
+ if stats_result.returncode != 0:
86
+ logging.debug("docker stats command failed for container %s", container_id)
87
+ return 0, 0
88
+ stats = stats_result.stdout.strip().split(": ")[1].split(", ")
89
+ cpu_usage = float(stats[0].replace("% CPU", "").strip())
90
+ memory_usage = stats[1].split(" / ")[0]
91
+ mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
92
+ if mem_unit == "KiB":
93
+ memory_usage_mb = float(mem_value) / 1024
94
+ elif mem_unit == "MiB":
95
+ memory_usage_mb = float(mem_value)
96
+ elif mem_unit == "GiB":
97
+ memory_usage_mb = float(mem_value) * 1024
98
+ else:
99
+ memory_usage_mb = float(mem_value)
100
+ return cpu_usage, memory_usage_mb
101
+ except subprocess.TimeoutExpired:
102
+ logging.debug("docker stats command timed out for container %s", container_id)
103
+ return 0, 0
104
+ except (ValueError, IndexError) as e:
105
+ logging.debug("Error parsing docker stats for container %s: %s", container_id, e)
106
+ return 0, 0
107
+ except Exception as e:
108
+ logging.debug("Unexpected error getting container stats for %s: %s", container_id, e)
109
+ return 0, 0
96
110
 
97
111
  @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
98
112
  def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
@@ -110,7 +124,7 @@ class ResourcesTracker:
110
124
  gpu_mem_used = self.get_container_gpu_memory_usage(container_pid)
111
125
  return gpu_util, gpu_mem_used
112
126
 
113
- @log_errors(default_return="", raise_exception=False)
127
+ @log_errors(default_return="", raise_exception=False, log_error=False)
114
128
  def get_pid_id_by_container_id(self, container_id: str) -> str:
115
129
  """
116
130
  Get PID for a container ID.
@@ -121,20 +135,31 @@ class ResourcesTracker:
121
135
  Returns:
122
136
  str: PID of the container.
123
137
  """
124
- pid_result = subprocess.run(
125
- [
126
- "docker",
127
- "inspect",
128
- "--format",
129
- "{{.State.Pid}}",
130
- container_id,
131
- ],
132
- capture_output=True,
133
- text=True,
134
- check=True,
135
- )
136
- container_pid = pid_result.stdout.strip()
137
- return container_pid
138
+ try:
139
+ pid_result = subprocess.run(
140
+ [
141
+ "docker",
142
+ "inspect",
143
+ "--format",
144
+ "{{.State.Pid}}",
145
+ container_id,
146
+ ],
147
+ capture_output=True,
148
+ text=True,
149
+ check=False,
150
+ timeout=10,
151
+ )
152
+ if pid_result.returncode != 0:
153
+ logging.debug("docker inspect command failed for container %s", container_id)
154
+ return ""
155
+ container_pid = pid_result.stdout.strip()
156
+ return container_pid
157
+ except subprocess.TimeoutExpired:
158
+ logging.debug("docker inspect command timed out for container %s", container_id)
159
+ return ""
160
+ except Exception as e:
161
+ logging.debug("Error getting PID for container %s: %s", container_id, e)
162
+ return ""
138
163
 
139
164
  @log_errors(default_return=0, raise_exception=False, log_error=False)
140
165
  def get_container_gpu_usage(self, container_pid: str) -> float:
@@ -155,9 +180,12 @@ class ResourcesTracker:
155
180
  ["nvidia-smi", "pmon", "-c", "1"],
156
181
  capture_output=True,
157
182
  text=True,
158
- check=True,
183
+ check=False,
159
184
  timeout=5,
160
185
  )
186
+ if result.returncode != 0:
187
+ logging.debug("nvidia-smi pmon command failed in get_container_gpu_usage")
188
+ return 0
161
189
  pmon_output = result.stdout.strip().split("\n")
162
190
  for line in pmon_output[2:]:
163
191
  parts = line.split()
@@ -167,7 +195,16 @@ class ResourcesTracker:
167
195
  if pid == str(container_pid):
168
196
  gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
169
197
  except subprocess.TimeoutExpired:
170
- logging.warning("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
198
+ logging.debug("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
199
+ return 0
200
+ except (ValueError, IndexError) as e:
201
+ logging.debug("Error parsing GPU usage info: %s", e)
202
+ return 0
203
+ except FileNotFoundError:
204
+ logging.debug("nvidia-smi not found on this system")
205
+ return 0
206
+ except Exception as e:
207
+ logging.debug("Unexpected error in get_container_gpu_usage: %s", e)
171
208
  return 0
172
209
  return gpu_util
173
210
 
@@ -196,9 +233,12 @@ class ResourcesTracker:
196
233
  stdout=subprocess.PIPE,
197
234
  stderr=subprocess.PIPE,
198
235
  text=True,
199
- check=True,
236
+ check=False,
200
237
  timeout=5,
201
238
  )
239
+ if result.returncode != 0:
240
+ logging.debug("nvidia-smi command failed in get_container_gpu_memory_usage")
241
+ return 0
202
242
  for line in result.stdout.splitlines():
203
243
  parts = line.strip().split(", ")
204
244
  if len(parts) == 2:
@@ -206,7 +246,16 @@ class ResourcesTracker:
206
246
  if process_pid == str(container_pid):
207
247
  total_memory += int(used_memory)
208
248
  except subprocess.TimeoutExpired:
209
- logging.warning("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
249
+ logging.debug("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
250
+ return 0
251
+ except (ValueError, IndexError) as e:
252
+ logging.debug("Error parsing GPU memory usage info: %s", e)
253
+ return 0
254
+ except FileNotFoundError:
255
+ logging.debug("nvidia-smi not found on this system")
256
+ return 0
257
+ except Exception as e:
258
+ logging.debug("Unexpected error in get_container_gpu_memory_usage: %s", e)
210
259
  return 0
211
260
  return total_memory
212
261
 
@@ -238,17 +287,40 @@ class ResourcesTracker:
238
287
  return gpu_memory_free, gpu_utilization
239
288
 
240
289
  try:
241
- subprocess.check_output("nvidia-smi", timeout=5)
290
+ result = subprocess.run(
291
+ ["nvidia-smi"],
292
+ stdout=subprocess.PIPE,
293
+ stderr=subprocess.PIPE,
294
+ timeout=5,
295
+ check=False,
296
+ )
297
+ if result.returncode != 0:
298
+ logging.debug("nvidia-smi command failed in _get_gpu_resources")
299
+ return 0, 0.0
242
300
  except subprocess.TimeoutExpired:
243
- logging.warning("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
301
+ logging.debug("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
302
+ return 0, 0.0
303
+ except FileNotFoundError:
304
+ logging.debug("nvidia-smi not found on this system")
305
+ return 0, 0.0
306
+ except Exception as e:
307
+ logging.debug("Error running nvidia-smi in _get_gpu_resources: %s", e)
244
308
  return 0, 0.0
245
309
 
246
310
  info_list = get_gpu_info()
247
- for info in info_list:
248
- info_split = info.split(", ")
249
- gpu_memory_free += int(info_split[5])
250
- gpu_utilization += float(info_split[2])
251
- gpu_utilization /= len(info_list) if info_list else 1
311
+ if not info_list:
312
+ return 0, 0.0
313
+
314
+ try:
315
+ for info in info_list:
316
+ info_split = info.split(", ")
317
+ if len(info_split) >= 6:
318
+ gpu_memory_free += int(info_split[5])
319
+ gpu_utilization += float(info_split[2])
320
+ gpu_utilization /= len(info_list) if info_list else 1
321
+ except (ValueError, IndexError) as e:
322
+ logging.debug("Error parsing GPU resources: %s", e)
323
+ return 0, 0.0
252
324
 
253
325
  return gpu_memory_free, gpu_utilization
254
326