ob-metaflow-extensions 1.1.99__py2.py3-none-any.whl → 1.1.100__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -13,6 +13,7 @@ import json
13
13
  import sys
14
14
  from tempfile import TemporaryDirectory
15
15
  from subprocess import check_output, Popen
16
+ import subprocess
16
17
  from datetime import datetime, timedelta
17
18
  from functools import wraps
18
19
  from collections import namedtuple
@@ -315,7 +316,23 @@ def _update_charts(results, md_dict):
315
316
 
316
317
  # This code is adapted from: https://github.com/outerbounds/monitorbench
317
318
  class GPUProfiler:
318
- def __init__(self, interval=1, monitor_batch_duration=200):
319
+ def __init__(
320
+ self, interval=1, monitor_batch_duration=200, artifact_name="gpu_profile_data"
321
+ ):
322
+ self._interval = interval
323
+ self._monitor_batch_duration = monitor_batch_duration
324
+ self.artifact_name = artifact_name
325
+ self._card_setup_finished = False
326
+ self._card_comps = {
327
+ "max_utilization": {},
328
+ "charts": {},
329
+ "reading_duration": {},
330
+ "error_component": None,
331
+ }
332
+ self._monitor_started = False
333
+ self._start_monitor()
334
+
335
+ def _start_monitor(self):
319
336
  self.driver_ver, self.cuda_ver, self.error = self._read_versions()
320
337
  (
321
338
  self.interconnect_data,
@@ -327,16 +344,14 @@ class GPUProfiler:
327
344
  else:
328
345
  self.devices = self._read_devices()
329
346
  self._monitor = GPUMonitor(
330
- interval=interval, duration=monitor_batch_duration
347
+ interval=self._interval, duration=self._monitor_batch_duration
331
348
  )
332
349
  self._monitor_thread = threading.Thread(
333
350
  target=self._monitor._monitor_update_thread, daemon=True
334
351
  )
352
+ self.error = None
335
353
  self._monitor_thread.start()
336
- self._interval = interval
337
-
338
- self._card_comps = {"max_utilization": {}, "charts": {}, "reading_duration": {}}
339
- self._card_created = False
354
+ self._monitor_started = True
340
355
 
341
356
  def finish(self):
342
357
  ret = {
@@ -384,21 +399,60 @@ class GPUProfiler:
384
399
  return
385
400
 
386
401
  while True:
402
+ # There is a possibility that the `monitor` thread is not started yet
403
+ # because it somehow crashed at the very start.
404
+ if not self._monitor_started:
405
+ self._start_monitor()
406
+ time.sleep(self._interval)
407
+ continue
408
+
387
409
  readings = self._make_reading()
410
+
388
411
  if readings is None:
389
412
  print("GPU Profiler readings are none", file=sys.stderr)
390
413
  time.sleep(self._interval)
391
414
  continue
415
+
416
+ if not self._card_setup_finished:
417
+ self._setup_card()
418
+ time.sleep(self._interval)
419
+ continue
420
+
392
421
  _update_utilization(readings, self._card_comps["max_utilization"])
393
422
  _update_charts(readings, self._card_comps["charts"])
394
423
  current.card["gpu_profile"].refresh()
395
424
  time.sleep(self._interval)
396
425
 
397
- def _setup_card(self, artifact_name):
426
+ def _setup_card(self):
398
427
  from metaflow import current
399
428
 
400
429
  results = self._make_reading()
430
+ if "profile" not in results:
431
+ if self._card_comps["error_component"] is None:
432
+ self._card_comps["error_component"] = Markdown(
433
+ "## GPU profile failed: %s" % results["error"]
434
+ )
435
+ current.card["gpu_profile"].append(self._card_comps["error_component"])
436
+ else:
437
+ self._card_comps["error_component"].update(
438
+ Markdown("## GPU profile failed: %s" % results["error"])
439
+ )
440
+ current.card["gpu_profile"].refresh()
441
+ return
442
+
401
443
  els = current.card["gpu_profile"]
444
+ self._card_comps["error_component"] = None
445
+ els.clear()
446
+
447
+ current.card["gpu_profile"].append(
448
+ Markdown("# GPU profile for `%s`" % current.pathspec)
449
+ )
450
+ current.card["gpu_profile"].append(
451
+ Markdown(
452
+ "_Started at: %s_"
453
+ % datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
454
+ )
455
+ )
402
456
 
403
457
  def _drivers():
404
458
  els.append(Markdown("## Drivers"))
@@ -444,7 +498,7 @@ class GPUProfiler:
444
498
  Table(data=_rows, headers=["Device ID", "Max GPU %", "Max memory"])
445
499
  )
446
500
  els.append(
447
- Markdown(f"Detailed data saved in an artifact `{artifact_name}`")
501
+ Markdown(f"Detailed data saved in an artifact `{self.artifact_name}`")
448
502
  )
449
503
  return rows
450
504
 
@@ -479,20 +533,36 @@ class GPUProfiler:
479
533
  _interconnect()
480
534
  self._card_comps["max_utilization"] = _utilization()
481
535
  self._card_comps["charts"] = _plots()
536
+ self._card_setup_finished = True
482
537
 
483
538
  def _read_versions(self):
484
539
  def parse(r, s):
485
540
  return r.search(s).group(1).strip().decode("utf-8")
486
541
 
487
542
  try:
488
- out = check_output(["nvidia-smi"])
543
+ result = subprocess.run(
544
+ ["nvidia-smi"],
545
+ check=True, # This will raise a CalledProcessError if the command fails
546
+ stdout=subprocess.PIPE, # Capture stdout
547
+ stderr=subprocess.PIPE, # Capture stderr
548
+ )
549
+ # Access the standard output
550
+ out = result.stdout
489
551
  return parse(DRIVER_VER, out), parse(CUDA_VER, out), None
490
552
  except FileNotFoundError:
491
553
  return None, None, "nvidia-smi not found"
492
554
  except AttributeError:
493
555
  return None, None, "nvidia-smi output is unexpected"
494
- except:
495
- return None, None, "nvidia-smi error"
556
+ except subprocess.CalledProcessError as e:
557
+ _error_message = "nvidia-smi error (CalledProcessError calling nvidia-smi)"
558
+ if e.stderr is not None:
559
+ _error_message = (
560
+ "nvidia-smi error (CalledProcessError stderr) \n %s \n %s"
561
+ % (e.stderr.decode("utf-8"), e.stdout.decode("utf-8"))
562
+ )
563
+ return None, None, _error_message
564
+ except Exception as e:
565
+ return None, None, "nvidia-smi error (unknown error) \n%s" % str(e)
496
566
 
497
567
  def _read_devices(self):
498
568
  out = check_output(
@@ -572,7 +642,9 @@ class gpu_profile:
572
642
  def __call__(self, f):
573
643
  @wraps(f)
574
644
  def func(s):
575
- prof = GPUProfiler(interval=self.interval)
645
+ prof = GPUProfiler(
646
+ interval=self.interval, artifact_name=self.artifact_prefix + "data"
647
+ )
576
648
  if self.include_artifacts:
577
649
  setattr(s, self.artifact_prefix + "num_gpus", len(prof.devices))
578
650
 
@@ -585,7 +657,7 @@ class gpu_profile:
585
657
  % datetime.now().astimezone().strftime("%Y-%m-%dT%H:%M:%S %z")
586
658
  )
587
659
  )
588
- prof._setup_card(self.artifact_prefix + "data")
660
+ prof._setup_card()
589
661
  current.card["gpu_profile"].refresh()
590
662
  update_thread = threading.Thread(target=prof._update_card, daemon=True)
591
663
  update_thread.start()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.99
3
+ Version: 1.1.100
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -28,13 +28,13 @@ metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py,sha256=d_5UhXqZ
28
28
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py,sha256=AI_kcm1hZV3JRxJkookcH6twiGnAYjk9Dx-MeoYz60Y,8511
29
29
  metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py,sha256=9lUM4Cqi5RjrHBRfG6AQMRz8-R96eZC8Ih0KD2lv22Y,1858
30
30
  metaflow_extensions/outerbounds/profilers/__init__.py,sha256=wa_jhnCBr82TBxoS0e8b6_6sLyZX0fdHicuGJZNTqKw,29
31
- metaflow_extensions/outerbounds/profilers/gpu.py,sha256=a5YZAepujuP0uDqG9UpXBlZS3wjUt4Yv8CjybXqeT2c,24342
31
+ metaflow_extensions/outerbounds/profilers/gpu.py,sha256=7yeGa_ji1NE_JpvWne8yjuKhykAphiMNpCIVBvU8aiw,27066
32
32
  metaflow_extensions/outerbounds/toplevel/__init__.py,sha256=qWUJSv_r5hXJ7jV_On4nEasKIfUCm6_UjkjXWA_A1Ts,90
33
33
  metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py,sha256=Zq3OuL1bOod8KJra-Zk8B3gNhSHoWEGteM9T7g0pp6E,1881
34
34
  metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py,sha256=WUuhz2YQfI4fz7nIcipwwWq781eaoHEk7n4GAn1npDg,63
35
35
  metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3uILlEZ6ntBLKeNyqn3If8nIXZFq_Apd7Dhco,70
36
36
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
37
- ob_metaflow_extensions-1.1.99.dist-info/METADATA,sha256=oBe6bSzZL9iBxorXqhVdGW_yKtv4JupzrO9gXsF21lw,520
38
- ob_metaflow_extensions-1.1.99.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
39
- ob_metaflow_extensions-1.1.99.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
40
- ob_metaflow_extensions-1.1.99.dist-info/RECORD,,
37
+ ob_metaflow_extensions-1.1.100.dist-info/METADATA,sha256=JHOEhTwZ33-QppRgWmYPlwFz-yJC8EsjM8bZsai0hA0,521
38
+ ob_metaflow_extensions-1.1.100.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
39
+ ob_metaflow_extensions-1.1.100.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
40
+ ob_metaflow_extensions-1.1.100.dist-info/RECORD,,