wandb 0.20.2rc20250616__py3-none-macosx_11_0_arm64.whl → 0.21.1__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. wandb/__init__.py +16 -14
  2. wandb/__init__.pyi +450 -472
  3. wandb/agents/pyagent.py +41 -12
  4. wandb/analytics/sentry.py +7 -2
  5. wandb/apis/importers/mlflow.py +1 -1
  6. wandb/apis/internal.py +3 -0
  7. wandb/apis/paginator.py +17 -4
  8. wandb/apis/public/__init__.py +1 -1
  9. wandb/apis/public/api.py +606 -359
  10. wandb/apis/public/artifacts.py +214 -16
  11. wandb/apis/public/automations.py +19 -3
  12. wandb/apis/public/files.py +177 -38
  13. wandb/apis/public/history.py +67 -15
  14. wandb/apis/public/integrations.py +25 -2
  15. wandb/apis/public/jobs.py +90 -2
  16. wandb/apis/public/projects.py +161 -69
  17. wandb/apis/public/query_generator.py +11 -1
  18. wandb/apis/public/registries/registries_search.py +7 -15
  19. wandb/apis/public/reports.py +147 -13
  20. wandb/apis/public/runs.py +315 -128
  21. wandb/apis/public/sweeps.py +222 -22
  22. wandb/apis/public/teams.py +41 -4
  23. wandb/apis/public/users.py +45 -4
  24. wandb/automations/__init__.py +10 -10
  25. wandb/automations/_filters/run_metrics.py +0 -2
  26. wandb/automations/_utils.py +0 -2
  27. wandb/automations/actions.py +0 -2
  28. wandb/automations/automations.py +0 -2
  29. wandb/automations/events.py +0 -2
  30. wandb/beta/workflows.py +66 -30
  31. wandb/bin/gpu_stats +0 -0
  32. wandb/bin/wandb-core +0 -0
  33. wandb/cli/cli.py +80 -1
  34. wandb/env.py +8 -0
  35. wandb/errors/errors.py +4 -1
  36. wandb/integration/catboost/catboost.py +6 -2
  37. wandb/integration/kfp/kfp_patch.py +3 -1
  38. wandb/integration/lightning/fabric/logger.py +3 -4
  39. wandb/integration/metaflow/__init__.py +6 -0
  40. wandb/integration/metaflow/data_pandas.py +74 -0
  41. wandb/integration/metaflow/errors.py +13 -0
  42. wandb/integration/metaflow/metaflow.py +205 -190
  43. wandb/integration/openai/fine_tuning.py +1 -2
  44. wandb/integration/sb3/sb3.py +3 -3
  45. wandb/integration/ultralytics/callback.py +6 -2
  46. wandb/jupyter.py +5 -5
  47. wandb/plot/__init__.py +2 -0
  48. wandb/plot/bar.py +30 -29
  49. wandb/plot/confusion_matrix.py +75 -71
  50. wandb/plot/custom_chart.py +30 -7
  51. wandb/plot/histogram.py +26 -25
  52. wandb/plot/line.py +33 -32
  53. wandb/plot/line_series.py +100 -103
  54. wandb/plot/pr_curve.py +33 -32
  55. wandb/plot/roc_curve.py +38 -38
  56. wandb/plot/scatter.py +27 -27
  57. wandb/proto/v3/wandb_internal_pb2.py +366 -385
  58. wandb/proto/v3/wandb_settings_pb2.py +2 -2
  59. wandb/proto/v3/wandb_telemetry_pb2.py +4 -4
  60. wandb/proto/v4/wandb_internal_pb2.py +352 -356
  61. wandb/proto/v4/wandb_settings_pb2.py +2 -2
  62. wandb/proto/v4/wandb_telemetry_pb2.py +4 -4
  63. wandb/proto/v5/wandb_internal_pb2.py +352 -356
  64. wandb/proto/v5/wandb_settings_pb2.py +2 -2
  65. wandb/proto/v5/wandb_telemetry_pb2.py +4 -4
  66. wandb/proto/v6/wandb_internal_pb2.py +352 -356
  67. wandb/proto/v6/wandb_settings_pb2.py +2 -2
  68. wandb/proto/v6/wandb_telemetry_pb2.py +4 -4
  69. wandb/proto/wandb_deprecated.py +6 -0
  70. wandb/sdk/artifacts/_generated/__init__.py +12 -1
  71. wandb/sdk/artifacts/_generated/input_types.py +20 -2
  72. wandb/sdk/artifacts/_generated/link_artifact.py +21 -0
  73. wandb/sdk/artifacts/_generated/operations.py +9 -0
  74. wandb/sdk/artifacts/_internal_artifact.py +19 -8
  75. wandb/sdk/artifacts/_validators.py +48 -2
  76. wandb/sdk/artifacts/artifact.py +269 -96
  77. wandb/sdk/data_types/audio.py +38 -10
  78. wandb/sdk/data_types/base_types/media.py +15 -63
  79. wandb/sdk/data_types/base_types/wb_value.py +6 -6
  80. wandb/sdk/data_types/graph.py +48 -14
  81. wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +1 -3
  82. wandb/sdk/data_types/helper_types/image_mask.py +1 -3
  83. wandb/sdk/data_types/histogram.py +34 -21
  84. wandb/sdk/data_types/html.py +35 -12
  85. wandb/sdk/data_types/image.py +104 -68
  86. wandb/sdk/data_types/molecule.py +32 -19
  87. wandb/sdk/data_types/object_3d.py +36 -17
  88. wandb/sdk/data_types/plotly.py +18 -5
  89. wandb/sdk/data_types/saved_model.py +7 -9
  90. wandb/sdk/data_types/table.py +99 -70
  91. wandb/sdk/data_types/trace_tree.py +12 -12
  92. wandb/sdk/data_types/video.py +53 -26
  93. wandb/sdk/integration_utils/auto_logging.py +2 -2
  94. wandb/sdk/interface/interface.py +8 -19
  95. wandb/sdk/interface/interface_shared.py +7 -16
  96. wandb/sdk/internal/datastore.py +18 -18
  97. wandb/sdk/internal/handler.py +3 -5
  98. wandb/sdk/internal/internal_api.py +60 -0
  99. wandb/sdk/internal/job_builder.py +6 -0
  100. wandb/sdk/internal/sender.py +23 -3
  101. wandb/sdk/internal/sender_config.py +9 -0
  102. wandb/sdk/launch/_project_spec.py +3 -3
  103. wandb/sdk/launch/agent/agent.py +11 -4
  104. wandb/sdk/launch/agent/job_status_tracker.py +3 -1
  105. wandb/sdk/launch/agent/run_queue_item_file_saver.py +2 -2
  106. wandb/sdk/launch/create_job.py +3 -1
  107. wandb/sdk/launch/inputs/internal.py +3 -4
  108. wandb/sdk/launch/inputs/schema.py +1 -0
  109. wandb/sdk/launch/runner/kubernetes_monitor.py +1 -0
  110. wandb/sdk/launch/runner/kubernetes_runner.py +328 -1
  111. wandb/sdk/launch/sweeps/scheduler.py +2 -3
  112. wandb/sdk/launch/utils.py +3 -3
  113. wandb/sdk/lib/asyncio_compat.py +3 -0
  114. wandb/sdk/lib/console_capture.py +66 -19
  115. wandb/sdk/lib/deprecate.py +1 -7
  116. wandb/sdk/lib/disabled.py +1 -1
  117. wandb/sdk/lib/hashutil.py +14 -1
  118. wandb/sdk/lib/module.py +7 -13
  119. wandb/sdk/lib/progress.py +0 -19
  120. wandb/sdk/lib/sock_client.py +0 -4
  121. wandb/sdk/wandb_init.py +67 -93
  122. wandb/sdk/wandb_login.py +18 -14
  123. wandb/sdk/wandb_metric.py +2 -0
  124. wandb/sdk/wandb_require.py +0 -1
  125. wandb/sdk/wandb_run.py +429 -527
  126. wandb/sdk/wandb_settings.py +364 -74
  127. wandb/sdk/wandb_setup.py +28 -28
  128. wandb/sdk/wandb_sweep.py +14 -13
  129. wandb/sdk/wandb_watch.py +4 -6
  130. wandb/sync/sync.py +10 -0
  131. wandb/util.py +57 -0
  132. wandb/wandb_run.py +1 -2
  133. {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/METADATA +1 -1
  134. {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/RECORD +137 -137
  135. wandb/sdk/wandb_metadata.py +0 -623
  136. wandb/vendor/pynvml/__init__.py +0 -0
  137. wandb/vendor/pynvml/pynvml.py +0 -4779
  138. {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/WHEEL +0 -0
  139. {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/entry_points.txt +0 -0
  140. {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/licenses/LICENSE +0 -0
@@ -6,6 +6,8 @@ import datetime
6
6
  import json
7
7
  import logging
8
8
  import os
9
+ import time
10
+ import uuid
9
11
  from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
10
12
 
11
13
  import yaml
@@ -20,6 +22,7 @@ from wandb.sdk.launch.registry.local_registry import LocalRegistry
20
22
  from wandb.sdk.launch.runner.abstract import Status
21
23
  from wandb.sdk.launch.runner.kubernetes_monitor import (
22
24
  WANDB_K8S_LABEL_AGENT,
25
+ WANDB_K8S_LABEL_AUXILIARY_RESOURCE,
23
26
  WANDB_K8S_LABEL_MONITOR,
24
27
  WANDB_K8S_RUN_ID,
25
28
  CustomResource,
@@ -47,6 +50,9 @@ get_module(
47
50
 
48
51
  import kubernetes_asyncio # type: ignore # noqa: E402
49
52
  from kubernetes_asyncio import client # noqa: E402
53
+ from kubernetes_asyncio.client.api.apps_v1_api import ( # type: ignore # noqa: E402
54
+ AppsV1Api,
55
+ )
50
56
  from kubernetes_asyncio.client.api.batch_v1_api import ( # type: ignore # noqa: E402
51
57
  BatchV1Api,
52
58
  )
@@ -78,9 +84,11 @@ class KubernetesSubmittedRun(AbstractRun):
78
84
  self,
79
85
  batch_api: "BatchV1Api",
80
86
  core_api: "CoreV1Api",
87
+ apps_api: "AppsV1Api",
81
88
  name: str,
82
89
  namespace: Optional[str] = "default",
83
90
  secret: Optional["V1Secret"] = None,
91
+ auxiliary_resource_label_key: Optional[str] = None,
84
92
  ) -> None:
85
93
  """Initialize a KubernetesSubmittedRun.
86
94
 
@@ -104,10 +112,12 @@ class KubernetesSubmittedRun(AbstractRun):
104
112
  """
105
113
  self.batch_api = batch_api
106
114
  self.core_api = core_api
115
+ self.apps_api = apps_api
107
116
  self.name = name
108
117
  self.namespace = namespace
109
118
  self._fail_count = 0
110
119
  self.secret = secret
120
+ self.auxiliary_resource_label_key = auxiliary_resource_label_key
111
121
 
112
122
  @property
113
123
  def id(self) -> str:
@@ -149,6 +159,7 @@ class KubernetesSubmittedRun(AbstractRun):
149
159
  await asyncio.sleep(5)
150
160
 
151
161
  await self._delete_secret()
162
+ await self._delete_auxiliary_resources_by_label()
152
163
  return (
153
164
  status.state == "finished"
154
165
  ) # todo: not sure if this (copied from aws runner) is the right approach? should we return false on failure
@@ -157,6 +168,7 @@ class KubernetesSubmittedRun(AbstractRun):
157
168
  status = LaunchKubernetesMonitor.get_status(self.name)
158
169
  if status in ["stopped", "failed", "finished", "preempted"]:
159
170
  await self._delete_secret()
171
+ await self._delete_auxiliary_resources_by_label()
160
172
  return status
161
173
 
162
174
  async def cancel(self) -> None:
@@ -167,6 +179,7 @@ class KubernetesSubmittedRun(AbstractRun):
167
179
  name=self.name,
168
180
  )
169
181
  await self._delete_secret()
182
+ await self._delete_auxiliary_resources_by_label()
170
183
  except ApiException as e:
171
184
  raise LaunchError(
172
185
  f"Failed to delete Kubernetes Job {self.name} in namespace {self.namespace}: {str(e)}"
@@ -181,6 +194,52 @@ class KubernetesSubmittedRun(AbstractRun):
181
194
  )
182
195
  self.secret = None
183
196
 
197
+ async def _delete_auxiliary_resources_by_label(self) -> None:
198
+ if self.auxiliary_resource_label_key is None:
199
+ return
200
+
201
+ label_selector = (
202
+ f"{WANDB_K8S_LABEL_AUXILIARY_RESOURCE}={self.auxiliary_resource_label_key}"
203
+ )
204
+
205
+ try:
206
+ resource_cleanups = [
207
+ (self.core_api, "service"),
208
+ (self.batch_api, "job"),
209
+ (self.core_api, "pod"),
210
+ (self.core_api, "config_map"),
211
+ (self.core_api, "secret"),
212
+ (self.apps_api, "deployment"),
213
+ (self.apps_api, "replica_set"),
214
+ (self.apps_api, "daemon_set"),
215
+ ]
216
+
217
+ for api_client, resource_type in resource_cleanups:
218
+ try:
219
+ list_method = getattr(
220
+ api_client, f"list_namespaced_{resource_type}"
221
+ )
222
+ delete_method = getattr(
223
+ api_client, f"delete_namespaced_{resource_type}"
224
+ )
225
+
226
+ # List resources with our label
227
+ resources = await list_method(
228
+ namespace=self.namespace, label_selector=label_selector
229
+ )
230
+
231
+ # Delete each resource
232
+ for resource in resources.items:
233
+ await delete_method(
234
+ name=resource.metadata.name, namespace=self.namespace
235
+ )
236
+
237
+ except (AttributeError, ApiException) as e:
238
+ wandb.termwarn(f"Could not clean up {resource_type}: {e}")
239
+
240
+ except Exception as e:
241
+ wandb.termwarn(f"Failed to clean up some auxiliary resources: {e}")
242
+
184
243
 
185
244
  class CrdSubmittedRun(AbstractRun):
186
245
  """Run submitted to a CRD backend, e.g. Volcano."""
@@ -366,6 +425,7 @@ class KubernetesRunner(AbstractRunner):
366
425
  job_metadata["generateName"] = make_name_dns_safe(
367
426
  f"launch-{launch_project.target_entity}-{launch_project.target_project}-"
368
427
  )
428
+ job_metadata["namespace"] = namespace
369
429
 
370
430
  for i, cont in enumerate(containers):
371
431
  if "name" not in cont:
@@ -489,6 +549,235 @@ class KubernetesRunner(AbstractRunner):
489
549
 
490
550
  return job, api_key_secret
491
551
 
552
+ async def _wait_for_resource_ready(
553
+ self,
554
+ api_client: kubernetes_asyncio.client.ApiClient,
555
+ config: Dict[str, Any],
556
+ namespace: str,
557
+ timeout_seconds: int = 300,
558
+ ) -> None:
559
+ """Wait for a Kubernetes resource to be ready.
560
+
561
+ Arguments:
562
+ api_client: The Kubernetes API client.
563
+ config: The resource configuration.
564
+ namespace: The namespace where the resource was created.
565
+ timeout_seconds: Maximum time to wait for readiness.
566
+ """
567
+ resource_kind = config.get("kind")
568
+ resource_name = config.get("metadata", {}).get("name")
569
+
570
+ if not resource_kind or not resource_name:
571
+ wandb.termerror(
572
+ f"{LOG_PREFIX}Cannot wait for resource without kind or name"
573
+ )
574
+ return
575
+
576
+ wandb.termlog(
577
+ f"{LOG_PREFIX}Waiting for {resource_kind} '{resource_name}' to be ready..."
578
+ )
579
+
580
+ start_time = time.time()
581
+
582
+ if resource_kind == "Deployment":
583
+ await self._wait_for_deployment_ready(
584
+ api_client, resource_name, namespace, timeout_seconds
585
+ )
586
+ elif resource_kind == "Service":
587
+ await self._wait_for_service_ready(
588
+ api_client, resource_name, namespace, timeout_seconds
589
+ )
590
+ elif resource_kind == "Pod":
591
+ await self._wait_for_pod_ready(
592
+ api_client, resource_name, namespace, timeout_seconds
593
+ )
594
+ else:
595
+ wandb.termlog(
596
+ f"{LOG_PREFIX}No specific readiness check for {resource_kind}, waiting 5 seconds..."
597
+ )
598
+ await asyncio.sleep(5)
599
+
600
+ elapsed = time.time() - start_time
601
+ wandb.termlog(
602
+ f"{LOG_PREFIX}{resource_kind} '{resource_name}' is ready after {elapsed:.1f}s"
603
+ )
604
+
605
+ async def _wait_for_deployment_ready(
606
+ self,
607
+ api_client: kubernetes_asyncio.client.ApiClient,
608
+ name: str,
609
+ namespace: str,
610
+ timeout_seconds: int,
611
+ ) -> None:
612
+ """Wait for a Deployment to be ready."""
613
+ apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
614
+
615
+ async def check_deployment_ready():
616
+ deployment = await apps_api.read_namespaced_deployment(
617
+ name=name, namespace=namespace
618
+ )
619
+ status = deployment.status
620
+
621
+ if status.ready_replicas and status.replicas:
622
+ return status.ready_replicas >= status.replicas
623
+
624
+ return False
625
+
626
+ await self._wait_with_timeout(check_deployment_ready, timeout_seconds, name)
627
+
628
+ async def _wait_for_service_ready(
629
+ self,
630
+ api_client: kubernetes_asyncio.client.ApiClient,
631
+ name: str,
632
+ namespace: str,
633
+ timeout_seconds: int,
634
+ ) -> None:
635
+ """Wait for a Service to have endpoints."""
636
+ core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
637
+
638
+ async def check_service_ready():
639
+ endpoints = await core_api.read_namespaced_endpoints(
640
+ name=name, namespace=namespace
641
+ )
642
+ if endpoints.subsets:
643
+ for subset in endpoints.subsets:
644
+ if subset.addresses: # These are ready pod addresses
645
+ return True
646
+ return False
647
+
648
+ await self._wait_with_timeout(check_service_ready, timeout_seconds, name)
649
+
650
+ async def _wait_for_pod_ready(
651
+ self,
652
+ api_client: kubernetes_asyncio.client.ApiClient,
653
+ name: str,
654
+ namespace: str,
655
+ timeout_seconds: int,
656
+ ) -> None:
657
+ """Wait for a Pod to be ready."""
658
+ core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
659
+
660
+ async def check_pod_ready():
661
+ pod = await core_api.read_namespaced_pod(name=name, namespace=namespace)
662
+ if pod.status.phase == "Running":
663
+ if pod.status.container_statuses:
664
+ return all(status.ready for status in pod.status.container_statuses)
665
+ return True
666
+ return False
667
+
668
+ await self._wait_with_timeout(check_pod_ready, timeout_seconds, name)
669
+
670
+ async def _wait_with_timeout(
671
+ self, check_func, timeout_seconds: int, name: str
672
+ ) -> None:
673
+ """Generic timeout wrapper for readiness checks."""
674
+ start_time = time.time()
675
+
676
+ while time.time() - start_time < timeout_seconds:
677
+ try:
678
+ if await check_func():
679
+ return
680
+ except kubernetes_asyncio.client.ApiException as e:
681
+ if e.status == 404:
682
+ pass
683
+ else:
684
+ wandb.termerror(
685
+ f"{LOG_PREFIX}Error waiting for resource '{name}': {e}"
686
+ )
687
+ raise
688
+ except Exception as e:
689
+ wandb.termerror(f"{LOG_PREFIX}Error waiting for resource '{name}': {e}")
690
+ raise
691
+ await asyncio.sleep(2)
692
+
693
+ raise LaunchError(
694
+ f"Resource '{name}' not ready within {timeout_seconds} seconds"
695
+ )
696
+
697
+ async def _prepare_resource(
698
+ self,
699
+ api_client: kubernetes_asyncio.client.ApiClient,
700
+ config: Dict[str, Any],
701
+ namespace: str,
702
+ run_id: str,
703
+ auxiliary_resource_label_key: str,
704
+ launch_project: LaunchProject,
705
+ api_key_secret: Optional["V1Secret"] = None,
706
+ wait_for_ready: bool = True,
707
+ wait_timeout: int = 300,
708
+ ) -> None:
709
+ """Prepare a service for launch.
710
+
711
+ Arguments:
712
+ api_client: The Kubernetes API client.
713
+ config: The resource configuration to prepare.
714
+ namespace: The namespace to create the resource in.
715
+ run_id: The run ID to label the resource with.
716
+ auxiliary_resource_label_key: The key of the auxiliary resource label.
717
+ launch_project: The launch project to get environment variables from.
718
+ api_key_secret: The API key secret to inject.
719
+ wait_for_ready: Whether to wait for the resource to be ready after creation.
720
+ wait_timeout: Maximum time in seconds to wait for resource readiness.
721
+ """
722
+ config.setdefault("metadata", {})
723
+ config["metadata"].setdefault("labels", {})
724
+ config["metadata"]["labels"][WANDB_K8S_RUN_ID] = run_id
725
+ config["metadata"]["labels"][WANDB_K8S_LABEL_AUXILIARY_RESOURCE] = (
726
+ auxiliary_resource_label_key
727
+ )
728
+ config["metadata"]["labels"]["wandb.ai/created-by"] = "launch-agent"
729
+
730
+ if config.get("kind") == "Service" or config.get("kind") == "Deployment":
731
+ config.setdefault("metadata", {})
732
+ original_name = config["metadata"].get("name", config.get("kind"))
733
+ safe_name = make_name_dns_safe(original_name)
734
+ safe_entity = make_name_dns_safe(launch_project.target_entity or "")
735
+ safe_project = make_name_dns_safe(launch_project.target_project or "")
736
+ safe_run_id = make_name_dns_safe(run_id or "")
737
+
738
+ new_name = f"{safe_name}-{safe_entity}-{safe_project}-{safe_run_id}"
739
+ config["metadata"]["name"] = new_name
740
+ wandb.termlog(
741
+ f"{LOG_PREFIX}Modified {config.get('kind')} name from '{original_name}' to '{new_name}'"
742
+ )
743
+
744
+ env_vars = launch_project.get_env_vars_dict(
745
+ self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
746
+ )
747
+ wandb_config_env = {
748
+ "WANDB_CONFIG": env_vars.get("WANDB_CONFIG", "{}"),
749
+ }
750
+ add_wandb_env(config, wandb_config_env)
751
+
752
+ if api_key_secret:
753
+ for cont in yield_containers(config):
754
+ env = cont.setdefault("env", [])
755
+ env.append(
756
+ {
757
+ "name": "WANDB_API_KEY",
758
+ "valueFrom": {
759
+ "secretKeyRef": {
760
+ "name": api_key_secret.metadata.name,
761
+ "key": "password",
762
+ }
763
+ },
764
+ }
765
+ )
766
+ cont["env"] = env
767
+
768
+ try:
769
+ await kubernetes_asyncio.utils.create_from_dict(
770
+ api_client, config, namespace=namespace
771
+ )
772
+
773
+ if wait_for_ready:
774
+ await self._wait_for_resource_ready(
775
+ api_client, config, namespace, wait_timeout
776
+ )
777
+ except Exception as e:
778
+ wandb.termerror(f"{LOG_PREFIX}Failed to create Kubernetes resource: {e}")
779
+ raise LaunchError(f"Failed to create Kubernetes resource: {e}")
780
+
492
781
  async def run(
493
782
  self, launch_project: LaunchProject, image_uri: str
494
783
  ) -> Optional[AbstractRun]:
@@ -630,10 +919,42 @@ class KubernetesRunner(AbstractRunner):
630
919
 
631
920
  batch_api = kubernetes_asyncio.client.BatchV1Api(api_client)
632
921
  core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
922
+ apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
923
+
633
924
  namespace = self.get_namespace(resource_args, context)
634
925
  job, secret = await self._inject_defaults(
635
926
  resource_args, launch_project, image_uri, namespace, core_api
636
927
  )
928
+
929
+ additional_services = launch_project.launch_spec.get("additional_services", [])
930
+ auxiliary_resource_label_key = None
931
+ if additional_services:
932
+ wandb.termlog(
933
+ f"{LOG_PREFIX}Creating additional services: {additional_services}"
934
+ )
935
+ auxiliary_resource_label_key = f"aux-{uuid.uuid4()}"
936
+
937
+ wait_for_ready = resource_args.get("wait_for_ready", True)
938
+ wait_timeout = resource_args.get("wait_timeout", 300)
939
+
940
+ await asyncio.gather(
941
+ *[
942
+ self._prepare_resource(
943
+ api_client,
944
+ resource.get("config"),
945
+ namespace,
946
+ launch_project.run_id,
947
+ auxiliary_resource_label_key,
948
+ launch_project,
949
+ secret,
950
+ wait_for_ready,
951
+ wait_timeout,
952
+ )
953
+ for resource in additional_services
954
+ if resource.get("config", {})
955
+ ]
956
+ )
957
+
637
958
  msg = "Creating Kubernetes job"
638
959
  if "name" in resource_args:
639
960
  msg += f": {resource_args['name']}"
@@ -658,7 +979,13 @@ class KubernetesRunner(AbstractRunner):
658
979
  job_name = job_response.metadata.name
659
980
  LaunchKubernetesMonitor.monitor_namespace(namespace)
660
981
  submitted_job = KubernetesSubmittedRun(
661
- batch_api, core_api, job_name, namespace, secret
982
+ batch_api,
983
+ core_api,
984
+ apps_api,
985
+ job_name,
986
+ namespace,
987
+ secret,
988
+ auxiliary_resource_label_key,
662
989
  )
663
990
  if self.backend_config[PROJECT_SYNCHRONOUS]:
664
991
  await submitted_job.wait()
@@ -36,7 +36,6 @@ if TYPE_CHECKING:
36
36
  import wandb.apis.public as public
37
37
  from wandb.apis.internal import Api
38
38
  from wandb.apis.public import QueuedRun, Run
39
- from wandb.sdk.wandb_run import Run as SdkRun
40
39
 
41
40
 
42
41
  _logger = logging.getLogger(__name__)
@@ -255,10 +254,10 @@ class Scheduler(ABC):
255
254
  _id: w for _id, w in self._workers.items() if _id not in self.busy_workers
256
255
  }
257
256
 
258
- def _init_wandb_run(self) -> "SdkRun":
257
+ def _init_wandb_run(self) -> "wandb.Run":
259
258
  """Controls resume or init logic for a scheduler wandb run."""
260
259
  settings = wandb.Settings(disable_job_creation=True)
261
- run: SdkRun = wandb.init( # type: ignore
260
+ run: wandb.Run = wandb.init( # type: ignore
262
261
  name=f"Scheduler.{self._sweep_id}",
263
262
  resume="allow",
264
263
  config=self._kwargs, # when run as a job, this sets config
wandb/sdk/launch/utils.py CHANGED
@@ -380,9 +380,9 @@ def diff_pip_requirements(req_1: List[str], req_2: List[str]) -> Dict[str, str]:
380
380
  else:
381
381
  raise ValueError(f"Unable to parse pip requirements file line: {line}")
382
382
  if _name is not None:
383
- assert re.match(
384
- _VALID_PIP_PACKAGE_REGEX, _name
385
- ), f"Invalid pip package name {_name}"
383
+ assert re.match(_VALID_PIP_PACKAGE_REGEX, _name), (
384
+ f"Invalid pip package name {_name}"
385
+ )
386
386
  d[_name] = _version
387
387
  return d
388
388
 
@@ -100,6 +100,9 @@ class _Runner:
100
100
  raise _RunnerCancelledError()
101
101
 
102
102
  finally:
103
+ # NOTE: asyncio.run() cancels all tasks after the main task exits,
104
+ # but this is not documented, so we cancel them explicitly here
105
+ # as well. It also blocks until canceled tasks complete.
103
106
  cancellation_task.cancel()
104
107
  fn_task.cancel()
105
108
 
@@ -25,17 +25,38 @@ In particular, it does not work with some combinations of pytest's
25
25
 
26
26
  from __future__ import annotations
27
27
 
28
+ import logging
28
29
  import sys
29
30
  import threading
30
31
  from typing import IO, AnyStr, Callable, Protocol
31
32
 
33
+ from . import wb_logging
34
+
35
+ _logger = logging.getLogger(__name__)
36
+
32
37
 
33
38
  class CannotCaptureConsoleError(Exception):
34
39
  """The module failed to patch stdout or stderr."""
35
40
 
36
41
 
37
42
  class _WriteCallback(Protocol):
38
- """A callback that receives intercepted bytes or string data."""
43
+ """A callback that receives intercepted bytes or string data.
44
+
45
+ This may be called from any thread, but is only called from one thread
46
+ at a time.
47
+
48
+ Note on errors: Any error raised during the callback will clear all
49
+ callbacks. This means that if a user presses Ctrl-C at an unlucky time
50
+ during a run, we will stop uploading console output---but it's not
51
+ likely to be a problem unless something catches the KeyboardInterrupt.
52
+
53
+ Regular Exceptions are caught and logged instead of bubbling up to the
54
+ user's print() statements; other exceptions like KeyboardInterrupt are
55
+ re-raised.
56
+
57
+ Callbacks should handle all exceptions---a callback that raises any
58
+ Exception is considered buggy.
59
+ """
39
60
 
40
61
  def __call__(
41
62
  self,
@@ -45,6 +66,8 @@ class _WriteCallback(Protocol):
45
66
  ) -> None:
46
67
  """Intercept data passed to `write()`.
47
68
 
69
+ See the protocol docstring for information about exceptions.
70
+
48
71
  Args:
49
72
  data: The object passed to stderr's or stdout's `write()`.
50
73
  written: The number of bytes or characters written.
@@ -52,7 +75,9 @@ class _WriteCallback(Protocol):
52
75
  """
53
76
 
54
77
 
55
- _module_lock = threading.Lock()
78
+ # A reentrant lock is used to catch callbacks that write to stderr/stdout.
79
+ _module_rlock = threading.RLock()
80
+ _is_writing = False
56
81
 
57
82
  _patch_exception: CannotCaptureConsoleError | None = None
58
83
 
@@ -67,9 +92,6 @@ def capture_stdout(callback: _WriteCallback) -> Callable[[], None]:
67
92
 
68
93
  Args:
69
94
  callback: A callback to invoke after running `sys.stdout.write`.
70
- This may be called from any thread, so it must be thread-safe.
71
- Exceptions are propagated to the caller of `write`.
72
- See `_WriteCallback` for the exact protocol.
73
95
 
74
96
  Returns:
75
97
  A function to uninstall the callback.
@@ -77,7 +99,7 @@ def capture_stdout(callback: _WriteCallback) -> Callable[[], None]:
77
99
  Raises:
78
100
  CannotCaptureConsoleError: If patching failed on import.
79
101
  """
80
- with _module_lock:
102
+ with _module_rlock:
81
103
  if _patch_exception:
82
104
  raise _patch_exception
83
105
 
@@ -92,9 +114,6 @@ def capture_stderr(callback: _WriteCallback) -> Callable[[], None]:
92
114
 
93
115
  Args:
94
116
  callback: A callback to invoke after running `sys.stderr.write`.
95
- This may be called from any thread, so it must be thread-safe.
96
- Exceptions are propagated to the caller of `write`.
97
- See `_WriteCallback` for the exact protocol.
98
117
 
99
118
  Returns:
100
119
  A function to uninstall the callback.
@@ -102,7 +121,7 @@ def capture_stderr(callback: _WriteCallback) -> Callable[[], None]:
102
121
  Raises:
103
122
  CannotCaptureConsoleError: If patching failed on import.
104
123
  """
105
- with _module_lock:
124
+ with _module_rlock:
106
125
  if _patch_exception:
107
126
  raise _patch_exception
108
127
 
@@ -125,11 +144,11 @@ def _insert_disposably(
125
144
  def dispose() -> None:
126
145
  nonlocal disposed
127
146
 
128
- with _module_lock:
147
+ with _module_rlock:
129
148
  if disposed:
130
149
  return
131
150
 
132
- del callback_dict[id]
151
+ callback_dict.pop(id, None)
133
152
 
134
153
  disposed = True
135
154
 
@@ -143,16 +162,44 @@ def _patch(
143
162
  ) -> None:
144
163
  orig_write: Callable[[AnyStr], int]
145
164
 
165
+ @wb_logging.log_to_all_runs()
146
166
  def write_with_callbacks(s: AnyStr, /) -> int:
167
+ global _is_writing
147
168
  n = orig_write(s)
148
169
 
149
- # We make a copy here because callbacks could, in theory, modify
150
- # the list of callbacks.
151
- with _module_lock:
152
- callbacks_copy = list(callbacks.values())
153
-
154
- for cb in callbacks_copy:
155
- cb(s, n)
170
+ # NOTE: Since _module_rlock is reentrant, this is safe. It will not
171
+ # deadlock if a callback invokes write() again.
172
+ with _module_rlock:
173
+ if _is_writing:
174
+ return n
175
+
176
+ _is_writing = True
177
+ try:
178
+ for cb in callbacks.values():
179
+ cb(s, n)
180
+
181
+ except BaseException as e:
182
+ # Clear all callbacks on any exception to avoid infinite loops:
183
+ #
184
+ # * If we re-raise, an exception handler is likely to print
185
+ # the exception to the console and trigger callbacks again
186
+ # * If we log, we can't guarantee that this doesn't print
187
+ # to console.
188
+ #
189
+ # This is especially important for KeyboardInterrupt.
190
+ _stderr_callbacks.clear()
191
+ _stdout_callbacks.clear()
192
+
193
+ if isinstance(e, Exception):
194
+ # We suppress Exceptions so that bugs in W&B code don't
195
+ # cause the user's print() statements to raise errors.
196
+ _logger.exception("Error in console callback, clearing all!")
197
+ else:
198
+ # Re-raise errors like KeyboardInterrupt.
199
+ raise
200
+
201
+ finally:
202
+ _is_writing = False
156
203
 
157
204
  return n
158
205
 
@@ -1,20 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING
4
-
5
3
  import wandb
6
4
  from wandb.proto.wandb_deprecated import DEPRECATED_FEATURES
7
5
  from wandb.sdk.lib import telemetry
8
6
 
9
- # Necessary to break import cycle.
10
- if TYPE_CHECKING:
11
- from wandb import wandb_run
12
-
13
7
 
14
8
  def deprecate(
15
9
  field_name: DEPRECATED_FEATURES,
16
10
  warning_message: str,
17
- run: wandb_run.Run | None = None,
11
+ run: wandb.Run | None = None,
18
12
  ) -> None:
19
13
  """Warn the user that a feature has been deprecated.
20
14
 
wandb/sdk/lib/disabled.py CHANGED
@@ -26,5 +26,5 @@ class RunDisabled:
26
26
  deprecate.deprecate(
27
27
  field_name=Deprecated.run_disabled,
28
28
  warning_message="RunDisabled is deprecated and is a no-op. "
29
- '`wandb.init(mode="disabled")` now returns and instance of `wandb.sdk.wandb_run.Run`.',
29
+ '`wandb.init(mode="disabled")` now returns an instance of `wandb.Run`.',
30
30
  )