wandb 0.20.2rc20250616__py3-none-win_amd64.whl → 0.21.1__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wandb/__init__.py +16 -14
- wandb/__init__.pyi +450 -472
- wandb/agents/pyagent.py +41 -12
- wandb/analytics/sentry.py +7 -2
- wandb/apis/importers/mlflow.py +1 -1
- wandb/apis/internal.py +3 -0
- wandb/apis/paginator.py +17 -4
- wandb/apis/public/__init__.py +1 -1
- wandb/apis/public/api.py +606 -359
- wandb/apis/public/artifacts.py +214 -16
- wandb/apis/public/automations.py +19 -3
- wandb/apis/public/files.py +177 -38
- wandb/apis/public/history.py +67 -15
- wandb/apis/public/integrations.py +25 -2
- wandb/apis/public/jobs.py +90 -2
- wandb/apis/public/projects.py +161 -69
- wandb/apis/public/query_generator.py +11 -1
- wandb/apis/public/registries/registries_search.py +7 -15
- wandb/apis/public/reports.py +147 -13
- wandb/apis/public/runs.py +315 -128
- wandb/apis/public/sweeps.py +222 -22
- wandb/apis/public/teams.py +41 -4
- wandb/apis/public/users.py +45 -4
- wandb/automations/__init__.py +10 -10
- wandb/automations/_filters/run_metrics.py +0 -2
- wandb/automations/_utils.py +0 -2
- wandb/automations/actions.py +0 -2
- wandb/automations/automations.py +0 -2
- wandb/automations/events.py +0 -2
- wandb/beta/workflows.py +66 -30
- wandb/bin/gpu_stats.exe +0 -0
- wandb/bin/wandb-core +0 -0
- wandb/cli/cli.py +80 -1
- wandb/env.py +8 -0
- wandb/errors/errors.py +4 -1
- wandb/integration/catboost/catboost.py +6 -2
- wandb/integration/kfp/kfp_patch.py +3 -1
- wandb/integration/lightning/fabric/logger.py +3 -4
- wandb/integration/metaflow/__init__.py +6 -0
- wandb/integration/metaflow/data_pandas.py +74 -0
- wandb/integration/metaflow/errors.py +13 -0
- wandb/integration/metaflow/metaflow.py +205 -190
- wandb/integration/openai/fine_tuning.py +1 -2
- wandb/integration/sb3/sb3.py +3 -3
- wandb/integration/ultralytics/callback.py +6 -2
- wandb/jupyter.py +5 -5
- wandb/plot/__init__.py +2 -0
- wandb/plot/bar.py +30 -29
- wandb/plot/confusion_matrix.py +75 -71
- wandb/plot/custom_chart.py +30 -7
- wandb/plot/histogram.py +26 -25
- wandb/plot/line.py +33 -32
- wandb/plot/line_series.py +100 -103
- wandb/plot/pr_curve.py +33 -32
- wandb/plot/roc_curve.py +38 -38
- wandb/plot/scatter.py +27 -27
- wandb/proto/v3/wandb_internal_pb2.py +366 -385
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +4 -4
- wandb/proto/v4/wandb_internal_pb2.py +352 -356
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +4 -4
- wandb/proto/v5/wandb_internal_pb2.py +352 -356
- wandb/proto/v5/wandb_settings_pb2.py +2 -2
- wandb/proto/v5/wandb_telemetry_pb2.py +4 -4
- wandb/proto/v6/wandb_internal_pb2.py +352 -356
- wandb/proto/v6/wandb_settings_pb2.py +2 -2
- wandb/proto/v6/wandb_telemetry_pb2.py +4 -4
- wandb/proto/wandb_deprecated.py +6 -0
- wandb/sdk/artifacts/_generated/__init__.py +12 -1
- wandb/sdk/artifacts/_generated/input_types.py +20 -2
- wandb/sdk/artifacts/_generated/link_artifact.py +21 -0
- wandb/sdk/artifacts/_generated/operations.py +9 -0
- wandb/sdk/artifacts/_internal_artifact.py +19 -8
- wandb/sdk/artifacts/_validators.py +48 -2
- wandb/sdk/artifacts/artifact.py +269 -96
- wandb/sdk/data_types/audio.py +38 -10
- wandb/sdk/data_types/base_types/media.py +15 -63
- wandb/sdk/data_types/base_types/wb_value.py +6 -6
- wandb/sdk/data_types/graph.py +48 -14
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +1 -3
- wandb/sdk/data_types/helper_types/image_mask.py +1 -3
- wandb/sdk/data_types/histogram.py +34 -21
- wandb/sdk/data_types/html.py +35 -12
- wandb/sdk/data_types/image.py +104 -68
- wandb/sdk/data_types/molecule.py +32 -19
- wandb/sdk/data_types/object_3d.py +36 -17
- wandb/sdk/data_types/plotly.py +18 -5
- wandb/sdk/data_types/saved_model.py +7 -9
- wandb/sdk/data_types/table.py +99 -70
- wandb/sdk/data_types/trace_tree.py +12 -12
- wandb/sdk/data_types/video.py +53 -26
- wandb/sdk/integration_utils/auto_logging.py +2 -2
- wandb/sdk/interface/interface.py +8 -19
- wandb/sdk/interface/interface_shared.py +7 -16
- wandb/sdk/internal/datastore.py +18 -18
- wandb/sdk/internal/handler.py +3 -5
- wandb/sdk/internal/internal_api.py +60 -0
- wandb/sdk/internal/job_builder.py +6 -0
- wandb/sdk/internal/sender.py +23 -3
- wandb/sdk/internal/sender_config.py +9 -0
- wandb/sdk/launch/_project_spec.py +3 -3
- wandb/sdk/launch/agent/agent.py +11 -4
- wandb/sdk/launch/agent/job_status_tracker.py +3 -1
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +2 -2
- wandb/sdk/launch/create_job.py +3 -1
- wandb/sdk/launch/inputs/internal.py +3 -4
- wandb/sdk/launch/inputs/schema.py +1 -0
- wandb/sdk/launch/runner/kubernetes_monitor.py +1 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +328 -1
- wandb/sdk/launch/sweeps/scheduler.py +2 -3
- wandb/sdk/launch/utils.py +3 -3
- wandb/sdk/lib/asyncio_compat.py +3 -0
- wandb/sdk/lib/console_capture.py +66 -19
- wandb/sdk/lib/deprecate.py +1 -7
- wandb/sdk/lib/disabled.py +1 -1
- wandb/sdk/lib/hashutil.py +14 -1
- wandb/sdk/lib/module.py +7 -13
- wandb/sdk/lib/progress.py +0 -19
- wandb/sdk/lib/sock_client.py +0 -4
- wandb/sdk/wandb_init.py +67 -93
- wandb/sdk/wandb_login.py +18 -14
- wandb/sdk/wandb_metric.py +2 -0
- wandb/sdk/wandb_require.py +0 -1
- wandb/sdk/wandb_run.py +429 -527
- wandb/sdk/wandb_settings.py +364 -74
- wandb/sdk/wandb_setup.py +28 -28
- wandb/sdk/wandb_sweep.py +14 -13
- wandb/sdk/wandb_watch.py +4 -6
- wandb/sync/sync.py +10 -0
- wandb/util.py +57 -0
- wandb/wandb_run.py +1 -2
- {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/METADATA +1 -1
- {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/RECORD +137 -137
- wandb/sdk/wandb_metadata.py +0 -623
- wandb/vendor/pynvml/__init__.py +0 -0
- wandb/vendor/pynvml/pynvml.py +0 -4779
- {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/WHEEL +0 -0
- {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/entry_points.txt +0 -0
- {wandb-0.20.2rc20250616.dist-info → wandb-0.21.1.dist-info}/licenses/LICENSE +0 -0
@@ -6,6 +6,8 @@ import datetime
|
|
6
6
|
import json
|
7
7
|
import logging
|
8
8
|
import os
|
9
|
+
import time
|
10
|
+
import uuid
|
9
11
|
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
10
12
|
|
11
13
|
import yaml
|
@@ -20,6 +22,7 @@ from wandb.sdk.launch.registry.local_registry import LocalRegistry
|
|
20
22
|
from wandb.sdk.launch.runner.abstract import Status
|
21
23
|
from wandb.sdk.launch.runner.kubernetes_monitor import (
|
22
24
|
WANDB_K8S_LABEL_AGENT,
|
25
|
+
WANDB_K8S_LABEL_AUXILIARY_RESOURCE,
|
23
26
|
WANDB_K8S_LABEL_MONITOR,
|
24
27
|
WANDB_K8S_RUN_ID,
|
25
28
|
CustomResource,
|
@@ -47,6 +50,9 @@ get_module(
|
|
47
50
|
|
48
51
|
import kubernetes_asyncio # type: ignore # noqa: E402
|
49
52
|
from kubernetes_asyncio import client # noqa: E402
|
53
|
+
from kubernetes_asyncio.client.api.apps_v1_api import ( # type: ignore # noqa: E402
|
54
|
+
AppsV1Api,
|
55
|
+
)
|
50
56
|
from kubernetes_asyncio.client.api.batch_v1_api import ( # type: ignore # noqa: E402
|
51
57
|
BatchV1Api,
|
52
58
|
)
|
@@ -78,9 +84,11 @@ class KubernetesSubmittedRun(AbstractRun):
|
|
78
84
|
self,
|
79
85
|
batch_api: "BatchV1Api",
|
80
86
|
core_api: "CoreV1Api",
|
87
|
+
apps_api: "AppsV1Api",
|
81
88
|
name: str,
|
82
89
|
namespace: Optional[str] = "default",
|
83
90
|
secret: Optional["V1Secret"] = None,
|
91
|
+
auxiliary_resource_label_key: Optional[str] = None,
|
84
92
|
) -> None:
|
85
93
|
"""Initialize a KubernetesSubmittedRun.
|
86
94
|
|
@@ -104,10 +112,12 @@ class KubernetesSubmittedRun(AbstractRun):
|
|
104
112
|
"""
|
105
113
|
self.batch_api = batch_api
|
106
114
|
self.core_api = core_api
|
115
|
+
self.apps_api = apps_api
|
107
116
|
self.name = name
|
108
117
|
self.namespace = namespace
|
109
118
|
self._fail_count = 0
|
110
119
|
self.secret = secret
|
120
|
+
self.auxiliary_resource_label_key = auxiliary_resource_label_key
|
111
121
|
|
112
122
|
@property
|
113
123
|
def id(self) -> str:
|
@@ -149,6 +159,7 @@ class KubernetesSubmittedRun(AbstractRun):
|
|
149
159
|
await asyncio.sleep(5)
|
150
160
|
|
151
161
|
await self._delete_secret()
|
162
|
+
await self._delete_auxiliary_resources_by_label()
|
152
163
|
return (
|
153
164
|
status.state == "finished"
|
154
165
|
) # todo: not sure if this (copied from aws runner) is the right approach? should we return false on failure
|
@@ -157,6 +168,7 @@ class KubernetesSubmittedRun(AbstractRun):
|
|
157
168
|
status = LaunchKubernetesMonitor.get_status(self.name)
|
158
169
|
if status in ["stopped", "failed", "finished", "preempted"]:
|
159
170
|
await self._delete_secret()
|
171
|
+
await self._delete_auxiliary_resources_by_label()
|
160
172
|
return status
|
161
173
|
|
162
174
|
async def cancel(self) -> None:
|
@@ -167,6 +179,7 @@ class KubernetesSubmittedRun(AbstractRun):
|
|
167
179
|
name=self.name,
|
168
180
|
)
|
169
181
|
await self._delete_secret()
|
182
|
+
await self._delete_auxiliary_resources_by_label()
|
170
183
|
except ApiException as e:
|
171
184
|
raise LaunchError(
|
172
185
|
f"Failed to delete Kubernetes Job {self.name} in namespace {self.namespace}: {str(e)}"
|
@@ -181,6 +194,52 @@ class KubernetesSubmittedRun(AbstractRun):
|
|
181
194
|
)
|
182
195
|
self.secret = None
|
183
196
|
|
197
|
+
async def _delete_auxiliary_resources_by_label(self) -> None:
|
198
|
+
if self.auxiliary_resource_label_key is None:
|
199
|
+
return
|
200
|
+
|
201
|
+
label_selector = (
|
202
|
+
f"{WANDB_K8S_LABEL_AUXILIARY_RESOURCE}={self.auxiliary_resource_label_key}"
|
203
|
+
)
|
204
|
+
|
205
|
+
try:
|
206
|
+
resource_cleanups = [
|
207
|
+
(self.core_api, "service"),
|
208
|
+
(self.batch_api, "job"),
|
209
|
+
(self.core_api, "pod"),
|
210
|
+
(self.core_api, "config_map"),
|
211
|
+
(self.core_api, "secret"),
|
212
|
+
(self.apps_api, "deployment"),
|
213
|
+
(self.apps_api, "replica_set"),
|
214
|
+
(self.apps_api, "daemon_set"),
|
215
|
+
]
|
216
|
+
|
217
|
+
for api_client, resource_type in resource_cleanups:
|
218
|
+
try:
|
219
|
+
list_method = getattr(
|
220
|
+
api_client, f"list_namespaced_{resource_type}"
|
221
|
+
)
|
222
|
+
delete_method = getattr(
|
223
|
+
api_client, f"delete_namespaced_{resource_type}"
|
224
|
+
)
|
225
|
+
|
226
|
+
# List resources with our label
|
227
|
+
resources = await list_method(
|
228
|
+
namespace=self.namespace, label_selector=label_selector
|
229
|
+
)
|
230
|
+
|
231
|
+
# Delete each resource
|
232
|
+
for resource in resources.items:
|
233
|
+
await delete_method(
|
234
|
+
name=resource.metadata.name, namespace=self.namespace
|
235
|
+
)
|
236
|
+
|
237
|
+
except (AttributeError, ApiException) as e:
|
238
|
+
wandb.termwarn(f"Could not clean up {resource_type}: {e}")
|
239
|
+
|
240
|
+
except Exception as e:
|
241
|
+
wandb.termwarn(f"Failed to clean up some auxiliary resources: {e}")
|
242
|
+
|
184
243
|
|
185
244
|
class CrdSubmittedRun(AbstractRun):
|
186
245
|
"""Run submitted to a CRD backend, e.g. Volcano."""
|
@@ -366,6 +425,7 @@ class KubernetesRunner(AbstractRunner):
|
|
366
425
|
job_metadata["generateName"] = make_name_dns_safe(
|
367
426
|
f"launch-{launch_project.target_entity}-{launch_project.target_project}-"
|
368
427
|
)
|
428
|
+
job_metadata["namespace"] = namespace
|
369
429
|
|
370
430
|
for i, cont in enumerate(containers):
|
371
431
|
if "name" not in cont:
|
@@ -489,6 +549,235 @@ class KubernetesRunner(AbstractRunner):
|
|
489
549
|
|
490
550
|
return job, api_key_secret
|
491
551
|
|
552
|
+
async def _wait_for_resource_ready(
|
553
|
+
self,
|
554
|
+
api_client: kubernetes_asyncio.client.ApiClient,
|
555
|
+
config: Dict[str, Any],
|
556
|
+
namespace: str,
|
557
|
+
timeout_seconds: int = 300,
|
558
|
+
) -> None:
|
559
|
+
"""Wait for a Kubernetes resource to be ready.
|
560
|
+
|
561
|
+
Arguments:
|
562
|
+
api_client: The Kubernetes API client.
|
563
|
+
config: The resource configuration.
|
564
|
+
namespace: The namespace where the resource was created.
|
565
|
+
timeout_seconds: Maximum time to wait for readiness.
|
566
|
+
"""
|
567
|
+
resource_kind = config.get("kind")
|
568
|
+
resource_name = config.get("metadata", {}).get("name")
|
569
|
+
|
570
|
+
if not resource_kind or not resource_name:
|
571
|
+
wandb.termerror(
|
572
|
+
f"{LOG_PREFIX}Cannot wait for resource without kind or name"
|
573
|
+
)
|
574
|
+
return
|
575
|
+
|
576
|
+
wandb.termlog(
|
577
|
+
f"{LOG_PREFIX}Waiting for {resource_kind} '{resource_name}' to be ready..."
|
578
|
+
)
|
579
|
+
|
580
|
+
start_time = time.time()
|
581
|
+
|
582
|
+
if resource_kind == "Deployment":
|
583
|
+
await self._wait_for_deployment_ready(
|
584
|
+
api_client, resource_name, namespace, timeout_seconds
|
585
|
+
)
|
586
|
+
elif resource_kind == "Service":
|
587
|
+
await self._wait_for_service_ready(
|
588
|
+
api_client, resource_name, namespace, timeout_seconds
|
589
|
+
)
|
590
|
+
elif resource_kind == "Pod":
|
591
|
+
await self._wait_for_pod_ready(
|
592
|
+
api_client, resource_name, namespace, timeout_seconds
|
593
|
+
)
|
594
|
+
else:
|
595
|
+
wandb.termlog(
|
596
|
+
f"{LOG_PREFIX}No specific readiness check for {resource_kind}, waiting 5 seconds..."
|
597
|
+
)
|
598
|
+
await asyncio.sleep(5)
|
599
|
+
|
600
|
+
elapsed = time.time() - start_time
|
601
|
+
wandb.termlog(
|
602
|
+
f"{LOG_PREFIX}{resource_kind} '{resource_name}' is ready after {elapsed:.1f}s"
|
603
|
+
)
|
604
|
+
|
605
|
+
async def _wait_for_deployment_ready(
|
606
|
+
self,
|
607
|
+
api_client: kubernetes_asyncio.client.ApiClient,
|
608
|
+
name: str,
|
609
|
+
namespace: str,
|
610
|
+
timeout_seconds: int,
|
611
|
+
) -> None:
|
612
|
+
"""Wait for a Deployment to be ready."""
|
613
|
+
apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
|
614
|
+
|
615
|
+
async def check_deployment_ready():
|
616
|
+
deployment = await apps_api.read_namespaced_deployment(
|
617
|
+
name=name, namespace=namespace
|
618
|
+
)
|
619
|
+
status = deployment.status
|
620
|
+
|
621
|
+
if status.ready_replicas and status.replicas:
|
622
|
+
return status.ready_replicas >= status.replicas
|
623
|
+
|
624
|
+
return False
|
625
|
+
|
626
|
+
await self._wait_with_timeout(check_deployment_ready, timeout_seconds, name)
|
627
|
+
|
628
|
+
async def _wait_for_service_ready(
|
629
|
+
self,
|
630
|
+
api_client: kubernetes_asyncio.client.ApiClient,
|
631
|
+
name: str,
|
632
|
+
namespace: str,
|
633
|
+
timeout_seconds: int,
|
634
|
+
) -> None:
|
635
|
+
"""Wait for a Service to have endpoints."""
|
636
|
+
core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
|
637
|
+
|
638
|
+
async def check_service_ready():
|
639
|
+
endpoints = await core_api.read_namespaced_endpoints(
|
640
|
+
name=name, namespace=namespace
|
641
|
+
)
|
642
|
+
if endpoints.subsets:
|
643
|
+
for subset in endpoints.subsets:
|
644
|
+
if subset.addresses: # These are ready pod addresses
|
645
|
+
return True
|
646
|
+
return False
|
647
|
+
|
648
|
+
await self._wait_with_timeout(check_service_ready, timeout_seconds, name)
|
649
|
+
|
650
|
+
async def _wait_for_pod_ready(
|
651
|
+
self,
|
652
|
+
api_client: kubernetes_asyncio.client.ApiClient,
|
653
|
+
name: str,
|
654
|
+
namespace: str,
|
655
|
+
timeout_seconds: int,
|
656
|
+
) -> None:
|
657
|
+
"""Wait for a Pod to be ready."""
|
658
|
+
core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
|
659
|
+
|
660
|
+
async def check_pod_ready():
|
661
|
+
pod = await core_api.read_namespaced_pod(name=name, namespace=namespace)
|
662
|
+
if pod.status.phase == "Running":
|
663
|
+
if pod.status.container_statuses:
|
664
|
+
return all(status.ready for status in pod.status.container_statuses)
|
665
|
+
return True
|
666
|
+
return False
|
667
|
+
|
668
|
+
await self._wait_with_timeout(check_pod_ready, timeout_seconds, name)
|
669
|
+
|
670
|
+
async def _wait_with_timeout(
|
671
|
+
self, check_func, timeout_seconds: int, name: str
|
672
|
+
) -> None:
|
673
|
+
"""Generic timeout wrapper for readiness checks."""
|
674
|
+
start_time = time.time()
|
675
|
+
|
676
|
+
while time.time() - start_time < timeout_seconds:
|
677
|
+
try:
|
678
|
+
if await check_func():
|
679
|
+
return
|
680
|
+
except kubernetes_asyncio.client.ApiException as e:
|
681
|
+
if e.status == 404:
|
682
|
+
pass
|
683
|
+
else:
|
684
|
+
wandb.termerror(
|
685
|
+
f"{LOG_PREFIX}Error waiting for resource '{name}': {e}"
|
686
|
+
)
|
687
|
+
raise
|
688
|
+
except Exception as e:
|
689
|
+
wandb.termerror(f"{LOG_PREFIX}Error waiting for resource '{name}': {e}")
|
690
|
+
raise
|
691
|
+
await asyncio.sleep(2)
|
692
|
+
|
693
|
+
raise LaunchError(
|
694
|
+
f"Resource '{name}' not ready within {timeout_seconds} seconds"
|
695
|
+
)
|
696
|
+
|
697
|
+
async def _prepare_resource(
|
698
|
+
self,
|
699
|
+
api_client: kubernetes_asyncio.client.ApiClient,
|
700
|
+
config: Dict[str, Any],
|
701
|
+
namespace: str,
|
702
|
+
run_id: str,
|
703
|
+
auxiliary_resource_label_key: str,
|
704
|
+
launch_project: LaunchProject,
|
705
|
+
api_key_secret: Optional["V1Secret"] = None,
|
706
|
+
wait_for_ready: bool = True,
|
707
|
+
wait_timeout: int = 300,
|
708
|
+
) -> None:
|
709
|
+
"""Prepare a service for launch.
|
710
|
+
|
711
|
+
Arguments:
|
712
|
+
api_client: The Kubernetes API client.
|
713
|
+
config: The resource configuration to prepare.
|
714
|
+
namespace: The namespace to create the resource in.
|
715
|
+
run_id: The run ID to label the resource with.
|
716
|
+
auxiliary_resource_label_key: The key of the auxiliary resource label.
|
717
|
+
launch_project: The launch project to get environment variables from.
|
718
|
+
api_key_secret: The API key secret to inject.
|
719
|
+
wait_for_ready: Whether to wait for the resource to be ready after creation.
|
720
|
+
wait_timeout: Maximum time in seconds to wait for resource readiness.
|
721
|
+
"""
|
722
|
+
config.setdefault("metadata", {})
|
723
|
+
config["metadata"].setdefault("labels", {})
|
724
|
+
config["metadata"]["labels"][WANDB_K8S_RUN_ID] = run_id
|
725
|
+
config["metadata"]["labels"][WANDB_K8S_LABEL_AUXILIARY_RESOURCE] = (
|
726
|
+
auxiliary_resource_label_key
|
727
|
+
)
|
728
|
+
config["metadata"]["labels"]["wandb.ai/created-by"] = "launch-agent"
|
729
|
+
|
730
|
+
if config.get("kind") == "Service" or config.get("kind") == "Deployment":
|
731
|
+
config.setdefault("metadata", {})
|
732
|
+
original_name = config["metadata"].get("name", config.get("kind"))
|
733
|
+
safe_name = make_name_dns_safe(original_name)
|
734
|
+
safe_entity = make_name_dns_safe(launch_project.target_entity or "")
|
735
|
+
safe_project = make_name_dns_safe(launch_project.target_project or "")
|
736
|
+
safe_run_id = make_name_dns_safe(run_id or "")
|
737
|
+
|
738
|
+
new_name = f"{safe_name}-{safe_entity}-{safe_project}-{safe_run_id}"
|
739
|
+
config["metadata"]["name"] = new_name
|
740
|
+
wandb.termlog(
|
741
|
+
f"{LOG_PREFIX}Modified {config.get('kind')} name from '{original_name}' to '{new_name}'"
|
742
|
+
)
|
743
|
+
|
744
|
+
env_vars = launch_project.get_env_vars_dict(
|
745
|
+
self._api, MAX_ENV_LENGTHS[self.__class__.__name__]
|
746
|
+
)
|
747
|
+
wandb_config_env = {
|
748
|
+
"WANDB_CONFIG": env_vars.get("WANDB_CONFIG", "{}"),
|
749
|
+
}
|
750
|
+
add_wandb_env(config, wandb_config_env)
|
751
|
+
|
752
|
+
if api_key_secret:
|
753
|
+
for cont in yield_containers(config):
|
754
|
+
env = cont.setdefault("env", [])
|
755
|
+
env.append(
|
756
|
+
{
|
757
|
+
"name": "WANDB_API_KEY",
|
758
|
+
"valueFrom": {
|
759
|
+
"secretKeyRef": {
|
760
|
+
"name": api_key_secret.metadata.name,
|
761
|
+
"key": "password",
|
762
|
+
}
|
763
|
+
},
|
764
|
+
}
|
765
|
+
)
|
766
|
+
cont["env"] = env
|
767
|
+
|
768
|
+
try:
|
769
|
+
await kubernetes_asyncio.utils.create_from_dict(
|
770
|
+
api_client, config, namespace=namespace
|
771
|
+
)
|
772
|
+
|
773
|
+
if wait_for_ready:
|
774
|
+
await self._wait_for_resource_ready(
|
775
|
+
api_client, config, namespace, wait_timeout
|
776
|
+
)
|
777
|
+
except Exception as e:
|
778
|
+
wandb.termerror(f"{LOG_PREFIX}Failed to create Kubernetes resource: {e}")
|
779
|
+
raise LaunchError(f"Failed to create Kubernetes resource: {e}")
|
780
|
+
|
492
781
|
async def run(
|
493
782
|
self, launch_project: LaunchProject, image_uri: str
|
494
783
|
) -> Optional[AbstractRun]:
|
@@ -630,10 +919,42 @@ class KubernetesRunner(AbstractRunner):
|
|
630
919
|
|
631
920
|
batch_api = kubernetes_asyncio.client.BatchV1Api(api_client)
|
632
921
|
core_api = kubernetes_asyncio.client.CoreV1Api(api_client)
|
922
|
+
apps_api = kubernetes_asyncio.client.AppsV1Api(api_client)
|
923
|
+
|
633
924
|
namespace = self.get_namespace(resource_args, context)
|
634
925
|
job, secret = await self._inject_defaults(
|
635
926
|
resource_args, launch_project, image_uri, namespace, core_api
|
636
927
|
)
|
928
|
+
|
929
|
+
additional_services = launch_project.launch_spec.get("additional_services", [])
|
930
|
+
auxiliary_resource_label_key = None
|
931
|
+
if additional_services:
|
932
|
+
wandb.termlog(
|
933
|
+
f"{LOG_PREFIX}Creating additional services: {additional_services}"
|
934
|
+
)
|
935
|
+
auxiliary_resource_label_key = f"aux-{uuid.uuid4()}"
|
936
|
+
|
937
|
+
wait_for_ready = resource_args.get("wait_for_ready", True)
|
938
|
+
wait_timeout = resource_args.get("wait_timeout", 300)
|
939
|
+
|
940
|
+
await asyncio.gather(
|
941
|
+
*[
|
942
|
+
self._prepare_resource(
|
943
|
+
api_client,
|
944
|
+
resource.get("config"),
|
945
|
+
namespace,
|
946
|
+
launch_project.run_id,
|
947
|
+
auxiliary_resource_label_key,
|
948
|
+
launch_project,
|
949
|
+
secret,
|
950
|
+
wait_for_ready,
|
951
|
+
wait_timeout,
|
952
|
+
)
|
953
|
+
for resource in additional_services
|
954
|
+
if resource.get("config", {})
|
955
|
+
]
|
956
|
+
)
|
957
|
+
|
637
958
|
msg = "Creating Kubernetes job"
|
638
959
|
if "name" in resource_args:
|
639
960
|
msg += f": {resource_args['name']}"
|
@@ -658,7 +979,13 @@ class KubernetesRunner(AbstractRunner):
|
|
658
979
|
job_name = job_response.metadata.name
|
659
980
|
LaunchKubernetesMonitor.monitor_namespace(namespace)
|
660
981
|
submitted_job = KubernetesSubmittedRun(
|
661
|
-
batch_api,
|
982
|
+
batch_api,
|
983
|
+
core_api,
|
984
|
+
apps_api,
|
985
|
+
job_name,
|
986
|
+
namespace,
|
987
|
+
secret,
|
988
|
+
auxiliary_resource_label_key,
|
662
989
|
)
|
663
990
|
if self.backend_config[PROJECT_SYNCHRONOUS]:
|
664
991
|
await submitted_job.wait()
|
@@ -36,7 +36,6 @@ if TYPE_CHECKING:
|
|
36
36
|
import wandb.apis.public as public
|
37
37
|
from wandb.apis.internal import Api
|
38
38
|
from wandb.apis.public import QueuedRun, Run
|
39
|
-
from wandb.sdk.wandb_run import Run as SdkRun
|
40
39
|
|
41
40
|
|
42
41
|
_logger = logging.getLogger(__name__)
|
@@ -255,10 +254,10 @@ class Scheduler(ABC):
|
|
255
254
|
_id: w for _id, w in self._workers.items() if _id not in self.busy_workers
|
256
255
|
}
|
257
256
|
|
258
|
-
def _init_wandb_run(self) -> "
|
257
|
+
def _init_wandb_run(self) -> "wandb.Run":
|
259
258
|
"""Controls resume or init logic for a scheduler wandb run."""
|
260
259
|
settings = wandb.Settings(disable_job_creation=True)
|
261
|
-
run:
|
260
|
+
run: wandb.Run = wandb.init( # type: ignore
|
262
261
|
name=f"Scheduler.{self._sweep_id}",
|
263
262
|
resume="allow",
|
264
263
|
config=self._kwargs, # when run as a job, this sets config
|
wandb/sdk/launch/utils.py
CHANGED
@@ -380,9 +380,9 @@ def diff_pip_requirements(req_1: List[str], req_2: List[str]) -> Dict[str, str]:
|
|
380
380
|
else:
|
381
381
|
raise ValueError(f"Unable to parse pip requirements file line: {line}")
|
382
382
|
if _name is not None:
|
383
|
-
assert re.match(
|
384
|
-
|
385
|
-
)
|
383
|
+
assert re.match(_VALID_PIP_PACKAGE_REGEX, _name), (
|
384
|
+
f"Invalid pip package name {_name}"
|
385
|
+
)
|
386
386
|
d[_name] = _version
|
387
387
|
return d
|
388
388
|
|
wandb/sdk/lib/asyncio_compat.py
CHANGED
@@ -100,6 +100,9 @@ class _Runner:
|
|
100
100
|
raise _RunnerCancelledError()
|
101
101
|
|
102
102
|
finally:
|
103
|
+
# NOTE: asyncio.run() cancels all tasks after the main task exits,
|
104
|
+
# but this is not documented, so we cancel them explicitly here
|
105
|
+
# as well. It also blocks until canceled tasks complete.
|
103
106
|
cancellation_task.cancel()
|
104
107
|
fn_task.cancel()
|
105
108
|
|
wandb/sdk/lib/console_capture.py
CHANGED
@@ -25,17 +25,38 @@ In particular, it does not work with some combinations of pytest's
|
|
25
25
|
|
26
26
|
from __future__ import annotations
|
27
27
|
|
28
|
+
import logging
|
28
29
|
import sys
|
29
30
|
import threading
|
30
31
|
from typing import IO, AnyStr, Callable, Protocol
|
31
32
|
|
33
|
+
from . import wb_logging
|
34
|
+
|
35
|
+
_logger = logging.getLogger(__name__)
|
36
|
+
|
32
37
|
|
33
38
|
class CannotCaptureConsoleError(Exception):
|
34
39
|
"""The module failed to patch stdout or stderr."""
|
35
40
|
|
36
41
|
|
37
42
|
class _WriteCallback(Protocol):
|
38
|
-
"""A callback that receives intercepted bytes or string data.
|
43
|
+
"""A callback that receives intercepted bytes or string data.
|
44
|
+
|
45
|
+
This may be called from any thread, but is only called from one thread
|
46
|
+
at a time.
|
47
|
+
|
48
|
+
Note on errors: Any error raised during the callback will clear all
|
49
|
+
callbacks. This means that if a user presses Ctrl-C at an unlucky time
|
50
|
+
during a run, we will stop uploading console output---but it's not
|
51
|
+
likely to be a problem unless something catches the KeyboardInterrupt.
|
52
|
+
|
53
|
+
Regular Exceptions are caught and logged instead of bubbling up to the
|
54
|
+
user's print() statements; other exceptions like KeyboardInterrupt are
|
55
|
+
re-raised.
|
56
|
+
|
57
|
+
Callbacks should handle all exceptions---a callback that raises any
|
58
|
+
Exception is considered buggy.
|
59
|
+
"""
|
39
60
|
|
40
61
|
def __call__(
|
41
62
|
self,
|
@@ -45,6 +66,8 @@ class _WriteCallback(Protocol):
|
|
45
66
|
) -> None:
|
46
67
|
"""Intercept data passed to `write()`.
|
47
68
|
|
69
|
+
See the protocol docstring for information about exceptions.
|
70
|
+
|
48
71
|
Args:
|
49
72
|
data: The object passed to stderr's or stdout's `write()`.
|
50
73
|
written: The number of bytes or characters written.
|
@@ -52,7 +75,9 @@ class _WriteCallback(Protocol):
|
|
52
75
|
"""
|
53
76
|
|
54
77
|
|
55
|
-
|
78
|
+
# A reentrant lock is used to catch callbacks that write to stderr/stdout.
|
79
|
+
_module_rlock = threading.RLock()
|
80
|
+
_is_writing = False
|
56
81
|
|
57
82
|
_patch_exception: CannotCaptureConsoleError | None = None
|
58
83
|
|
@@ -67,9 +92,6 @@ def capture_stdout(callback: _WriteCallback) -> Callable[[], None]:
|
|
67
92
|
|
68
93
|
Args:
|
69
94
|
callback: A callback to invoke after running `sys.stdout.write`.
|
70
|
-
This may be called from any thread, so it must be thread-safe.
|
71
|
-
Exceptions are propagated to the caller of `write`.
|
72
|
-
See `_WriteCallback` for the exact protocol.
|
73
95
|
|
74
96
|
Returns:
|
75
97
|
A function to uninstall the callback.
|
@@ -77,7 +99,7 @@ def capture_stdout(callback: _WriteCallback) -> Callable[[], None]:
|
|
77
99
|
Raises:
|
78
100
|
CannotCaptureConsoleError: If patching failed on import.
|
79
101
|
"""
|
80
|
-
with
|
102
|
+
with _module_rlock:
|
81
103
|
if _patch_exception:
|
82
104
|
raise _patch_exception
|
83
105
|
|
@@ -92,9 +114,6 @@ def capture_stderr(callback: _WriteCallback) -> Callable[[], None]:
|
|
92
114
|
|
93
115
|
Args:
|
94
116
|
callback: A callback to invoke after running `sys.stderr.write`.
|
95
|
-
This may be called from any thread, so it must be thread-safe.
|
96
|
-
Exceptions are propagated to the caller of `write`.
|
97
|
-
See `_WriteCallback` for the exact protocol.
|
98
117
|
|
99
118
|
Returns:
|
100
119
|
A function to uninstall the callback.
|
@@ -102,7 +121,7 @@ def capture_stderr(callback: _WriteCallback) -> Callable[[], None]:
|
|
102
121
|
Raises:
|
103
122
|
CannotCaptureConsoleError: If patching failed on import.
|
104
123
|
"""
|
105
|
-
with
|
124
|
+
with _module_rlock:
|
106
125
|
if _patch_exception:
|
107
126
|
raise _patch_exception
|
108
127
|
|
@@ -125,11 +144,11 @@ def _insert_disposably(
|
|
125
144
|
def dispose() -> None:
|
126
145
|
nonlocal disposed
|
127
146
|
|
128
|
-
with
|
147
|
+
with _module_rlock:
|
129
148
|
if disposed:
|
130
149
|
return
|
131
150
|
|
132
|
-
|
151
|
+
callback_dict.pop(id, None)
|
133
152
|
|
134
153
|
disposed = True
|
135
154
|
|
@@ -143,16 +162,44 @@ def _patch(
|
|
143
162
|
) -> None:
|
144
163
|
orig_write: Callable[[AnyStr], int]
|
145
164
|
|
165
|
+
@wb_logging.log_to_all_runs()
|
146
166
|
def write_with_callbacks(s: AnyStr, /) -> int:
|
167
|
+
global _is_writing
|
147
168
|
n = orig_write(s)
|
148
169
|
|
149
|
-
#
|
150
|
-
#
|
151
|
-
with
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
170
|
+
# NOTE: Since _module_rlock is reentrant, this is safe. It will not
|
171
|
+
# deadlock if a callback invokes write() again.
|
172
|
+
with _module_rlock:
|
173
|
+
if _is_writing:
|
174
|
+
return n
|
175
|
+
|
176
|
+
_is_writing = True
|
177
|
+
try:
|
178
|
+
for cb in callbacks.values():
|
179
|
+
cb(s, n)
|
180
|
+
|
181
|
+
except BaseException as e:
|
182
|
+
# Clear all callbacks on any exception to avoid infinite loops:
|
183
|
+
#
|
184
|
+
# * If we re-raise, an exception handler is likely to print
|
185
|
+
# the exception to the console and trigger callbacks again
|
186
|
+
# * If we log, we can't guarantee that this doesn't print
|
187
|
+
# to console.
|
188
|
+
#
|
189
|
+
# This is especially important for KeyboardInterrupt.
|
190
|
+
_stderr_callbacks.clear()
|
191
|
+
_stdout_callbacks.clear()
|
192
|
+
|
193
|
+
if isinstance(e, Exception):
|
194
|
+
# We suppress Exceptions so that bugs in W&B code don't
|
195
|
+
# cause the user's print() statements to raise errors.
|
196
|
+
_logger.exception("Error in console callback, clearing all!")
|
197
|
+
else:
|
198
|
+
# Re-raise errors like KeyboardInterrupt.
|
199
|
+
raise
|
200
|
+
|
201
|
+
finally:
|
202
|
+
_is_writing = False
|
156
203
|
|
157
204
|
return n
|
158
205
|
|
wandb/sdk/lib/deprecate.py
CHANGED
@@ -1,20 +1,14 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING
|
4
|
-
|
5
3
|
import wandb
|
6
4
|
from wandb.proto.wandb_deprecated import DEPRECATED_FEATURES
|
7
5
|
from wandb.sdk.lib import telemetry
|
8
6
|
|
9
|
-
# Necessary to break import cycle.
|
10
|
-
if TYPE_CHECKING:
|
11
|
-
from wandb import wandb_run
|
12
|
-
|
13
7
|
|
14
8
|
def deprecate(
|
15
9
|
field_name: DEPRECATED_FEATURES,
|
16
10
|
warning_message: str,
|
17
|
-
run:
|
11
|
+
run: wandb.Run | None = None,
|
18
12
|
) -> None:
|
19
13
|
"""Warn the user that a feature has been deprecated.
|
20
14
|
|
wandb/sdk/lib/disabled.py
CHANGED
@@ -26,5 +26,5 @@ class RunDisabled:
|
|
26
26
|
deprecate.deprecate(
|
27
27
|
field_name=Deprecated.run_disabled,
|
28
28
|
warning_message="RunDisabled is deprecated and is a no-op. "
|
29
|
-
'`wandb.init(mode="disabled")` now returns
|
29
|
+
'`wandb.init(mode="disabled")` now returns an instance of `wandb.Run`.',
|
30
30
|
)
|