wandb 0.16.5__py3-none-any.whl → 0.17.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- package_readme.md +95 -0
- wandb/__init__.py +2 -3
- wandb/agents/pyagent.py +0 -1
- wandb/analytics/sentry.py +2 -1
- wandb/apis/importers/internals/internal.py +0 -1
- wandb/apis/importers/internals/protocols.py +30 -56
- wandb/apis/importers/mlflow.py +13 -26
- wandb/apis/importers/wandb.py +8 -14
- wandb/apis/internal.py +0 -3
- wandb/apis/public/api.py +55 -3
- wandb/apis/public/artifacts.py +1 -0
- wandb/apis/public/files.py +1 -0
- wandb/apis/public/history.py +1 -0
- wandb/apis/public/jobs.py +17 -4
- wandb/apis/public/projects.py +1 -0
- wandb/apis/public/reports.py +1 -0
- wandb/apis/public/runs.py +15 -17
- wandb/apis/public/sweeps.py +1 -0
- wandb/apis/public/teams.py +1 -0
- wandb/apis/public/users.py +1 -0
- wandb/apis/reports/v1/_blocks.py +3 -7
- wandb/apis/reports/v2/gql.py +1 -0
- wandb/apis/reports/v2/interface.py +3 -4
- wandb/apis/reports/v2/internal.py +5 -8
- wandb/cli/cli.py +95 -22
- wandb/data_types.py +9 -6
- wandb/docker/__init__.py +1 -1
- wandb/env.py +38 -8
- wandb/errors/__init__.py +5 -0
- wandb/errors/term.py +10 -2
- wandb/filesync/step_checksum.py +1 -4
- wandb/filesync/step_prepare.py +4 -24
- wandb/filesync/step_upload.py +4 -106
- wandb/filesync/upload_job.py +0 -76
- wandb/integration/catboost/catboost.py +1 -1
- wandb/integration/fastai/__init__.py +1 -0
- wandb/integration/huggingface/resolver.py +2 -2
- wandb/integration/keras/__init__.py +1 -0
- wandb/integration/keras/callbacks/metrics_logger.py +1 -1
- wandb/integration/keras/keras.py +7 -7
- wandb/integration/langchain/wandb_tracer.py +1 -0
- wandb/integration/lightning/fabric/logger.py +1 -3
- wandb/integration/metaflow/metaflow.py +41 -6
- wandb/integration/openai/fine_tuning.py +77 -40
- wandb/integration/prodigy/prodigy.py +1 -1
- wandb/old/summary.py +1 -1
- wandb/plot/confusion_matrix.py +1 -1
- wandb/plot/pr_curve.py +2 -1
- wandb/plot/roc_curve.py +2 -1
- wandb/{plots → plot}/utils.py +13 -25
- wandb/proto/v3/wandb_internal_pb2.py +364 -332
- wandb/proto/v3/wandb_settings_pb2.py +2 -2
- wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
- wandb/proto/v4/wandb_internal_pb2.py +322 -316
- wandb/proto/v4/wandb_settings_pb2.py +2 -2
- wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
- wandb/proto/wandb_deprecated.py +7 -1
- wandb/proto/wandb_internal_codegen.py +3 -29
- wandb/sdk/artifacts/artifact.py +51 -20
- wandb/sdk/artifacts/artifact_download_logger.py +1 -0
- wandb/sdk/artifacts/artifact_file_cache.py +18 -4
- wandb/sdk/artifacts/artifact_instance_cache.py +1 -0
- wandb/sdk/artifacts/artifact_manifest.py +1 -0
- wandb/sdk/artifacts/artifact_manifest_entry.py +7 -3
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +1 -0
- wandb/sdk/artifacts/artifact_saver.py +18 -27
- wandb/sdk/artifacts/artifact_state.py +1 -0
- wandb/sdk/artifacts/artifact_ttl.py +1 -0
- wandb/sdk/artifacts/exceptions.py +1 -0
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/gcs_handler.py +13 -18
- wandb/sdk/artifacts/storage_handlers/http_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +5 -3
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +1 -0
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +1 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +3 -42
- wandb/sdk/artifacts/storage_policy.py +2 -12
- wandb/sdk/data_types/_dtypes.py +8 -8
- wandb/sdk/data_types/base_types/media.py +3 -6
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +3 -1
- wandb/sdk/data_types/image.py +1 -1
- wandb/sdk/data_types/video.py +1 -1
- wandb/sdk/integration_utils/auto_logging.py +5 -6
- wandb/sdk/integration_utils/data_logging.py +10 -6
- wandb/sdk/interface/interface.py +86 -38
- wandb/sdk/interface/interface_shared.py +7 -13
- wandb/sdk/internal/datastore.py +1 -1
- wandb/sdk/internal/file_pusher.py +2 -5
- wandb/sdk/internal/file_stream.py +5 -18
- wandb/sdk/internal/handler.py +18 -2
- wandb/sdk/internal/internal.py +0 -1
- wandb/sdk/internal/internal_api.py +1 -129
- wandb/sdk/internal/internal_util.py +0 -1
- wandb/sdk/internal/job_builder.py +159 -45
- wandb/sdk/internal/profiler.py +1 -0
- wandb/sdk/internal/progress.py +0 -28
- wandb/sdk/internal/run.py +1 -0
- wandb/sdk/internal/sender.py +1 -2
- wandb/sdk/internal/system/assets/gpu_amd.py +44 -44
- wandb/sdk/internal/system/assets/gpu_apple.py +56 -11
- wandb/sdk/internal/system/assets/interfaces.py +6 -8
- wandb/sdk/internal/system/assets/open_metrics.py +2 -2
- wandb/sdk/internal/system/assets/trainium.py +1 -3
- wandb/sdk/launch/__init__.py +9 -1
- wandb/sdk/launch/_launch.py +9 -24
- wandb/sdk/launch/_launch_add.py +1 -3
- wandb/sdk/launch/_project_spec.py +188 -241
- wandb/sdk/launch/agent/agent.py +115 -48
- wandb/sdk/launch/agent/config.py +80 -14
- wandb/sdk/launch/builder/abstract.py +69 -1
- wandb/sdk/launch/builder/build.py +156 -555
- wandb/sdk/launch/builder/context_manager.py +235 -0
- wandb/sdk/launch/builder/docker_builder.py +8 -23
- wandb/sdk/launch/builder/kaniko_builder.py +161 -159
- wandb/sdk/launch/builder/noop.py +1 -0
- wandb/sdk/launch/builder/templates/dockerfile.py +92 -0
- wandb/sdk/launch/create_job.py +68 -63
- wandb/sdk/launch/environment/abstract.py +1 -0
- wandb/sdk/launch/environment/gcp_environment.py +1 -0
- wandb/sdk/launch/environment/local_environment.py +1 -0
- wandb/sdk/launch/inputs/files.py +148 -0
- wandb/sdk/launch/inputs/internal.py +217 -0
- wandb/sdk/launch/inputs/manage.py +95 -0
- wandb/sdk/launch/loader.py +1 -0
- wandb/sdk/launch/registry/abstract.py +1 -0
- wandb/sdk/launch/registry/azure_container_registry.py +1 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +1 -0
- wandb/sdk/launch/registry/google_artifact_registry.py +2 -1
- wandb/sdk/launch/registry/local_registry.py +1 -0
- wandb/sdk/launch/runner/abstract.py +1 -0
- wandb/sdk/launch/runner/kubernetes_monitor.py +4 -1
- wandb/sdk/launch/runner/kubernetes_runner.py +9 -10
- wandb/sdk/launch/runner/local_container.py +2 -3
- wandb/sdk/launch/runner/local_process.py +8 -29
- wandb/sdk/launch/runner/sagemaker_runner.py +21 -20
- wandb/sdk/launch/runner/vertex_runner.py +8 -7
- wandb/sdk/launch/sweeps/scheduler.py +7 -4
- wandb/sdk/launch/sweeps/scheduler_sweep.py +2 -1
- wandb/sdk/launch/sweeps/utils.py +3 -3
- wandb/sdk/launch/utils.py +33 -140
- wandb/sdk/lib/_settings_toposort_generated.py +1 -5
- wandb/sdk/lib/fsm.py +8 -12
- wandb/sdk/lib/gitlib.py +4 -4
- wandb/sdk/lib/import_hooks.py +1 -1
- wandb/sdk/lib/lazyloader.py +0 -1
- wandb/sdk/lib/proto_util.py +23 -2
- wandb/sdk/lib/redirect.py +19 -14
- wandb/sdk/lib/retry.py +3 -2
- wandb/sdk/lib/run_moment.py +7 -1
- wandb/sdk/lib/tracelog.py +1 -1
- wandb/sdk/service/service.py +19 -16
- wandb/sdk/verify/verify.py +2 -1
- wandb/sdk/wandb_init.py +16 -63
- wandb/sdk/wandb_manager.py +2 -2
- wandb/sdk/wandb_require.py +5 -0
- wandb/sdk/wandb_run.py +164 -90
- wandb/sdk/wandb_settings.py +2 -48
- wandb/sdk/wandb_setup.py +1 -1
- wandb/sklearn/__init__.py +1 -0
- wandb/sklearn/plot/__init__.py +1 -0
- wandb/sklearn/plot/classifier.py +11 -12
- wandb/sklearn/plot/clusterer.py +2 -1
- wandb/sklearn/plot/regressor.py +1 -0
- wandb/sklearn/plot/shared.py +1 -0
- wandb/sklearn/utils.py +1 -0
- wandb/testing/relay.py +4 -4
- wandb/trigger.py +1 -0
- wandb/util.py +67 -54
- wandb/wandb_controller.py +2 -3
- wandb/wandb_torch.py +1 -2
- {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/METADATA +67 -70
- {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/RECORD +178 -188
- {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/WHEEL +1 -2
- wandb/bin/apple_gpu_stats +0 -0
- wandb/catboost/__init__.py +0 -9
- wandb/fastai/__init__.py +0 -9
- wandb/keras/__init__.py +0 -18
- wandb/lightgbm/__init__.py +0 -9
- wandb/plots/__init__.py +0 -6
- wandb/plots/explain_text.py +0 -36
- wandb/plots/heatmap.py +0 -81
- wandb/plots/named_entity.py +0 -43
- wandb/plots/part_of_speech.py +0 -50
- wandb/plots/plot_definitions.py +0 -768
- wandb/plots/precision_recall.py +0 -121
- wandb/plots/roc.py +0 -103
- wandb/sacred/__init__.py +0 -3
- wandb/xgboost/__init__.py +0 -9
- wandb-0.16.5.dist-info/top_level.txt +0 -1
- {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info}/entry_points.txt +0 -0
- {wandb-0.16.5.dist-info → wandb-0.17.0.dist-info/licenses}/LICENSE +0 -0
wandb/sdk/launch/agent/agent.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Implementation of launch agent."""
|
2
|
+
|
2
3
|
import asyncio
|
3
4
|
import logging
|
4
5
|
import os
|
@@ -8,7 +9,9 @@ import time
|
|
8
9
|
import traceback
|
9
10
|
from dataclasses import dataclass
|
10
11
|
from multiprocessing import Event
|
11
|
-
from typing import Any, Dict, List, Optional, Union
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
13
|
+
|
14
|
+
import yaml
|
12
15
|
|
13
16
|
import wandb
|
14
17
|
from wandb.apis.internal import Api
|
@@ -17,11 +20,11 @@ from wandb.sdk.launch._launch_add import launch_add
|
|
17
20
|
from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
|
18
21
|
from wandb.sdk.launch.runner.local_process import LocalProcessRunner
|
19
22
|
from wandb.sdk.launch.sweeps.scheduler import Scheduler
|
23
|
+
from wandb.sdk.launch.utils import LAUNCH_CONFIG_FILE, resolve_build_and_registry_config
|
20
24
|
from wandb.sdk.lib import runid
|
21
25
|
|
22
26
|
from .. import loader
|
23
27
|
from .._project_spec import LaunchProject
|
24
|
-
from ..builder.build import construct_agent_configs
|
25
28
|
from ..errors import LaunchDockerError, LaunchError
|
26
29
|
from ..utils import (
|
27
30
|
LAUNCH_DEFAULT_PROJECT,
|
@@ -45,7 +48,10 @@ MAX_RESUME_COUNT = 5
|
|
45
48
|
|
46
49
|
RUN_INFO_GRACE_PERIOD = 60
|
47
50
|
|
48
|
-
|
51
|
+
DEFAULT_STOPPED_RUN_TIMEOUT = 60
|
52
|
+
|
53
|
+
DEFAULT_PRINT_INTERVAL = 5 * 60
|
54
|
+
VERBOSE_PRINT_INTERVAL = 20
|
49
55
|
|
50
56
|
_env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
|
51
57
|
if _env_timeout:
|
@@ -105,30 +111,54 @@ def _max_from_config(
|
|
105
111
|
return max_from_config
|
106
112
|
|
107
113
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
_logger.debug("Recieved runSpec in _is_scheduler_job that was empty")
|
114
|
+
class InternalAgentLogger:
|
115
|
+
def __init__(self, verbosity=0):
|
116
|
+
self._print_to_terminal = verbosity >= 2
|
112
117
|
|
113
|
-
|
114
|
-
|
118
|
+
def error(self, message: str):
|
119
|
+
if self._print_to_terminal:
|
120
|
+
wandb.termerror(f"{LOG_PREFIX}{message}")
|
121
|
+
_logger.error(f"{LOG_PREFIX}{message}")
|
115
122
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
return True
|
123
|
+
def warn(self, message: str):
|
124
|
+
if self._print_to_terminal:
|
125
|
+
wandb.termwarn(f"{LOG_PREFIX}{message}")
|
126
|
+
_logger.warn(f"{LOG_PREFIX}{message}")
|
121
127
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
return False
|
128
|
+
def info(self, message: str):
|
129
|
+
if self._print_to_terminal:
|
130
|
+
wandb.termlog(f"{LOG_PREFIX}{message}")
|
131
|
+
_logger.info(f"{LOG_PREFIX}{message}")
|
127
132
|
|
128
|
-
|
129
|
-
|
133
|
+
def debug(self, message: str):
|
134
|
+
if self._print_to_terminal:
|
135
|
+
wandb.termlog(f"{LOG_PREFIX}{message}")
|
136
|
+
_logger.debug(f"{LOG_PREFIX}{message}")
|
130
137
|
|
131
|
-
|
138
|
+
|
139
|
+
def construct_agent_configs(
|
140
|
+
launch_config: Optional[Dict] = None,
|
141
|
+
build_config: Optional[Dict] = None,
|
142
|
+
) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any], Dict[str, Any]]:
|
143
|
+
registry_config = None
|
144
|
+
environment_config = None
|
145
|
+
if launch_config is not None:
|
146
|
+
build_config = launch_config.get("builder")
|
147
|
+
registry_config = launch_config.get("registry")
|
148
|
+
|
149
|
+
default_launch_config = None
|
150
|
+
if os.path.exists(os.path.expanduser(LAUNCH_CONFIG_FILE)):
|
151
|
+
with open(os.path.expanduser(LAUNCH_CONFIG_FILE)) as f:
|
152
|
+
default_launch_config = (
|
153
|
+
yaml.safe_load(f) or {}
|
154
|
+
) # In case the config is empty, we want it to be {} instead of None.
|
155
|
+
environment_config = default_launch_config.get("environment")
|
156
|
+
|
157
|
+
build_config, registry_config = resolve_build_and_registry_config(
|
158
|
+
default_launch_config, build_config, registry_config
|
159
|
+
)
|
160
|
+
|
161
|
+
return environment_config, build_config, registry_config
|
132
162
|
|
133
163
|
|
134
164
|
class LaunchAgent:
|
@@ -170,7 +200,7 @@ class LaunchAgent:
|
|
170
200
|
config: Config dictionary for the agent.
|
171
201
|
"""
|
172
202
|
self._entity = config["entity"]
|
173
|
-
self._project =
|
203
|
+
self._project = LAUNCH_DEFAULT_PROJECT
|
174
204
|
self._api = api
|
175
205
|
self._base_url = self._api.settings().get("base_url")
|
176
206
|
self._ticks = 0
|
@@ -184,7 +214,13 @@ class LaunchAgent:
|
|
184
214
|
self._max_jobs = _max_from_config(config, "max_jobs")
|
185
215
|
self._max_schedulers = _max_from_config(config, "max_schedulers")
|
186
216
|
self._secure_mode = config.get("secure_mode", False)
|
217
|
+
self._verbosity = config.get("verbosity", 0)
|
218
|
+
self._internal_logger = InternalAgentLogger(verbosity=self._verbosity)
|
219
|
+
self._last_status_print_time = 0.0
|
187
220
|
self.default_config: Dict[str, Any] = config
|
221
|
+
self._stopped_run_timeout = config.get(
|
222
|
+
"stopped_run_timeout", DEFAULT_STOPPED_RUN_TIMEOUT
|
223
|
+
)
|
188
224
|
|
189
225
|
# Get agent version from env var if present, otherwise wandb version
|
190
226
|
self.version: str = "wandb@" + wandb.__version__
|
@@ -228,6 +264,33 @@ class LaunchAgent:
|
|
228
264
|
self._name = agent_response["name"]
|
229
265
|
self._init_agent_run()
|
230
266
|
|
267
|
+
def _is_scheduler_job(self, run_spec: Dict[str, Any]) -> bool:
|
268
|
+
"""Determine whether a job/runSpec is a sweep scheduler."""
|
269
|
+
if not run_spec:
|
270
|
+
self._internal_logger.debug(
|
271
|
+
"Received runSpec in _is_scheduler_job that was empty"
|
272
|
+
)
|
273
|
+
|
274
|
+
if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
|
275
|
+
return False
|
276
|
+
|
277
|
+
if run_spec.get("resource") == "local-process":
|
278
|
+
# Any job pushed to a run queue that has a scheduler uri is
|
279
|
+
# allowed to use local-process
|
280
|
+
if run_spec.get("job"):
|
281
|
+
return True
|
282
|
+
|
283
|
+
# If a scheduler is local-process and run through CLI, also
|
284
|
+
# confirm command is in format: [wandb scheduler <sweep>]
|
285
|
+
cmd = run_spec.get("overrides", {}).get("entry_point", [])
|
286
|
+
if len(cmd) < 3:
|
287
|
+
return False
|
288
|
+
|
289
|
+
if cmd[:2] != ["wandb", "scheduler"]:
|
290
|
+
return False
|
291
|
+
|
292
|
+
return True
|
293
|
+
|
231
294
|
async def fail_run_queue_item(
|
232
295
|
self,
|
233
296
|
run_queue_item_id: str,
|
@@ -241,6 +304,8 @@ class LaunchAgent:
|
|
241
304
|
|
242
305
|
def _init_agent_run(self) -> None:
|
243
306
|
# TODO: has it been long enough that all backends support agents?
|
307
|
+
self._wandb_run = None
|
308
|
+
|
244
309
|
if self.gorilla_supports_agents:
|
245
310
|
settings = wandb.Settings(silent=True, disable_git=True)
|
246
311
|
self._wandb_run = wandb.init(
|
@@ -250,8 +315,6 @@ class LaunchAgent:
|
|
250
315
|
id=self._name,
|
251
316
|
job_type=HIDDEN_AGENT_RUN_TYPE,
|
252
317
|
)
|
253
|
-
else:
|
254
|
-
self._wandb_run = None
|
255
318
|
|
256
319
|
@property
|
257
320
|
def thread_ids(self) -> List[int]:
|
@@ -298,14 +361,12 @@ class LaunchAgent:
|
|
298
361
|
|
299
362
|
def print_status(self) -> None:
|
300
363
|
"""Prints the current status of the agent."""
|
364
|
+
self._last_status_print_time = time.time()
|
301
365
|
output_str = "agent "
|
302
366
|
if self._name:
|
303
367
|
output_str += f"{self._name} "
|
304
368
|
if self.num_running_jobs < self._max_jobs:
|
305
|
-
output_str += "polling on "
|
306
|
-
if self._project != LAUNCH_DEFAULT_PROJECT:
|
307
|
-
output_str += f"project {self._project}, "
|
308
|
-
output_str += f"queues {','.join(self._queues)}, "
|
369
|
+
output_str += f"polling on queues {','.join(self._queues)}, "
|
309
370
|
output_str += (
|
310
371
|
f"running {self.num_running_jobs} out of a maximum of {self._max_jobs} jobs"
|
311
372
|
)
|
@@ -344,8 +405,8 @@ class LaunchAgent:
|
|
344
405
|
if run_state.lower() != "pending":
|
345
406
|
return True
|
346
407
|
except CommError:
|
347
|
-
|
348
|
-
f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run"
|
408
|
+
self._internal_logger.info(
|
409
|
+
f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run",
|
349
410
|
)
|
350
411
|
return False
|
351
412
|
|
@@ -361,8 +422,8 @@ class LaunchAgent:
|
|
361
422
|
job_and_run_status.entity is not None
|
362
423
|
and job_and_run_status.entity != self._entity
|
363
424
|
):
|
364
|
-
|
365
|
-
"Skipping check for completed run status because run is on a different entity than agent"
|
425
|
+
self._internal_logger.info(
|
426
|
+
"Skipping check for completed run status because run is on a different entity than agent",
|
366
427
|
)
|
367
428
|
elif exception is not None:
|
368
429
|
tb_str = traceback.format_exception(
|
@@ -378,8 +439,8 @@ class LaunchAgent:
|
|
378
439
|
fnames,
|
379
440
|
)
|
380
441
|
elif job_and_run_status.project is None or job_and_run_status.run_id is None:
|
381
|
-
|
382
|
-
f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}"
|
442
|
+
self._internal_logger.info(
|
443
|
+
f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}",
|
383
444
|
)
|
384
445
|
wandb.termerror(
|
385
446
|
"Missing project or run id on thread called finish thread id"
|
@@ -397,7 +458,6 @@ class LaunchAgent:
|
|
397
458
|
# We retry for 60 seconds with an exponential backoff in case
|
398
459
|
# upsert run is taking a while.
|
399
460
|
logs = None
|
400
|
-
start_time = time.time()
|
401
461
|
interval = 1
|
402
462
|
while True:
|
403
463
|
called_init = self._check_run_exists_and_inited(
|
@@ -406,7 +466,7 @@ class LaunchAgent:
|
|
406
466
|
job_and_run_status.run_id,
|
407
467
|
job_and_run_status.run_queue_item_id,
|
408
468
|
)
|
409
|
-
if called_init or
|
469
|
+
if called_init or interval > RUN_INFO_GRACE_PERIOD:
|
410
470
|
break
|
411
471
|
if not called_init:
|
412
472
|
# Fetch the logs now if we don't get run info on the
|
@@ -430,7 +490,9 @@ class LaunchAgent:
|
|
430
490
|
job_and_run_status.run_queue_item_id, _msg, "run", fnames
|
431
491
|
)
|
432
492
|
else:
|
433
|
-
|
493
|
+
self._internal_logger.info(
|
494
|
+
f"Finish thread id {thread_id} had no exception and no run"
|
495
|
+
)
|
434
496
|
wandb._sentry.exception(
|
435
497
|
"launch agent called finish thread id on thread without run or exception"
|
436
498
|
)
|
@@ -458,7 +520,7 @@ class LaunchAgent:
|
|
458
520
|
await self.update_status(AGENT_RUNNING)
|
459
521
|
|
460
522
|
# parse job
|
461
|
-
|
523
|
+
self._internal_logger.info("Parsing launch spec")
|
462
524
|
launch_spec = job["runSpec"]
|
463
525
|
|
464
526
|
# Abort if this job attempts to override secure mode
|
@@ -511,6 +573,10 @@ class LaunchAgent:
|
|
511
573
|
KeyboardInterrupt: if the agent is requested to stop.
|
512
574
|
"""
|
513
575
|
self.print_status()
|
576
|
+
if self._verbosity == 0:
|
577
|
+
print_interval = DEFAULT_PRINT_INTERVAL
|
578
|
+
else:
|
579
|
+
print_interval = VERBOSE_PRINT_INTERVAL
|
514
580
|
try:
|
515
581
|
while True:
|
516
582
|
job = None
|
@@ -532,7 +598,7 @@ class LaunchAgent:
|
|
532
598
|
file_saver = RunQueueItemFileSaver(
|
533
599
|
self._wandb_run, job["runQueueItemId"]
|
534
600
|
)
|
535
|
-
if _is_scheduler_job(job.get("runSpec", {})):
|
601
|
+
if self._is_scheduler_job(job.get("runSpec", {})):
|
536
602
|
# If job is a scheduler, and we are already at the cap, ignore,
|
537
603
|
# don't ack, and it will be pushed back onto the queue in 1 min
|
538
604
|
if self.num_running_schedulers >= self._max_schedulers:
|
@@ -567,6 +633,7 @@ class LaunchAgent:
|
|
567
633
|
await self.update_status(AGENT_POLLING)
|
568
634
|
else:
|
569
635
|
await self.update_status(AGENT_RUNNING)
|
636
|
+
if time.time() - self._last_status_print_time > print_interval:
|
570
637
|
self.print_status()
|
571
638
|
|
572
639
|
if self.num_running_jobs == self._max_jobs or job is None:
|
@@ -634,21 +701,21 @@ class LaunchAgent:
|
|
634
701
|
await self.check_sweep_state(launch_spec, api)
|
635
702
|
|
636
703
|
job_tracker.update_run_info(project)
|
637
|
-
|
704
|
+
self._internal_logger.info("Fetching and validating project...")
|
638
705
|
project.fetch_and_validate_project()
|
639
|
-
|
706
|
+
self._internal_logger.info("Fetching resource...")
|
640
707
|
resource = launch_spec.get("resource") or "local-container"
|
641
708
|
backend_config: Dict[str, Any] = {
|
642
709
|
PROJECT_SYNCHRONOUS: False, # agent always runs async
|
643
710
|
}
|
644
|
-
|
711
|
+
self._internal_logger.info("Loading backend")
|
645
712
|
override_build_config = launch_spec.get("builder")
|
646
713
|
|
647
714
|
_, build_config, registry_config = construct_agent_configs(
|
648
715
|
default_config, override_build_config
|
649
716
|
)
|
650
717
|
image_uri = project.docker_image
|
651
|
-
entrypoint = project.
|
718
|
+
entrypoint = project.get_job_entry_point()
|
652
719
|
environment = loader.environment_from_config(
|
653
720
|
default_config.get("environment", {})
|
654
721
|
)
|
@@ -661,13 +728,13 @@ class LaunchAgent:
|
|
661
728
|
assert entrypoint is not None
|
662
729
|
image_uri = await builder.build_image(project, entrypoint, job_tracker)
|
663
730
|
|
664
|
-
|
731
|
+
self._internal_logger.info("Backend loaded...")
|
665
732
|
if isinstance(backend, LocalProcessRunner):
|
666
733
|
run = await backend.run(project, image_uri)
|
667
734
|
else:
|
668
735
|
assert image_uri
|
669
736
|
run = await backend.run(project, image_uri)
|
670
|
-
if _is_scheduler_job(launch_spec):
|
737
|
+
if self._is_scheduler_job(launch_spec):
|
671
738
|
with self._jobs_lock:
|
672
739
|
self._jobs[thread_id].is_scheduler = True
|
673
740
|
wandb.termlog(
|
@@ -700,7 +767,7 @@ class LaunchAgent:
|
|
700
767
|
if stopped_time is None:
|
701
768
|
stopped_time = time.time()
|
702
769
|
else:
|
703
|
-
if time.time() - stopped_time >
|
770
|
+
if time.time() - stopped_time > self._stopped_run_timeout:
|
704
771
|
await run.cancel()
|
705
772
|
await asyncio.sleep(AGENT_POLLING_INTERVAL)
|
706
773
|
|
@@ -720,7 +787,7 @@ class LaunchAgent:
|
|
720
787
|
project=launch_spec["project"],
|
721
788
|
)
|
722
789
|
except Exception as e:
|
723
|
-
|
790
|
+
self._internal_logger.debug(f"Fetch sweep state error: {e}")
|
724
791
|
state = None
|
725
792
|
|
726
793
|
if state != "RUNNING" and state != "PAUSED":
|
wandb/sdk/launch/agent/config.py
CHANGED
@@ -80,17 +80,7 @@ class RegistryConfig(BaseModel):
|
|
80
80
|
@validator("uri") # type: ignore
|
81
81
|
@classmethod
|
82
82
|
def validate_uri(cls, uri: str) -> str:
|
83
|
-
|
84
|
-
GCP_ARTIFACT_REGISTRY_URI_REGEX,
|
85
|
-
AZURE_CONTAINER_REGISTRY_URI_REGEX,
|
86
|
-
ELASTIC_CONTAINER_REGISTRY_URI_REGEX,
|
87
|
-
]:
|
88
|
-
if regex.match(uri):
|
89
|
-
return uri
|
90
|
-
raise ValueError(
|
91
|
-
"Invalid uri. URI must be a repository URI for an "
|
92
|
-
"ECR, ACR, or GCP Artifact Registry."
|
93
|
-
)
|
83
|
+
return validate_registry_uri(uri)
|
94
84
|
|
95
85
|
|
96
86
|
class EnvironmentConfig(BaseModel):
|
@@ -186,6 +176,14 @@ class BuilderConfig(BaseModel):
|
|
186
176
|
"""Right now there are no required fields for docker builds."""
|
187
177
|
return values
|
188
178
|
|
179
|
+
@validator("destination") # type: ignore
|
180
|
+
@classmethod
|
181
|
+
def validate_destination(cls, destination: Optional[str]) -> Optional[str]:
|
182
|
+
"""Validate that the destination is a valid container registry URI."""
|
183
|
+
if destination is None:
|
184
|
+
return None
|
185
|
+
return validate_registry_uri(destination)
|
186
|
+
|
189
187
|
|
190
188
|
class AgentConfig(BaseModel):
|
191
189
|
"""Configuration for the Launch agent."""
|
@@ -194,9 +192,6 @@ class AgentConfig(BaseModel):
|
|
194
192
|
default=[],
|
195
193
|
description="The queues to use for this agent.",
|
196
194
|
)
|
197
|
-
project: Optional[str] = Field(
|
198
|
-
description="The W&B project to use for this agent.",
|
199
|
-
)
|
200
195
|
entity: Optional[str] = Field(
|
201
196
|
description="The W&B entity to use for this agent.",
|
202
197
|
)
|
@@ -225,6 +220,77 @@ class AgentConfig(BaseModel):
|
|
225
220
|
None,
|
226
221
|
description="The builder to use.",
|
227
222
|
)
|
223
|
+
verbosity: Optional[int] = Field(
|
224
|
+
0,
|
225
|
+
description="How verbose to print, 0 = default, 1 = verbose, 2 = very verbose",
|
226
|
+
)
|
227
|
+
stopped_run_timeout: Optional[int] = Field(
|
228
|
+
60,
|
229
|
+
description="How many seconds to wait after receiving the stop command before forcibly cancelling a run.",
|
230
|
+
)
|
228
231
|
|
229
232
|
class Config:
|
230
233
|
extra = "forbid"
|
234
|
+
|
235
|
+
|
236
|
+
def validate_registry_uri(uri: str) -> str:
|
237
|
+
"""Validate that the registry URI is a valid container registry URI.
|
238
|
+
|
239
|
+
The URI should resolve to an image name in a container registry. The recognized
|
240
|
+
formats are for ECR, ACR, and GCP Artifact Registry. If the URI does not match
|
241
|
+
any of these formats, a warning is printed indicating the registry type is not
|
242
|
+
recognized and the agent can't guarantee that images can be pushed.
|
243
|
+
|
244
|
+
If the format is recognized but does not resolve to an image name, an
|
245
|
+
error is raised. For example, if the URI is an ECR URI but does not include
|
246
|
+
an image name or includes a tag as well as an image name, an error is raised.
|
247
|
+
"""
|
248
|
+
tag_msg = (
|
249
|
+
"Destination for built images may not include a tag, but the URI provided "
|
250
|
+
"includes the suffix '{tag}'. Please remove the tag and try again. The agent "
|
251
|
+
"will automatically tag each image with a unique hash of the source code."
|
252
|
+
)
|
253
|
+
if uri.startswith("https://"):
|
254
|
+
uri = uri[8:]
|
255
|
+
|
256
|
+
match = GCP_ARTIFACT_REGISTRY_URI_REGEX.match(uri)
|
257
|
+
if match:
|
258
|
+
if match.group("tag"):
|
259
|
+
raise ValueError(tag_msg.format(tag=match.group("tag")))
|
260
|
+
if not match.group("image_name"):
|
261
|
+
raise ValueError(
|
262
|
+
"An image name must be specified in the URI for a GCP Artifact Registry. "
|
263
|
+
"Please provide a uri with the format "
|
264
|
+
"'https://<region>-docker.pkg.dev/<project>/<repository>/<image>'."
|
265
|
+
)
|
266
|
+
return uri
|
267
|
+
|
268
|
+
match = AZURE_CONTAINER_REGISTRY_URI_REGEX.match(uri)
|
269
|
+
if match:
|
270
|
+
if match.group("tag"):
|
271
|
+
raise ValueError(tag_msg.format(tag=match.group("tag")))
|
272
|
+
if not match.group("repository"):
|
273
|
+
raise ValueError(
|
274
|
+
"A repository name must be specified in the URI for an "
|
275
|
+
"Azure Container Registry. Please provide a uri with the format "
|
276
|
+
"'https://<registry-name>.azurecr.io/<repository>'."
|
277
|
+
)
|
278
|
+
return uri
|
279
|
+
|
280
|
+
match = ELASTIC_CONTAINER_REGISTRY_URI_REGEX.match(uri)
|
281
|
+
if match:
|
282
|
+
if match.group("tag"):
|
283
|
+
raise ValueError(tag_msg.format(tag=match.group("tag")))
|
284
|
+
if not match.group("repository"):
|
285
|
+
raise ValueError(
|
286
|
+
"A repository name must be specified in the URI for an "
|
287
|
+
"Elastic Container Registry. Please provide a uri with the format "
|
288
|
+
"'https://<account-id>.dkr.ecr.<region>.amazonaws.com/<repository>'."
|
289
|
+
)
|
290
|
+
return uri
|
291
|
+
|
292
|
+
wandb.termwarn(
|
293
|
+
f"Unable to recognize registry type in URI {uri}. You are responsible "
|
294
|
+
"for ensuring the agent can push images to this registry."
|
295
|
+
)
|
296
|
+
return uri
|
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Abstract plugin class defining the interface needed to build container images for W&B Launch."""
|
2
|
+
|
2
3
|
from abc import ABC, abstractmethod
|
3
4
|
from typing import TYPE_CHECKING, Any, Dict, Optional
|
4
5
|
|
@@ -6,6 +7,12 @@ from wandb.sdk.launch.environment.abstract import AbstractEnvironment
|
|
6
7
|
from wandb.sdk.launch.registry.abstract import AbstractRegistry
|
7
8
|
|
8
9
|
from .._project_spec import EntryPoint, LaunchProject
|
10
|
+
from ..registry.anon import AnonynmousRegistry
|
11
|
+
from ..utils import (
|
12
|
+
AZURE_CONTAINER_REGISTRY_URI_REGEX,
|
13
|
+
ELASTIC_CONTAINER_REGISTRY_URI_REGEX,
|
14
|
+
GCP_ARTIFACT_REGISTRY_URI_REGEX,
|
15
|
+
)
|
9
16
|
|
10
17
|
if TYPE_CHECKING:
|
11
18
|
from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
|
@@ -34,7 +41,7 @@ class AbstractBuilder(ABC):
|
|
34
41
|
verify: Whether to verify the functionality of the builder.
|
35
42
|
|
36
43
|
Raises:
|
37
|
-
LaunchError: If the builder cannot be
|
44
|
+
LaunchError: If the builder cannot be initialized or verified.
|
38
45
|
"""
|
39
46
|
raise NotImplementedError
|
40
47
|
|
@@ -86,3 +93,64 @@ class AbstractBuilder(ABC):
|
|
86
93
|
LaunchError: If the builder cannot be used to build images.
|
87
94
|
"""
|
88
95
|
raise NotImplementedError
|
96
|
+
|
97
|
+
|
98
|
+
def registry_from_uri(uri: str) -> AbstractRegistry:
|
99
|
+
"""Create a registry helper object from a uri.
|
100
|
+
|
101
|
+
This function parses the URI and determines which supported registry it
|
102
|
+
belongs to. It then creates a registry helper object for that registry.
|
103
|
+
The supported remote registry types are:
|
104
|
+
- Azure Container Registry
|
105
|
+
- Google Container Registry
|
106
|
+
- AWS Elastic Container Registry
|
107
|
+
|
108
|
+
The format of the URI is as follows:
|
109
|
+
- Azure Container Registry: <registry-name>.azurecr.io/<repo-name>/<image-name>
|
110
|
+
- Google Container Registry: <location>-docker.pkg.dev/<project-id>/<repo-name>/<image-name>
|
111
|
+
- AWS Elastic Container Registry: <account-id>.dkr.ecr.<region>.amazonaws.com/<repo-name>/<image-name>
|
112
|
+
|
113
|
+
Our classification of the registry is based on the domain name. For example,
|
114
|
+
if the uri contains `.azurecr.io`, we classify it as an Azure
|
115
|
+
Container Registry. If the uri contains `.dkr.ecr`, we classify
|
116
|
+
it as an AWS Elastic Container Registry. If the uri contains
|
117
|
+
`-docker.pkg.dev`, we classify it as a Google Artifact Registry.
|
118
|
+
|
119
|
+
This function will attempt to load the approriate cloud helpers for the
|
120
|
+
|
121
|
+
`https://` prefix is optional for all of the above.
|
122
|
+
|
123
|
+
Arguments:
|
124
|
+
uri: The uri to create a registry from.
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
The registry.
|
128
|
+
|
129
|
+
Raises:
|
130
|
+
LaunchError: If the registry helper cannot be loaded for the given URI.
|
131
|
+
"""
|
132
|
+
if uri.startswith("https://"):
|
133
|
+
uri = uri[len("https://") :]
|
134
|
+
|
135
|
+
if AZURE_CONTAINER_REGISTRY_URI_REGEX.match(uri) is not None:
|
136
|
+
from wandb.sdk.launch.registry.azure_container_registry import (
|
137
|
+
AzureContainerRegistry,
|
138
|
+
)
|
139
|
+
|
140
|
+
return AzureContainerRegistry(uri=uri)
|
141
|
+
|
142
|
+
elif GCP_ARTIFACT_REGISTRY_URI_REGEX.match(uri) is not None:
|
143
|
+
from wandb.sdk.launch.registry.google_artifact_registry import (
|
144
|
+
GoogleArtifactRegistry,
|
145
|
+
)
|
146
|
+
|
147
|
+
return GoogleArtifactRegistry(uri=uri)
|
148
|
+
|
149
|
+
elif ELASTIC_CONTAINER_REGISTRY_URI_REGEX.match(uri) is not None:
|
150
|
+
from wandb.sdk.launch.registry.elastic_container_registry import (
|
151
|
+
ElasticContainerRegistry,
|
152
|
+
)
|
153
|
+
|
154
|
+
return ElasticContainerRegistry(uri=uri)
|
155
|
+
|
156
|
+
return AnonynmousRegistry(uri=uri)
|