wandb 0.18.1__py3-none-macosx_11_0_x86_64.whl
Sign up to get free protection for your applications and to get access to all the features.
- package_readme.md +89 -0
- wandb/__init__.py +245 -0
- wandb/__init__.pyi +1084 -0
- wandb/__main__.py +3 -0
- wandb/_globals.py +19 -0
- wandb/agents/__init__.py +0 -0
- wandb/agents/pyagent.py +363 -0
- wandb/analytics/__init__.py +3 -0
- wandb/analytics/sentry.py +266 -0
- wandb/apis/__init__.py +48 -0
- wandb/apis/attrs.py +40 -0
- wandb/apis/importers/__init__.py +1 -0
- wandb/apis/importers/internals/internal.py +385 -0
- wandb/apis/importers/internals/protocols.py +99 -0
- wandb/apis/importers/internals/util.py +78 -0
- wandb/apis/importers/mlflow.py +254 -0
- wandb/apis/importers/validation.py +108 -0
- wandb/apis/importers/wandb.py +1603 -0
- wandb/apis/internal.py +229 -0
- wandb/apis/normalize.py +89 -0
- wandb/apis/paginator.py +81 -0
- wandb/apis/public/__init__.py +34 -0
- wandb/apis/public/api.py +1179 -0
- wandb/apis/public/artifacts.py +1086 -0
- wandb/apis/public/const.py +4 -0
- wandb/apis/public/files.py +195 -0
- wandb/apis/public/history.py +149 -0
- wandb/apis/public/jobs.py +651 -0
- wandb/apis/public/projects.py +154 -0
- wandb/apis/public/query_generator.py +166 -0
- wandb/apis/public/reports.py +469 -0
- wandb/apis/public/runs.py +903 -0
- wandb/apis/public/sweeps.py +240 -0
- wandb/apis/public/teams.py +198 -0
- wandb/apis/public/users.py +136 -0
- wandb/apis/reports/__init__.py +1 -0
- wandb/apis/reports/v1/__init__.py +8 -0
- wandb/apis/reports/v2/__init__.py +8 -0
- wandb/apis/workspaces/__init__.py +8 -0
- wandb/beta/workflows.py +288 -0
- wandb/bin/wandb-core +0 -0
- wandb/cli/__init__.py +0 -0
- wandb/cli/cli.py +3007 -0
- wandb/data_types.py +63 -0
- wandb/docker/__init__.py +342 -0
- wandb/docker/auth.py +436 -0
- wandb/docker/wandb-entrypoint.sh +33 -0
- wandb/docker/www_authenticate.py +94 -0
- wandb/env.py +514 -0
- wandb/errors/__init__.py +46 -0
- wandb/errors/term.py +103 -0
- wandb/errors/util.py +57 -0
- wandb/filesync/__init__.py +0 -0
- wandb/filesync/dir_watcher.py +403 -0
- wandb/filesync/stats.py +100 -0
- wandb/filesync/step_checksum.py +142 -0
- wandb/filesync/step_prepare.py +179 -0
- wandb/filesync/step_upload.py +290 -0
- wandb/filesync/upload_job.py +142 -0
- wandb/integration/__init__.py +0 -0
- wandb/integration/catboost/__init__.py +5 -0
- wandb/integration/catboost/catboost.py +178 -0
- wandb/integration/cohere/__init__.py +3 -0
- wandb/integration/cohere/cohere.py +21 -0
- wandb/integration/cohere/resolver.py +347 -0
- wandb/integration/diffusers/__init__.py +3 -0
- wandb/integration/diffusers/autologger.py +76 -0
- wandb/integration/diffusers/pipeline_resolver.py +50 -0
- wandb/integration/diffusers/resolvers/__init__.py +9 -0
- wandb/integration/diffusers/resolvers/multimodal.py +882 -0
- wandb/integration/diffusers/resolvers/utils.py +102 -0
- wandb/integration/fastai/__init__.py +249 -0
- wandb/integration/gym/__init__.py +105 -0
- wandb/integration/huggingface/__init__.py +3 -0
- wandb/integration/huggingface/huggingface.py +18 -0
- wandb/integration/huggingface/resolver.py +213 -0
- wandb/integration/keras/__init__.py +11 -0
- wandb/integration/keras/callbacks/__init__.py +5 -0
- wandb/integration/keras/callbacks/metrics_logger.py +136 -0
- wandb/integration/keras/callbacks/model_checkpoint.py +195 -0
- wandb/integration/keras/callbacks/tables_builder.py +226 -0
- wandb/integration/keras/keras.py +1091 -0
- wandb/integration/kfp/__init__.py +6 -0
- wandb/integration/kfp/helpers.py +28 -0
- wandb/integration/kfp/kfp_patch.py +324 -0
- wandb/integration/kfp/wandb_logging.py +182 -0
- wandb/integration/langchain/__init__.py +3 -0
- wandb/integration/langchain/wandb_tracer.py +48 -0
- wandb/integration/lightgbm/__init__.py +239 -0
- wandb/integration/lightning/__init__.py +0 -0
- wandb/integration/lightning/fabric/__init__.py +3 -0
- wandb/integration/lightning/fabric/logger.py +762 -0
- wandb/integration/magic.py +556 -0
- wandb/integration/metaflow/__init__.py +3 -0
- wandb/integration/metaflow/metaflow.py +383 -0
- wandb/integration/openai/__init__.py +3 -0
- wandb/integration/openai/fine_tuning.py +480 -0
- wandb/integration/openai/openai.py +22 -0
- wandb/integration/openai/resolver.py +240 -0
- wandb/integration/prodigy/__init__.py +3 -0
- wandb/integration/prodigy/prodigy.py +299 -0
- wandb/integration/sacred/__init__.py +117 -0
- wandb/integration/sagemaker/__init__.py +12 -0
- wandb/integration/sagemaker/auth.py +28 -0
- wandb/integration/sagemaker/config.py +49 -0
- wandb/integration/sagemaker/files.py +3 -0
- wandb/integration/sagemaker/resources.py +34 -0
- wandb/integration/sb3/__init__.py +3 -0
- wandb/integration/sb3/sb3.py +153 -0
- wandb/integration/sklearn/__init__.py +37 -0
- wandb/integration/sklearn/calculate/__init__.py +32 -0
- wandb/integration/sklearn/calculate/calibration_curves.py +125 -0
- wandb/integration/sklearn/calculate/class_proportions.py +68 -0
- wandb/integration/sklearn/calculate/confusion_matrix.py +93 -0
- wandb/integration/sklearn/calculate/decision_boundaries.py +40 -0
- wandb/integration/sklearn/calculate/elbow_curve.py +55 -0
- wandb/integration/sklearn/calculate/feature_importances.py +67 -0
- wandb/integration/sklearn/calculate/learning_curve.py +64 -0
- wandb/integration/sklearn/calculate/outlier_candidates.py +69 -0
- wandb/integration/sklearn/calculate/residuals.py +86 -0
- wandb/integration/sklearn/calculate/silhouette.py +118 -0
- wandb/integration/sklearn/calculate/summary_metrics.py +62 -0
- wandb/integration/sklearn/plot/__init__.py +35 -0
- wandb/integration/sklearn/plot/classifier.py +329 -0
- wandb/integration/sklearn/plot/clusterer.py +146 -0
- wandb/integration/sklearn/plot/regressor.py +121 -0
- wandb/integration/sklearn/plot/shared.py +91 -0
- wandb/integration/sklearn/utils.py +183 -0
- wandb/integration/tensorboard/__init__.py +10 -0
- wandb/integration/tensorboard/log.py +355 -0
- wandb/integration/tensorboard/monkeypatch.py +185 -0
- wandb/integration/tensorflow/__init__.py +5 -0
- wandb/integration/tensorflow/estimator_hook.py +54 -0
- wandb/integration/torch/__init__.py +0 -0
- wandb/integration/torch/wandb_torch.py +554 -0
- wandb/integration/ultralytics/__init__.py +11 -0
- wandb/integration/ultralytics/bbox_utils.py +208 -0
- wandb/integration/ultralytics/callback.py +524 -0
- wandb/integration/ultralytics/classification_utils.py +83 -0
- wandb/integration/ultralytics/mask_utils.py +202 -0
- wandb/integration/ultralytics/pose_utils.py +103 -0
- wandb/integration/xgboost/__init__.py +11 -0
- wandb/integration/xgboost/xgboost.py +189 -0
- wandb/integration/yolov8/__init__.py +0 -0
- wandb/integration/yolov8/yolov8.py +284 -0
- wandb/jupyter.py +515 -0
- wandb/magic.py +3 -0
- wandb/mpmain/__init__.py +0 -0
- wandb/mpmain/__main__.py +1 -0
- wandb/old/__init__.py +0 -0
- wandb/old/core.py +131 -0
- wandb/old/settings.py +173 -0
- wandb/old/summary.py +440 -0
- wandb/plot/__init__.py +19 -0
- wandb/plot/bar.py +42 -0
- wandb/plot/confusion_matrix.py +99 -0
- wandb/plot/histogram.py +36 -0
- wandb/plot/line.py +40 -0
- wandb/plot/line_series.py +88 -0
- wandb/plot/pr_curve.py +136 -0
- wandb/plot/roc_curve.py +118 -0
- wandb/plot/scatter.py +32 -0
- wandb/plot/utils.py +183 -0
- wandb/proto/__init__.py +0 -0
- wandb/proto/v3/__init__.py +0 -0
- wandb/proto/v3/wandb_base_pb2.py +55 -0
- wandb/proto/v3/wandb_internal_pb2.py +1608 -0
- wandb/proto/v3/wandb_server_pb2.py +208 -0
- wandb/proto/v3/wandb_settings_pb2.py +112 -0
- wandb/proto/v3/wandb_telemetry_pb2.py +106 -0
- wandb/proto/v4/__init__.py +0 -0
- wandb/proto/v4/wandb_base_pb2.py +30 -0
- wandb/proto/v4/wandb_internal_pb2.py +360 -0
- wandb/proto/v4/wandb_server_pb2.py +63 -0
- wandb/proto/v4/wandb_settings_pb2.py +45 -0
- wandb/proto/v4/wandb_telemetry_pb2.py +41 -0
- wandb/proto/v5/wandb_base_pb2.py +31 -0
- wandb/proto/v5/wandb_internal_pb2.py +361 -0
- wandb/proto/v5/wandb_server_pb2.py +64 -0
- wandb/proto/v5/wandb_settings_pb2.py +46 -0
- wandb/proto/v5/wandb_telemetry_pb2.py +42 -0
- wandb/proto/wandb_base_pb2.py +10 -0
- wandb/proto/wandb_deprecated.py +53 -0
- wandb/proto/wandb_generate_deprecated.py +34 -0
- wandb/proto/wandb_generate_proto.py +49 -0
- wandb/proto/wandb_internal_pb2.py +16 -0
- wandb/proto/wandb_server_pb2.py +10 -0
- wandb/proto/wandb_settings_pb2.py +10 -0
- wandb/proto/wandb_telemetry_pb2.py +10 -0
- wandb/py.typed +0 -0
- wandb/sdk/__init__.py +37 -0
- wandb/sdk/artifacts/__init__.py +0 -0
- wandb/sdk/artifacts/_validators.py +45 -0
- wandb/sdk/artifacts/artifact.py +2415 -0
- wandb/sdk/artifacts/artifact_download_logger.py +43 -0
- wandb/sdk/artifacts/artifact_file_cache.py +251 -0
- wandb/sdk/artifacts/artifact_instance_cache.py +15 -0
- wandb/sdk/artifacts/artifact_manifest.py +72 -0
- wandb/sdk/artifacts/artifact_manifest_entry.py +247 -0
- wandb/sdk/artifacts/artifact_manifests/__init__.py +0 -0
- wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +90 -0
- wandb/sdk/artifacts/artifact_saver.py +267 -0
- wandb/sdk/artifacts/artifact_state.py +11 -0
- wandb/sdk/artifacts/artifact_ttl.py +7 -0
- wandb/sdk/artifacts/exceptions.py +56 -0
- wandb/sdk/artifacts/staging.py +25 -0
- wandb/sdk/artifacts/storage_handler.py +60 -0
- wandb/sdk/artifacts/storage_handlers/__init__.py +0 -0
- wandb/sdk/artifacts/storage_handlers/azure_handler.py +206 -0
- wandb/sdk/artifacts/storage_handlers/gcs_handler.py +226 -0
- wandb/sdk/artifacts/storage_handlers/http_handler.py +113 -0
- wandb/sdk/artifacts/storage_handlers/local_file_handler.py +139 -0
- wandb/sdk/artifacts/storage_handlers/multi_handler.py +54 -0
- wandb/sdk/artifacts/storage_handlers/s3_handler.py +300 -0
- wandb/sdk/artifacts/storage_handlers/tracking_handler.py +70 -0
- wandb/sdk/artifacts/storage_handlers/wb_artifact_handler.py +133 -0
- wandb/sdk/artifacts/storage_handlers/wb_local_artifact_handler.py +72 -0
- wandb/sdk/artifacts/storage_layout.py +6 -0
- wandb/sdk/artifacts/storage_policies/__init__.py +4 -0
- wandb/sdk/artifacts/storage_policies/register.py +1 -0
- wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +376 -0
- wandb/sdk/artifacts/storage_policy.py +72 -0
- wandb/sdk/backend/__init__.py +0 -0
- wandb/sdk/backend/backend.py +240 -0
- wandb/sdk/data_types/__init__.py +0 -0
- wandb/sdk/data_types/_dtypes.py +914 -0
- wandb/sdk/data_types/_private.py +10 -0
- wandb/sdk/data_types/audio.py +165 -0
- wandb/sdk/data_types/base_types/__init__.py +0 -0
- wandb/sdk/data_types/base_types/json_metadata.py +55 -0
- wandb/sdk/data_types/base_types/media.py +315 -0
- wandb/sdk/data_types/base_types/wb_value.py +274 -0
- wandb/sdk/data_types/bokeh.py +70 -0
- wandb/sdk/data_types/graph.py +405 -0
- wandb/sdk/data_types/helper_types/__init__.py +0 -0
- wandb/sdk/data_types/helper_types/bounding_boxes_2d.py +295 -0
- wandb/sdk/data_types/helper_types/classes.py +159 -0
- wandb/sdk/data_types/helper_types/image_mask.py +235 -0
- wandb/sdk/data_types/histogram.py +96 -0
- wandb/sdk/data_types/html.py +115 -0
- wandb/sdk/data_types/image.py +845 -0
- wandb/sdk/data_types/molecule.py +241 -0
- wandb/sdk/data_types/object_3d.py +474 -0
- wandb/sdk/data_types/plotly.py +82 -0
- wandb/sdk/data_types/saved_model.py +446 -0
- wandb/sdk/data_types/table.py +1204 -0
- wandb/sdk/data_types/trace_tree.py +438 -0
- wandb/sdk/data_types/utils.py +229 -0
- wandb/sdk/data_types/video.py +247 -0
- wandb/sdk/integration_utils/__init__.py +0 -0
- wandb/sdk/integration_utils/auto_logging.py +239 -0
- wandb/sdk/integration_utils/data_logging.py +475 -0
- wandb/sdk/interface/__init__.py +0 -0
- wandb/sdk/interface/constants.py +4 -0
- wandb/sdk/interface/interface.py +996 -0
- wandb/sdk/interface/interface_queue.py +59 -0
- wandb/sdk/interface/interface_relay.py +53 -0
- wandb/sdk/interface/interface_shared.py +549 -0
- wandb/sdk/interface/interface_sock.py +61 -0
- wandb/sdk/interface/message_future.py +27 -0
- wandb/sdk/interface/message_future_poll.py +50 -0
- wandb/sdk/interface/router.py +118 -0
- wandb/sdk/interface/router_queue.py +44 -0
- wandb/sdk/interface/router_relay.py +39 -0
- wandb/sdk/interface/router_sock.py +36 -0
- wandb/sdk/interface/summary_record.py +67 -0
- wandb/sdk/internal/__init__.py +0 -0
- wandb/sdk/internal/context.py +89 -0
- wandb/sdk/internal/datastore.py +297 -0
- wandb/sdk/internal/file_pusher.py +181 -0
- wandb/sdk/internal/file_stream.py +695 -0
- wandb/sdk/internal/flow_control.py +263 -0
- wandb/sdk/internal/handler.py +911 -0
- wandb/sdk/internal/internal.py +417 -0
- wandb/sdk/internal/internal_api.py +4287 -0
- wandb/sdk/internal/internal_util.py +100 -0
- wandb/sdk/internal/job_builder.py +629 -0
- wandb/sdk/internal/profiler.py +78 -0
- wandb/sdk/internal/progress.py +83 -0
- wandb/sdk/internal/run.py +25 -0
- wandb/sdk/internal/sample.py +70 -0
- wandb/sdk/internal/sender.py +1729 -0
- wandb/sdk/internal/sender_config.py +197 -0
- wandb/sdk/internal/settings_static.py +90 -0
- wandb/sdk/internal/system/__init__.py +0 -0
- wandb/sdk/internal/system/assets/__init__.py +27 -0
- wandb/sdk/internal/system/assets/aggregators.py +37 -0
- wandb/sdk/internal/system/assets/asset_registry.py +20 -0
- wandb/sdk/internal/system/assets/cpu.py +163 -0
- wandb/sdk/internal/system/assets/disk.py +210 -0
- wandb/sdk/internal/system/assets/gpu.py +416 -0
- wandb/sdk/internal/system/assets/gpu_amd.py +239 -0
- wandb/sdk/internal/system/assets/gpu_apple.py +177 -0
- wandb/sdk/internal/system/assets/interfaces.py +207 -0
- wandb/sdk/internal/system/assets/ipu.py +177 -0
- wandb/sdk/internal/system/assets/memory.py +166 -0
- wandb/sdk/internal/system/assets/network.py +125 -0
- wandb/sdk/internal/system/assets/open_metrics.py +299 -0
- wandb/sdk/internal/system/assets/tpu.py +154 -0
- wandb/sdk/internal/system/assets/trainium.py +399 -0
- wandb/sdk/internal/system/env_probe_helpers.py +13 -0
- wandb/sdk/internal/system/system_info.py +249 -0
- wandb/sdk/internal/system/system_monitor.py +229 -0
- wandb/sdk/internal/tb_watcher.py +518 -0
- wandb/sdk/internal/thread_local_settings.py +18 -0
- wandb/sdk/internal/update.py +113 -0
- wandb/sdk/internal/writer.py +206 -0
- wandb/sdk/launch/__init__.py +14 -0
- wandb/sdk/launch/_launch.py +330 -0
- wandb/sdk/launch/_launch_add.py +255 -0
- wandb/sdk/launch/_project_spec.py +566 -0
- wandb/sdk/launch/agent/__init__.py +5 -0
- wandb/sdk/launch/agent/agent.py +924 -0
- wandb/sdk/launch/agent/config.py +296 -0
- wandb/sdk/launch/agent/job_status_tracker.py +53 -0
- wandb/sdk/launch/agent/run_queue_item_file_saver.py +45 -0
- wandb/sdk/launch/builder/__init__.py +0 -0
- wandb/sdk/launch/builder/abstract.py +156 -0
- wandb/sdk/launch/builder/build.py +297 -0
- wandb/sdk/launch/builder/context_manager.py +235 -0
- wandb/sdk/launch/builder/docker_builder.py +177 -0
- wandb/sdk/launch/builder/kaniko_builder.py +595 -0
- wandb/sdk/launch/builder/noop.py +58 -0
- wandb/sdk/launch/builder/templates/_wandb_bootstrap.py +188 -0
- wandb/sdk/launch/builder/templates/dockerfile.py +92 -0
- wandb/sdk/launch/create_job.py +528 -0
- wandb/sdk/launch/environment/abstract.py +29 -0
- wandb/sdk/launch/environment/aws_environment.py +322 -0
- wandb/sdk/launch/environment/azure_environment.py +105 -0
- wandb/sdk/launch/environment/gcp_environment.py +335 -0
- wandb/sdk/launch/environment/local_environment.py +66 -0
- wandb/sdk/launch/errors.py +19 -0
- wandb/sdk/launch/git_reference.py +109 -0
- wandb/sdk/launch/inputs/files.py +148 -0
- wandb/sdk/launch/inputs/internal.py +315 -0
- wandb/sdk/launch/inputs/manage.py +113 -0
- wandb/sdk/launch/inputs/schema.py +39 -0
- wandb/sdk/launch/loader.py +249 -0
- wandb/sdk/launch/registry/abstract.py +48 -0
- wandb/sdk/launch/registry/anon.py +29 -0
- wandb/sdk/launch/registry/azure_container_registry.py +124 -0
- wandb/sdk/launch/registry/elastic_container_registry.py +192 -0
- wandb/sdk/launch/registry/google_artifact_registry.py +219 -0
- wandb/sdk/launch/registry/local_registry.py +67 -0
- wandb/sdk/launch/runner/__init__.py +0 -0
- wandb/sdk/launch/runner/abstract.py +195 -0
- wandb/sdk/launch/runner/kubernetes_monitor.py +474 -0
- wandb/sdk/launch/runner/kubernetes_runner.py +963 -0
- wandb/sdk/launch/runner/local_container.py +301 -0
- wandb/sdk/launch/runner/local_process.py +78 -0
- wandb/sdk/launch/runner/sagemaker_runner.py +426 -0
- wandb/sdk/launch/runner/vertex_runner.py +230 -0
- wandb/sdk/launch/sweeps/__init__.py +39 -0
- wandb/sdk/launch/sweeps/scheduler.py +742 -0
- wandb/sdk/launch/sweeps/scheduler_sweep.py +91 -0
- wandb/sdk/launch/sweeps/utils.py +316 -0
- wandb/sdk/launch/utils.py +746 -0
- wandb/sdk/launch/wandb_reference.py +138 -0
- wandb/sdk/lib/__init__.py +5 -0
- wandb/sdk/lib/_settings_toposort_generate.py +159 -0
- wandb/sdk/lib/_settings_toposort_generated.py +249 -0
- wandb/sdk/lib/_wburls_generate.py +25 -0
- wandb/sdk/lib/_wburls_generated.py +22 -0
- wandb/sdk/lib/apikey.py +273 -0
- wandb/sdk/lib/capped_dict.py +26 -0
- wandb/sdk/lib/config_util.py +101 -0
- wandb/sdk/lib/credentials.py +141 -0
- wandb/sdk/lib/deprecate.py +42 -0
- wandb/sdk/lib/disabled.py +29 -0
- wandb/sdk/lib/exit_hooks.py +54 -0
- wandb/sdk/lib/file_stream_utils.py +118 -0
- wandb/sdk/lib/filenames.py +64 -0
- wandb/sdk/lib/filesystem.py +372 -0
- wandb/sdk/lib/fsm.py +174 -0
- wandb/sdk/lib/gitlib.py +239 -0
- wandb/sdk/lib/gql_request.py +65 -0
- wandb/sdk/lib/handler_util.py +21 -0
- wandb/sdk/lib/hashutil.py +62 -0
- wandb/sdk/lib/import_hooks.py +275 -0
- wandb/sdk/lib/ipython.py +146 -0
- wandb/sdk/lib/json_util.py +80 -0
- wandb/sdk/lib/lazyloader.py +63 -0
- wandb/sdk/lib/mailbox.py +460 -0
- wandb/sdk/lib/module.py +69 -0
- wandb/sdk/lib/paths.py +106 -0
- wandb/sdk/lib/preinit.py +42 -0
- wandb/sdk/lib/printer.py +313 -0
- wandb/sdk/lib/proto_util.py +90 -0
- wandb/sdk/lib/redirect.py +845 -0
- wandb/sdk/lib/reporting.py +99 -0
- wandb/sdk/lib/retry.py +289 -0
- wandb/sdk/lib/run_moment.py +78 -0
- wandb/sdk/lib/runid.py +12 -0
- wandb/sdk/lib/server.py +52 -0
- wandb/sdk/lib/sock_client.py +291 -0
- wandb/sdk/lib/sparkline.py +45 -0
- wandb/sdk/lib/telemetry.py +100 -0
- wandb/sdk/lib/timed_input.py +133 -0
- wandb/sdk/lib/timer.py +19 -0
- wandb/sdk/lib/tracelog.py +255 -0
- wandb/sdk/lib/viz.py +123 -0
- wandb/sdk/lib/wburls.py +46 -0
- wandb/sdk/service/__init__.py +0 -0
- wandb/sdk/service/_startup_debug.py +22 -0
- wandb/sdk/service/port_file.py +53 -0
- wandb/sdk/service/server.py +119 -0
- wandb/sdk/service/server_sock.py +276 -0
- wandb/sdk/service/service.py +264 -0
- wandb/sdk/service/service_base.py +50 -0
- wandb/sdk/service/service_sock.py +70 -0
- wandb/sdk/service/streams.py +417 -0
- wandb/sdk/verify/__init__.py +0 -0
- wandb/sdk/verify/verify.py +501 -0
- wandb/sdk/wandb_alerts.py +12 -0
- wandb/sdk/wandb_config.py +322 -0
- wandb/sdk/wandb_helper.py +54 -0
- wandb/sdk/wandb_init.py +1256 -0
- wandb/sdk/wandb_login.py +349 -0
- wandb/sdk/wandb_manager.py +232 -0
- wandb/sdk/wandb_metric.py +110 -0
- wandb/sdk/wandb_require.py +97 -0
- wandb/sdk/wandb_require_helpers.py +44 -0
- wandb/sdk/wandb_run.py +4231 -0
- wandb/sdk/wandb_settings.py +1999 -0
- wandb/sdk/wandb_setup.py +400 -0
- wandb/sdk/wandb_summary.py +150 -0
- wandb/sdk/wandb_sweep.py +119 -0
- wandb/sdk/wandb_sync.py +75 -0
- wandb/sdk/wandb_watch.py +128 -0
- wandb/sklearn.py +35 -0
- wandb/sync/__init__.py +3 -0
- wandb/sync/sync.py +443 -0
- wandb/trigger.py +29 -0
- wandb/util.py +1949 -0
- wandb/vendor/__init__.py +0 -0
- wandb/vendor/gql-0.2.0/setup.py +40 -0
- wandb/vendor/gql-0.2.0/tests/__init__.py +0 -0
- wandb/vendor/gql-0.2.0/tests/starwars/__init__.py +0 -0
- wandb/vendor/gql-0.2.0/tests/starwars/fixtures.py +96 -0
- wandb/vendor/gql-0.2.0/tests/starwars/schema.py +146 -0
- wandb/vendor/gql-0.2.0/tests/starwars/test_dsl.py +293 -0
- wandb/vendor/gql-0.2.0/tests/starwars/test_query.py +355 -0
- wandb/vendor/gql-0.2.0/tests/starwars/test_validation.py +171 -0
- wandb/vendor/gql-0.2.0/tests/test_client.py +31 -0
- wandb/vendor/gql-0.2.0/tests/test_transport.py +89 -0
- wandb/vendor/gql-0.2.0/wandb_gql/__init__.py +4 -0
- wandb/vendor/gql-0.2.0/wandb_gql/client.py +75 -0
- wandb/vendor/gql-0.2.0/wandb_gql/dsl.py +152 -0
- wandb/vendor/gql-0.2.0/wandb_gql/gql.py +10 -0
- wandb/vendor/gql-0.2.0/wandb_gql/transport/__init__.py +0 -0
- wandb/vendor/gql-0.2.0/wandb_gql/transport/http.py +6 -0
- wandb/vendor/gql-0.2.0/wandb_gql/transport/local_schema.py +15 -0
- wandb/vendor/gql-0.2.0/wandb_gql/transport/requests.py +46 -0
- wandb/vendor/gql-0.2.0/wandb_gql/utils.py +21 -0
- wandb/vendor/graphql-core-1.1/setup.py +86 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/__init__.py +287 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/error/__init__.py +6 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/error/base.py +42 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/error/format_error.py +11 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/error/located_error.py +29 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/error/syntax_error.py +36 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/__init__.py +26 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/base.py +311 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executor.py +398 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/__init__.py +0 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/asyncio.py +53 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/gevent.py +22 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/process.py +32 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/sync.py +7 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/thread.py +35 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/executors/utils.py +6 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/__init__.py +0 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/executor.py +66 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/fragment.py +252 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/resolver.py +151 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/experimental/utils.py +7 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/middleware.py +57 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/execution/values.py +145 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/graphql.py +60 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/__init__.py +0 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/ast.py +1349 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/base.py +19 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/lexer.py +435 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/location.py +30 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/parser.py +779 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/printer.py +193 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/source.py +18 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/visitor.py +222 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/language/visitor_meta.py +82 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/__init__.py +0 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/cached_property.py +17 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/contain_subset.py +28 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/default_ordered_dict.py +40 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/ordereddict.py +8 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/pair_set.py +43 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/pyutils/version.py +78 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/type/__init__.py +67 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/type/definition.py +619 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/type/directives.py +132 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/type/introspection.py +440 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/type/scalars.py +131 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/type/schema.py +100 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/type/typemap.py +145 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/__init__.py +0 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/assert_valid_name.py +9 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/ast_from_value.py +65 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/ast_to_code.py +49 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/ast_to_dict.py +24 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/base.py +75 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/build_ast_schema.py +291 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/build_client_schema.py +250 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/concat_ast.py +9 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/extend_schema.py +357 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/get_field_def.py +27 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/get_operation_ast.py +21 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/introspection_query.py +90 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/is_valid_literal_value.py +67 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/is_valid_value.py +66 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/quoted_or_list.py +21 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/schema_printer.py +168 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/suggestion_list.py +56 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/type_comparators.py +69 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/type_from_ast.py +21 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/type_info.py +149 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/utils/value_from_ast.py +69 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/__init__.py +4 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/__init__.py +79 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/arguments_of_correct_type.py +24 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/base.py +8 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/default_values_of_correct_type.py +44 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/fields_on_correct_type.py +113 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/fragments_on_composite_types.py +33 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_argument_names.py +70 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_directives.py +97 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_fragment_names.py +19 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/known_type_names.py +43 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/lone_anonymous_operation.py +23 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_fragment_cycles.py +59 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_undefined_variables.py +36 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_unused_fragments.py +38 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/no_unused_variables.py +37 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/overlapping_fields_can_be_merged.py +529 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/possible_fragment_spreads.py +44 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/provided_non_null_arguments.py +46 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/scalar_leafs.py +33 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_argument_names.py +32 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_fragment_names.py +28 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_input_field_names.py +33 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_operation_names.py +31 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/unique_variable_names.py +27 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/variables_are_input_types.py +21 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/rules/variables_in_allowed_position.py +53 -0
- wandb/vendor/graphql-core-1.1/wandb_graphql/validation/validation.py +158 -0
- wandb/vendor/promise-2.3.0/conftest.py +30 -0
- wandb/vendor/promise-2.3.0/setup.py +64 -0
- wandb/vendor/promise-2.3.0/tests/__init__.py +0 -0
- wandb/vendor/promise-2.3.0/tests/conftest.py +8 -0
- wandb/vendor/promise-2.3.0/tests/test_awaitable.py +32 -0
- wandb/vendor/promise-2.3.0/tests/test_awaitable_35.py +47 -0
- wandb/vendor/promise-2.3.0/tests/test_benchmark.py +116 -0
- wandb/vendor/promise-2.3.0/tests/test_complex_threads.py +23 -0
- wandb/vendor/promise-2.3.0/tests/test_dataloader.py +452 -0
- wandb/vendor/promise-2.3.0/tests/test_dataloader_awaitable_35.py +99 -0
- wandb/vendor/promise-2.3.0/tests/test_dataloader_extra.py +65 -0
- wandb/vendor/promise-2.3.0/tests/test_extra.py +670 -0
- wandb/vendor/promise-2.3.0/tests/test_issues.py +132 -0
- wandb/vendor/promise-2.3.0/tests/test_promise_list.py +70 -0
- wandb/vendor/promise-2.3.0/tests/test_spec.py +584 -0
- wandb/vendor/promise-2.3.0/tests/test_thread_safety.py +115 -0
- wandb/vendor/promise-2.3.0/tests/utils.py +3 -0
- wandb/vendor/promise-2.3.0/wandb_promise/__init__.py +38 -0
- wandb/vendor/promise-2.3.0/wandb_promise/async_.py +135 -0
- wandb/vendor/promise-2.3.0/wandb_promise/compat.py +32 -0
- wandb/vendor/promise-2.3.0/wandb_promise/dataloader.py +326 -0
- wandb/vendor/promise-2.3.0/wandb_promise/iterate_promise.py +12 -0
- wandb/vendor/promise-2.3.0/wandb_promise/promise.py +848 -0
- wandb/vendor/promise-2.3.0/wandb_promise/promise_list.py +151 -0
- wandb/vendor/promise-2.3.0/wandb_promise/pyutils/__init__.py +0 -0
- wandb/vendor/promise-2.3.0/wandb_promise/pyutils/version.py +83 -0
- wandb/vendor/promise-2.3.0/wandb_promise/schedulers/__init__.py +0 -0
- wandb/vendor/promise-2.3.0/wandb_promise/schedulers/asyncio.py +22 -0
- wandb/vendor/promise-2.3.0/wandb_promise/schedulers/gevent.py +21 -0
- wandb/vendor/promise-2.3.0/wandb_promise/schedulers/immediate.py +27 -0
- wandb/vendor/promise-2.3.0/wandb_promise/schedulers/thread.py +18 -0
- wandb/vendor/promise-2.3.0/wandb_promise/utils.py +56 -0
- wandb/vendor/pygments/__init__.py +90 -0
- wandb/vendor/pygments/cmdline.py +568 -0
- wandb/vendor/pygments/console.py +74 -0
- wandb/vendor/pygments/filter.py +74 -0
- wandb/vendor/pygments/filters/__init__.py +350 -0
- wandb/vendor/pygments/formatter.py +95 -0
- wandb/vendor/pygments/formatters/__init__.py +153 -0
- wandb/vendor/pygments/formatters/_mapping.py +85 -0
- wandb/vendor/pygments/formatters/bbcode.py +109 -0
- wandb/vendor/pygments/formatters/html.py +851 -0
- wandb/vendor/pygments/formatters/img.py +600 -0
- wandb/vendor/pygments/formatters/irc.py +182 -0
- wandb/vendor/pygments/formatters/latex.py +482 -0
- wandb/vendor/pygments/formatters/other.py +160 -0
- wandb/vendor/pygments/formatters/rtf.py +147 -0
- wandb/vendor/pygments/formatters/svg.py +153 -0
- wandb/vendor/pygments/formatters/terminal.py +136 -0
- wandb/vendor/pygments/formatters/terminal256.py +309 -0
- wandb/vendor/pygments/lexer.py +871 -0
- wandb/vendor/pygments/lexers/__init__.py +329 -0
- wandb/vendor/pygments/lexers/_asy_builtins.py +1645 -0
- wandb/vendor/pygments/lexers/_cl_builtins.py +232 -0
- wandb/vendor/pygments/lexers/_cocoa_builtins.py +72 -0
- wandb/vendor/pygments/lexers/_csound_builtins.py +1346 -0
- wandb/vendor/pygments/lexers/_lasso_builtins.py +5327 -0
- wandb/vendor/pygments/lexers/_lua_builtins.py +295 -0
- wandb/vendor/pygments/lexers/_mapping.py +500 -0
- wandb/vendor/pygments/lexers/_mql_builtins.py +1172 -0
- wandb/vendor/pygments/lexers/_openedge_builtins.py +2547 -0
- wandb/vendor/pygments/lexers/_php_builtins.py +4756 -0
- wandb/vendor/pygments/lexers/_postgres_builtins.py +621 -0
- wandb/vendor/pygments/lexers/_scilab_builtins.py +3094 -0
- wandb/vendor/pygments/lexers/_sourcemod_builtins.py +1163 -0
- wandb/vendor/pygments/lexers/_stan_builtins.py +532 -0
- wandb/vendor/pygments/lexers/_stata_builtins.py +419 -0
- wandb/vendor/pygments/lexers/_tsql_builtins.py +1004 -0
- wandb/vendor/pygments/lexers/_vim_builtins.py +1939 -0
- wandb/vendor/pygments/lexers/actionscript.py +240 -0
- wandb/vendor/pygments/lexers/agile.py +24 -0
- wandb/vendor/pygments/lexers/algebra.py +221 -0
- wandb/vendor/pygments/lexers/ambient.py +76 -0
- wandb/vendor/pygments/lexers/ampl.py +87 -0
- wandb/vendor/pygments/lexers/apl.py +101 -0
- wandb/vendor/pygments/lexers/archetype.py +318 -0
- wandb/vendor/pygments/lexers/asm.py +641 -0
- wandb/vendor/pygments/lexers/automation.py +374 -0
- wandb/vendor/pygments/lexers/basic.py +500 -0
- wandb/vendor/pygments/lexers/bibtex.py +160 -0
- wandb/vendor/pygments/lexers/business.py +612 -0
- wandb/vendor/pygments/lexers/c_cpp.py +252 -0
- wandb/vendor/pygments/lexers/c_like.py +541 -0
- wandb/vendor/pygments/lexers/capnproto.py +78 -0
- wandb/vendor/pygments/lexers/chapel.py +102 -0
- wandb/vendor/pygments/lexers/clean.py +288 -0
- wandb/vendor/pygments/lexers/compiled.py +34 -0
- wandb/vendor/pygments/lexers/configs.py +833 -0
- wandb/vendor/pygments/lexers/console.py +114 -0
- wandb/vendor/pygments/lexers/crystal.py +393 -0
- wandb/vendor/pygments/lexers/csound.py +366 -0
- wandb/vendor/pygments/lexers/css.py +689 -0
- wandb/vendor/pygments/lexers/d.py +251 -0
- wandb/vendor/pygments/lexers/dalvik.py +125 -0
- wandb/vendor/pygments/lexers/data.py +555 -0
- wandb/vendor/pygments/lexers/diff.py +165 -0
- wandb/vendor/pygments/lexers/dotnet.py +691 -0
- wandb/vendor/pygments/lexers/dsls.py +878 -0
- wandb/vendor/pygments/lexers/dylan.py +289 -0
- wandb/vendor/pygments/lexers/ecl.py +125 -0
- wandb/vendor/pygments/lexers/eiffel.py +65 -0
- wandb/vendor/pygments/lexers/elm.py +121 -0
- wandb/vendor/pygments/lexers/erlang.py +533 -0
- wandb/vendor/pygments/lexers/esoteric.py +277 -0
- wandb/vendor/pygments/lexers/ezhil.py +69 -0
- wandb/vendor/pygments/lexers/factor.py +344 -0
- wandb/vendor/pygments/lexers/fantom.py +250 -0
- wandb/vendor/pygments/lexers/felix.py +273 -0
- wandb/vendor/pygments/lexers/forth.py +177 -0
- wandb/vendor/pygments/lexers/fortran.py +205 -0
- wandb/vendor/pygments/lexers/foxpro.py +428 -0
- wandb/vendor/pygments/lexers/functional.py +21 -0
- wandb/vendor/pygments/lexers/go.py +101 -0
- wandb/vendor/pygments/lexers/grammar_notation.py +213 -0
- wandb/vendor/pygments/lexers/graph.py +80 -0
- wandb/vendor/pygments/lexers/graphics.py +553 -0
- wandb/vendor/pygments/lexers/haskell.py +843 -0
- wandb/vendor/pygments/lexers/haxe.py +936 -0
- wandb/vendor/pygments/lexers/hdl.py +382 -0
- wandb/vendor/pygments/lexers/hexdump.py +103 -0
- wandb/vendor/pygments/lexers/html.py +602 -0
- wandb/vendor/pygments/lexers/idl.py +270 -0
- wandb/vendor/pygments/lexers/igor.py +288 -0
- wandb/vendor/pygments/lexers/inferno.py +96 -0
- wandb/vendor/pygments/lexers/installers.py +322 -0
- wandb/vendor/pygments/lexers/int_fiction.py +1343 -0
- wandb/vendor/pygments/lexers/iolang.py +63 -0
- wandb/vendor/pygments/lexers/j.py +146 -0
- wandb/vendor/pygments/lexers/javascript.py +1525 -0
- wandb/vendor/pygments/lexers/julia.py +333 -0
- wandb/vendor/pygments/lexers/jvm.py +1573 -0
- wandb/vendor/pygments/lexers/lisp.py +2621 -0
- wandb/vendor/pygments/lexers/make.py +202 -0
- wandb/vendor/pygments/lexers/markup.py +595 -0
- wandb/vendor/pygments/lexers/math.py +21 -0
- wandb/vendor/pygments/lexers/matlab.py +663 -0
- wandb/vendor/pygments/lexers/ml.py +769 -0
- wandb/vendor/pygments/lexers/modeling.py +358 -0
- wandb/vendor/pygments/lexers/modula2.py +1561 -0
- wandb/vendor/pygments/lexers/monte.py +204 -0
- wandb/vendor/pygments/lexers/ncl.py +894 -0
- wandb/vendor/pygments/lexers/nimrod.py +159 -0
- wandb/vendor/pygments/lexers/nit.py +64 -0
- wandb/vendor/pygments/lexers/nix.py +136 -0
- wandb/vendor/pygments/lexers/oberon.py +105 -0
- wandb/vendor/pygments/lexers/objective.py +504 -0
- wandb/vendor/pygments/lexers/ooc.py +85 -0
- wandb/vendor/pygments/lexers/other.py +41 -0
- wandb/vendor/pygments/lexers/parasail.py +79 -0
- wandb/vendor/pygments/lexers/parsers.py +835 -0
- wandb/vendor/pygments/lexers/pascal.py +644 -0
- wandb/vendor/pygments/lexers/pawn.py +199 -0
- wandb/vendor/pygments/lexers/perl.py +620 -0
- wandb/vendor/pygments/lexers/php.py +267 -0
- wandb/vendor/pygments/lexers/praat.py +294 -0
- wandb/vendor/pygments/lexers/prolog.py +306 -0
- wandb/vendor/pygments/lexers/python.py +939 -0
- wandb/vendor/pygments/lexers/qvt.py +152 -0
- wandb/vendor/pygments/lexers/r.py +453 -0
- wandb/vendor/pygments/lexers/rdf.py +270 -0
- wandb/vendor/pygments/lexers/rebol.py +431 -0
- wandb/vendor/pygments/lexers/resource.py +85 -0
- wandb/vendor/pygments/lexers/rnc.py +67 -0
- wandb/vendor/pygments/lexers/roboconf.py +82 -0
- wandb/vendor/pygments/lexers/robotframework.py +560 -0
- wandb/vendor/pygments/lexers/ruby.py +519 -0
- wandb/vendor/pygments/lexers/rust.py +220 -0
- wandb/vendor/pygments/lexers/sas.py +228 -0
- wandb/vendor/pygments/lexers/scripting.py +1222 -0
- wandb/vendor/pygments/lexers/shell.py +794 -0
- wandb/vendor/pygments/lexers/smalltalk.py +195 -0
- wandb/vendor/pygments/lexers/smv.py +79 -0
- wandb/vendor/pygments/lexers/snobol.py +83 -0
- wandb/vendor/pygments/lexers/special.py +103 -0
- wandb/vendor/pygments/lexers/sql.py +681 -0
- wandb/vendor/pygments/lexers/stata.py +108 -0
- wandb/vendor/pygments/lexers/supercollider.py +90 -0
- wandb/vendor/pygments/lexers/tcl.py +145 -0
- wandb/vendor/pygments/lexers/templates.py +2283 -0
- wandb/vendor/pygments/lexers/testing.py +207 -0
- wandb/vendor/pygments/lexers/text.py +25 -0
- wandb/vendor/pygments/lexers/textedit.py +169 -0
- wandb/vendor/pygments/lexers/textfmts.py +297 -0
- wandb/vendor/pygments/lexers/theorem.py +458 -0
- wandb/vendor/pygments/lexers/trafficscript.py +54 -0
- wandb/vendor/pygments/lexers/typoscript.py +226 -0
- wandb/vendor/pygments/lexers/urbi.py +133 -0
- wandb/vendor/pygments/lexers/varnish.py +190 -0
- wandb/vendor/pygments/lexers/verification.py +111 -0
- wandb/vendor/pygments/lexers/web.py +24 -0
- wandb/vendor/pygments/lexers/webmisc.py +988 -0
- wandb/vendor/pygments/lexers/whiley.py +116 -0
- wandb/vendor/pygments/lexers/x10.py +69 -0
- wandb/vendor/pygments/modeline.py +44 -0
- wandb/vendor/pygments/plugin.py +68 -0
- wandb/vendor/pygments/regexopt.py +92 -0
- wandb/vendor/pygments/scanner.py +105 -0
- wandb/vendor/pygments/sphinxext.py +158 -0
- wandb/vendor/pygments/style.py +155 -0
- wandb/vendor/pygments/styles/__init__.py +80 -0
- wandb/vendor/pygments/styles/abap.py +29 -0
- wandb/vendor/pygments/styles/algol.py +63 -0
- wandb/vendor/pygments/styles/algol_nu.py +63 -0
- wandb/vendor/pygments/styles/arduino.py +98 -0
- wandb/vendor/pygments/styles/autumn.py +65 -0
- wandb/vendor/pygments/styles/borland.py +51 -0
- wandb/vendor/pygments/styles/bw.py +49 -0
- wandb/vendor/pygments/styles/colorful.py +81 -0
- wandb/vendor/pygments/styles/default.py +73 -0
- wandb/vendor/pygments/styles/emacs.py +72 -0
- wandb/vendor/pygments/styles/friendly.py +72 -0
- wandb/vendor/pygments/styles/fruity.py +42 -0
- wandb/vendor/pygments/styles/igor.py +29 -0
- wandb/vendor/pygments/styles/lovelace.py +97 -0
- wandb/vendor/pygments/styles/manni.py +75 -0
- wandb/vendor/pygments/styles/monokai.py +106 -0
- wandb/vendor/pygments/styles/murphy.py +80 -0
- wandb/vendor/pygments/styles/native.py +65 -0
- wandb/vendor/pygments/styles/paraiso_dark.py +125 -0
- wandb/vendor/pygments/styles/paraiso_light.py +125 -0
- wandb/vendor/pygments/styles/pastie.py +75 -0
- wandb/vendor/pygments/styles/perldoc.py +69 -0
- wandb/vendor/pygments/styles/rainbow_dash.py +89 -0
- wandb/vendor/pygments/styles/rrt.py +33 -0
- wandb/vendor/pygments/styles/sas.py +44 -0
- wandb/vendor/pygments/styles/stata.py +40 -0
- wandb/vendor/pygments/styles/tango.py +141 -0
- wandb/vendor/pygments/styles/trac.py +63 -0
- wandb/vendor/pygments/styles/vim.py +63 -0
- wandb/vendor/pygments/styles/vs.py +38 -0
- wandb/vendor/pygments/styles/xcode.py +51 -0
- wandb/vendor/pygments/token.py +213 -0
- wandb/vendor/pygments/unistring.py +217 -0
- wandb/vendor/pygments/util.py +388 -0
- wandb/vendor/pynvml/__init__.py +0 -0
- wandb/vendor/pynvml/pynvml.py +4779 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/__init__.py +17 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/events.py +615 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/__init__.py +98 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/api.py +369 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/fsevents.py +172 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/fsevents2.py +239 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/inotify.py +218 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/inotify_buffer.py +81 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/inotify_c.py +575 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/kqueue.py +730 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/polling.py +145 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/read_directory_changes.py +133 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/observers/winapi.py +348 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/patterns.py +265 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/tricks/__init__.py +174 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/__init__.py +151 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/bricks.py +249 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/compat.py +29 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/decorators.py +198 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/delayed_queue.py +88 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/dirsnapshot.py +293 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/echo.py +157 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/event_backport.py +41 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/importlib2.py +40 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/platform.py +57 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/unicode_paths.py +64 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/utils/win32stat.py +123 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/version.py +28 -0
- wandb/vendor/watchdog_0_9_0/wandb_watchdog/watchmedo.py +577 -0
- wandb/wandb_agent.py +588 -0
- wandb/wandb_controller.py +721 -0
- wandb/wandb_run.py +9 -0
- wandb-0.18.1.dist-info/METADATA +212 -0
- wandb-0.18.1.dist-info/RECORD +826 -0
- wandb-0.18.1.dist-info/WHEEL +4 -0
- wandb-0.18.1.dist-info/entry_points.txt +3 -0
- wandb-0.18.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,924 @@
|
|
1
|
+
"""Implementation of launch agent."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
import pprint
|
7
|
+
import threading
|
8
|
+
import time
|
9
|
+
import traceback
|
10
|
+
from dataclasses import dataclass
|
11
|
+
from multiprocessing import Event
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
13
|
+
|
14
|
+
import yaml
|
15
|
+
|
16
|
+
import wandb
|
17
|
+
from wandb.apis.internal import Api
|
18
|
+
from wandb.errors import CommError
|
19
|
+
from wandb.sdk.launch._launch_add import launch_add
|
20
|
+
from wandb.sdk.launch.runner.local_container import LocalSubmittedRun
|
21
|
+
from wandb.sdk.launch.runner.local_process import LocalProcessRunner
|
22
|
+
from wandb.sdk.launch.sweeps.scheduler import Scheduler
|
23
|
+
from wandb.sdk.launch.utils import LAUNCH_CONFIG_FILE, resolve_build_and_registry_config
|
24
|
+
from wandb.sdk.lib import runid
|
25
|
+
|
26
|
+
from .. import loader
|
27
|
+
from .._project_spec import LaunchProject
|
28
|
+
from ..errors import LaunchDockerError, LaunchError
|
29
|
+
from ..utils import (
|
30
|
+
LAUNCH_DEFAULT_PROJECT,
|
31
|
+
LOG_PREFIX,
|
32
|
+
PROJECT_SYNCHRONOUS,
|
33
|
+
event_loop_thread_exec,
|
34
|
+
)
|
35
|
+
from .job_status_tracker import JobAndRunStatusTracker
|
36
|
+
from .run_queue_item_file_saver import RunQueueItemFileSaver
|
37
|
+
|
38
|
+
AGENT_POLLING_INTERVAL = 10
|
39
|
+
RECEIVED_JOB_POLLING_INTERVAL = 0.0 # more frequent when we know we have jobs
|
40
|
+
|
41
|
+
AGENT_POLLING = "POLLING"
|
42
|
+
AGENT_RUNNING = "RUNNING"
|
43
|
+
AGENT_KILLED = "KILLED"
|
44
|
+
|
45
|
+
HIDDEN_AGENT_RUN_TYPE = "sweep-controller"
|
46
|
+
|
47
|
+
MAX_RESUME_COUNT = 5
|
48
|
+
|
49
|
+
RUN_INFO_GRACE_PERIOD = 60
|
50
|
+
|
51
|
+
DEFAULT_STOPPED_RUN_TIMEOUT = 60
|
52
|
+
|
53
|
+
DEFAULT_PRINT_INTERVAL = 5 * 60
|
54
|
+
VERBOSE_PRINT_INTERVAL = 20
|
55
|
+
|
56
|
+
_env_timeout = os.environ.get("WANDB_LAUNCH_START_TIMEOUT")
|
57
|
+
if _env_timeout:
|
58
|
+
try:
|
59
|
+
RUN_START_TIMEOUT = float(_env_timeout)
|
60
|
+
except ValueError:
|
61
|
+
raise LaunchError(
|
62
|
+
f"Invalid value for WANDB_LAUNCH_START_TIMEOUT: {_env_timeout}"
|
63
|
+
)
|
64
|
+
else:
|
65
|
+
RUN_START_TIMEOUT = 60 * 30 # default 30 minutes
|
66
|
+
|
67
|
+
_logger = logging.getLogger(__name__)
|
68
|
+
|
69
|
+
|
70
|
+
@dataclass
|
71
|
+
class JobSpecAndQueue:
|
72
|
+
job: Dict[str, Any]
|
73
|
+
queue: str
|
74
|
+
|
75
|
+
|
76
|
+
def _convert_access(access: str) -> str:
|
77
|
+
"""Convert access string to a value accepted by wandb."""
|
78
|
+
access = access.upper()
|
79
|
+
assert (
|
80
|
+
access == "PROJECT" or access == "USER"
|
81
|
+
), "Queue access must be either project or user"
|
82
|
+
return access
|
83
|
+
|
84
|
+
|
85
|
+
def _max_from_config(
|
86
|
+
config: Dict[str, Any], key: str, default: int = 1
|
87
|
+
) -> Union[int, float]:
|
88
|
+
"""Get an integer from the config, or float.inf if -1.
|
89
|
+
|
90
|
+
Utility for parsing integers from the agent config with a default, infinity
|
91
|
+
handling, and integer parsing. Raises more informative error if parse error.
|
92
|
+
"""
|
93
|
+
try:
|
94
|
+
val = config.get(key)
|
95
|
+
if val is None:
|
96
|
+
val = default
|
97
|
+
max_from_config = int(val)
|
98
|
+
except ValueError as e:
|
99
|
+
raise LaunchError(
|
100
|
+
f"Error when parsing LaunchAgent config key: ['{key}': "
|
101
|
+
f"{config.get(key)}]. Error: {str(e)}"
|
102
|
+
)
|
103
|
+
if max_from_config == -1:
|
104
|
+
return float("inf")
|
105
|
+
|
106
|
+
if max_from_config < 0:
|
107
|
+
raise LaunchError(
|
108
|
+
f"Error when parsing LaunchAgent config key: ['{key}': "
|
109
|
+
f"{config.get(key)}]. Error: negative value."
|
110
|
+
)
|
111
|
+
return max_from_config
|
112
|
+
|
113
|
+
|
114
|
+
class InternalAgentLogger:
|
115
|
+
def __init__(self, verbosity=0):
|
116
|
+
self._print_to_terminal = verbosity >= 2
|
117
|
+
|
118
|
+
def error(self, message: str):
|
119
|
+
if self._print_to_terminal:
|
120
|
+
wandb.termerror(f"{LOG_PREFIX}{message}")
|
121
|
+
_logger.error(f"{LOG_PREFIX}{message}")
|
122
|
+
|
123
|
+
def warn(self, message: str):
|
124
|
+
if self._print_to_terminal:
|
125
|
+
wandb.termwarn(f"{LOG_PREFIX}{message}")
|
126
|
+
_logger.warn(f"{LOG_PREFIX}{message}")
|
127
|
+
|
128
|
+
def info(self, message: str):
|
129
|
+
if self._print_to_terminal:
|
130
|
+
wandb.termlog(f"{LOG_PREFIX}{message}")
|
131
|
+
_logger.info(f"{LOG_PREFIX}{message}")
|
132
|
+
|
133
|
+
def debug(self, message: str):
|
134
|
+
if self._print_to_terminal:
|
135
|
+
wandb.termlog(f"{LOG_PREFIX}{message}")
|
136
|
+
_logger.debug(f"{LOG_PREFIX}{message}")
|
137
|
+
|
138
|
+
|
139
|
+
def construct_agent_configs(
|
140
|
+
launch_config: Optional[Dict] = None,
|
141
|
+
build_config: Optional[Dict] = None,
|
142
|
+
) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any], Dict[str, Any]]:
|
143
|
+
registry_config = None
|
144
|
+
environment_config = None
|
145
|
+
if launch_config is not None:
|
146
|
+
build_config = launch_config.get("builder")
|
147
|
+
registry_config = launch_config.get("registry")
|
148
|
+
|
149
|
+
default_launch_config = None
|
150
|
+
if os.path.exists(os.path.expanduser(LAUNCH_CONFIG_FILE)):
|
151
|
+
with open(os.path.expanduser(LAUNCH_CONFIG_FILE)) as f:
|
152
|
+
default_launch_config = (
|
153
|
+
yaml.safe_load(f) or {}
|
154
|
+
) # In case the config is empty, we want it to be {} instead of None.
|
155
|
+
environment_config = default_launch_config.get("environment")
|
156
|
+
|
157
|
+
build_config, registry_config = resolve_build_and_registry_config(
|
158
|
+
default_launch_config, build_config, registry_config
|
159
|
+
)
|
160
|
+
|
161
|
+
return environment_config, build_config, registry_config
|
162
|
+
|
163
|
+
|
164
|
+
class LaunchAgent:
|
165
|
+
"""Launch agent class which polls run given run queues and launches runs for wandb launch."""
|
166
|
+
|
167
|
+
_instance = None
|
168
|
+
|
169
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> "LaunchAgent":
|
170
|
+
"""Create a new instance of the LaunchAgent.
|
171
|
+
|
172
|
+
This method ensures that only one instance of the LaunchAgent is created.
|
173
|
+
This is done so that information about the agent can be accessed from
|
174
|
+
elsewhere in the library.
|
175
|
+
"""
|
176
|
+
if cls._instance is None:
|
177
|
+
cls._instance = super().__new__(cls)
|
178
|
+
return cls._instance
|
179
|
+
|
180
|
+
@classmethod
|
181
|
+
def name(cls) -> str:
|
182
|
+
"""Return the name of the agent."""
|
183
|
+
if cls._instance is None:
|
184
|
+
raise LaunchError("LaunchAgent has not been initialized")
|
185
|
+
name = cls._instance._name
|
186
|
+
if isinstance(name, str):
|
187
|
+
return name
|
188
|
+
raise LaunchError(f"Found invalid name for agent {name}")
|
189
|
+
|
190
|
+
@classmethod
|
191
|
+
def initialized(cls) -> bool:
|
192
|
+
"""Return whether the agent is initialized."""
|
193
|
+
return cls._instance is not None
|
194
|
+
|
195
|
+
def __init__(self, api: Api, config: Dict[str, Any]):
|
196
|
+
"""Initialize a launch agent.
|
197
|
+
|
198
|
+
Arguments:
|
199
|
+
api: Api object to use for making requests to the backend.
|
200
|
+
config: Config dictionary for the agent.
|
201
|
+
"""
|
202
|
+
self._entity = config["entity"]
|
203
|
+
self._project = LAUNCH_DEFAULT_PROJECT
|
204
|
+
self._api = api
|
205
|
+
self._base_url = self._api.settings().get("base_url")
|
206
|
+
self._ticks = 0
|
207
|
+
self._jobs: Dict[int, JobAndRunStatusTracker] = {}
|
208
|
+
self._jobs_lock = threading.Lock()
|
209
|
+
self._jobs_event = Event()
|
210
|
+
self._jobs_event.set()
|
211
|
+
self._cwd = os.getcwd()
|
212
|
+
self._namespace = runid.generate_id()
|
213
|
+
self._access = _convert_access("project")
|
214
|
+
self._max_jobs = _max_from_config(config, "max_jobs")
|
215
|
+
self._max_schedulers = _max_from_config(config, "max_schedulers")
|
216
|
+
self._secure_mode = config.get("secure_mode", False)
|
217
|
+
self._verbosity = config.get("verbosity", 0)
|
218
|
+
self._internal_logger = InternalAgentLogger(verbosity=self._verbosity)
|
219
|
+
self._last_status_print_time = 0.0
|
220
|
+
self.default_config: Dict[str, Any] = config
|
221
|
+
self._stopped_run_timeout = config.get(
|
222
|
+
"stopped_run_timeout", DEFAULT_STOPPED_RUN_TIMEOUT
|
223
|
+
)
|
224
|
+
self._known_warnings: List[str] = []
|
225
|
+
|
226
|
+
# Get agent version from env var if present, otherwise wandb version
|
227
|
+
self.version: str = "wandb@" + wandb.__version__
|
228
|
+
env_agent_version = os.environ.get("WANDB_AGENT_VERSION")
|
229
|
+
if env_agent_version and env_agent_version != "wandb-launch-agent":
|
230
|
+
self.version = env_agent_version
|
231
|
+
|
232
|
+
# serverside creation
|
233
|
+
self.gorilla_supports_agents = (
|
234
|
+
self._api.launch_agent_introspection() is not None
|
235
|
+
)
|
236
|
+
self._gorilla_supports_fail_run_queue_items = (
|
237
|
+
self._api.fail_run_queue_item_introspection()
|
238
|
+
)
|
239
|
+
|
240
|
+
self._queues: List[str] = config.get("queues", ["default"])
|
241
|
+
|
242
|
+
# remove project field from agent config before sending to back end
|
243
|
+
# because otherwise it shows up in the config in the UI and confuses users
|
244
|
+
sent_config = config.copy()
|
245
|
+
if "project" in sent_config:
|
246
|
+
del sent_config["project"]
|
247
|
+
|
248
|
+
create_response = self._api.create_launch_agent(
|
249
|
+
self._entity,
|
250
|
+
self._project,
|
251
|
+
self._queues,
|
252
|
+
sent_config,
|
253
|
+
self.version,
|
254
|
+
self.gorilla_supports_agents,
|
255
|
+
)
|
256
|
+
self._id = create_response["launchAgentId"]
|
257
|
+
if self._api.entity_is_team(self._entity):
|
258
|
+
wandb.termwarn(
|
259
|
+
f"{LOG_PREFIX}Agent is running on team entity ({self._entity}). Members of this team will be able to run code on this device."
|
260
|
+
)
|
261
|
+
|
262
|
+
agent_response = self._api.get_launch_agent(
|
263
|
+
self._id, self.gorilla_supports_agents
|
264
|
+
)
|
265
|
+
self._name = agent_response["name"]
|
266
|
+
self._init_agent_run()
|
267
|
+
|
268
|
+
def _is_scheduler_job(self, run_spec: Dict[str, Any]) -> bool:
|
269
|
+
"""Determine whether a job/runSpec is a sweep scheduler."""
|
270
|
+
if not run_spec:
|
271
|
+
self._internal_logger.debug(
|
272
|
+
"Received runSpec in _is_scheduler_job that was empty"
|
273
|
+
)
|
274
|
+
|
275
|
+
if run_spec.get("uri") != Scheduler.PLACEHOLDER_URI:
|
276
|
+
return False
|
277
|
+
|
278
|
+
if run_spec.get("resource") == "local-process":
|
279
|
+
# Any job pushed to a run queue that has a scheduler uri is
|
280
|
+
# allowed to use local-process
|
281
|
+
if run_spec.get("job"):
|
282
|
+
return True
|
283
|
+
|
284
|
+
# If a scheduler is local-process and run through CLI, also
|
285
|
+
# confirm command is in format: [wandb scheduler <sweep>]
|
286
|
+
cmd = run_spec.get("overrides", {}).get("entry_point", [])
|
287
|
+
if len(cmd) < 3:
|
288
|
+
return False
|
289
|
+
|
290
|
+
if cmd[:2] != ["wandb", "scheduler"]:
|
291
|
+
return False
|
292
|
+
|
293
|
+
return True
|
294
|
+
|
295
|
+
async def fail_run_queue_item(
|
296
|
+
self,
|
297
|
+
run_queue_item_id: str,
|
298
|
+
message: str,
|
299
|
+
phase: str,
|
300
|
+
files: Optional[List[str]] = None,
|
301
|
+
) -> None:
|
302
|
+
if self._gorilla_supports_fail_run_queue_items:
|
303
|
+
fail_rqi = event_loop_thread_exec(self._api.fail_run_queue_item)
|
304
|
+
await fail_rqi(run_queue_item_id, message, phase, files)
|
305
|
+
|
306
|
+
def _init_agent_run(self) -> None:
|
307
|
+
# TODO: has it been long enough that all backends support agents?
|
308
|
+
self._wandb_run = None
|
309
|
+
|
310
|
+
if self.gorilla_supports_agents:
|
311
|
+
settings = wandb.Settings(
|
312
|
+
silent=True, disable_git=True, disable_job_creation=True
|
313
|
+
)
|
314
|
+
self._wandb_run = wandb.init(
|
315
|
+
project=self._project,
|
316
|
+
entity=self._entity,
|
317
|
+
settings=settings,
|
318
|
+
id=self._name,
|
319
|
+
job_type=HIDDEN_AGENT_RUN_TYPE,
|
320
|
+
)
|
321
|
+
|
322
|
+
@property
|
323
|
+
def thread_ids(self) -> List[int]:
|
324
|
+
"""Returns a list of keys running thread ids for the agent."""
|
325
|
+
with self._jobs_lock:
|
326
|
+
return list(self._jobs.keys())
|
327
|
+
|
328
|
+
@property
|
329
|
+
def num_running_schedulers(self) -> int:
|
330
|
+
"""Return just the number of schedulers."""
|
331
|
+
with self._jobs_lock:
|
332
|
+
return len([x for x in self._jobs if self._jobs[x].is_scheduler])
|
333
|
+
|
334
|
+
@property
|
335
|
+
def num_running_jobs(self) -> int:
|
336
|
+
"""Return the number of jobs not including schedulers."""
|
337
|
+
with self._jobs_lock:
|
338
|
+
return len([x for x in self._jobs if not self._jobs[x].is_scheduler])
|
339
|
+
|
340
|
+
async def pop_from_queue(self, queue: str) -> Any:
|
341
|
+
"""Pops an item off the runqueue to run as a job.
|
342
|
+
|
343
|
+
Arguments:
|
344
|
+
queue: Queue to pop from.
|
345
|
+
|
346
|
+
Returns:
|
347
|
+
Item popped off the queue.
|
348
|
+
|
349
|
+
Raises:
|
350
|
+
Exception: if there is an error popping from the queue.
|
351
|
+
"""
|
352
|
+
try:
|
353
|
+
pop = event_loop_thread_exec(self._api.pop_from_run_queue)
|
354
|
+
ups = await pop(
|
355
|
+
queue,
|
356
|
+
entity=self._entity,
|
357
|
+
project=self._project,
|
358
|
+
agent_id=self._id,
|
359
|
+
)
|
360
|
+
return ups
|
361
|
+
except Exception as e:
|
362
|
+
print("Exception:", e)
|
363
|
+
return None
|
364
|
+
|
365
|
+
def print_status(self) -> None:
|
366
|
+
"""Prints the current status of the agent."""
|
367
|
+
self._last_status_print_time = time.time()
|
368
|
+
output_str = "agent "
|
369
|
+
if self._name:
|
370
|
+
output_str += f"{self._name} "
|
371
|
+
if self.num_running_jobs < self._max_jobs:
|
372
|
+
output_str += f"polling on queues {','.join(self._queues)}, "
|
373
|
+
output_str += (
|
374
|
+
f"running {self.num_running_jobs} out of a maximum of {self._max_jobs} jobs"
|
375
|
+
)
|
376
|
+
|
377
|
+
wandb.termlog(f"{LOG_PREFIX}{output_str}")
|
378
|
+
if self.num_running_jobs > 0:
|
379
|
+
output_str += f": {','.join(str(job_id) for job_id in self.thread_ids)}"
|
380
|
+
|
381
|
+
_logger.info(output_str)
|
382
|
+
|
383
|
+
async def update_status(self, status: str) -> None:
|
384
|
+
"""Update the status of the agent.
|
385
|
+
|
386
|
+
Arguments:
|
387
|
+
status: Status to update the agent to.
|
388
|
+
"""
|
389
|
+
_update_status = event_loop_thread_exec(self._api.update_launch_agent_status)
|
390
|
+
update_ret = await _update_status(
|
391
|
+
self._id, status, self.gorilla_supports_agents
|
392
|
+
)
|
393
|
+
if not update_ret["success"]:
|
394
|
+
wandb.termerror(f"{LOG_PREFIX}Failed to update agent status to {status}")
|
395
|
+
|
396
|
+
def _check_run_exists_and_inited(
|
397
|
+
self, entity: str, project: str, run_id: str, rqi_id: str
|
398
|
+
) -> bool:
|
399
|
+
"""Checks the stateof the run to ensure it has been inited. Note this will not behave well with resuming."""
|
400
|
+
# Checks the _wandb key in the run config for the run queue item id. If it exists, the
|
401
|
+
# submitted run definitely called init. Falls back to checking state of run.
|
402
|
+
# TODO: handle resuming runs
|
403
|
+
|
404
|
+
# Sweep runs exist but are in pending state, normal launch runs won't exist
|
405
|
+
# so will raise a CommError.
|
406
|
+
try:
|
407
|
+
run_state = self._api.get_run_state(entity, project, run_id)
|
408
|
+
if run_state.lower() != "pending":
|
409
|
+
return True
|
410
|
+
except CommError:
|
411
|
+
self._internal_logger.info(
|
412
|
+
f"Run {entity}/{project}/{run_id} with rqi id: {rqi_id} did not have associated run",
|
413
|
+
)
|
414
|
+
return False
|
415
|
+
|
416
|
+
async def finish_thread_id(
|
417
|
+
self,
|
418
|
+
thread_id: int,
|
419
|
+
exception: Optional[Union[Exception, LaunchDockerError]] = None,
|
420
|
+
) -> None:
|
421
|
+
"""Removes the job from our list for now."""
|
422
|
+
with self._jobs_lock:
|
423
|
+
job_and_run_status = self._jobs[thread_id]
|
424
|
+
if (
|
425
|
+
job_and_run_status.entity is not None
|
426
|
+
and job_and_run_status.entity != self._entity
|
427
|
+
):
|
428
|
+
self._internal_logger.info(
|
429
|
+
"Skipping check for completed run status because run is on a different entity than agent",
|
430
|
+
)
|
431
|
+
elif exception is not None:
|
432
|
+
tb_str = traceback.format_exception(
|
433
|
+
type(exception), value=exception, tb=exception.__traceback__
|
434
|
+
)
|
435
|
+
fnames = job_and_run_status.saver.save_contents(
|
436
|
+
"".join(tb_str), "error.log", "error"
|
437
|
+
)
|
438
|
+
await self.fail_run_queue_item(
|
439
|
+
job_and_run_status.run_queue_item_id,
|
440
|
+
str(exception),
|
441
|
+
job_and_run_status.err_stage,
|
442
|
+
fnames,
|
443
|
+
)
|
444
|
+
elif job_and_run_status.project is None or job_and_run_status.run_id is None:
|
445
|
+
self._internal_logger.info(
|
446
|
+
f"called finish_thread_id on thread whose tracker has no project or run id. RunQueueItemID: {job_and_run_status.run_queue_item_id}",
|
447
|
+
)
|
448
|
+
wandb.termerror(
|
449
|
+
"Missing project or run id on thread called finish thread id"
|
450
|
+
)
|
451
|
+
await self.fail_run_queue_item(
|
452
|
+
job_and_run_status.run_queue_item_id,
|
453
|
+
"submitted job was finished without assigned project or run id",
|
454
|
+
"agent",
|
455
|
+
)
|
456
|
+
elif job_and_run_status.run is not None:
|
457
|
+
called_init = False
|
458
|
+
# We do some weird stuff here getting run info to check for a
|
459
|
+
# created in run in W&B.
|
460
|
+
#
|
461
|
+
# We retry for 60 seconds with an exponential backoff in case
|
462
|
+
# upsert run is taking a while.
|
463
|
+
logs = None
|
464
|
+
interval = 1
|
465
|
+
while True:
|
466
|
+
called_init = self._check_run_exists_and_inited(
|
467
|
+
self._entity,
|
468
|
+
job_and_run_status.project,
|
469
|
+
job_and_run_status.run_id,
|
470
|
+
job_and_run_status.run_queue_item_id,
|
471
|
+
)
|
472
|
+
if called_init or interval > RUN_INFO_GRACE_PERIOD:
|
473
|
+
break
|
474
|
+
if not called_init:
|
475
|
+
# Fetch the logs now if we don't get run info on the
|
476
|
+
# first try, in case the logs are cleaned from the runner
|
477
|
+
# environment (e.g. k8s) during the run info grace period.
|
478
|
+
if interval == 1:
|
479
|
+
logs = await job_and_run_status.run.get_logs()
|
480
|
+
await asyncio.sleep(interval)
|
481
|
+
interval *= 2
|
482
|
+
if not called_init:
|
483
|
+
fnames = None
|
484
|
+
if job_and_run_status.completed_status == "finished":
|
485
|
+
_msg = "The submitted job exited successfully but failed to call wandb.init"
|
486
|
+
else:
|
487
|
+
_msg = "The submitted run was not successfully started"
|
488
|
+
if logs:
|
489
|
+
fnames = job_and_run_status.saver.save_contents(
|
490
|
+
logs, "error.log", "error"
|
491
|
+
)
|
492
|
+
await self.fail_run_queue_item(
|
493
|
+
job_and_run_status.run_queue_item_id, _msg, "run", fnames
|
494
|
+
)
|
495
|
+
else:
|
496
|
+
self._internal_logger.info(
|
497
|
+
f"Finish thread id {thread_id} had no exception and no run"
|
498
|
+
)
|
499
|
+
wandb._sentry.exception(
|
500
|
+
"launch agent called finish thread id on thread without run or exception"
|
501
|
+
)
|
502
|
+
|
503
|
+
# TODO: keep logs or something for the finished jobs
|
504
|
+
with self._jobs_lock:
|
505
|
+
del self._jobs[thread_id]
|
506
|
+
|
507
|
+
# update status back to polling if no jobs are running
|
508
|
+
if len(self.thread_ids) == 0:
|
509
|
+
await self.update_status(AGENT_POLLING)
|
510
|
+
|
511
|
+
async def run_job(
|
512
|
+
self, job: Dict[str, Any], queue: str, file_saver: RunQueueItemFileSaver
|
513
|
+
) -> None:
|
514
|
+
"""Set up project and run the job.
|
515
|
+
|
516
|
+
Arguments:
|
517
|
+
job: Job to run.
|
518
|
+
"""
|
519
|
+
_msg = f"{LOG_PREFIX}Launch agent received job:\n{pprint.pformat(job)}\n"
|
520
|
+
wandb.termlog(_msg)
|
521
|
+
_logger.info(_msg)
|
522
|
+
# update agent status
|
523
|
+
await self.update_status(AGENT_RUNNING)
|
524
|
+
|
525
|
+
# parse job
|
526
|
+
self._internal_logger.info("Parsing launch spec")
|
527
|
+
launch_spec = job["runSpec"]
|
528
|
+
|
529
|
+
# Abort if this job attempts to override secure mode
|
530
|
+
self._assert_secure(launch_spec)
|
531
|
+
job_tracker = JobAndRunStatusTracker(job["runQueueItemId"], queue, file_saver)
|
532
|
+
|
533
|
+
asyncio.create_task(
|
534
|
+
self.task_run_job(
|
535
|
+
launch_spec,
|
536
|
+
job,
|
537
|
+
self.default_config,
|
538
|
+
self._api,
|
539
|
+
job_tracker,
|
540
|
+
)
|
541
|
+
)
|
542
|
+
|
543
|
+
def _assert_secure(self, launch_spec: Dict[str, Any]) -> None:
|
544
|
+
"""If secure mode is set, make sure no vulnerable keys are overridden."""
|
545
|
+
if not self._secure_mode:
|
546
|
+
return
|
547
|
+
k8s_config = launch_spec.get("resource_args", {}).get("kubernetes", {})
|
548
|
+
|
549
|
+
pod_secure_keys = ["hostPID", "hostIPC", "hostNetwork", "initContainers"]
|
550
|
+
pod_spec = k8s_config.get("spec", {}).get("template", {}).get("spec", {})
|
551
|
+
for key in pod_secure_keys:
|
552
|
+
if key in pod_spec:
|
553
|
+
raise ValueError(
|
554
|
+
f'This agent is configured to lock "{key}" in pod spec '
|
555
|
+
"but the job specification attempts to override it."
|
556
|
+
)
|
557
|
+
|
558
|
+
container_specs = pod_spec.get("containers", [])
|
559
|
+
for container_spec in container_specs:
|
560
|
+
if "command" in container_spec:
|
561
|
+
raise ValueError(
|
562
|
+
'This agent is configured to lock "command" in container spec '
|
563
|
+
"but the job specification attempts to override it."
|
564
|
+
)
|
565
|
+
|
566
|
+
if launch_spec.get("overrides", {}).get("entry_point"):
|
567
|
+
raise ValueError(
|
568
|
+
'This agent is configured to lock the "entrypoint" override '
|
569
|
+
"but the job specification attempts to override it."
|
570
|
+
)
|
571
|
+
|
572
|
+
async def loop(self) -> None:
|
573
|
+
"""Loop infinitely to poll for jobs and run them.
|
574
|
+
|
575
|
+
Raises:
|
576
|
+
KeyboardInterrupt: if the agent is requested to stop.
|
577
|
+
"""
|
578
|
+
self.print_status()
|
579
|
+
if self._verbosity == 0:
|
580
|
+
print_interval = DEFAULT_PRINT_INTERVAL
|
581
|
+
else:
|
582
|
+
print_interval = VERBOSE_PRINT_INTERVAL
|
583
|
+
try:
|
584
|
+
while True:
|
585
|
+
job = None
|
586
|
+
self._ticks += 1
|
587
|
+
agent_response = self._api.get_launch_agent(
|
588
|
+
self._id, self.gorilla_supports_agents
|
589
|
+
)
|
590
|
+
if agent_response["stopPolling"]:
|
591
|
+
# shutdown process and all jobs if requested from ui
|
592
|
+
raise KeyboardInterrupt
|
593
|
+
if self.num_running_jobs < self._max_jobs:
|
594
|
+
# only check for new jobs if we're not at max
|
595
|
+
job_and_queue = await self.get_job_and_queue()
|
596
|
+
# these will either both be None, or neither will be None
|
597
|
+
if job_and_queue is not None:
|
598
|
+
job = job_and_queue.job
|
599
|
+
queue = job_and_queue.queue
|
600
|
+
try:
|
601
|
+
file_saver = RunQueueItemFileSaver(
|
602
|
+
self._wandb_run, job["runQueueItemId"]
|
603
|
+
)
|
604
|
+
if self._is_scheduler_job(job.get("runSpec", {})):
|
605
|
+
# If job is a scheduler, and we are already at the cap, ignore,
|
606
|
+
# don't ack, and it will be pushed back onto the queue in 1 min
|
607
|
+
if self.num_running_schedulers >= self._max_schedulers:
|
608
|
+
wandb.termwarn(
|
609
|
+
f"{LOG_PREFIX}Agent already running the maximum number "
|
610
|
+
f"of sweep schedulers: {self._max_schedulers}. To set "
|
611
|
+
"this value use `max_schedulers` key in the agent config"
|
612
|
+
)
|
613
|
+
continue
|
614
|
+
await self.run_job(job, queue, file_saver)
|
615
|
+
except Exception as e:
|
616
|
+
wandb.termerror(
|
617
|
+
f"{LOG_PREFIX}Error running job: {traceback.format_exc()}"
|
618
|
+
)
|
619
|
+
wandb._sentry.exception(e)
|
620
|
+
|
621
|
+
# always the first phase, because we only enter phase 2 within the thread
|
622
|
+
files = file_saver.save_contents(
|
623
|
+
contents=traceback.format_exc(),
|
624
|
+
fname="error.log",
|
625
|
+
file_sub_type="error",
|
626
|
+
)
|
627
|
+
await self.fail_run_queue_item(
|
628
|
+
run_queue_item_id=job["runQueueItemId"],
|
629
|
+
message=str(e),
|
630
|
+
phase="agent",
|
631
|
+
files=files,
|
632
|
+
)
|
633
|
+
|
634
|
+
if self._ticks % 2 == 0:
|
635
|
+
if len(self.thread_ids) == 0:
|
636
|
+
await self.update_status(AGENT_POLLING)
|
637
|
+
else:
|
638
|
+
await self.update_status(AGENT_RUNNING)
|
639
|
+
if time.time() - self._last_status_print_time > print_interval:
|
640
|
+
self.print_status()
|
641
|
+
|
642
|
+
if self.num_running_jobs == self._max_jobs or job is None:
|
643
|
+
# all threads busy or did not receive job
|
644
|
+
await asyncio.sleep(AGENT_POLLING_INTERVAL)
|
645
|
+
else:
|
646
|
+
await asyncio.sleep(RECEIVED_JOB_POLLING_INTERVAL)
|
647
|
+
|
648
|
+
except KeyboardInterrupt:
|
649
|
+
await self.update_status(AGENT_KILLED)
|
650
|
+
wandb.termlog(f"{LOG_PREFIX}Shutting down, active jobs:")
|
651
|
+
self.print_status()
|
652
|
+
finally:
|
653
|
+
self._jobs_event.clear()
|
654
|
+
|
655
|
+
# Threaded functions
|
656
|
+
async def task_run_job(
|
657
|
+
self,
|
658
|
+
launch_spec: Dict[str, Any],
|
659
|
+
job: Dict[str, Any],
|
660
|
+
default_config: Dict[str, Any],
|
661
|
+
api: Api,
|
662
|
+
job_tracker: JobAndRunStatusTracker,
|
663
|
+
) -> None:
|
664
|
+
rqi_id = job["runQueueItemId"]
|
665
|
+
assert rqi_id
|
666
|
+
exception: Optional[Union[LaunchDockerError, Exception]] = None
|
667
|
+
try:
|
668
|
+
with self._jobs_lock:
|
669
|
+
self._jobs[rqi_id] = job_tracker
|
670
|
+
await self._task_run_job(
|
671
|
+
launch_spec, job, default_config, api, rqi_id, job_tracker
|
672
|
+
)
|
673
|
+
except LaunchDockerError as e:
|
674
|
+
wandb.termerror(
|
675
|
+
f"{LOG_PREFIX}agent {self._name} encountered an issue while starting Docker, see above output for details."
|
676
|
+
)
|
677
|
+
exception = e
|
678
|
+
wandb._sentry.exception(e)
|
679
|
+
except LaunchError as e:
|
680
|
+
wandb.termerror(f"{LOG_PREFIX}Error running job: {e}")
|
681
|
+
exception = e
|
682
|
+
wandb._sentry.exception(e)
|
683
|
+
except Exception as e:
|
684
|
+
wandb.termerror(f"{LOG_PREFIX}Error running job: {traceback.format_exc()}")
|
685
|
+
exception = e
|
686
|
+
wandb._sentry.exception(e)
|
687
|
+
finally:
|
688
|
+
await self.finish_thread_id(rqi_id, exception)
|
689
|
+
|
690
|
+
async def _task_run_job(
|
691
|
+
self,
|
692
|
+
launch_spec: Dict[str, Any],
|
693
|
+
job: Dict[str, Any],
|
694
|
+
default_config: Dict[str, Any],
|
695
|
+
api: Api,
|
696
|
+
thread_id: int,
|
697
|
+
job_tracker: JobAndRunStatusTracker,
|
698
|
+
) -> None:
|
699
|
+
project = LaunchProject.from_spec(launch_spec, api)
|
700
|
+
self._set_queue_and_rqi_in_project(project, job, job_tracker.queue)
|
701
|
+
ack = event_loop_thread_exec(api.ack_run_queue_item)
|
702
|
+
await ack(job["runQueueItemId"], project.run_id)
|
703
|
+
# don't launch sweep runs if the sweep isn't healthy
|
704
|
+
await self.check_sweep_state(launch_spec, api)
|
705
|
+
|
706
|
+
job_tracker.update_run_info(project)
|
707
|
+
self._internal_logger.info("Fetching and validating project...")
|
708
|
+
project.fetch_and_validate_project()
|
709
|
+
self._internal_logger.info("Fetching resource...")
|
710
|
+
resource = launch_spec.get("resource") or "local-container"
|
711
|
+
backend_config: Dict[str, Any] = {
|
712
|
+
PROJECT_SYNCHRONOUS: False, # agent always runs async
|
713
|
+
}
|
714
|
+
self._internal_logger.info("Loading backend")
|
715
|
+
override_build_config = launch_spec.get("builder")
|
716
|
+
|
717
|
+
_, build_config, registry_config = construct_agent_configs(
|
718
|
+
default_config, override_build_config
|
719
|
+
)
|
720
|
+
image_uri = project.docker_image or project.job_base_image
|
721
|
+
entrypoint = project.get_job_entry_point()
|
722
|
+
environment = loader.environment_from_config(
|
723
|
+
default_config.get("environment", {})
|
724
|
+
)
|
725
|
+
registry = loader.registry_from_config(registry_config, environment)
|
726
|
+
builder = loader.builder_from_config(build_config, environment, registry)
|
727
|
+
backend = loader.runner_from_config(
|
728
|
+
resource, api, backend_config, environment, registry
|
729
|
+
)
|
730
|
+
if not (
|
731
|
+
project.docker_image
|
732
|
+
or project.job_base_image
|
733
|
+
or isinstance(backend, LocalProcessRunner)
|
734
|
+
):
|
735
|
+
assert entrypoint is not None
|
736
|
+
image_uri = await builder.build_image(project, entrypoint, job_tracker)
|
737
|
+
|
738
|
+
self._internal_logger.info("Backend loaded...")
|
739
|
+
if isinstance(backend, LocalProcessRunner):
|
740
|
+
run = await backend.run(project, image_uri)
|
741
|
+
else:
|
742
|
+
assert image_uri
|
743
|
+
run = await backend.run(project, image_uri)
|
744
|
+
if self._is_scheduler_job(launch_spec):
|
745
|
+
with self._jobs_lock:
|
746
|
+
self._jobs[thread_id].is_scheduler = True
|
747
|
+
wandb.termlog(
|
748
|
+
f"{LOG_PREFIX}Preparing to run sweep scheduler "
|
749
|
+
f"({self.num_running_schedulers}/{self._max_schedulers})"
|
750
|
+
)
|
751
|
+
|
752
|
+
if not run:
|
753
|
+
with self._jobs_lock:
|
754
|
+
job_tracker.failed_to_start = True
|
755
|
+
return
|
756
|
+
with self._jobs_lock:
|
757
|
+
job_tracker.run = run
|
758
|
+
start_time = time.time()
|
759
|
+
stopped_time: Optional[float] = None
|
760
|
+
while self._jobs_event.is_set():
|
761
|
+
# If run has failed to start before timeout, kill it
|
762
|
+
state = (await run.get_status()).state
|
763
|
+
if state == "starting" and RUN_START_TIMEOUT > 0:
|
764
|
+
if time.time() - start_time > RUN_START_TIMEOUT:
|
765
|
+
await run.cancel()
|
766
|
+
raise LaunchError(
|
767
|
+
f"Run failed to start within {RUN_START_TIMEOUT} seconds. "
|
768
|
+
"If you want to increase this timeout, set WANDB_LAUNCH_START_TIMEOUT "
|
769
|
+
"to a larger value."
|
770
|
+
)
|
771
|
+
if await self._check_run_finished(job_tracker, launch_spec):
|
772
|
+
return
|
773
|
+
if await job_tracker.check_wandb_run_stopped(self._api):
|
774
|
+
if stopped_time is None:
|
775
|
+
stopped_time = time.time()
|
776
|
+
else:
|
777
|
+
if time.time() - stopped_time > self._stopped_run_timeout:
|
778
|
+
await run.cancel()
|
779
|
+
await asyncio.sleep(AGENT_POLLING_INTERVAL)
|
780
|
+
|
781
|
+
# temp: for local, kill all jobs. we don't yet have good handling for different
|
782
|
+
# types of runners in general
|
783
|
+
if isinstance(run, LocalSubmittedRun) and run._command_proc is not None:
|
784
|
+
run._command_proc.kill()
|
785
|
+
|
786
|
+
async def check_sweep_state(self, launch_spec: Dict[str, Any], api: Api) -> None:
|
787
|
+
"""Check the state of a sweep before launching a run for the sweep."""
|
788
|
+
if launch_spec.get("sweep_id"):
|
789
|
+
try:
|
790
|
+
get_sweep_state = event_loop_thread_exec(api.get_sweep_state)
|
791
|
+
state = await get_sweep_state(
|
792
|
+
sweep=launch_spec["sweep_id"],
|
793
|
+
entity=launch_spec["entity"],
|
794
|
+
project=launch_spec["project"],
|
795
|
+
)
|
796
|
+
except Exception as e:
|
797
|
+
self._internal_logger.debug(f"Fetch sweep state error: {e}")
|
798
|
+
state = None
|
799
|
+
|
800
|
+
if state != "RUNNING" and state != "PAUSED":
|
801
|
+
raise LaunchError(
|
802
|
+
f"Launch agent picked up sweep job, but sweep ({launch_spec['sweep_id']}) was in a terminal state ({state})"
|
803
|
+
)
|
804
|
+
|
805
|
+
async def _check_run_finished(
|
806
|
+
self, job_tracker: JobAndRunStatusTracker, launch_spec: Dict[str, Any]
|
807
|
+
) -> bool:
|
808
|
+
if job_tracker.completed_status:
|
809
|
+
return True
|
810
|
+
|
811
|
+
# the run can be done before the run has started
|
812
|
+
# but can also be none if the run failed to start
|
813
|
+
# so if there is no run, either the run hasn't started yet
|
814
|
+
# or it has failed
|
815
|
+
if job_tracker.run is None:
|
816
|
+
if job_tracker.failed_to_start:
|
817
|
+
return True
|
818
|
+
return False
|
819
|
+
|
820
|
+
known_error = False
|
821
|
+
try:
|
822
|
+
run = job_tracker.run
|
823
|
+
status = await run.get_status()
|
824
|
+
state = status.state
|
825
|
+
|
826
|
+
for warning in status.messages:
|
827
|
+
if warning not in self._known_warnings:
|
828
|
+
self._known_warnings.append(warning)
|
829
|
+
success = self._api.update_run_queue_item_warning(
|
830
|
+
job_tracker.run_queue_item_id,
|
831
|
+
warning,
|
832
|
+
"Kubernetes",
|
833
|
+
[],
|
834
|
+
)
|
835
|
+
if not success:
|
836
|
+
_logger.warning(
|
837
|
+
f"Error adding warning {warning} to run queue item {job_tracker.run_queue_item_id}"
|
838
|
+
)
|
839
|
+
self._known_warnings.remove(warning)
|
840
|
+
|
841
|
+
if state == "preempted" and job_tracker.entity == self._entity:
|
842
|
+
config = launch_spec.copy()
|
843
|
+
config["run_id"] = job_tracker.run_id
|
844
|
+
config["_resume_count"] = config.get("_resume_count", 0) + 1
|
845
|
+
with self._jobs_lock:
|
846
|
+
job_tracker.completed_status = state
|
847
|
+
if config["_resume_count"] > MAX_RESUME_COUNT:
|
848
|
+
wandb.termlog(
|
849
|
+
f"{LOG_PREFIX}Run {job_tracker.run_id} has already resumed {MAX_RESUME_COUNT} times."
|
850
|
+
)
|
851
|
+
return True
|
852
|
+
wandb.termlog(
|
853
|
+
f"{LOG_PREFIX}Run {job_tracker.run_id} was preempted, requeueing..."
|
854
|
+
)
|
855
|
+
|
856
|
+
if "sweep_id" in config:
|
857
|
+
# allow resumed runs from sweeps that have already completed by removing
|
858
|
+
# the sweep id before pushing to queue
|
859
|
+
del config["sweep_id"]
|
860
|
+
|
861
|
+
launch_add(
|
862
|
+
config=config,
|
863
|
+
project_queue=self._project,
|
864
|
+
queue_name=job_tracker.queue,
|
865
|
+
)
|
866
|
+
return True
|
867
|
+
# TODO change these statuses to an enum
|
868
|
+
if state in ["stopped", "failed", "finished", "preempted"]:
|
869
|
+
if job_tracker.is_scheduler:
|
870
|
+
wandb.termlog(f"{LOG_PREFIX}Scheduler finished with ID: {run.id}")
|
871
|
+
if state == "failed":
|
872
|
+
# on fail, update sweep state. scheduler run_id should == sweep_id
|
873
|
+
try:
|
874
|
+
self._api.set_sweep_state(
|
875
|
+
sweep=job_tracker.run_id,
|
876
|
+
entity=job_tracker.entity,
|
877
|
+
project=job_tracker.project,
|
878
|
+
state="CANCELED",
|
879
|
+
)
|
880
|
+
except Exception as e:
|
881
|
+
raise LaunchError(f"Failed to update sweep state: {e}")
|
882
|
+
else:
|
883
|
+
wandb.termlog(f"{LOG_PREFIX}Job finished with ID: {run.id}")
|
884
|
+
with self._jobs_lock:
|
885
|
+
job_tracker.completed_status = state
|
886
|
+
return True
|
887
|
+
|
888
|
+
return False
|
889
|
+
except LaunchError as e:
|
890
|
+
wandb.termerror(
|
891
|
+
f"{LOG_PREFIX}Terminating job {run.id} because it failed to start: {str(e)}"
|
892
|
+
)
|
893
|
+
known_error = True
|
894
|
+
with self._jobs_lock:
|
895
|
+
job_tracker.failed_to_start = True
|
896
|
+
# TODO: make get_status robust to errors for each runner, and handle them
|
897
|
+
except Exception as e:
|
898
|
+
wandb.termerror(f"{LOG_PREFIX}Error getting status for job {run.id}")
|
899
|
+
wandb.termerror(traceback.format_exc())
|
900
|
+
_logger.info("---")
|
901
|
+
_logger.info("Caught exception while getting status.")
|
902
|
+
_logger.info(f"Job ID: {run.id}")
|
903
|
+
_logger.info(traceback.format_exc())
|
904
|
+
_logger.info("---")
|
905
|
+
wandb._sentry.exception(e)
|
906
|
+
return known_error
|
907
|
+
|
908
|
+
async def get_job_and_queue(self) -> Optional[JobSpecAndQueue]:
|
909
|
+
for queue in self._queues:
|
910
|
+
job = await self.pop_from_queue(queue)
|
911
|
+
if job is not None:
|
912
|
+
self._queues.remove(queue)
|
913
|
+
self._queues.append(queue)
|
914
|
+
return JobSpecAndQueue(job, queue)
|
915
|
+
return None
|
916
|
+
|
917
|
+
def _set_queue_and_rqi_in_project(
|
918
|
+
self, project: LaunchProject, job: Dict[str, Any], queue: str
|
919
|
+
) -> None:
|
920
|
+
project.queue_name = queue
|
921
|
+
|
922
|
+
# queue entity currently always matches the agent
|
923
|
+
project.queue_entity = self._entity
|
924
|
+
project.run_queue_item_id = job["runQueueItemId"]
|