PyPI - wandb - Versions diffs - 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl - Mend

wandb 0.16.3py3-none-any.whl → 0.16.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

wandb/__init__.py +2 -2
wandb/agents/pyagent.py +1 -1
wandb/apis/importers/__init__.py +1 -4
wandb/apis/importers/internals/internal.py +386 -0
wandb/apis/importers/internals/protocols.py +125 -0
wandb/apis/importers/internals/util.py +78 -0
wandb/apis/importers/mlflow.py +125 -88
wandb/apis/importers/validation.py +108 -0
wandb/apis/importers/wandb.py +1604 -0
wandb/apis/public/api.py +7 -10
wandb/apis/public/artifacts.py +38 -0
wandb/apis/public/files.py +11 -2
wandb/apis/reports/v2/__init__.py +0 -19
wandb/apis/reports/v2/expr_parsing.py +0 -1
wandb/apis/reports/v2/interface.py +15 -18
wandb/apis/reports/v2/internal.py +12 -45
wandb/cli/cli.py +52 -55
wandb/integration/gym/__init__.py +2 -1
wandb/integration/keras/callbacks/model_checkpoint.py +1 -1
wandb/integration/keras/keras.py +6 -4
wandb/integration/kfp/kfp_patch.py +2 -2
wandb/integration/openai/fine_tuning.py +1 -2
wandb/integration/ultralytics/callback.py +0 -1
wandb/proto/v3/wandb_internal_pb2.py +332 -312
wandb/proto/v3/wandb_settings_pb2.py +13 -3
wandb/proto/v3/wandb_telemetry_pb2.py +10 -10
wandb/proto/v4/wandb_internal_pb2.py +316 -312
wandb/proto/v4/wandb_settings_pb2.py +5 -3
wandb/proto/v4/wandb_telemetry_pb2.py +10 -10
wandb/sdk/artifacts/artifact.py +75 -31
wandb/sdk/artifacts/artifact_manifest.py +5 -2
wandb/sdk/artifacts/artifact_manifest_entry.py +6 -1
wandb/sdk/artifacts/artifact_manifests/artifact_manifest_v1.py +8 -2
wandb/sdk/artifacts/artifact_saver.py +19 -47
wandb/sdk/artifacts/storage_handler.py +2 -1
wandb/sdk/artifacts/storage_policies/wandb_storage_policy.py +22 -9
wandb/sdk/artifacts/storage_policy.py +4 -1
wandb/sdk/data_types/base_types/wb_value.py +1 -1
wandb/sdk/data_types/image.py +2 -2
wandb/sdk/interface/interface.py +49 -13
wandb/sdk/interface/interface_shared.py +17 -11
wandb/sdk/internal/file_stream.py +20 -1
wandb/sdk/internal/handler.py +1 -4
wandb/sdk/internal/internal_api.py +3 -1
wandb/sdk/internal/job_builder.py +49 -19
wandb/sdk/internal/profiler.py +1 -1
wandb/sdk/internal/sender.py +96 -124
wandb/sdk/internal/sender_config.py +197 -0
wandb/sdk/internal/settings_static.py +9 -0
wandb/sdk/internal/system/system_info.py +5 -3
wandb/sdk/internal/update.py +1 -1
wandb/sdk/launch/_launch.py +3 -3
wandb/sdk/launch/_launch_add.py +28 -29
wandb/sdk/launch/_project_spec.py +148 -136
wandb/sdk/launch/agent/agent.py +3 -7
wandb/sdk/launch/agent/config.py +0 -27
wandb/sdk/launch/builder/build.py +54 -28
wandb/sdk/launch/builder/docker_builder.py +4 -15
wandb/sdk/launch/builder/kaniko_builder.py +72 -45
wandb/sdk/launch/create_job.py +6 -40
wandb/sdk/launch/loader.py +10 -0
wandb/sdk/launch/registry/anon.py +29 -0
wandb/sdk/launch/registry/local_registry.py +4 -1
wandb/sdk/launch/runner/kubernetes_runner.py +20 -2
wandb/sdk/launch/runner/local_container.py +15 -10
wandb/sdk/launch/runner/sagemaker_runner.py +1 -1
wandb/sdk/launch/sweeps/scheduler.py +11 -3
wandb/sdk/launch/utils.py +14 -0
wandb/sdk/lib/__init__.py +2 -5
wandb/sdk/lib/_settings_toposort_generated.py +4 -1
wandb/sdk/lib/apikey.py +0 -5
wandb/sdk/lib/config_util.py +0 -31
wandb/sdk/lib/filesystem.py +11 -1
wandb/sdk/lib/run_moment.py +72 -0
wandb/sdk/service/service.py +7 -2
wandb/sdk/service/streams.py +1 -6
wandb/sdk/verify/verify.py +2 -1
wandb/sdk/wandb_init.py +12 -1
wandb/sdk/wandb_login.py +43 -26
wandb/sdk/wandb_run.py +164 -110
wandb/sdk/wandb_settings.py +58 -16
wandb/testing/relay.py +5 -6
wandb/util.py +50 -7
{wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/METADATA +8 -1
{wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/RECORD +89 -82
{wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/WHEEL +1 -1
wandb/apis/importers/base.py +0 -400
{wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/LICENSE +0 -0
{wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/entry_points.txt +0 -0
{wandb-0.16.3.dist-info → wandb-0.16.5.dist-info}/top_level.txt +0 -0

wandb/apis/importers/mlflow.py CHANGED Viewed

@@ -1,33 +1,27 @@
+import itertools
+import logging
 import re
 from collections import defaultdict
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Dict, Iterable, List, Optional, Tuple
-from unittest.mock import patch
+import mlflow
 from packaging.version import Version  # type: ignore
-from tqdm.auto import tqdm
 import wandb
 from wandb import Artifact
-from wandb.util import coalesce, get_module
-from .base import ImporterRun, send_run_with_send_manager
-with patch("click.echo"):
-    from wandb.apis.reports import Report
-mlflow = get_module(
-    "mlflow",
-    required="To use the MlflowImporter, please install mlflow: `pip install mlflow`",
-)
+from .internals import internal
+from .internals.util import Namespace, for_each
 mlflow_version = Version(mlflow.__version__)
+logger = logging.getLogger("import_logger")
 class MlflowRun:
     def __init__(self, run, mlflow_client):
         self.run = run
-        self.mlflow_client = mlflow_client
+        self.mlflow_client: mlflow.MlflowClient = mlflow_client
     def run_id(self) -> str:
         return self.run.info.run_id
@@ -39,7 +33,13 @@ class MlflowRun:
         return "imported-from-mlflow"
     def config(self) -> Dict[str, Any]:
-        return self.run.data.params
+        conf = self.run.data.params
+        # Add tags here since mlflow supports very long tag names but we only support up to 64 chars
+        tags = {
+            k: v for k, v in self.run.data.tags.items() if not k.startswith("mlflow.")
+        }
+        return {**conf, "imported_mlflow_tags": tags}
     def summary(self) -> Dict[str, float]:
         return self.run.data.metrics
@@ -71,19 +71,22 @@ class MlflowRun:
         return self.run.data.tags.get("mlflow.note.content")
     def tags(self) -> Optional[List[str]]:
-        mlflow_tags = {
-            k: v for k, v in self.run.data.tags.items() if not k.startswith("mlflow.")
-        }
-        return [f"{k}={v}" for k, v in mlflow_tags.items()]
+        ...
+        # W&B tags are different than mlflow tags.
+        # The full mlflow tags are added to config under key `imported_mlflow_tags` instead
     def artifacts(self) -> Optional[Iterable[Artifact]]:  # type: ignore
         if mlflow_version < Version("2.0.0"):
             dir_path = self.mlflow_client.download_artifacts(
-                run_id=self.run.info.run_id, path=""
+                run_id=self.run.info.run_id,
+                path="",
             )
         else:
             dir_path = mlflow.artifacts.download_artifacts(run_id=self.run.info.run_id)
+        # Since mlflow doesn't have extra metadata about the artifacts,
+        # we just lump them all together into a single wandb.Artifact
         artifact_name = self._handle_incompatible_strings(self.display_name())
         art = wandb.Artifact(artifact_name, "imported-artifacts")
         art.add_dir(dir_path)
@@ -91,37 +94,37 @@ class MlflowRun:
         return [art]
     def used_artifacts(self) -> Optional[Iterable[Artifact]]:  # type: ignore
-        ...
+        ...  # pragma: no cover
     def os_version(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def python_version(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def cuda_version(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def program(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def host(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def username(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def executable(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def gpus_used(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def cpus_used(self) -> Optional[int]:  # can we get the model?
-        ...
+        ...  # pragma: no cover
     def memory_used(self) -> Optional[int]:
-        ...
+        ...  # pragma: no cover
     def runtime(self) -> Optional[int]:
         end_time = (
@@ -135,16 +138,16 @@ class MlflowRun:
         return self.run.info.start_time // 1000
     def code_path(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def cli_version(self) -> Optional[str]:
-        ...
+        ...  # pragma: no cover
     def files(self) -> Optional[Iterable[Tuple[str, str]]]:
-        ...
+        ...  # pragma: no cover
     def logs(self) -> Optional[Iterable[str]]:
-        ...
+        ...  # pragma: no cover
     @staticmethod
     def _handle_incompatible_strings(s: str) -> str:
@@ -155,76 +158,110 @@ class MlflowRun:
 class MlflowImporter:
-    def __init__(self, mlflow_tracking_uri, mlflow_registry_uri=None) -> None:
-        self.mlflow_tracking_uri = mlflow_tracking_uri
+    def __init__(
+        self,
+        dst_base_url: str,
+        dst_api_key: str,
+        mlflow_tracking_uri: str,
+        mlflow_registry_uri: Optional[str] = None,
+        *,
+        custom_api_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        self.dst_base_url = dst_base_url
+        self.dst_api_key = dst_api_key
+        if custom_api_kwargs is None:
+            custom_api_kwargs = {"timeout": 600}
+        self.dst_api = wandb.Api(
+            api_key=dst_api_key,
+            overrides={"base_url": dst_base_url},
+            **custom_api_kwargs,
+        )
+        self.mlflow_tracking_uri = mlflow_tracking_uri
         mlflow.set_tracking_uri(self.mlflow_tracking_uri)
         if mlflow_registry_uri:
             mlflow.set_registry_uri(mlflow_registry_uri)
         self.mlflow_client = mlflow.tracking.MlflowClient(mlflow_tracking_uri)
-    def collect_runs(self, limit: Optional[int] = None) -> Iterable[MlflowRun]:
+    def __repr__(self):
+        return f"<MlflowImporter src={self.mlflow_tracking_uri}>"
+    def collect_runs(self, *, limit: Optional[int] = None) -> Iterable[MlflowRun]:
         if mlflow_version < Version("1.28.0"):
             experiments = self.mlflow_client.list_experiments()
         else:
             experiments = self.mlflow_client.search_experiments()
-        runs = (
-            run
-            for exp in experiments
-            for run in self.mlflow_client.search_runs(exp.experiment_id)
-        )
-        for i, run in enumerate(runs):
-            if limit and i >= limit:
-                break
-            yield MlflowRun(run, self.mlflow_client)
+        def _runs():
+            for exp in experiments:
+                for run in self.mlflow_client.search_runs(exp.experiment_id):
+                    yield MlflowRun(run, self.mlflow_client)
+        runs = itertools.islice(_runs(), limit)
+        yield from runs
-    def import_run(
+    def _import_run(
         self,
-        run: ImporterRun,
-        overrides: Optional[Dict[str, Any]] = None,
+        run: MlflowRun,
+        *,
+        artifacts: bool = True,
+        namespace: Optional[Namespace] = None,
+        config: Optional[internal.SendManagerConfig] = None,
     ) -> None:
+        if namespace is None:
+            namespace = Namespace(run.entity(), run.project())
+        if config is None:
+            config = internal.SendManagerConfig(
+                metadata=True,
+                files=True,
+                media=True,
+                code=True,
+                history=True,
+                summary=True,
+                terminal_output=True,
+            )
+        settings_override = {
+            "api_key": self.dst_api_key,
+            "base_url": self.dst_base_url,
+            "resume": "true",
+            "resumed": True,
+        }
         mlflow.set_tracking_uri(self.mlflow_tracking_uri)
-        send_run_with_send_manager(run, overrides)
+        internal.send_run(
+            run,
+            overrides=namespace.send_manager_overrides,
+            settings_override=settings_override,
+            config=config,
+        )
+        # in mlflow, the artifacts come with the runs, so import them together
+        if artifacts:
+            arts = list(run.artifacts())
+            logger.debug(f"Importing history artifacts, {run=}")
+            internal.send_run(
+                run,
+                extra_arts=arts,
+                overrides=namespace.send_manager_overrides,
+                settings_override=settings_override,
+                config=internal.SendManagerConfig(log_artifacts=True),
+            )
     def import_runs(
         self,
-        runs: Iterable[ImporterRun],
-        overrides: Optional[Dict[str, Any]] = None,
-        pool_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        _overrides = coalesce(overrides, {})
-        _pool_kwargs = coalesce(pool_kwargs, {})
-        runs = list(self.collect_runs())
-        with ThreadPoolExecutor(**_pool_kwargs) as exc:
-            futures = {
-                exc.submit(self.import_run, run, overrides=_overrides): run
-                for run in runs
-            }
-            with tqdm(desc="Importing runs", total=len(futures), unit="run") as pbar:
-                for future in as_completed(futures):
-                    run = futures[future]
-                    try:
-                        future.result()
-                    except Exception as e:
-                        wandb.termerror(f"Failed to import {run.display_name()}: {e}")
-                        raise e
-                    else:
-                        pbar.set_description(
-                            f"Imported Run: {run.run_group()} {run.display_name()}"
-                        )
-                    finally:
-                        pbar.update(1)
-    def import_all_runs(
-        self,
-        limit: Optional[int] = None,
-        overrides: Optional[Dict[str, Any]] = None,
-        pool_kwargs: Optional[Dict[str, Any]] = None,
+        runs: Iterable[MlflowRun],
+        *,
+        artifacts: bool = True,
+        namespace: Optional[Namespace] = None,
+        parallel: bool = True,
+        max_workers: Optional[int] = None,
     ) -> None:
-        runs = self.collect_runs(limit)
-        self.import_runs(runs, overrides, pool_kwargs)
+        def _import_run_wrapped(run):
+            self._import_run(run, namespace=namespace, artifacts=artifacts)
-    def import_report(self, report: Report):
-        raise NotImplementedError("MLFlow does not have a reports concept")
+        for_each(_import_run_wrapped, runs, parallel=parallel, max_workers=max_workers)

wandb/apis/importers/validation.py ADDED Viewed

@@ -0,0 +1,108 @@
+import filecmp
+import logging
+import os
+import requests
+import wandb
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+def _compare_artifact_manifests(
+    src_art: wandb.Artifact, dst_art: wandb.Artifact
+) -> list:
+    problems = []
+    if isinstance(dst_art, wandb.CommError):
+        return ["commError"]
+    if src_art.digest != dst_art.digest:
+        problems.append(f"digest mismatch {src_art.digest=}, {dst_art.digest=}")
+    for name, src_entry in src_art.manifest.entries.items():
+        dst_entry = dst_art.manifest.entries.get(name)
+        if dst_entry is None:
+            problems.append(f"missing manifest entry {name=}, {src_entry=}")
+            continue
+        for attr in ["path", "digest", "size"]:
+            if getattr(src_entry, attr) != getattr(dst_entry, attr):
+                problems.append(
+                    f"manifest entry mismatch {attr=}, {getattr(src_entry, attr)=}, {getattr(dst_entry, attr)=}"
+                )
+    return problems
+def _compare_artifact_dirs(src_dir, dst_dir) -> list:
+    def compare(src_dir, dst_dir):
+        comparison = filecmp.dircmp(src_dir, dst_dir)
+        differences = {
+            "left_only": comparison.left_only,
+            "right_only": comparison.right_only,
+            "diff_files": comparison.diff_files,
+            "subdir_differences": {},
+        }
+        # Recursively find differences in subdirectories
+        for subdir in comparison.subdirs:
+            subdir_src = os.path.join(src_dir, subdir)
+            subdir_dst = os.path.join(dst_dir, subdir)
+            subdir_differences = compare(subdir_src, subdir_dst)
+            # If there are differences, add them to the result
+            if subdir_differences and any(subdir_differences.values()):
+                differences["subdir_differences"][subdir] = subdir_differences
+        if all(not diff for diff in differences.values()):
+            return None
+        return differences
+    return compare(src_dir, dst_dir)
+def _check_entries_are_downloadable(art):
+    entries = _collect_entries(art)
+    for entry in entries:
+        if not _check_entry_is_downloable(entry):
+            return False
+    return True
+def _collect_entries(art):
+    has_next_page = True
+    cursor = None
+    entries = []
+    while has_next_page:
+        attrs = art._fetch_file_urls(cursor)
+        has_next_page = attrs["pageInfo"]["hasNextPage"]
+        cursor = attrs["pageInfo"]["endCursor"]
+        for edge in attrs["edges"]:
+            name = edge["node"]["name"]
+            entry = art.get_entry(name)
+            entry._download_url = edge["node"]["directUrl"]
+            entries.append(entry)
+    return entries
+def _check_entry_is_downloable(entry):
+    url = entry._download_url
+    expected_size = entry.size
+    try:
+        resp = requests.head(url, allow_redirects=True)
+    except Exception as e:
+        logger.error(f"Problem validating {entry=}, {e=}")
+        return False
+    if resp.status_code != 200:
+        return False
+    actual_size = resp.headers.get("content-length", -1)
+    actual_size = int(actual_size)
+    if expected_size == actual_size:
+        return True
+    return False

wandb 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl

wandb 0.16.3py3-none-any.whl → 0.16.5py3-none-any.whl