PyPI - flyteplugins-wandb - Versions diffs - 2.0.0b52__py3-none-any.whl → 2.0.0b54__py3-none-any.whl - Mend

flyteplugins-wandb 2.0.0b52py3-none-any.whl → 2.0.0b54py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

flyteplugins/wandb/__init__.py +91 -4
flyteplugins/wandb/_context.py +6 -1
flyteplugins/wandb/_decorator.py +272 -60
flyteplugins/wandb/_link.py +42 -5
flyteplugins_wandb-2.0.0b54.dist-info/METADATA +266 -0
flyteplugins_wandb-2.0.0b54.dist-info/RECORD +8 -0
flyteplugins_wandb-2.0.0b52.dist-info/METADATA +0 -34
flyteplugins_wandb-2.0.0b52.dist-info/RECORD +0 -8
{flyteplugins_wandb-2.0.0b52.dist-info → flyteplugins_wandb-2.0.0b54.dist-info}/WHEEL +0 -0
{flyteplugins_wandb-2.0.0b52.dist-info → flyteplugins_wandb-2.0.0b54.dist-info}/top_level.txt +0 -0

flyteplugins/wandb/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@
 - Parent/child task support with automatic run reuse
 - W&B sweep creation and management with `@wandb_sweep` decorator
 - Configuration management with `wandb_config()` and `wandb_sweep_config()`
+- Distributed training support (auto-detects PyTorch DDP/torchrun)
 ## Basic usage:
@@ -119,6 +120,64 @@
    ).run(run_parallel_sweep, num_agents=2, trials_per_agent=5)
    ```
+6. Distributed Training Support:
+   The plugin auto-detects distributed training from environment variables
+   (RANK, WORLD_SIZE, LOCAL_RANK, etc.) set by torchrun/torch.distributed.elastic.
+   By default (`run_mode="auto"`):
+   - Single-node: Only rank 0 logs (1 run)
+   - Multi-node: Local rank 0 of each worker logs (1 run per worker)
+   ```python
+   from flyteplugins.pytorch.task import Elastic
+   from flyteplugins.wandb import wandb_init, get_wandb_run
+   torch_env = flyte.TaskEnvironment(
+       name="torch_env",
+       resources=flyte.Resources(cpu=(1, 2), memory=("1Gi", "5Gi"), gpu="V100:4"),
+       plugin_config=Elastic(nnodes=2, nproc_per_node=2),
+   )
+   @wandb_init
+   @torch_env.task
+   async def train_distributed():
+       torch.distributed.init_process_group("nccl")
+       # Only local rank 0 gets a W&B run, other ranks get None
+       run = get_wandb_run()
+       if run:
+           run.log({"loss": loss})
+       return run.id if run else "non-primary-rank"
+   ```
+   Use `run_mode="shared"` for all ranks to log to a single shared run:
+   ```python
+   @wandb_init(run_mode="shared")
+   @torch_env.task
+   async def train_distributed_shared():
+       # All ranks log to the same W&B run (with x_label to identify each rank)
+       run = get_wandb_run()
+       run.log({"rank_metric": value})
+       return run.id
+   ```
+   Use `run_mode="new"` for each rank to have its own W&B run:
+   ```python
+   @wandb_init(run_mode="new")
+   @torch_env.task
+   async def train_distributed_separate_runs():
+       # Each rank gets its own W&B run (grouped in W&B UI)
+       # Run IDs: {run_name}-{action_name}-rank-{rank} (single-node)
+       # Run IDs: {run_name}-{action_name}-worker-{worker}-rank-{rank} (multi-node)
+       run = get_wandb_run()
+       run.log({"rank_metric": value})
+       return run.id
+   ```
 Decorator order: `@wandb_init` or `@wandb_sweep` must be the outermost decorator:
 ```python
@@ -145,7 +204,11 @@ from ._context import (
     wandb_config,
     wandb_sweep_config,
 )
-from ._decorator import wandb_init, wandb_sweep
+from ._decorator import (
+    _get_distributed_info,
+    wandb_init,
+    wandb_sweep,
+)
 from ._link import Wandb, WandbSweep
 logger = logging.getLogger(__name__)
@@ -158,6 +221,7 @@ __all__ = [
     "download_wandb_run_logs",
     "download_wandb_sweep_dirs",
     "download_wandb_sweep_logs",
+    "get_distributed_info",
     "get_wandb_context",
     "get_wandb_run",
     "get_wandb_run_dir",
@@ -183,11 +247,15 @@ def get_wandb_run():
     Returns:
         `wandb.sdk.wandb_run.Run` | `None`: The current wandb run object or None.
     """
+    # First check Flyte context
     ctx = flyte.ctx()
-    if not ctx or not ctx.data:
-        return None
+    if ctx and ctx.data:
+        run = ctx.data.get("_wandb_run")
+        if run:
+            return run
-    return ctx.data.get("_wandb_run")
+    # Fallback to wandb's global run
+    return wandb.run
 def get_wandb_sweep_id() -> str | None:
@@ -224,6 +292,25 @@ def get_wandb_run_dir() -> Optional[str]:
     return run.dir
+def get_distributed_info() -> dict | None:
+    """
+    Get distributed training info if running in a distributed context.
+    This function auto-detects distributed training from environment variables
+    set by torchrun/torch.distributed.elastic.
+    Returns:
+        dict | None: Dictionary with distributed info or None if not distributed.
+            - rank: Global rank (0 to world_size-1)
+            - local_rank: Rank within the node (0 to local_world_size-1)
+            - world_size: Total number of processes
+            - local_world_size: Processes per node
+            - worker_index: Node/worker index (0 to num_workers-1)
+            - num_workers: Total number of nodes/workers
+    """
+    return _get_distributed_info()
 def download_wandb_run_dir(
     run_id: Optional[str] = None,
     path: Optional[str] = None,

flyteplugins/wandb/_context.py CHANGED Viewed

@@ -213,7 +213,12 @@ def wandb_config(
         mode: "online", "offline" or "disabled"
         group: Group name for related runs
         run_mode: Flyte-specific run mode - "auto", "new" or "shared".
-            Controls whether tasks create new W&B runs or share existing ones
+            Controls whether tasks create new W&B runs or share existing ones.
+            In distributed training context:
+            - "auto" (default): Single-node: only rank 0 logs.
+              Multi-node: local rank 0 of each worker logs (1 run per worker).
+            - "shared": All ranks log to a single shared W&B run.
+            - "new": Each rank gets its own W&B run (grouped in W&B UI).
         download_logs: If `True`, downloads wandb run files after task completes
             and shows them as a trace output in the Flyte UI
         **kwargs: Additional `wandb.init()` parameters

flyteplugins/wandb/_decorator.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import functools
 import logging
+import os
 from contextlib import contextmanager
 from dataclasses import asdict
 from inspect import iscoroutinefunction
@@ -18,6 +19,144 @@ logger = logging.getLogger(__name__)
 F = TypeVar("F", bound=Callable[..., Any])
+def _get_distributed_info() -> dict | None:
+    """
+    Auto-detect distributed training info from environment variables.
+    Returns None if not in a distributed training context.
+    Environment variables are set by torchrun/torch.distributed.elastic.
+    """
+    if "RANK" not in os.environ or "WORLD_SIZE" not in os.environ:
+        return None
+    world_size = int(os.environ["WORLD_SIZE"])
+    if world_size <= 1:
+        return None
+    local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", world_size))
+    return {
+        "rank": int(os.environ["RANK"]),
+        "local_rank": int(os.environ.get("LOCAL_RANK", "0")),
+        "world_size": world_size,
+        "local_world_size": local_world_size,
+        "worker_index": int(os.environ.get("GROUP_RANK", "0")),
+        "num_workers": world_size // local_world_size if local_world_size > 0 else 1,
+    }
+def _is_multi_node(info: dict) -> bool:
+    """Check if this is a multi-node distributed setup."""
+    return info["num_workers"] > 1
+def _is_primary_rank(info: dict) -> bool:
+    """Check if current process is rank 0 (primary)."""
+    return info["rank"] == 0
+def _should_skip_rank(run_mode: RunMode, dist_info: dict) -> bool:
+    """
+    Check if this rank should skip wandb initialization.
+    For run_mode="auto":
+    - Single-node: Only rank 0 initializes wandb
+    - Multi-node: Only local rank 0 of each worker initializes wandb
+    For run_mode="shared" or "new": All ranks initialize wandb.
+    """
+    if run_mode != "auto":
+        return False
+    is_multi_node = _is_multi_node(dist_info)
+    is_primary = _is_primary_rank(dist_info)
+    is_local_primary = dist_info["local_rank"] == 0
+    if is_multi_node:
+        # Multi-node: only local rank 0 of each node logs
+        return not is_local_primary
+    else:
+        # Single-node: only rank 0 logs
+        return not is_primary
+def _configure_distributed_run(
+    init_kwargs: dict,
+    run_mode: RunMode,
+    dist_info: dict,
+    base_run_id: str,
+) -> dict:
+    """
+    Configure wandb.init() kwargs for distributed training.
+    Sets run ID, group, and shared mode settings based on:
+    - run_mode: "auto", "new", or "shared"
+    - dist_info: distributed topology (rank, worker_index, etc.)
+    - base_run_id: base string for generating run IDs
+    Run ID patterns:
+    - Single-node auto/shared: {base_run_id}
+    - Single-node new: {base_run_id}-rank-{rank}
+    - Multi-node auto/shared: {base_run_id}-worker-{worker_index}
+    - Multi-node new: {base_run_id}-worker-{worker_index}-rank-{local_rank}
+    """
+    is_multi_node = _is_multi_node(dist_info)
+    is_primary = _is_primary_rank(dist_info)
+    # Build run ID based on mode and topology
+    if "id" not in init_kwargs or init_kwargs["id"] is None:
+        if run_mode == "new":
+            # Each rank gets its own run
+            if is_multi_node:
+                init_kwargs["id"] = f"{base_run_id}-worker-{dist_info['worker_index']}-rank-{dist_info['local_rank']}"
+            else:
+                init_kwargs["id"] = f"{base_run_id}-rank-{dist_info['rank']}"
+        else:  # run_mode == "auto" or "shared"
+            if is_multi_node:
+                init_kwargs["id"] = f"{base_run_id}-worker-{dist_info['worker_index']}"
+            else:
+                init_kwargs["id"] = base_run_id
+    # Set group for multiple runs (run_mode="new")
+    if run_mode == "new" and "group" not in init_kwargs:
+        if is_multi_node:
+            init_kwargs["group"] = f"{base_run_id}-worker-{dist_info['worker_index']}"
+        else:
+            init_kwargs["group"] = base_run_id
+    # Configure W&B shared mode for run_mode="shared"
+    if run_mode == "shared":
+        if is_multi_node:
+            x_label = f"worker-{dist_info['worker_index']}-rank-{dist_info['local_rank']}"
+            # For multi-node, primary is local_rank 0 within each worker
+            is_worker_primary = dist_info["local_rank"] == 0
+        else:
+            x_label = f"rank-{dist_info['rank']}"
+            # For single-node, primary is rank 0
+            is_worker_primary = is_primary
+        existing_settings = init_kwargs.get("settings")
+        shared_config = {
+            "mode": "shared",
+            "x_primary": is_worker_primary,
+            "x_label": x_label,
+            "x_update_finish_state": is_worker_primary,
+        }
+        # Handle both dict and wandb.Settings objects
+        if existing_settings is None:
+            init_kwargs["settings"] = wandb.Settings(**shared_config)
+        elif isinstance(existing_settings, dict):
+            init_kwargs["settings"] = wandb.Settings(**{**existing_settings, **shared_config})
+        else:
+            # existing_settings is already a wandb.Settings object
+            for key, value in shared_config.items():
+                setattr(existing_settings, key, value)
+            init_kwargs["settings"] = existing_settings
+    return init_kwargs
 def _build_init_kwargs() -> dict[str, Any]:
     """Build wandb.init() kwargs from current context config."""
     context_config = get_wandb_context()
@@ -50,6 +189,7 @@ def _wandb_run(
     """
     # Try to get Flyte context
     ctx = flyte.ctx()
+    dist_info = _get_distributed_info()
     # This enables @wandb_init to work in wandb.agent() callbacks (sweep objectives)
     if func and ctx is None:
@@ -61,6 +201,12 @@ def _wandb_run(
             run.finish()
         return
     elif func and ctx:
+        # Check if there's already a W&B run from parent
+        existing_run = ctx.data.get("_wandb_run")
+        if existing_run:
+            yield existing_run
+            return
         raise RuntimeError(
             "@wandb_init cannot be applied to traces. Traces can access the parent's wandb run via get_wandb_run()."
         )
@@ -85,46 +231,64 @@ def _wandb_run(
     # Get current action name for run ID generation
     current_action = ctx.action.name
+    base_run_id = f"{ctx.action.run_name}-{current_action}"
+    # Handle distributed training
+    if dist_info:
+        if _should_skip_rank(run_mode, dist_info):
+            yield None
+            return
+        init_kwargs = _configure_distributed_run(init_kwargs, run_mode, dist_info, base_run_id)
+    else:
+        # Non-distributed training
+        # Determine if we should reuse parent's run
+        should_reuse = False
+        if run_mode == "shared":
+            should_reuse = True
+        elif run_mode == "auto":
+            should_reuse = bool(saved_run_id)
+        # Determine run ID
+        if "id" not in init_kwargs or init_kwargs["id"] is None:
+            if should_reuse:
+                if not saved_run_id:
+                    raise RuntimeError("Cannot reuse parent run: no parent run ID found")
+                init_kwargs["id"] = saved_run_id
+            else:
+                init_kwargs["id"] = base_run_id
-    # Determine if we should reuse parent's run
-    should_reuse = False
-    if run_mode == "shared":
-        should_reuse = True
-    elif run_mode == "auto":
-        should_reuse = bool(saved_run_id)
-    # Determine run ID
-    if "id" not in init_kwargs or init_kwargs["id"] is None:
-        if should_reuse:
-            if not saved_run_id:
-                raise RuntimeError("Cannot reuse parent run: no parent run ID found")
-            init_kwargs["id"] = saved_run_id
-        else:
-            init_kwargs["id"] = f"{ctx.action.run_name}-{current_action}"
-    # Configure reinit parameter (only for local mode)
-    # In remote/shared mode, wandb handles run creation/joining automatically
-    if flyte.ctx().mode == "local":
-        if should_reuse:
-            if "reinit" not in init_kwargs:
-                init_kwargs["reinit"] = "return_previous"
-        else:
-            init_kwargs["reinit"] = "create_new"
-    # Configure remote mode settings
-    if flyte.ctx().mode == "remote":
-        is_primary = not should_reuse
-        existing_settings = init_kwargs.get("settings", {})
-        shared_config = {
-            "mode": "shared",
-            "x_primary": is_primary,
-            "x_label": current_action,
-        }
-        if not is_primary:
-            shared_config["x_update_finish_state"] = False
-        init_kwargs["settings"] = wandb.Settings(**{**existing_settings, **shared_config})
+        # Configure reinit parameter (only for local mode)
+        if ctx.mode == "local":
+            if should_reuse:
+                if "reinit" not in init_kwargs:
+                    init_kwargs["reinit"] = "return_previous"
+            else:
+                init_kwargs["reinit"] = "create_new"
+        # Configure remote mode settings
+        if ctx.mode == "remote":
+            is_primary = not should_reuse
+            existing_settings = init_kwargs.get("settings")
+            shared_config = {
+                "mode": "shared",
+                "x_primary": is_primary,
+                "x_label": current_action,
+            }
+            if not is_primary:
+                shared_config["x_update_finish_state"] = False
+            # Handle None, dict, and wandb.Settings objects
+            if existing_settings is None:
+                init_kwargs["settings"] = wandb.Settings(**shared_config)
+            elif isinstance(existing_settings, dict):
+                init_kwargs["settings"] = wandb.Settings(**{**existing_settings, **shared_config})
+            else:
+                # existing_settings is already a wandb.Settings object
+                for key, value in shared_config.items():
+                    setattr(existing_settings, key, value)
+                init_kwargs["settings"] = existing_settings
     # Initialize wandb
     run = wandb.init(**init_kwargs)
@@ -141,18 +305,18 @@ def _wandb_run(
         # Determine if this is a primary run
         is_primary_run = run_mode == "new" or (run_mode == "auto" and saved_run_id is None)
+        # Determine if we should call finish()
+        should_finish = False
         if run:
-            # Different cleanup logic for local vs remote mode
-            should_finish = False
-            if flyte.ctx().mode == "remote":
-                # In remote/shared mode, always call run.finish() to flush data
-                # For secondary tasks, x_update_finish_state=False prevents actually finishing
-                # For primary tasks, this properly finishes the run
-                should_finish = True
-            elif is_primary_run:
-                # In local mode, only primary tasks should call run.finish()
-                # Secondary tasks reuse the parent's run object, so they must not finish it
+            if dist_info and run_mode == "shared":
+                # For distributed shared mode, only primary (local_rank 0) finishes
+                is_multi_node = _is_multi_node(dist_info)
+                if is_multi_node:
+                    should_finish = dist_info["local_rank"] == 0
+                else:
+                    should_finish = dist_info["rank"] == 0
+            elif ctx.mode == "remote" or is_primary_run:
+                # In remote mode or for primary runs, always finish
                 should_finish = True
             if should_finish:
@@ -192,10 +356,14 @@ def wandb_init(
     Args:
         run_mode: Controls whether to create a new W&B run or share an existing one:
-            1. "auto" (default): Creates new run if no parent run exists, otherwise shares parent's run
-            2. "new": Always creates a new wandb run with a unique ID
-            3. "shared": Always shares the parent's run ID (useful for child tasks)
+            - "auto" (default): Creates new run if no parent run exists, otherwise shares parent's run
+            - "new": Always creates a new wandb run with a unique ID
+            - "shared": Always shares the parent's run ID (useful for child tasks)
+            In distributed training context:
+            - "auto" (default): Single-node: only rank 0 logs.
+              Multi-node: local rank 0 of each worker logs (1 run per worker).
+            - "shared": All ranks log to a single shared W&B run.
+            - "new": Each rank gets its own W&B run (grouped in W&B UI).
         download_logs: If `True`, downloads wandb run files after task completes
             and shows them as a trace output in the Flyte UI. If None, uses
             the value from `wandb_config()` context if set.
@@ -230,15 +398,59 @@ def wandb_init(
         # Check if it's a Flyte task (AsyncFunctionTaskTemplate)
         if isinstance(func, AsyncFunctionTaskTemplate):
-            # Create a Wandb link
-            # Even if run_mode="shared", we still add a link - it will point to the parent's run
-            wandb_link = Wandb(project=project, entity=entity, run_mode=run_mode)
+            # Detect distributed config from plugin_config
+            nnodes = 1
+            nproc_per_node = 1
+            plugin_config = getattr(func, "plugin_config", None)
+            if plugin_config is not None and type(plugin_config).__name__ == "Elastic":
+                nnodes_val = getattr(plugin_config, "nnodes", 1)
+                if isinstance(nnodes_val, int):
+                    nnodes = nnodes_val
+                elif isinstance(nnodes_val, str):
+                    parts = nnodes_val.split(":")
+                    nnodes = int(parts[-1]) if parts else 1
+                nproc_val = getattr(plugin_config, "nproc_per_node", 1)
+                if isinstance(nproc_val, int):
+                    nproc_per_node = nproc_val
+                elif isinstance(nproc_val, str):
+                    try:
+                        nproc_per_node = int(nproc_val)
+                    except ValueError:
+                        nproc_per_node = 1
-            # Get existing links from the task and add wandb link
+            is_distributed = nnodes > 1 or nproc_per_node > 1
+            # Add W&B links
+            wandb_id = kwargs.get("id")
             existing_links = getattr(func, "links", ())
-            # Use override to properly add the link to the task
-            func = func.override(links=(*existing_links, wandb_link))
+            if nnodes > 1:
+                # Multi-node: one link per worker
+                wandb_links = tuple(
+                    Wandb(
+                        project=project,
+                        entity=entity,
+                        run_mode=run_mode,
+                        id=wandb_id,
+                        _is_distributed=True,
+                        _worker_index=i,
+                        name=f"Weights & Biases Worker {i}",
+                    )
+                    for i in range(nnodes)
+                )
+                func = func.override(links=(*existing_links, *wandb_links))
+            else:
+                # Single-node (distributed or not): one link
+                wandb_link = Wandb(
+                    project=project,
+                    entity=entity,
+                    run_mode=run_mode,
+                    id=wandb_id,
+                    _is_distributed=is_distributed,
+                )
+                func = func.override(links=(*existing_links, wandb_link))
             # Wrap the task's execute method with wandb_run
             original_execute = func.execute

flyteplugins/wandb/_link.py CHANGED Viewed

@@ -15,11 +15,15 @@ class Wandb(Link):
         host: Base W&B host URL
         project: W&B project name (overrides context config if provided)
         entity: W&B entity/team name (overrides context config if provided)
-        run_mode: Controls whether to create a new W&B run or share an existing one:
-            1. "auto" (default): Creates new run if no parent run exists, otherwise shares parent's run
-            2. "new": Always creates a new wandb run with a unique ID
-            3. "shared": Always shares the parent's run ID (useful for child tasks)
+        run_mode: Determines the link behavior:
+            - "auto" (default): Use parent's run if available, otherwise create new
+            - "new": Always creates a new wandb run with a unique ID
+            - "shared": Always shares the parent's run ID (useful for child tasks)
+            In distributed training context:
+            - "auto" (default): Single-node: only rank 0 logs
+              Multi-node: only local rank 0 of each worker logs
+            - "shared": Link to a single shared W&B run.
+            - "new": Link to group view.
         id: Optional W&B run ID (overrides context config if provided)
         name: Link name in the Flyte UI
     """
@@ -30,6 +34,10 @@ class Wandb(Link):
     run_mode: RunMode = "auto"
     id: Optional[str] = None
     name: str = "Weights & Biases"
+    # Internal: set by @wandb_init for distributed training tasks
+    _is_distributed: bool = False
+    # Internal: worker index for multi-node distributed training (set by @wandb_init)
+    _worker_index: Optional[int] = None
     def get_link(
         self,
@@ -69,6 +77,35 @@ class Wandb(Link):
         if not wandb_project or not wandb_entity:
             return self.host
+        # Distributed training links - derived from decorator-time info (plugin_config)
+        # _is_distributed and _worker_index are set by @wandb_init based on Elastic config
+        is_multi_node = self._worker_index is not None
+        if self._is_distributed:
+            base_id = user_provided_id or f"{run_name}-{action_name}"
+            # For run_mode="new", link to group view
+            if run_mode == "new":
+                if is_multi_node:
+                    # Multi-node: link to per-worker group
+                    group_name = f"{base_id}-worker-{self._worker_index}"
+                else:
+                    # Single-node: link to single group
+                    group_name = base_id
+                return f"{self.host}/{wandb_entity}/{wandb_project}/groups/{group_name}"
+            # For run_mode="auto" or "shared", link to run directly
+            if is_multi_node:
+                # Multi-node: link to worker-specific run
+                wandb_run_id = f"{base_id}-worker-{self._worker_index}"
+            else:
+                # Single-node: link to single run
+                wandb_run_id = base_id
+            return f"{self.host}/{wandb_entity}/{wandb_project}/runs/{wandb_run_id}"
+        # Non-distributed: link to specific run
         # Determine run ID based on run_mode setting
         if run_mode == "new":
             # Always create new run - use user-provided ID if available, otherwise generate

flyteplugins_wandb-2.0.0b54.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,266 @@
+Metadata-Version: 2.4
+Name: flyteplugins-wandb
+Version: 2.0.0b54
+Summary: Weights & Biases plugin for Flyte
+Author: Flyte Contributors
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: wandb
+Requires-Dist: flyte
+# Weights & Biases Plugin
+This plugin provides integration between Flyte and Weights & Biases (W&B) for experiment tracking, including support for distributed training with PyTorch Elastic.
+## Quickstart
+```python
+from flyteplugins.wandb import wandb_init, wandb_config, get_wandb_run
+@wandb_init(project="my-project", entity="my-team")
+@env.task
+def train():
+    run = get_wandb_run()
+    run.log({"loss": 0.5, "accuracy": 0.9})
+```
+## Core concepts
+### Decorator order
+`@wandb_init` and `@wandb_sweep` must be the **outermost decorators** (applied after `@env.task`):
+```python
+@wandb_init  # Outermost
+@env.task   # Task decorator
+def my_task():
+    ...
+```
+### Run modes
+The `run_mode` parameter controls how W&B runs are created:
+- **`"auto"`** (default): Creates a new run if no parent exists, otherwise shares the parent's run
+- **`"new"`**: Always creates a new W&B run with a unique ID
+- **`"shared"`**: Always shares the parent's run ID (useful for child tasks)
+### Accessing the run
+Use `get_wandb_run()` to access the current W&B run:
+```python
+from flyteplugins.wandb import get_wandb_run
+run = get_wandb_run()
+if run:
+    run.log({"metric": value})
+```
+Returns `None` if not within a `@wandb_init` decorated task or if the current rank should not log (in distributed training).
+## Distributed training
+The plugin automatically detects distributed training environments (PyTorch Elastic) and configures W&B appropriately.
+### Environment variables
+Distributed training is detected via these environment variables (set by `torchrun`/`torch.distributed.elastic`):
+| Variable | Description |
+|----------|-------------|
+| `RANK` | Global rank of the process |
+| `WORLD_SIZE` | Total number of processes |
+| `LOCAL_RANK` | Rank within the current node |
+| `LOCAL_WORLD_SIZE` | Number of processes per node |
+| `GROUP_RANK` | Worker/node index (0, 1, 2, ...) |
+### Run modes in distributed context
+| Mode | Single-Node | Multi-Node |
+|------|-------------|------------|
+| `"auto"` | Only rank 0 logs → 1 run | Local rank 0 of each worker logs → N runs (1 per worker) |
+| `"shared"` | All ranks log to 1 shared run | All ranks per worker log to shared run → N runs (1 per worker) |
+| `"new"` | Each rank gets its own run (grouped) → N runs | Each rank gets its own run (grouped per worker) → N×GPUs runs |
+### Run ID patterns
+| Scenario | Run ID Pattern |
+|----------|----------------|
+| Single-node auto/shared | `{run_name}-{action_name}` |
+| Single-node new | `{run_name}-{action_name}-rank-{rank}` |
+| Multi-node auto/shared | `{run_name}-{action_name}-worker-{worker_index}` |
+| Multi-node new | `{run_name}-{action_name}-worker-{worker_index}-rank-{local_rank}` |
+### Example: Distributed training task
+```python
+from flyteplugins.wandb import wandb_init, wandb_config, get_wandb_run, get_distributed_info
+from flyteplugins.pytorch.task import Elastic
+# Multi-node environment (2 nodes, 4 GPUs each)
+multi_node_env = flyte.TaskEnvironment(
+    name="multi_node_env",
+    resources=flyte.Resources(gpu="V100:4", shm="auto"),
+    plugin_config=Elastic(nproc_per_node=4, nnodes=2),
+    secrets=flyte.Secret(key="wandb_api_key", as_env_var="WANDB_API_KEY"),
+)
+@wandb_init  # run_mode="auto" by default
+@multi_node_env.task
+def train_multi_node():
+    import torch.distributed as dist
+    dist.init_process_group("nccl")
+    run = get_wandb_run()  # Returns run for local_rank 0, None for others
+    dist_info = get_distributed_info()
+    # Training loop...
+    if run:
+        run.log({"loss": loss.item()})
+    dist.destroy_process_group()
+```
+### Shared mode for all-Rank logging
+Use `run_mode="shared"` when you want all ranks to log to the same W&B run:
+```python
+@wandb_init(run_mode="shared")
+@multi_node_env.task
+def train_all_ranks_log():
+    run = get_wandb_run()  # All ranks get a run object
+    # All ranks can log - W&B handles deduplication
+    run.log({"loss": loss.item(), "rank": dist.get_rank()})
+```
+### New mode for per-rank runs
+Use `run_mode="new"` when you want each rank to have its own W&B run:
+```python
+@wandb_init(run_mode="new")
+@multi_node_env.task
+def train_per_rank():
+    run = get_wandb_run()  # Each rank gets its own run
+    # Runs are grouped in W&B UI for easy comparison
+    run.log({"loss": loss.item()})
+```
+## Configuration
+### wandb_config
+Use `wandb_config()` to pass configuration that propagates to child tasks:
+```python
+from flyteplugins.wandb import wandb_config
+# With flyte.with_runcontext
+run = flyte.with_runcontext(
+    custom_context=wandb_config(
+        project="my-project",
+        entity="my-team",
+        tags=["experiment-1"],
+    )
+).run(my_task)
+# As a context manager
+with wandb_config(project="override-project"):
+    await child_task()
+```
+### Decorator vs context config
+- **Decorator arguments** (`@wandb_init(project=...)`) are available only within the current task and its traces
+- **Context config** (`wandb_config(...)`) propagates to child tasks
+## W&B links
+Tasks decorated with `@wandb_init` or `@wandb_sweep` automatically get W&B links in the Flyte UI:
+- For distributed training with multiple workers, each worker gets its own link
+- Links point directly to the corresponding W&B runs or sweeps
+- Project/entity are retrieved from decorator parameters or context configuration
+## Sweeps
+Use `@wandb_sweep` to create W&B sweeps:
+```python
+from flyteplugins.wandb import wandb_sweep, wandb_sweep_config, get_wandb_sweep_id
+@wandb_init
+def objective():
+    # Training logic - this runs for each sweep trial
+    run = get_wandb_run()
+    config = run.config  # Sweep parameters are passed via run.config
+    # Train with sweep-suggested hyperparameters
+    model = train(lr=config.lr, batch_size=config.batch_size)
+    wandb.log({"loss": loss, "accuracy": accuracy})
+@wandb_sweep
+@env.task
+def run_sweep():
+    sweep_id = get_wandb_sweep_id()
+    # Launch sweep agents to run trials
+    # count=10 means run 10 trials total
+    wandb.agent(sweep_id, function=objective, count=10)
+```
+**Note:** A maximum of **20 sweep agents** can be launched at a time.
+Configure sweeps with `wandb_sweep_config()`:
+```python
+run = flyte.with_runcontext(
+    custom_context=wandb_sweep_config(
+        method="bayes",
+        metric={"name": "loss", "goal": "minimize"},
+        parameters={"lr": {"min": 1e-5, "max": 1e-2}},
+        project="my-project",
+    )
+).run(run_sweep)
+```
+## Downloading logs
+Set `download_logs=True` to download W&B run/sweep logs after task completion. The download I/O is traced by Flyte's `@flyte.trace`, making the logs visible in the Flyte UI:
+```python
+@wandb_init(download_logs=True)
+@env.task
+def train():
+    ...
+# Or via context
+wandb_config(download_logs=True)
+wandb_sweep_config(download_logs=True)
+```
+The downloaded logs include all files uploaded to W&B during the run (metrics, artifacts, etc.).
+## API reference
+### Functions
+- `get_wandb_run()` - Get the current W&B run object (or `None`)
+- `get_wandb_sweep_id()` - Get the current sweep ID (or `None`)
+- `get_distributed_info()` - Get distributed training info dict (or `None`)
+- `wandb_config(...)` - Create W&B configuration for context
+- `wandb_sweep_config(...)` - Create sweep configuration for context
+### Decorators
+- `@wandb_init` - Initialize W&B for a task or function
+- `@wandb_sweep` - Create a W&B sweep for a task
+### Links
+- `Wandb` - Link class for W&B runs
+- `WandbSweep` - Link class for W&B sweeps

flyteplugins_wandb-2.0.0b54.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+flyteplugins/wandb/__init__.py,sha256=kfCwLEFT2foHYaPrqtpOMnV_qwVM7qw9IPFC2lJUhBM,18704
+flyteplugins/wandb/_context.py,sha256=7-MnHpoh4OdjKWI6SWYljLqa2cS-wDA6PmDVGudyDHY,13073
+flyteplugins/wandb/_decorator.py,sha256=ge3ZT3AxFNDWtOCzhoRvJsM2WPjzQmX4SV3rhu1vHdc,22749
+flyteplugins/wandb/_link.py,sha256=dQfH9BoI0eMwm-hf_rh9aFZ5bgYZWDFCQ6jU5TgA214,7027
+flyteplugins_wandb-2.0.0b54.dist-info/METADATA,sha256=RRYiuP4fX4giv7pNQhsVzinkwDJ_tvEFG0lDxhld23M,7710
+flyteplugins_wandb-2.0.0b54.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+flyteplugins_wandb-2.0.0b54.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
+flyteplugins_wandb-2.0.0b54.dist-info/RECORD,,

flyteplugins_wandb-2.0.0b52.dist-info/METADATA DELETED Viewed

@@ -1,34 +0,0 @@
-Metadata-Version: 2.4
-Name: flyteplugins-wandb
-Version: 2.0.0b52
-Summary: Weights & Biases plugin for Flyte
-Author: Flyte Contributors
-Requires-Python: >=3.10
-Description-Content-Type: text/markdown
-Requires-Dist: wandb
-Requires-Dist: flyte
-# Weights & Biases Plugin
-- Tasks decorated with `@wandb_init` or `@wandb_sweep` automatically get W&B links in the Flyte UI that point directly to the corresponding W&B runs or sweeps. Links retrieve project/entity from decorator parameters or context configuration (from `with_runcontext`).
-- `@wandb_init` and `@wandb_sweep` must be the **outermost decorators** (applied after `@env.task`). For example:
-  ```python
-  @wandb_init
-  @env.task
-  def my_task():
-      ...
-  ```
-- By default (`run_mode="auto"`), child tasks automatically reuse their parent's W&B run if one exists, or create a new run if they're top-level tasks. You can override this with `run_mode="new"` (always create new) or `run_mode="shared"` (always reuse parent).
-- `@wandb_init` should be applied to tasks (not traces). Traces can access the parent task's W&B run via `get_wandb_run()`. `@wandb_init` can also be applied to regular Python functions for use in `wandb.agent()` sweep callbacks.
-- The wandb run can be accessed via `get_wandb_run()`, which returns the run object or `None` if not within a `@wandb_init` decorated task.
-- When using `run_mode="shared"` or `run_mode="auto"` (with a parent run), child tasks reuse the parent's run ID. Configuration from `wandb_config()` is merged with decorator parameters.
-- `wandb_config` can be used to pass configuration to tasks enclosed within the context manager and can also be provided via `with_runcontext`.
-- When the context manager exits, the configuration falls back to the parent task's config.
-- Arguments passed to `wandb_init` decorator are available only within the current task and traces and are not propagated to child tasks (use `wandb_config` for child tasks).
-- At most 20 sweep agents can be launched at a time: https://docs.wandb.ai/models/sweeps/existing-project#3-launch-agents.
-- `@wandb_sweep` creates a W&B sweep and adds a sweep link to the decorated task. The sweep ID is available via `get_wandb_sweep_id()`. For the parent task that creates the sweep, the link points to the project's sweeps list page. For child tasks, the link points to the specific sweep (they inherit the `sweep_id` from the parent's context).
-- The objective function passed to `wandb.agent()` should be a vanilla Python function decorated with `@wandb_init` to initialize the run. You can access the run with `wandb.run` since the Flyte context won't be available during the objective function call.
-- Set `download_logs=True` in `wandb_config` or `@wandb_init` to download W&B run logs after task completion. The I/O of this download functionality is traced by Flyte's `@flyte.trace`.
-- Set `download_logs=True` in `wandb_sweep_config` or `@wandb_sweep` to download W&B sweep logs after task completion. The I/O of this download functionality is traced by Flyte's `@flyte.trace`.

flyteplugins_wandb-2.0.0b52.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-flyteplugins/wandb/__init__.py,sha256=D5gqDOIy6ePcE2tcbNVsp9ZzxZKC6Qmd-6eHxNX3L88,15881
-flyteplugins/wandb/_context.py,sha256=va_TlRhSW-QBbHhvKmIAggsLw5VFAq4gXMIu7n5ZKSA,12746
-flyteplugins/wandb/_decorator.py,sha256=HenEVJI7kmDMQdHo6jDy3vXvjxT89CCYRBCR2CuGE3s,14785
-flyteplugins/wandb/_link.py,sha256=tEzfW06GPsVMECGAnEhwNzCI2h0d0UnJHMqso6t8Pnw,5319
-flyteplugins_wandb-2.0.0b52.dist-info/METADATA,sha256=oOQOpcjQa99Iy-bhYuop9KtFktttFzNzNYj4yQvUjBc,3058
-flyteplugins_wandb-2.0.0b52.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-flyteplugins_wandb-2.0.0b52.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
-flyteplugins_wandb-2.0.0b52.dist-info/RECORD,,

{flyteplugins_wandb-2.0.0b52.dist-info → flyteplugins_wandb-2.0.0b54.dist-info}/WHEEL RENAMED Viewed

File without changes

{flyteplugins_wandb-2.0.0b52.dist-info → flyteplugins_wandb-2.0.0b54.dist-info}/top_level.txt RENAMED Viewed

File without changes

flyteplugins-wandb 2.0.0b52__py3-none-any.whl → 2.0.0b54__py3-none-any.whl

flyteplugins-wandb 2.0.0b52py3-none-any.whl → 2.0.0b54py3-none-any.whl