PyPI - dr-wandb - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dr-wandb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dr-wandb might be problematic. Click here for more details.

Files changed (15) hide show

dr_wandb/__init__.py +2 -0
dr_wandb/cli/__init__.py +0 -0
dr_wandb/cli/download.py +128 -0
dr_wandb/constants.py +20 -0
dr_wandb/downloader.py +118 -0
dr_wandb/history_entry_record.py +62 -0
dr_wandb/py.typed +0 -0
dr_wandb/run_record.py +115 -0
dr_wandb/store.py +193 -0
dr_wandb/utils.py +57 -0
dr_wandb-0.1.0.dist-info/METADATA +123 -0
dr_wandb-0.1.0.dist-info/RECORD +15 -0
dr_wandb-0.1.0.dist-info/WHEEL +4 -0
dr_wandb-0.1.0.dist-info/entry_points.txt +2 -0
dr_wandb-0.1.0.dist-info/licenses/LICENSE +21 -0

dr_wandb/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ def hello() -> str:
2	+ return "Hello from dr-wandb!"

dr_wandb/cli/__init__.py ADDED Viewed

File without changes

dr_wandb/cli/download.py ADDED Viewed

@@ -0,0 +1,128 @@
+import logging
+from pathlib import Path
+import click
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from dr_wandb.downloader import Downloader
+from dr_wandb.store import ProjectStore
+class ProjDownloadSettings(BaseSettings):
+    model_config = SettingsConfigDict(env_file=".env", env_prefix="DR_WANDB_")
+    entity: str | None = None
+    project: str | None = None
+    database_url: str = "postgresql+psycopg2://localhost/wandb"
+    output_dir: Path = Path(__file__).parent.parent / "data"
+    runs_per_page: int = 500
+def setup_logging(level: str = "INFO") -> None:
+    logging.basicConfig(
+        level=getattr(logging, level.upper()),
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+def validate_settings(entity: str | None, project: str | None) -> None:
+    if not entity:
+        raise click.ClickException(
+            "--entity is required, or set DR_WANDB_ENTITY in .env"
+        )
+    if not project:
+        raise click.ClickException(
+            "--project is required, or set DR_WANDB_PROJECT in .env"
+        )
+def resolve_config(
+    entity: str | None,
+    project: str | None,
+    db_url: str | None,
+    output_dir: str | None,
+) -> ProjDownloadSettings:
+    cfg = ProjDownloadSettings()
+    final_entity = entity if entity else cfg.entity
+    final_project = project if project else cfg.project
+    final_db_url = db_url if db_url else cfg.database_url
+    final_output_dir = output_dir if output_dir else cfg.output_dir
+    validate_settings(final_entity, final_project)
+    return ProjDownloadSettings(
+        entity=final_entity,
+        project=final_project,
+        database_url=final_db_url,
+        output_dir=final_output_dir,
+        runs_per_page=cfg.runs_per_page,
+    )
+def execute_download(
+    cfg: ProjDownloadSettings, runs_only: bool, force_refresh: bool
+) -> None:
+    store = ProjectStore(
+        cfg.database_url,
+        output_dir=cfg.output_dir,
+    )
+    downloader = Downloader(store, runs_per_page=cfg.runs_per_page)
+    click.echo(">> Beginning download:")
+    stats = downloader.download_project(
+        entity=cfg.entity,
+        project=cfg.project,
+        runs_only=runs_only,
+        force_refresh=force_refresh,
+    )
+    click.echo(str(stats))
+    return downloader
+@click.command()
+@click.option(
+    "--entity",
+    envvar="DR_WANDB_ENTITY",
+    help="WandB entity (username or team name)",
+)
+@click.option("--project", envvar="DR_WANDB_PROJECT", help="WandB project name")
+@click.option(
+    "--runs-only",
+    is_flag=True,
+    help="Only download runs, don't download history",
+)
+@click.option(
+    "--force-refresh",
+    is_flag=True,
+    help="Force refresh, download all data",
+)
+@click.option(
+    "--db-url",
+    envvar="DR_WANDB_DATABASE_URL",
+    help="PostgreSQL connection string",
+)
+@click.option(
+    "--output-dir",
+    envvar="DR_WANDB_OUTPUT_DIR",
+    help="Output directory",
+)
+def download_project(
+    entity: str | None,
+    project: str | None,
+    runs_only: bool,
+    force_refresh: bool,
+    db_url: str | None,
+    output_dir: str | None,
+) -> None:
+    setup_logging()
+    click.echo("\n:: Beginning Dr. Wandb Project Downloading Tool ::\n")
+    cfg = resolve_config(entity, project, db_url, output_dir)
+    click.echo(f">> Downloading project {cfg.entity}/{cfg.project}")
+    click.echo(f">> Database: {cfg.database_url}")
+    click.echo(f">> Output directory: {cfg.output_dir}")
+    click.echo(f">> Force refresh: {force_refresh} Runs only: {runs_only}")
+    click.echo()
+    downloader = execute_download(cfg, runs_only, force_refresh)
+    downloader.write_downloaded_to_parquet()
+if __name__ == "__main__":
+    download_project()

dr_wandb/constants.py ADDED Viewed

@@ -0,0 +1,20 @@
+from collections.abc import Callable
+from typing import Literal
+from sqlalchemy.orm import DeclarativeBase
+class Base(DeclarativeBase):
+    pass
+MAX_INT = 2**31 - 1
+SUPPORTED_FILTER_FIELDS = ["project", "entity", "state", "run_ids"]
+type FilterField = Literal["project", "entity", "state", "run_ids"]
+WANDB_RUN_STATES = ["finished", "running", "crashed", "failed", "killed"]
+type RunState = Literal["finished", "running", "crashed", "failed", "killed"]
+type RunId = str
+type ProgressCallback = Callable[[int, int, str], None]

dr_wandb/downloader.py ADDED Viewed

@@ -0,0 +1,118 @@
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+import wandb
+from dr_wandb.constants import ProgressCallback
+from dr_wandb.store import ProjectStore
+from dr_wandb.utils import default_progress_callback, select_updated_runs
+@dataclass
+class DownloaderStats:
+    num_wandb_runs: int = 0
+    num_stored_runs: int = 0
+    num_new_runs: int = 0
+    num_updated_runs: int = 0
+    def __str__(self) -> str:
+        return "\n".join(
+            [
+                "",
+                ":: Downloader Stats ::",
+                f" - # WandB runs: {self.num_wandb_runs:,}",
+                f" - # Stored runs: {self.num_stored_runs:,}",
+                f" - # New runs: {self.num_new_runs:,}",
+                f" - # Updated runs: {self.num_updated_runs:,}",
+                "",
+            ]
+        )
+class Downloader:
+    def __init__(
+        self,
+        store: ProjectStore,
+        runs_per_page: int = 500,
+    ) -> None:
+        self.store = store
+        self._api: wandb.Api | None = None
+        self.runs_per_page = runs_per_page
+        self.progress_callback: ProgressCallback = default_progress_callback
+    @property
+    def api(self) -> wandb.Api:
+        if self._api is None:
+            try:
+                self._api = wandb.Api()
+            except wandb.errors.UsageError as e:
+                if "api_key not configured" in str(e):
+                    raise RuntimeError(
+                        "WandB API key not configured. "
+                        "Please run 'wandb login' or set WANDB_API_KEY env var"
+                    ) from e
+                raise
+        return self._api
+    def set_progress_callback(self, progress_callback: ProgressCallback) -> None:
+        self.progress_callback = progress_callback
+    def get_all_runs(self, entity: str, project: str) -> list[wandb.apis.public.Run]:
+        return list(self.api.runs(f"{entity}/{project}", per_page=self.runs_per_page))
+    def download_runs(
+        self,
+        entity: str,
+        project: str,
+        force_refresh: bool = False,
+        with_history: bool = False,
+    ) -> DownloaderStats:
+        wandb_runs = self.get_all_runs(entity, project)
+        stored_states = self.store.get_existing_run_states(
+            {"entity": entity, "project": project}
+        )
+        runs_to_download = (
+            wandb_runs
+            if force_refresh
+            else select_updated_runs(wandb_runs, stored_states)
+        )
+        num_new_runs = len([r for r in runs_to_download if r.id not in stored_states])
+        stats = DownloaderStats(
+            num_wandb_runs=len(wandb_runs),
+            num_stored_runs=len(stored_states),
+            num_new_runs=num_new_runs,
+            num_updated_runs=len(runs_to_download) - num_new_runs,
+        )
+        if len(runs_to_download) == 0:
+            logging.info(">> No runs to download")
+            return stats
+        if not with_history:
+            logging.info(">> Runs only mode, bulk downloading runs")
+            self.store.store_runs(runs_to_download)
+            return stats
+        logging.info(">> Downloading runs and history data together")
+        for i, run in enumerate(runs_to_download):
+            self.store.store_run_and_history(run, list(run.scan_history()))
+            self.progress_callback(i + 1, len(runs_to_download), run.name)
+        return stats
+    def download_project(
+        self,
+        entity: str,
+        project: str,
+        runs_only: bool = False,
+        force_refresh: bool = False,
+    ) -> DownloaderStats:
+        stats = self.download_runs(
+            entity, project, force_refresh, with_history=not runs_only
+        )
+        logging.info(">> Download completed")
+        return stats
+    def write_downloaded_to_parquet(self) -> None:
+        logging.info(">> Beginning export to parquet")
+        self.store.export_to_parquet()

dr_wandb/history_entry_record.py ADDED Viewed

@@ -0,0 +1,62 @@
+from __future__ import annotations
+from datetime import datetime
+from typing import Any
+from sqlalchemy import Select, select
+from sqlalchemy.dialects.postgresql import JSONB
+from sqlalchemy.orm import Mapped, mapped_column
+from dr_wandb.constants import Base, RunId
+from dr_wandb.utils import extract_as_datetime
+type HistoryEntry = dict[str, Any]
+class HistoryEntryRecord(Base):
+    __tablename__ = "wandb_history"
+    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
+    run_id: Mapped[str]
+    step: Mapped[int | None]
+    timestamp: Mapped[datetime | None]
+    runtime: Mapped[int | None]
+    wandb_metadata: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    metrics: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    @classmethod
+    def from_wandb_history(
+        cls, history_entry: HistoryEntry, run_id: str
+    ) -> HistoryEntryRecord:
+        return cls(
+            run_id=run_id,
+            step=history_entry.get("_step"),
+            timestamp=extract_as_datetime(history_entry, "_timestamp"),
+            runtime=history_entry.get("_runtime"),
+            wandb_metadata=history_entry.get("_wandb", {}),
+            metrics={k: v for k, v in history_entry.items() if not k.startswith("_")},
+        )
+    @classmethod
+    def standard_fields(cls) -> list[str]:
+        return [
+            col.name
+            for col in cls.__table__.columns
+            if col.name not in ["wandb_metadata", "metrics"]
+        ]
+    def to_dict(self, include_metadata: bool = False) -> dict[str, Any]:
+        return {
+            **{field: getattr(self, field) for field in self.standard_fields()},
+            **self.metrics,
+            **({"wandb_metadata": self.wandb_metadata} if include_metadata else {}),
+        }
+def build_history_query(
+    run_ids: list[RunId] | None = None,
+) -> Select[HistoryEntryRecord]:
+    query = select(HistoryEntryRecord)
+    if run_ids is not None:
+        query = query.where(HistoryEntryRecord.run_id.in_(run_ids))
+    return query

dr_wandb/py.typed ADDED Viewed

File without changes

dr_wandb/run_record.py ADDED Viewed

@@ -0,0 +1,115 @@
+from __future__ import annotations
+from datetime import datetime
+from typing import Any, Literal
+import wandb
+from sqlalchemy import Select, select
+from sqlalchemy.dialects.postgresql import JSONB
+from sqlalchemy.orm import Mapped, mapped_column
+from dr_wandb.constants import (
+    SUPPORTED_FILTER_FIELDS,
+    Base,
+    FilterField,
+    RunId,
+    RunState,
+)
+RUN_DATA_COMPONENTS = [
+    "config",
+    "summary",
+    "wandb_metadata",
+    "system_metrics",
+    "system_attrs",
+    "sweep_info",
+]
+type All = Literal["all"]
+type RunDataComponent = Literal[
+    "config",
+    "summary",
+    "wandb_metadata",
+    "system_metrics",
+    "system_attrs",
+    "sweep_info",
+]
+class RunRecord(Base):
+    __tablename__ = "wandb_runs"
+    run_id: Mapped[RunId] = mapped_column(primary_key=True)
+    run_name: Mapped[str]
+    state: Mapped[RunState]
+    project: Mapped[str]
+    entity: Mapped[str]
+    created_at: Mapped[datetime | None]
+    config: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    summary: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    wandb_metadata: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    system_metrics: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    system_attrs: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    sweep_info: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    @classmethod
+    def standard_fields(cls) -> list[str]:
+        return [
+            col.name
+            for col in cls.__table__.columns
+            if col.name not in RUN_DATA_COMPONENTS
+        ]
+    @classmethod
+    def from_wandb_run(cls, wandb_run: wandb.apis.public.Run) -> RunRecord:
+        return cls(
+            run_id=wandb_run.id,
+            run_name=wandb_run.name,
+            state=wandb_run.state,
+            project=wandb_run.project,
+            entity=wandb_run.entity,
+            created_at=wandb_run.created_at,
+            config=dict(wandb_run.config),
+            summary=dict(wandb_run.summary._json_dict) if wandb_run.summary else {},  # noqa: SLF001
+            wandb_metadata=wandb_run.metadata or {},
+            system_metrics=wandb_run.system_metrics or {},
+            system_attrs=dict(wandb_run._attrs),  # noqa: SLF001
+            sweep_info={
+                "sweep_id": getattr(wandb_run, "sweep_id", None),
+                "sweep_url": getattr(wandb_run, "sweep_url", None),
+            },
+        )
+    def update_from_wandb_run(self, wandb_run: wandb.apis.public.Run) -> None:
+        updated = self.__class__.from_wandb_run(wandb_run)
+        for col in self.__table__.columns:
+            if col.name != "run_id":
+                setattr(self, col.name, getattr(updated, col.name))
+    def to_dict(
+        self, include: list[RunDataComponent] | All | None = None
+    ) -> dict[str, Any]:
+        include = include or []
+        if include == "all":
+            include = RUN_DATA_COMPONENTS
+        assert all(field in RUN_DATA_COMPONENTS for field in include)
+        data = {k: getattr(self, k) for k in self.standard_fields()}
+        for field in include:
+            data[field] = getattr(self, field)
+        return data
+def build_run_query(kwargs: dict[FilterField, Any] | None = None) -> Select[RunRecord]:
+    query = select(RunRecord)
+    if kwargs is not None:
+        assert all(k in SUPPORTED_FILTER_FIELDS for k in kwargs)
+        assert all(v is not None for v in kwargs.values())
+        if "project" in kwargs:
+            query = query.where(RunRecord.project == kwargs["project"])
+        if "entity" in kwargs:
+            query = query.where(RunRecord.entity == kwargs["entity"])
+        if "state" in kwargs:
+            query = query.where(RunRecord.state == kwargs["state"])
+        if "run_ids" in kwargs:
+            query = query.where(RunRecord.run_id.in_(kwargs["run_ids"]))
+    return query

dr_wandb/store.py ADDED Viewed

@@ -0,0 +1,193 @@
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Any
+from urllib.parse import urlparse
+import pandas as pd
+import wandb
+from sqlalchemy import Engine, create_engine, text
+from sqlalchemy.exc import OperationalError
+from sqlalchemy.orm import Session
+from dr_wandb.constants import (
+    Base,
+    FilterField,
+    RunId,
+    RunState,
+)
+from dr_wandb.history_entry_record import (
+    HistoryEntry,
+    HistoryEntryRecord,
+    build_history_query,
+)
+from dr_wandb.run_record import (
+    RUN_DATA_COMPONENTS,
+    All,
+    RunDataComponent,
+    RunRecord,
+    build_run_query,
+)
+from dr_wandb.utils import safe_convert_for_parquet
+DEFAULT_OUTPUT_DIR = Path(__file__).parent.parent / "data"
+DEFAULT_RUNS_FILENAME = "runs_metadata"
+DEFAULT_HISTORY_FILENAME = "runs_history"
+type History = list[HistoryEntry]
+def delete_history_for_runs(session: Session, run_ids: list[RunId]) -> None:
+    if not run_ids:
+        return
+    session.execute(
+        text("DELETE FROM wandb_history WHERE run_id = ANY(:run_ids)"),
+        {"run_ids": run_ids},
+    )
+def save_update_run(session: Session, run: wandb.apis.public.Run) -> None:
+    existing_run = session.get(RunRecord, run.id)
+    if existing_run:
+        existing_run.update_from_wandb_run(run)
+    else:
+        session.add(RunRecord.from_wandb_run(run))
+def delete_add_history(session: Session, run_id: RunId, history: History) -> None:
+    delete_history_for_runs(session, [run_id])
+    for history_entry in history:
+        session.add(HistoryEntryRecord.from_wandb_history(history_entry, run_id))
+def ensure_database_exists(database_url: str) -> str:
+    parsed = urlparse(database_url)
+    db_name = parsed.path.lstrip("/")
+    postgres_url = database_url.replace(f"/{db_name}", "/postgres")
+    try:
+        test_engine = create_engine(database_url)
+        with test_engine.connect():
+            pass
+        return database_url
+    except OperationalError as e:
+        if "does not exist" in str(e):
+            logging.info(f"Database '{db_name}' doesn't exist, creating it...")
+            postgres_engine = create_engine(postgres_url)
+            with postgres_engine.connect() as conn:
+                conn.execute(text("COMMIT"))
+                conn.execute(text(f'CREATE DATABASE "{db_name}"'))
+            logging.info(f"Created database '{db_name}'")
+            return database_url
+        else:
+            raise
+class ProjectStore:
+    def __init__(self, connection_string: str, output_dir: str | None = None) -> None:
+        connection_string = ensure_database_exists(connection_string)
+        self.engine: Engine = create_engine(connection_string)
+        self.create_tables()
+        self.output_dir = output_dir if output_dir is not None else DEFAULT_OUTPUT_DIR
+    def create_tables(self) -> None:
+        Base.metadata.create_all(self.engine)
+    def store_run(self, run: wandb.apis.public.Run) -> None:
+        with Session(self.engine) as session:
+            save_update_run(session, run)
+            session.commit()
+    def store_runs(self, runs: list[wandb.apis.public.Run]) -> None:
+        with Session(self.engine) as session:
+            for run in runs:
+                save_update_run(session, run)
+            session.commit()
+    def store_history(self, run_id: RunId, history: History) -> None:
+        with Session(self.engine) as session:
+            delete_add_history(session, run_id, history)
+            session.commit()
+    def store_histories(
+        self,
+        runs: list[wandb.apis.public.Run],
+        histories: list[History],
+    ) -> None:
+        assert len(runs) == len(histories)
+        run_ids = [run.id for run in runs]
+        with Session(self.engine) as session:
+            delete_history_for_runs(session, run_ids)
+            for run_id, history in zip(run_ids, histories, strict=False):
+                for history_entry in history:
+                    session.add(
+                        HistoryEntryRecord.from_wandb_history(history_entry, run_id)
+                    )
+            session.commit()
+    def store_run_and_history(
+        self, run: wandb.apis.public.Run, history: History
+    ) -> None:
+        with Session(self.engine) as session:
+            delete_add_history(session, run.id, history)
+            save_update_run(session, run)
+            session.commit()
+    def get_runs_df(
+        self,
+        include: list[RunDataComponent] | All | None = None,
+        kwargs: dict[FilterField, Any] | None = None,
+    ) -> pd.DataFrame:
+        with Session(self.engine) as session:
+            result = session.execute(build_run_query(kwargs=kwargs))
+            return pd.DataFrame(
+                [run.to_dict(include=include) for run in result.scalars().all()]
+            )
+    def get_history_df(
+        self,
+        include_metadata: bool = False,
+        run_ids: list[RunId] | None = None,
+    ) -> pd.DataFrame:
+        with Session(self.engine) as session:
+            result = session.execute(build_history_query(run_ids=run_ids))
+            return pd.DataFrame(
+                [
+                    history.to_dict(include_metadata=include_metadata)
+                    for history in result.scalars().all()
+                ]
+            )
+    def get_existing_run_states(
+        self, kwargs: dict[FilterField, Any] | None = None
+    ) -> dict[RunId, RunState]:
+        with Session(self.engine) as session:
+            result = session.execute(build_run_query(kwargs=kwargs))
+            return {run.run_id: run.state for run in result.scalars().all()}
+    def export_to_parquet(
+        self,
+        runs_filename: str = DEFAULT_RUNS_FILENAME,
+        history_filename: str = DEFAULT_HISTORY_FILENAME,
+    ) -> None:
+        self.output_dir.mkdir(exist_ok=True)
+        logging.info(f">> Using data output directory: {self.output_dir}")
+        history_df = self.get_history_df()
+        if not history_df.empty:
+            history_path = self.output_dir / f"{history_filename}.parquet"
+            history_df = safe_convert_for_parquet(history_df)
+            history_df.to_parquet(history_path, engine="pyarrow", index=False)
+            logging.info(f">> Wrote history_df to {history_path}")
+        for include_type in RUN_DATA_COMPONENTS:
+            runs_df = self.get_runs_df(include=[include_type])
+            if not runs_df.empty:
+                runs_path = self.output_dir / f"{runs_filename}_{include_type}.parquet"
+                runs_df = safe_convert_for_parquet(runs_df)
+                runs_df.to_parquet(runs_path, engine="pyarrow", index=False)
+                logging.info(f">> Wrote runs_df with {include_type} to {runs_path}")
+        runs_df_full = self.get_runs_df(include="all")
+        if not runs_df_full.empty:
+            runs_path = self.output_dir / f"{runs_filename}.parquet"
+            runs_df_full = safe_convert_for_parquet(runs_df_full)
+            runs_df_full.to_parquet(runs_path, engine="pyarrow", index=False)
+            logging.info(f">> Wrote runs_df with all parts to {runs_path}")

dr_wandb/utils.py ADDED Viewed

@@ -0,0 +1,57 @@
+import json
+import logging
+from datetime import datetime
+from typing import Any
+import pandas as pd
+import wandb
+from dr_wandb.constants import MAX_INT, RunId, RunState
+def extract_as_datetime(data: dict[str, Any], key: str) -> datetime | None:
+    timestamp = data.get(key)
+    return datetime.fromtimestamp(timestamp) if timestamp is not None else None
+def select_updated_runs(
+    all_runs: list[wandb.apis.public.Run],
+    existing_run_states: dict[RunId, RunState],
+) -> list[wandb.apis.public.Run]:
+    return [
+        run
+        for run in all_runs
+        if run.id not in existing_run_states or existing_run_states[run.id] == "running"
+    ]
+def default_progress_callback(run_index: int, total_runs: int, message: str) -> None:
+    logging.info(f">> {run_index}/{total_runs}: {message}")
+def convert_large_ints_in_data(data: Any, max_int: int = MAX_INT) -> Any:
+    if isinstance(data, dict):
+        return {k: convert_large_ints_in_data(v, max_int) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [convert_large_ints_in_data(item, max_int) for item in data]
+    elif isinstance(data, int) and abs(data) > max_int:
+        return float(data)
+    return data
+def safe_convert_for_parquet(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    for col in df.columns:
+        if df[col].dtype == "int64":
+            mask = df[col].abs() > MAX_INT
+            if mask.any():
+                df[col] = df[col].astype("float64")
+        elif df[col].dtype == "object":
+            df[col] = df[col].apply(
+                lambda x: json.dumps(convert_large_ints_in_data(x), default=str)
+                if isinstance(x, dict | list)
+                else str(x)
+                if x is not None
+                else None
+            )
+    return df

dr_wandb-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,123 @@
+Metadata-Version: 2.4
+Name: dr-wandb
+Version: 0.1.0
+Summary: Interact with wandb from python
+Author-email: Danielle Rothermel <danielle.rothermel@gmail.com>
+License-File: LICENSE
+Requires-Python: >=3.12
+Requires-Dist: pandas>=2.3.2
+Requires-Dist: psycopg2>=2.9.10
+Requires-Dist: pyarrow>=21.0.0
+Requires-Dist: pydantic-settings>=2.10.1
+Requires-Dist: sqlalchemy>=2.0.43
+Requires-Dist: wandb>=0.21.4
+Description-Content-Type: text/markdown
+# dr_wandb
+A command-line utility for downloading and archiving Weights & Biases experiment data to local storage formats optimized for offline analysis. Stores to PostgreSQL db + Parquet files, supports incremental updates and selective data retrieval.
+## Installation
+```bash
+uv add dr_wandb
+```
+### Prerequisites
+- Python 3.12 or higher
+- PostgreSQL database server
+- Weights & Biases account with API access
+- PyArrow for Parquet file operations
+### Authentication
+Configure Weights & Biases authentication using one of these methods:
+```bash
+wandb login
+```
+Or set the API key as an environment variable:
+```bash
+export WANDB_API_KEY=your_api_key_here
+```
+## Basic Usage
+Download all runs from a Weights & Biases project:
+```bash
+wandb-download --entity your_entity --project your_project
+Options:
+  --entity TEXT        WandB entity (username or team name)
+  --project TEXT       WandB project name
+  --runs-only          Download only run metadata, skip training history
+  --force-refresh      Download all data, ignoring existing records
+  --db-url TEXT        PostgreSQL connection string
+  --output-dir TEXT    Directory for exported Parquet files
+  --help              Show help message and exit
+```
+The tool creates a PostgreSQL database, downloads experiment data, and exports Parquet files to the configured output directory. It tool tracks existing data and downloads only new or updated runs by default. A run is considered for update if:
+- It does not exist in the local database
+- Its state is "running" (indicating potential new data)
+Use `--force-refresh` to download all runs regardless of existing data.
+### Environment Variables
+The tool reads configuration from environment variables with the `DR_WANDB_` prefix and supports `.env` files:
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `DR_WANDB_ENTITY` | Weights & Biases entity name | None |
+| `DR_WANDB_PROJECT` | Weights & Biases project name | None |
+| `DR_WANDB_DATABASE_URL` | PostgreSQL connection string | `postgresql+psycopg2://localhost/wandb` |
+| `DR_WANDB_OUTPUT_DIR` | Directory for exported files | `./data` |
+### Database Configuration
+The PostgreSQL connection string follows the standard format:
+```
+postgresql+psycopg2://username:password@host:port/database_name
+```
+If the specified database does not exist, the tool will attempt to create it automatically.
+## Data Schema
+The tool generates the following files in the output directory:
+- `runs_metadata.parquet` - Complete run metadata including configurations, summaries, and system information
+- `runs_history.parquet` - Training metrics and logged values over time
+- `runs_metadata_{component}.parquet` - Component-specific files for config, summary, wandb_metadata, system_metrics, system_attrs, and sweep_info
+**Run Records**
+- **run_id**: Unique identifier for the experiment run
+- **run_name**: Human-readable name assigned to the run
+- **state**: Current state (finished, running, crashed, failed, killed)
+- **project**: Project name
+- **entity**: Entity name
+- **created_at**: Timestamp of run creation
+- **config**: Experiment configuration parameters (JSONB)
+- **summary**: Final metrics and outputs (JSONB)
+- **wandb_metadata**: Platform-specific metadata (JSONB)
+- **system_metrics**: Hardware and system information (JSONB)
+- **system_attrs**: Additional system attributes (JSONB)
+- **sweep_info**: Hyperparameter sweep information (JSONB)
+**Training History Records**
+- **run_id**: Reference to the parent run
+- **step**: Training step number
+- **timestamp**: Time of metric logging
+- **runtime**: Elapsed time since run start
+- **wandb_metadata**: Platform logging metadata (JSONB)
+- **metrics**: All logged metrics and values (JSONB, flattened in Parquet export)

dr_wandb-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+dr_wandb/__init__.py,sha256=aAqpPH5MBqIT_dPF5aEdqnjglgsas1MTaYqEJxyHc6s,54
+dr_wandb/constants.py,sha256=aKbkVU08aRdfrcSYu_UYXPI48ZpSTZUSpVZeXK2L4L8,534
+dr_wandb/downloader.py,sha256=X-NN1A1GilnUoxdEyHCsKJolqGBke_dIzS5wJWEAvvE,3888
+dr_wandb/history_entry_record.py,sha256=ni9rXhYWxOg2kdidQ4norYAK37tGWj8xaB8R_lU4tw0,2010
+dr_wandb/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dr_wandb/run_record.py,sha256=X3fmNOhsBfJsaCZEWhOtnulI16mXAYiH8K194CPdjfk,3794
+dr_wandb/store.py,sha256=gWvlC0NIjcKeRP1rZooBz6dDq2nS2wIudsgahLso3VM,7063
+dr_wandb/utils.py,sha256=zzpHVOVo0QD82ik9ksQCP_vN7Zw0ov9dPGFfNMFgfmg,1796
+dr_wandb/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dr_wandb/cli/download.py,sha256=XvUY8Jl2u9BGo1l8QXn0foEFi5a3SfCARbvzZ-HxPoA,3710
+dr_wandb-0.1.0.dist-info/METADATA,sha256=2UBB8JfOPTCMWJbjXG8jOa0KC91GFwKlZguSzBdRwc8,4226
+dr_wandb-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+dr_wandb-0.1.0.dist-info/entry_points.txt,sha256=l4X0h3JbfOr_-3pgqiq3iy4MqUTSiaFUMeVf0DTck88,74
+dr_wandb-0.1.0.dist-info/licenses/LICENSE,sha256=6tUm1Q55M1UBMbbawzFlF0-DgCazM1BELo_5-RXA1K4,1075
+dr_wandb-0.1.0.dist-info/RECORD,,

dr_wandb-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any

dr_wandb-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ wandb-download = dr_wandb.cli.download:download_project

dr_wandb-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Danielle Rothermel
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.