PyPI - dr-wandb - Versions diffs - 0.1.2__py3-none-any.whl - Mend

dr-wandb 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

dr_wandb/__init__.py +9 -0
dr_wandb/cli/__init__.py +0 -0
dr_wandb/cli/download.py +97 -0
dr_wandb/cli/postgres_download.py +128 -0
dr_wandb/constants.py +23 -0
dr_wandb/downloader.py +118 -0
dr_wandb/fetch.py +84 -0
dr_wandb/history_entry_record.py +62 -0
dr_wandb/py.typed +0 -0
dr_wandb/run_record.py +115 -0
dr_wandb/store.py +193 -0
dr_wandb/utils.py +57 -0
dr_wandb-0.1.2.dist-info/METADATA +179 -0
dr_wandb-0.1.2.dist-info/RECORD +17 -0
dr_wandb-0.1.2.dist-info/WHEEL +4 -0
dr_wandb-0.1.2.dist-info/entry_points.txt +2 -0
dr_wandb-0.1.2.dist-info/licenses/LICENSE +21 -0

dr_wandb/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""dr_wandb public API."""
+from .fetch import fetch_project_runs, serialize_history_entry, serialize_run
+__all__ = [
+    "fetch_project_runs",
+    "serialize_history_entry",
+    "serialize_run",
+]

dr_wandb/cli/__init__.py ADDED Viewed

File without changes

dr_wandb/cli/download.py ADDED Viewed

@@ -0,0 +1,97 @@
+from typing import Any
+import logging
+from pydantic import BaseModel, Field, computed_field
+from pathlib import Path
+import typer
+import pickle
+from dr_wandb.fetch import fetch_project_runs
+app = typer.Typer()
+class ProjDownloadConfig(BaseModel):
+    entity: str
+    project: str
+    output_dir: Path = Field(
+        default_factory=lambda: (
+            Path(__file__).parent.parent.parent.parent / "data"
+        )
+    )
+    runs_only: bool = False
+    runs_per_page: int = 500
+    log_every: int = 20
+    runs_output_filename: str = Field(
+        default_factory=lambda data: (
+            f"{data['entity']}_{data['project']}_runs.pkl"
+        )
+    )
+    histories_output_filename: str = Field(
+        default_factory=lambda data: (
+            f"{data['entity']}_{data['project']}_histories.pkl"
+        )
+    )
+    def progress_callback(self, run_index: int, total_runs: int, message: str)-> None:
+        if run_index % self.log_every == 0:
+            logging.info(f">> {run_index}/{total_runs}: {message}")
+    @computed_field
+    @property
+    def fetch_runs_cfg(self) -> dict[str, Any]:
+        return {
+            "entity": self.entity,
+            "project": self.project,
+            "runs_per_page": self.runs_per_page,
+            "progress_callback": self.progress_callback,
+            "include_history": not self.runs_only,
+        }
+def setup_logging(level: str = "INFO") -> None:
+    logging.basicConfig(
+        level=getattr(logging, level.upper()),
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+@app.command()
+def download_project(
+    entity: str,
+    project: str,
+    output_dir: str,
+    runs_only: bool = False,
+    runs_per_page: int = 500,
+    log_every: int = 20,
+) -> None:
+    setup_logging()
+    logging.info("\n:: Beginning Dr. Wandb Project Downloading Tool ::\n")
+    cfg = ProjDownloadConfig(
+        entity=entity,
+        project=project,
+        output_dir=output_dir,
+        runs_only=runs_only,
+        runs_per_page=runs_per_page,
+        log_every=log_every,
+    )
+    logging.info(str(cfg.model_dump_json(indent=4, exclude="fetch_runs_cfg")))
+    logging.info("")
+    runs, histories = fetch_project_runs(**cfg.fetch_runs_cfg)
+    runs_filename = f"{output_dir}/{cfg.runs_output_filename}"
+    histories_filename = f"{output_dir}/{cfg.histories_output_filename}"
+    with open(runs_filename, 'wb') as run_file:
+        pickle.dump(runs, run_file)
+    logging.info(f">> Dumped runs data to: {runs_filename}")
+    if not cfg.runs_only:
+        with open(histories_filename, 'wb') as hist_file:
+            pickle.dump(histories, hist_file)
+        logging.info(f">> Dumped histories data to: {histories_filename}")
+    else:
+        logging.info(f">> Runs only, not dumping histories to: {histories_filename}")
+if __name__ == "__main__":
+    app()

dr_wandb/cli/postgres_download.py ADDED Viewed

@@ -0,0 +1,128 @@
+import logging
+from pathlib import Path
+import click
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from dr_wandb.downloader import Downloader
+from dr_wandb.store import ProjectStore
+class ProjDownloadSettings(BaseSettings):
+    model_config = SettingsConfigDict(env_file=".env", env_prefix="DR_WANDB_")
+    entity: str | None = None
+    project: str | None = None
+    database_url: str = "postgresql+psycopg2://localhost/wandb"
+    output_dir: Path = Path(__file__).parent.parent / "data"
+    runs_per_page: int = 500
+def setup_logging(level: str = "INFO") -> None:
+    logging.basicConfig(
+        level=getattr(logging, level.upper()),
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+def validate_settings(entity: str | None, project: str | None) -> None:
+    if not entity:
+        raise click.ClickException(
+            "--entity is required, or set DR_WANDB_ENTITY in .env"
+        )
+    if not project:
+        raise click.ClickException(
+            "--project is required, or set DR_WANDB_PROJECT in .env"
+        )
+def resolve_config(
+    entity: str | None,
+    project: str | None,
+    db_url: str | None,
+    output_dir: str | None,
+) -> ProjDownloadSettings:
+    cfg = ProjDownloadSettings()
+    final_entity = entity if entity else cfg.entity
+    final_project = project if project else cfg.project
+    final_db_url = db_url if db_url else cfg.database_url
+    final_output_dir = output_dir if output_dir else cfg.output_dir
+    validate_settings(final_entity, final_project)
+    return ProjDownloadSettings(
+        entity=final_entity,
+        project=final_project,
+        database_url=final_db_url,
+        output_dir=final_output_dir,
+        runs_per_page=cfg.runs_per_page,
+    )
+def execute_download(
+    cfg: ProjDownloadSettings, runs_only: bool, force_refresh: bool
+) -> None:
+    store = ProjectStore(
+        cfg.database_url,
+        output_dir=cfg.output_dir,
+    )
+    downloader = Downloader(store, runs_per_page=cfg.runs_per_page)
+    click.echo(">> Beginning download:")
+    stats = downloader.download_project(
+        entity=cfg.entity,
+        project=cfg.project,
+        runs_only=runs_only,
+        force_refresh=force_refresh,
+    )
+    click.echo(str(stats))
+    return downloader
+@click.command()
+@click.option(
+    "--entity",
+    envvar="DR_WANDB_ENTITY",
+    help="WandB entity (username or team name)",
+)
+@click.option("--project", envvar="DR_WANDB_PROJECT", help="WandB project name")
+@click.option(
+    "--runs-only",
+    is_flag=True,
+    help="Only download runs, don't download history",
+)
+@click.option(
+    "--force-refresh",
+    is_flag=True,
+    help="Force refresh, download all data",
+)
+@click.option(
+    "--db-url",
+    envvar="DR_WANDB_DATABASE_URL",
+    help="PostgreSQL connection string",
+)
+@click.option(
+    "--output-dir",
+    envvar="DR_WANDB_OUTPUT_DIR",
+    help="Output directory",
+)
+def download_project(
+    entity: str | None,
+    project: str | None,
+    runs_only: bool,
+    force_refresh: bool,
+    db_url: str | None,
+    output_dir: str | None,
+) -> None:
+    setup_logging()
+    click.echo("\n:: Beginning Dr. Wandb Project Downloading Tool ::\n")
+    cfg = resolve_config(entity, project, db_url, output_dir)
+    click.echo(f">> Downloading project {cfg.entity}/{cfg.project}")
+    click.echo(f">> Database: {cfg.database_url}")
+    click.echo(f">> Output directory: {cfg.output_dir}")
+    click.echo(f">> Force refresh: {force_refresh} Runs only: {runs_only}")
+    click.echo()
+    downloader = execute_download(cfg, runs_only, force_refresh)
+    downloader.write_downloaded_to_parquet()
+if __name__ == "__main__":
+    download_project()

dr_wandb/constants.py ADDED Viewed

@@ -0,0 +1,23 @@
+from collections.abc import Callable
+from typing import Literal
+from sqlalchemy import String
+from sqlalchemy.orm import DeclarativeBase
+class Base(DeclarativeBase):
+    pass
+MAX_INT = 2**31 - 1
+SUPPORTED_FILTER_FIELDS = ["project", "entity", "state", "run_ids"]
+type FilterField = Literal["project", "entity", "state", "run_ids"]
+WANDB_RUN_STATES = ["finished", "running", "crashed", "failed", "killed"]
+type RunState = Literal["finished", "running", "crashed", "failed", "killed"]
+type RunId = str
+Base.type_annotation_map = {RunId: String}
+type ProgressCallback = Callable[[int, int, str], None]

dr_wandb/downloader.py ADDED Viewed

@@ -0,0 +1,118 @@
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+import wandb
+from dr_wandb.constants import ProgressCallback
+from dr_wandb.store import ProjectStore
+from dr_wandb.utils import default_progress_callback, select_updated_runs
+@dataclass
+class DownloaderStats:
+    num_wandb_runs: int = 0
+    num_stored_runs: int = 0
+    num_new_runs: int = 0
+    num_updated_runs: int = 0
+    def __str__(self) -> str:
+        return "\n".join(
+            [
+                "",
+                ":: Downloader Stats ::",
+                f" - # WandB runs: {self.num_wandb_runs:,}",
+                f" - # Stored runs: {self.num_stored_runs:,}",
+                f" - # New runs: {self.num_new_runs:,}",
+                f" - # Updated runs: {self.num_updated_runs:,}",
+                "",
+            ]
+        )
+class Downloader:
+    def __init__(
+        self,
+        store: ProjectStore,
+        runs_per_page: int = 500,
+    ) -> None:
+        self.store = store
+        self._api: wandb.Api | None = None
+        self.runs_per_page = runs_per_page
+        self.progress_callback: ProgressCallback = default_progress_callback
+    @property
+    def api(self) -> wandb.Api:
+        if self._api is None:
+            try:
+                self._api = wandb.Api()
+            except wandb.errors.UsageError as e:
+                if "api_key not configured" in str(e):
+                    raise RuntimeError(
+                        "WandB API key not configured. "
+                        "Please run 'wandb login' or set WANDB_API_KEY env var"
+                    ) from e
+                raise
+        return self._api
+    def set_progress_callback(self, progress_callback: ProgressCallback) -> None:
+        self.progress_callback = progress_callback
+    def get_all_runs(self, entity: str, project: str) -> list[wandb.apis.public.Run]:
+        return list(self.api.runs(f"{entity}/{project}", per_page=self.runs_per_page))
+    def download_runs(
+        self,
+        entity: str,
+        project: str,
+        force_refresh: bool = False,
+        with_history: bool = False,
+    ) -> DownloaderStats:
+        wandb_runs = self.get_all_runs(entity, project)
+        stored_states = self.store.get_existing_run_states(
+            {"entity": entity, "project": project}
+        )
+        runs_to_download = (
+            wandb_runs
+            if force_refresh
+            else select_updated_runs(wandb_runs, stored_states)
+        )
+        num_new_runs = len([r for r in runs_to_download if r.id not in stored_states])
+        stats = DownloaderStats(
+            num_wandb_runs=len(wandb_runs),
+            num_stored_runs=len(stored_states),
+            num_new_runs=num_new_runs,
+            num_updated_runs=len(runs_to_download) - num_new_runs,
+        )
+        if len(runs_to_download) == 0:
+            logging.info(">> No runs to download")
+            return stats
+        if not with_history:
+            logging.info(">> Runs only mode, bulk downloading runs")
+            self.store.store_runs(runs_to_download)
+            return stats
+        logging.info(">> Downloading runs and history data together")
+        for i, run in enumerate(runs_to_download):
+            self.store.store_run_and_history(run, list(run.scan_history()))
+            self.progress_callback(i + 1, len(runs_to_download), run.name)
+        return stats
+    def download_project(
+        self,
+        entity: str,
+        project: str,
+        runs_only: bool = False,
+        force_refresh: bool = False,
+    ) -> DownloaderStats:
+        stats = self.download_runs(
+            entity, project, force_refresh, with_history=not runs_only
+        )
+        logging.info(">> Download completed")
+        return stats
+    def write_downloaded_to_parquet(self) -> None:
+        logging.info(">> Beginning export to parquet")
+        self.store.export_to_parquet()

dr_wandb/fetch.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Lightweight WandB fetch utilities that avoid database storage."""
+from __future__ import annotations
+from collections.abc import Callable, Iterator
+from typing import Any
+import wandb
+import logging
+from dr_wandb.history_entry_record import HistoryEntryRecord
+from dr_wandb.run_record import RunRecord
+from dr_wandb.utils import default_progress_callback
+ProgressFn = Callable[[int, int, str], None]
+def _iterate_runs(
+    entity: str,
+    project: str,
+    *,
+    runs_per_page: int,
+) -> Iterator[wandb.apis.public.Run]:
+    api = wandb.Api()
+    yield from api.runs(f"{entity}/{project}", per_page=runs_per_page)
+def serialize_run(run: wandb.apis.public.Run) -> dict[str, Any]:
+    """Convert a WandB run into a JSON-friendly dict."""
+    record = RunRecord.from_wandb_run(run)
+    return record.to_dict(include="all")
+def serialize_history_entry(
+    run: wandb.apis.public.Run, history_entry: dict[str, Any]
+) -> dict[str, Any]:
+    """Convert a raw history payload into a structured dict."""
+    record = HistoryEntryRecord.from_wandb_history(history_entry, run.id)
+    return {
+        "run_id": record.run_id,
+        "step": record.step,
+        "timestamp": record.timestamp,
+        "runtime": record.runtime,
+        "wandb_metadata": record.wandb_metadata,
+        "metrics": record.metrics,
+    }
+def fetch_project_runs(
+    entity: str,
+    project: str,
+    *,
+    runs_per_page: int = 500,
+    include_history: bool = True,
+    progress_callback: ProgressFn | None = None,
+) -> tuple[list[dict[str, Any]], list[list[dict[str, Any]]]]:
+    """Download runs (and optional history) without requiring Postgres."""
+    progress = progress_callback or default_progress_callback
+    runs: list[dict[str, Any]] = []
+    histories: list[list[dict[str, Any]]] = []
+    logging.info(">> Downloading runs, this will take a while (minutes)")
+    run_iter = list(_iterate_runs(entity, project, runs_per_page=runs_per_page))
+    total = len(run_iter)
+    logging.info(f"  - total runs found: {total}")
+    logging.info(f">> Serializing runs and maybe getting histories: {include_history}")
+    for index, run in enumerate(run_iter, start=1):
+        runs.append(serialize_run(run))
+        if include_history:
+            history_payloads = [
+                serialize_history_entry(run, entry) for entry in run.scan_history()
+            ]
+            histories.append(history_payloads)
+        progress(index, total, run.name)
+    if not include_history:
+        histories = []
+    return runs, histories

dr_wandb/history_entry_record.py ADDED Viewed

@@ -0,0 +1,62 @@
+from __future__ import annotations
+from datetime import datetime
+from typing import Any
+from sqlalchemy import Select, select
+from sqlalchemy.dialects.postgresql import JSONB
+from sqlalchemy.orm import Mapped, mapped_column
+from dr_wandb.constants import Base, RunId
+from dr_wandb.utils import extract_as_datetime
+type HistoryEntry = dict[str, Any]
+class HistoryEntryRecord(Base):
+    __tablename__ = "wandb_history"
+    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
+    run_id: Mapped[str]
+    step: Mapped[int | None]
+    timestamp: Mapped[datetime | None]
+    runtime: Mapped[int | None]
+    wandb_metadata: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    metrics: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    @classmethod
+    def from_wandb_history(
+        cls, history_entry: HistoryEntry, run_id: str
+    ) -> HistoryEntryRecord:
+        return cls(
+            run_id=run_id,
+            step=history_entry.get("_step"),
+            timestamp=extract_as_datetime(history_entry, "_timestamp"),
+            runtime=history_entry.get("_runtime"),
+            wandb_metadata=history_entry.get("_wandb", {}),
+            metrics={k: v for k, v in history_entry.items() if not k.startswith("_")},
+        )
+    @classmethod
+    def standard_fields(cls) -> list[str]:
+        return [
+            col.name
+            for col in cls.__table__.columns
+            if col.name not in ["wandb_metadata", "metrics"]
+        ]
+    def to_dict(self, include_metadata: bool = False) -> dict[str, Any]:
+        return {
+            **{field: getattr(self, field) for field in self.standard_fields()},
+            **self.metrics,
+            **({"wandb_metadata": self.wandb_metadata} if include_metadata else {}),
+        }
+def build_history_query(
+    run_ids: list[RunId] | None = None,
+) -> Select[HistoryEntryRecord]:
+    query = select(HistoryEntryRecord)
+    if run_ids is not None:
+        query = query.where(HistoryEntryRecord.run_id.in_(run_ids))
+    return query

dr_wandb/py.typed ADDED Viewed

File without changes

dr_wandb/run_record.py ADDED Viewed

@@ -0,0 +1,115 @@
+from __future__ import annotations
+from datetime import datetime
+from typing import Any, Literal
+import wandb
+from sqlalchemy import Select, select
+from sqlalchemy.dialects.postgresql import JSONB
+from sqlalchemy.orm import Mapped, mapped_column
+from dr_wandb.constants import (
+    SUPPORTED_FILTER_FIELDS,
+    Base,
+    FilterField,
+    RunId,
+    RunState,
+)
+RUN_DATA_COMPONENTS = [
+    "config",
+    "summary",
+    "wandb_metadata",
+    "system_metrics",
+    "system_attrs",
+    "sweep_info",
+]
+type All = Literal["all"]
+type RunDataComponent = Literal[
+    "config",
+    "summary",
+    "wandb_metadata",
+    "system_metrics",
+    "system_attrs",
+    "sweep_info",
+]
+class RunRecord(Base):
+    __tablename__ = "wandb_runs"
+    run_id: Mapped[RunId] = mapped_column(primary_key=True)
+    run_name: Mapped[str]
+    state: Mapped[RunState]
+    project: Mapped[str]
+    entity: Mapped[str]
+    created_at: Mapped[datetime | None]
+    config: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    summary: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    wandb_metadata: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    system_metrics: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    system_attrs: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    sweep_info: Mapped[dict[str, Any]] = mapped_column(JSONB)
+    @classmethod
+    def standard_fields(cls) -> list[str]:
+        return [
+            col.name
+            for col in cls.__table__.columns
+            if col.name not in RUN_DATA_COMPONENTS
+        ]
+    @classmethod
+    def from_wandb_run(cls, wandb_run: wandb.apis.public.Run) -> RunRecord:
+        return cls(
+            run_id=wandb_run.id,
+            run_name=wandb_run.name,
+            state=wandb_run.state,
+            project=wandb_run.project,
+            entity=wandb_run.entity,
+            created_at=wandb_run.created_at,
+            config=dict(wandb_run.config),
+            summary=dict(wandb_run.summary._json_dict) if wandb_run.summary else {},  # noqa: SLF001
+            wandb_metadata=wandb_run.metadata or {},
+            system_metrics=wandb_run.system_metrics or {},
+            system_attrs=dict(wandb_run._attrs),  # noqa: SLF001
+            sweep_info={
+                "sweep_id": getattr(wandb_run, "sweep_id", None),
+                "sweep_url": getattr(wandb_run, "sweep_url", None),
+            },
+        )
+    def update_from_wandb_run(self, wandb_run: wandb.apis.public.Run) -> None:
+        updated = self.__class__.from_wandb_run(wandb_run)
+        for col in self.__table__.columns:
+            if col.name != "run_id":
+                setattr(self, col.name, getattr(updated, col.name))
+    def to_dict(
+        self, include: list[RunDataComponent] | All | None = None
+    ) -> dict[str, Any]:
+        include = include or []
+        if include == "all":
+            include = RUN_DATA_COMPONENTS
+        assert all(field in RUN_DATA_COMPONENTS for field in include)
+        data = {k: getattr(self, k) for k in self.standard_fields()}
+        for field in include:
+            data[field] = getattr(self, field)
+        return data
+def build_run_query(kwargs: dict[FilterField, Any] | None = None) -> Select[RunRecord]:
+    query = select(RunRecord)
+    if kwargs is not None:
+        assert all(k in SUPPORTED_FILTER_FIELDS for k in kwargs)
+        assert all(v is not None for v in kwargs.values())
+        if "project" in kwargs:
+            query = query.where(RunRecord.project == kwargs["project"])
+        if "entity" in kwargs:
+            query = query.where(RunRecord.entity == kwargs["entity"])
+        if "state" in kwargs:
+            query = query.where(RunRecord.state == kwargs["state"])
+        if "run_ids" in kwargs:
+            query = query.where(RunRecord.run_id.in_(kwargs["run_ids"]))
+    return query

dr_wandb/store.py ADDED Viewed

@@ -0,0 +1,193 @@
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Any
+from urllib.parse import urlparse
+import pandas as pd
+import wandb
+from sqlalchemy import Engine, create_engine, text
+from sqlalchemy.exc import OperationalError
+from sqlalchemy.orm import Session
+from dr_wandb.constants import (
+    Base,
+    FilterField,
+    RunId,
+    RunState,
+)
+from dr_wandb.history_entry_record import (
+    HistoryEntry,
+    HistoryEntryRecord,
+    build_history_query,
+)
+from dr_wandb.run_record import (
+    RUN_DATA_COMPONENTS,
+    All,
+    RunDataComponent,
+    RunRecord,
+    build_run_query,
+)
+from dr_wandb.utils import safe_convert_for_parquet
+DEFAULT_OUTPUT_DIR = Path(__file__).parent.parent / "data"
+DEFAULT_RUNS_FILENAME = "runs_metadata"
+DEFAULT_HISTORY_FILENAME = "runs_history"
+type History = list[HistoryEntry]
+def delete_history_for_runs(session: Session, run_ids: list[RunId]) -> None:
+    if not run_ids:
+        return
+    session.execute(
+        text("DELETE FROM wandb_history WHERE run_id = ANY(:run_ids)"),
+        {"run_ids": run_ids},
+    )
+def save_update_run(session: Session, run: wandb.apis.public.Run) -> None:
+    existing_run = session.get(RunRecord, run.id)
+    if existing_run:
+        existing_run.update_from_wandb_run(run)
+    else:
+        session.add(RunRecord.from_wandb_run(run))
+def delete_add_history(session: Session, run_id: RunId, history: History) -> None:
+    delete_history_for_runs(session, [run_id])
+    for history_entry in history:
+        session.add(HistoryEntryRecord.from_wandb_history(history_entry, run_id))
+def ensure_database_exists(database_url: str) -> str:
+    parsed = urlparse(database_url)
+    db_name = parsed.path.lstrip("/")
+    postgres_url = database_url.replace(f"/{db_name}", "/postgres")
+    try:
+        test_engine = create_engine(database_url)
+        with test_engine.connect():
+            pass
+        return database_url
+    except OperationalError as e:
+        if "does not exist" in str(e):
+            logging.info(f"Database '{db_name}' doesn't exist, creating it...")
+            postgres_engine = create_engine(postgres_url)
+            with postgres_engine.connect() as conn:
+                conn.execute(text("COMMIT"))
+                conn.execute(text(f'CREATE DATABASE "{db_name}"'))
+            logging.info(f"Created database '{db_name}'")
+            return database_url
+        else:
+            raise
+class ProjectStore:
+    def __init__(self, connection_string: str, output_dir: str | None = None) -> None:
+        connection_string = ensure_database_exists(connection_string)
+        self.engine: Engine = create_engine(connection_string)
+        self.create_tables()
+        self.output_dir = output_dir if output_dir is not None else DEFAULT_OUTPUT_DIR
+    def create_tables(self) -> None:
+        Base.metadata.create_all(self.engine)
+    def store_run(self, run: wandb.apis.public.Run) -> None:
+        with Session(self.engine) as session:
+            save_update_run(session, run)
+            session.commit()
+    def store_runs(self, runs: list[wandb.apis.public.Run]) -> None:
+        with Session(self.engine) as session:
+            for run in runs:
+                save_update_run(session, run)
+            session.commit()
+    def store_history(self, run_id: RunId, history: History) -> None:
+        with Session(self.engine) as session:
+            delete_add_history(session, run_id, history)
+            session.commit()
+    def store_histories(
+        self,
+        runs: list[wandb.apis.public.Run],
+        histories: list[History],
+    ) -> None:
+        assert len(runs) == len(histories)
+        run_ids = [run.id for run in runs]
+        with Session(self.engine) as session:
+            delete_history_for_runs(session, run_ids)
+            for run_id, history in zip(run_ids, histories, strict=False):
+                for history_entry in history:
+                    session.add(
+                        HistoryEntryRecord.from_wandb_history(history_entry, run_id)
+                    )
+            session.commit()
+    def store_run_and_history(
+        self, run: wandb.apis.public.Run, history: History
+    ) -> None:
+        with Session(self.engine) as session:
+            delete_add_history(session, run.id, history)
+            save_update_run(session, run)
+            session.commit()
+    def get_runs_df(
+        self,
+        include: list[RunDataComponent] | All | None = None,
+        kwargs: dict[FilterField, Any] | None = None,
+    ) -> pd.DataFrame:
+        with Session(self.engine) as session:
+            result = session.execute(build_run_query(kwargs=kwargs))
+            return pd.DataFrame(
+                [run.to_dict(include=include) for run in result.scalars().all()]
+            )
+    def get_history_df(
+        self,
+        include_metadata: bool = False,
+        run_ids: list[RunId] | None = None,
+    ) -> pd.DataFrame:
+        with Session(self.engine) as session:
+            result = session.execute(build_history_query(run_ids=run_ids))
+            return pd.DataFrame(
+                [
+                    history.to_dict(include_metadata=include_metadata)
+                    for history in result.scalars().all()
+                ]
+            )
+    def get_existing_run_states(
+        self, kwargs: dict[FilterField, Any] | None = None
+    ) -> dict[RunId, RunState]:
+        with Session(self.engine) as session:
+            result = session.execute(build_run_query(kwargs=kwargs))
+            return {run.run_id: run.state for run in result.scalars().all()}
+    def export_to_parquet(
+        self,
+        runs_filename: str = DEFAULT_RUNS_FILENAME,
+        history_filename: str = DEFAULT_HISTORY_FILENAME,
+    ) -> None:
+        self.output_dir.mkdir(exist_ok=True)
+        logging.info(f">> Using data output directory: {self.output_dir}")
+        history_df = self.get_history_df()
+        if not history_df.empty:
+            history_path = self.output_dir / f"{history_filename}.parquet"
+            history_df = safe_convert_for_parquet(history_df)
+            history_df.to_parquet(history_path, engine="pyarrow", index=False)
+            logging.info(f">> Wrote history_df to {history_path}")
+        for include_type in RUN_DATA_COMPONENTS:
+            runs_df = self.get_runs_df(include=[include_type])
+            if not runs_df.empty:
+                runs_path = self.output_dir / f"{runs_filename}_{include_type}.parquet"
+                runs_df = safe_convert_for_parquet(runs_df)
+                runs_df.to_parquet(runs_path, engine="pyarrow", index=False)
+                logging.info(f">> Wrote runs_df with {include_type} to {runs_path}")
+        runs_df_full = self.get_runs_df(include="all")
+        if not runs_df_full.empty:
+            runs_path = self.output_dir / f"{runs_filename}.parquet"
+            runs_df_full = safe_convert_for_parquet(runs_df_full)
+            runs_df_full.to_parquet(runs_path, engine="pyarrow", index=False)
+            logging.info(f">> Wrote runs_df with all parts to {runs_path}")

dr_wandb/utils.py ADDED Viewed

@@ -0,0 +1,57 @@
+import json
+import logging
+from datetime import datetime
+from typing import Any
+import pandas as pd
+import wandb
+from dr_wandb.constants import MAX_INT, RunId, RunState
+def extract_as_datetime(data: dict[str, Any], key: str) -> datetime | None:
+    timestamp = data.get(key)
+    return datetime.fromtimestamp(timestamp) if timestamp is not None else None
+def select_updated_runs(
+    all_runs: list[wandb.apis.public.Run],
+    existing_run_states: dict[RunId, RunState],
+) -> list[wandb.apis.public.Run]:
+    return [
+        run
+        for run in all_runs
+        if run.id not in existing_run_states or existing_run_states[run.id] == "running"
+    ]
+def default_progress_callback(run_index: int, total_runs: int, message: str) -> None:
+    logging.info(f">> {run_index}/{total_runs}: {message}")
+def convert_large_ints_in_data(data: Any, max_int: int = MAX_INT) -> Any:
+    if isinstance(data, dict):
+        return {k: convert_large_ints_in_data(v, max_int) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [convert_large_ints_in_data(item, max_int) for item in data]
+    elif isinstance(data, int) and abs(data) > max_int:
+        return float(data)
+    return data
+def safe_convert_for_parquet(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    for col in df.columns:
+        if df[col].dtype == "int64":
+            mask = df[col].abs() > MAX_INT
+            if mask.any():
+                df[col] = df[col].astype("float64")
+        elif df[col].dtype == "object":
+            df[col] = df[col].apply(
+                lambda x: json.dumps(convert_large_ints_in_data(x), default=str)
+                if isinstance(x, dict | list)
+                else str(x)
+                if x is not None
+                else None
+            )
+    return df

dr_wandb-0.1.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,179 @@
+Metadata-Version: 2.4
+Name: dr-wandb
+Version: 0.1.2
+Summary: Interact with wandb from python
+Author-email: Danielle Rothermel <danielle.rothermel@gmail.com>
+License-File: LICENSE
+Requires-Python: >=3.12
+Requires-Dist: pandas>=2.3.2
+Requires-Dist: pyarrow>=21.0.0
+Requires-Dist: sqlalchemy>=2.0.43
+Requires-Dist: typer>=0.20.0
+Requires-Dist: wandb>=0.21.4
+Description-Content-Type: text/markdown
+# dr_wandb
+A command-line utility for downloading and archiving Weights & Biases experiment data to local storage formats optimized for offline analysis.
+## Installation
+CLI Tool Install: `wandb-downloader`
+```
+uv tool install dr_wandb
+```
+Or, to use the library functions
+```bash
+# To use the library functions
+uv add dr_wandb
+# Optionally
+uv add dr_wandb[postgres]
+uv sync
+```
+### Authentication
+Configure Weights & Biases authentication using one of these methods:
+```bash
+wandb login
+```
+Or set the API key as an environment variable:
+```bash
+export WANDB_API_KEY=your_api_key_here
+```
+## Quickstart
+The default approach doesn't involve postgres. It fetches the runs, and optionally histories, and dumps them to local pkl files.
+```bash
+» wandb-download --help
+ Usage: wandb-download [OPTIONS] ENTITY PROJECT OUTPUT_DIR
+╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ *    entity          TEXT  [required]                                                                                                                            │
+│ *    project         TEXT  [required]                                                                                                                            │
+│ *    output_dir      TEXT  [required]                                                                                                                            │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --runs-only             --no-runs-only             [default: no-runs-only]                                                                                       │
+│ --runs-per-page                           INTEGER  [default: 500]                                                                                                │
+│ --log-every                               INTEGER  [default: 20]                                                                                                 │
+│ --install-completion                               Install completion for the current shell.                                                                     │
+│ --show-completion                                  Show completion for the current shell, to copy it or customize the installation.                              │
+│ --help                                             Show this message and exit.                                                                                   │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+An example:
+```bash
+» wandb-download --runs-only "ml-moe" "ft-scaling" "./data"                                          1 ↵
+2025-11-10 21:47:54 - INFO -
+:: Beginning Dr. Wandb Project Downloading Tool ::
+2025-11-10 21:47:54 - INFO - {
+    "entity": "ml-me",
+    "project": "scaling",
+    "output_dir": "data",
+    "runs_only": true,
+    "runs_per_page": 500,
+    "log_every": 20,
+    "runs_output_filename": "ml-me_scaling_runs.pkl",
+    "histories_output_filename": "ml-me_scaling_histories.pkl"
+}
+2025-11-10 21:47:54 - INFO -
+2025-11-10 21:47:54 - INFO - >> Downloading runs, this will take a while (minutes)
+wandb: Currently logged in as: danielle-rothermel (ml-moe) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+2025-11-10 21:48:00 - INFO -   - total runs found: 517
+2025-11-10 21:48:00 - INFO - >> Serializing runs and maybe getting histories: False
+2025-11-10 21:48:07 - INFO - >> 20/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-10M_main_1Mtx1_--learning_rate=5e-05
+2025-11-10 21:48:12 - INFO - >> 40/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-150M_main_10Mtx1_--learning_rate=5e-06
+...
+2025-11-10 21:50:46 - INFO - >> Dumped runs data to: ./data/ml-moe_ft-scaling_runs.pkl
+2025-11-10 21:50:46 - INFO - >> Runs only, not dumping histories to: ./data/ml-moe_ft-scaling_histories.pkl
+```
+## Very Alpha: Postgres Version
+**Its very likely this won't currently work.**  Download all runs from a Weights & Biases project:
+```bash
+uv run python src/dr_wandb/cli/postres_download.py --entity your_entity --project your_project
+Options:
+  --entity TEXT        WandB entity (username or team name)
+  --project TEXT       WandB project name
+  --runs-only          Download only run metadata, skip training history
+  --force-refresh      Download all data, ignoring existing records
+  --db-url TEXT        PostgreSQL connection string
+  --output-dir TEXT    Directory for exported Parquet files
+  --help              Show help message and exit
+```
+The tool creates a PostgreSQL database, downloads experiment data, and exports Parquet files to the configured output directory. It tool tracks existing data and downloads only new or updated runs by default. A run is considered for update if:
+- It does not exist in the local database
+- Its state is "running" (indicating potential new data)
+Use `--force-refresh` to download all runs regardless of existing data.
+### Environment Variables
+The tool reads configuration from environment variables with the `DR_WANDB_` prefix and supports `.env` files:
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `DR_WANDB_ENTITY` | Weights & Biases entity name | None |
+| `DR_WANDB_PROJECT` | Weights & Biases project name | None |
+| `DR_WANDB_DATABASE_URL` | PostgreSQL connection string | `postgresql+psycopg2://localhost/wandb` |
+| `DR_WANDB_OUTPUT_DIR` | Directory for exported files | `./data` |
+### Database Configuration
+The PostgreSQL connection string follows the standard format:
+```
+postgresql+psycopg2://username:password@host:port/database_name
+```
+If the specified database does not exist, the tool will attempt to create it automatically.
+### Data Schema
+The tool generates the following files in the output directory:
+- `runs_metadata.parquet` - Complete run metadata including configurations, summaries, and system information
+- `runs_history.parquet` - Training metrics and logged values over time
+- `runs_metadata_{component}.parquet` - Component-specific files for config, summary, wandb_metadata, system_metrics, system_attrs, and sweep_info
+**Run Records**
+- **run_id**: Unique identifier for the experiment run
+- **run_name**: Human-readable name assigned to the run
+- **state**: Current state (finished, running, crashed, failed, killed)
+- **project**: Project name
+- **entity**: Entity name
+- **created_at**: Timestamp of run creation
+- **config**: Experiment configuration parameters (JSONB)
+- **summary**: Final metrics and outputs (JSONB)
+- **wandb_metadata**: Platform-specific metadata (JSONB)
+- **system_metrics**: Hardware and system information (JSONB)
+- **system_attrs**: Additional system attributes (JSONB)
+- **sweep_info**: Hyperparameter sweep information (JSONB)
+**Training History Records**
+- **run_id**: Reference to the parent run
+- **step**: Training step number
+- **timestamp**: Time of metric logging
+- **runtime**: Elapsed time since run start
+- **wandb_metadata**: Platform logging metadata (JSONB)
+- **metrics**: All logged metrics and values (JSONB, flattened in Parquet export)

dr_wandb-0.1.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+dr_wandb/__init__.py,sha256=C1FWh869zNWF5XU4XKyGfBPJ2hVpW_UsawIIusfXuQQ,199
+dr_wandb/constants.py,sha256=HuIDOe_MRp2BTTuD1uyVzPJPUm3DbDQDIDk7HNltspc,608
+dr_wandb/downloader.py,sha256=X-NN1A1GilnUoxdEyHCsKJolqGBke_dIzS5wJWEAvvE,3888
+dr_wandb/fetch.py,sha256=wtpY78-VeNjCjro4Ata0N6-uV6neqBq8ooLPxJgXE7k,2533
+dr_wandb/history_entry_record.py,sha256=ni9rXhYWxOg2kdidQ4norYAK37tGWj8xaB8R_lU4tw0,2010
+dr_wandb/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dr_wandb/run_record.py,sha256=X3fmNOhsBfJsaCZEWhOtnulI16mXAYiH8K194CPdjfk,3794
+dr_wandb/store.py,sha256=gWvlC0NIjcKeRP1rZooBz6dDq2nS2wIudsgahLso3VM,7063
+dr_wandb/utils.py,sha256=zzpHVOVo0QD82ik9ksQCP_vN7Zw0ov9dPGFfNMFgfmg,1796
+dr_wandb/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dr_wandb/cli/download.py,sha256=V_V1q5HPXbnBorS0l1gMJlVRg4QlVl5LYL1K7-_j--s,2870
+dr_wandb/cli/postgres_download.py,sha256=XvUY8Jl2u9BGo1l8QXn0foEFi5a3SfCARbvzZ-HxPoA,3710
+dr_wandb-0.1.2.dist-info/METADATA,sha256=EI1oFoFETRG-3eYnzrdJBBUtuRulOEChqhEJE3HU4co,9250
+dr_wandb-0.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+dr_wandb-0.1.2.dist-info/entry_points.txt,sha256=BATf5eJjnFMRULrNGiXfzL3ImYPdNK-MlatSzOFrtII,61
+dr_wandb-0.1.2.dist-info/licenses/LICENSE,sha256=6tUm1Q55M1UBMbbawzFlF0-DgCazM1BELo_5-RXA1K4,1075
+dr_wandb-0.1.2.dist-info/RECORD,,

dr_wandb-0.1.2.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any

dr_wandb-0.1.2.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ wandb-download = dr_wandb.cli.download:app

dr_wandb-0.1.2.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Danielle Rothermel
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.