dr-wandb 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. dr_wandb-0.1.2/.python-version +1 -0
  2. dr_wandb-0.1.2/PKG-INFO +179 -0
  3. dr_wandb-0.1.2/README.md +165 -0
  4. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/pyproject.toml +18 -5
  5. dr_wandb-0.1.2/src/dr_wandb/cli/download.py +97 -0
  6. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/constants.py +3 -0
  7. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/fetch.py +4 -0
  8. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/uv.lock +383 -324
  9. dr_wandb-0.1.1/.python-version +0 -1
  10. dr_wandb-0.1.1/PKG-INFO +0 -124
  11. dr_wandb-0.1.1/README.md +0 -109
  12. dr_wandb-0.1.1/docs/processes/CODING_PRINCIPLES.md +0 -1
  13. dr_wandb-0.1.1/docs/processes/README.md +0 -1
  14. dr_wandb-0.1.1/docs/processes/audit_synthesis_pipeline.md +0 -1
  15. dr_wandb-0.1.1/docs/processes/design_philosophy.md +0 -1
  16. dr_wandb-0.1.1/docs/processes/documentation_organizer_guide.md +0 -1
  17. dr_wandb-0.1.1/docs/processes/fresh_eyes_review_guide.md +0 -1
  18. dr_wandb-0.1.1/docs/processes/general_project_extraction_prompt.md +0 -1
  19. dr_wandb-0.1.1/docs/processes/project_consolidation_methodology.md +0 -1
  20. dr_wandb-0.1.1/docs/processes/reporting_guide.md +0 -1
  21. dr_wandb-0.1.1/docs/processes/strategic_collaboration_guide.md +0 -1
  22. dr_wandb-0.1.1/docs/processes/tactical_execution_guide.md +0 -1
  23. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/.claude/settings.local.json +0 -0
  24. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/.example.env +0 -0
  25. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/.gitignore +0 -0
  26. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/CLAUDE.md +0 -0
  27. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/LICENSE +0 -0
  28. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/__init__.py +0 -0
  29. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/cli/__init__.py +0 -0
  30. /dr_wandb-0.1.1/src/dr_wandb/cli/download.py → /dr_wandb-0.1.2/src/dr_wandb/cli/postgres_download.py +0 -0
  31. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/downloader.py +0 -0
  32. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/history_entry_record.py +0 -0
  33. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/py.typed +0 -0
  34. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/run_record.py +0 -0
  35. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/store.py +0 -0
  36. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/utils.py +0 -0
  37. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/conftest.py +0 -0
  38. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_cli_contract.py +0 -0
  39. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_cli_download.py +0 -0
  40. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_fetch.py +0 -0
  41. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_history_entry_record.py +0 -0
  42. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_query_builders.py +0 -0
  43. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_run_record.py +0 -0
  44. {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_utils.py +0 -0
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: dr-wandb
3
+ Version: 0.1.2
4
+ Summary: Interact with wandb from python
5
+ Author-email: Danielle Rothermel <danielle.rothermel@gmail.com>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: pandas>=2.3.2
9
+ Requires-Dist: pyarrow>=21.0.0
10
+ Requires-Dist: sqlalchemy>=2.0.43
11
+ Requires-Dist: typer>=0.20.0
12
+ Requires-Dist: wandb>=0.21.4
13
+ Description-Content-Type: text/markdown
14
+
15
+ # dr_wandb
16
+
17
+ A command-line utility for downloading and archiving Weights & Biases experiment data to local storage formats optimized for offline analysis.
18
+
19
+
20
+ ## Installation
21
+
22
+ CLI Tool Install: `wandb-downloader`
23
+ ```
24
+ uv tool install dr_wandb
25
+ ```
26
+
27
+ Or, to use the library functions
28
+ ```bash
29
+ # To use the library functions
30
+ uv add dr_wandb
31
+ # Optionally
32
+ uv add dr_wandb[postgres]
33
+ uv sync
34
+ ```
35
+
36
+ ### Authentication
37
+
38
+ Configure Weights & Biases authentication using one of these methods:
39
+
40
+ ```bash
41
+ wandb login
42
+ ```
43
+
44
+ Or set the API key as an environment variable:
45
+
46
+ ```bash
47
+ export WANDB_API_KEY=your_api_key_here
48
+ ```
49
+
50
+ ## Quickstart
51
+
52
+ The default approach doesn't involve postgres. It fetches the runs, and optionally histories, and dumps them to local pkl files.
53
+
54
+ ```bash
55
+ » wandb-download --help
56
+
57
+ Usage: wandb-download [OPTIONS] ENTITY PROJECT OUTPUT_DIR
58
+
59
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
60
+ │ * entity TEXT [required] │
61
+ │ * project TEXT [required] │
62
+ │ * output_dir TEXT [required] │
63
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
64
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
65
+ │ --runs-only --no-runs-only [default: no-runs-only] │
66
+ │ --runs-per-page INTEGER [default: 500] │
67
+ │ --log-every INTEGER [default: 20] │
68
+ │ --install-completion Install completion for the current shell. │
69
+ │ --show-completion Show completion for the current shell, to copy it or customize the installation. │
70
+ │ --help Show this message and exit. │
71
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
72
+ ```
73
+
74
+ An example:
75
+ ```bash
76
+ » wandb-download --runs-only "ml-moe" "ft-scaling" "./data" 1 ↵
77
+ 2025-11-10 21:47:54 - INFO -
78
+ :: Beginning Dr. Wandb Project Downloading Tool ::
79
+
80
+ 2025-11-10 21:47:54 - INFO - {
81
+ "entity": "ml-me",
82
+ "project": "scaling",
83
+ "output_dir": "data",
84
+ "runs_only": true,
85
+ "runs_per_page": 500,
86
+ "log_every": 20,
87
+ "runs_output_filename": "ml-me_scaling_runs.pkl",
88
+ "histories_output_filename": "ml-me_scaling_histories.pkl"
89
+ }
90
+ 2025-11-10 21:47:54 - INFO -
91
+ 2025-11-10 21:47:54 - INFO - >> Downloading runs, this will take a while (minutes)
92
+ wandb: Currently logged in as: danielle-rothermel (ml-moe) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
93
+ 2025-11-10 21:48:00 - INFO - - total runs found: 517
94
+ 2025-11-10 21:48:00 - INFO - >> Serializing runs and maybe getting histories: False
95
+ 2025-11-10 21:48:07 - INFO - >> 20/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-10M_main_1Mtx1_--learning_rate=5e-05
96
+ 2025-11-10 21:48:12 - INFO - >> 40/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-150M_main_10Mtx1_--learning_rate=5e-06
97
+ ...
98
+ 2025-11-10 21:50:46 - INFO - >> Dumped runs data to: ./data/ml-moe_ft-scaling_runs.pkl
99
+ 2025-11-10 21:50:46 - INFO - >> Runs only, not dumping histories to: ./data/ml-moe_ft-scaling_histories.pkl
100
+ ```
101
+
102
+
103
+
104
+ ## Very Alpha: Postgres Version
105
+
106
+ **Its very likely this won't currently work.** Download all runs from a Weights & Biases project:
107
+
108
+ ```bash
109
+ uv run python src/dr_wandb/cli/postres_download.py --entity your_entity --project your_project
110
+
111
+ Options:
112
+ --entity TEXT WandB entity (username or team name)
113
+ --project TEXT WandB project name
114
+ --runs-only Download only run metadata, skip training history
115
+ --force-refresh Download all data, ignoring existing records
116
+ --db-url TEXT PostgreSQL connection string
117
+ --output-dir TEXT Directory for exported Parquet files
118
+ --help Show help message and exit
119
+ ```
120
+
121
+ The tool creates a PostgreSQL database, downloads experiment data, and exports Parquet files to the configured output directory. It tool tracks existing data and downloads only new or updated runs by default. A run is considered for update if:
122
+
123
+ - It does not exist in the local database
124
+ - Its state is "running" (indicating potential new data)
125
+
126
+ Use `--force-refresh` to download all runs regardless of existing data.
127
+
128
+ ### Environment Variables
129
+
130
+ The tool reads configuration from environment variables with the `DR_WANDB_` prefix and supports `.env` files:
131
+
132
+ | Variable | Description | Default |
133
+ |----------|-------------|---------|
134
+ | `DR_WANDB_ENTITY` | Weights & Biases entity name | None |
135
+ | `DR_WANDB_PROJECT` | Weights & Biases project name | None |
136
+ | `DR_WANDB_DATABASE_URL` | PostgreSQL connection string | `postgresql+psycopg2://localhost/wandb` |
137
+ | `DR_WANDB_OUTPUT_DIR` | Directory for exported files | `./data` |
138
+
139
+ ### Database Configuration
140
+
141
+ The PostgreSQL connection string follows the standard format:
142
+
143
+ ```
144
+ postgresql+psycopg2://username:password@host:port/database_name
145
+ ```
146
+
147
+ If the specified database does not exist, the tool will attempt to create it automatically.
148
+
149
+ ### Data Schema
150
+
151
+
152
+ The tool generates the following files in the output directory:
153
+
154
+ - `runs_metadata.parquet` - Complete run metadata including configurations, summaries, and system information
155
+ - `runs_history.parquet` - Training metrics and logged values over time
156
+ - `runs_metadata_{component}.parquet` - Component-specific files for config, summary, wandb_metadata, system_metrics, system_attrs, and sweep_info
157
+
158
+
159
+ **Run Records**
160
+ - **run_id**: Unique identifier for the experiment run
161
+ - **run_name**: Human-readable name assigned to the run
162
+ - **state**: Current state (finished, running, crashed, failed, killed)
163
+ - **project**: Project name
164
+ - **entity**: Entity name
165
+ - **created_at**: Timestamp of run creation
166
+ - **config**: Experiment configuration parameters (JSONB)
167
+ - **summary**: Final metrics and outputs (JSONB)
168
+ - **wandb_metadata**: Platform-specific metadata (JSONB)
169
+ - **system_metrics**: Hardware and system information (JSONB)
170
+ - **system_attrs**: Additional system attributes (JSONB)
171
+ - **sweep_info**: Hyperparameter sweep information (JSONB)
172
+
173
+ **Training History Records**
174
+ - **run_id**: Reference to the parent run
175
+ - **step**: Training step number
176
+ - **timestamp**: Time of metric logging
177
+ - **runtime**: Elapsed time since run start
178
+ - **wandb_metadata**: Platform logging metadata (JSONB)
179
+ - **metrics**: All logged metrics and values (JSONB, flattened in Parquet export)
@@ -0,0 +1,165 @@
1
+ # dr_wandb
2
+
3
+ A command-line utility for downloading and archiving Weights & Biases experiment data to local storage formats optimized for offline analysis.
4
+
5
+
6
+ ## Installation
7
+
8
+ CLI Tool Install: `wandb-downloader`
9
+ ```
10
+ uv tool install dr_wandb
11
+ ```
12
+
13
+ Or, to use the library functions
14
+ ```bash
15
+ # To use the library functions
16
+ uv add dr_wandb
17
+ # Optionally
18
+ uv add dr_wandb[postgres]
19
+ uv sync
20
+ ```
21
+
22
+ ### Authentication
23
+
24
+ Configure Weights & Biases authentication using one of these methods:
25
+
26
+ ```bash
27
+ wandb login
28
+ ```
29
+
30
+ Or set the API key as an environment variable:
31
+
32
+ ```bash
33
+ export WANDB_API_KEY=your_api_key_here
34
+ ```
35
+
36
+ ## Quickstart
37
+
38
+ The default approach doesn't involve postgres. It fetches the runs, and optionally histories, and dumps them to local pkl files.
39
+
40
+ ```bash
41
+ » wandb-download --help
42
+
43
+ Usage: wandb-download [OPTIONS] ENTITY PROJECT OUTPUT_DIR
44
+
45
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
46
+ │ * entity TEXT [required] │
47
+ │ * project TEXT [required] │
48
+ │ * output_dir TEXT [required] │
49
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
50
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
51
+ │ --runs-only --no-runs-only [default: no-runs-only] │
52
+ │ --runs-per-page INTEGER [default: 500] │
53
+ │ --log-every INTEGER [default: 20] │
54
+ │ --install-completion Install completion for the current shell. │
55
+ │ --show-completion Show completion for the current shell, to copy it or customize the installation. │
56
+ │ --help Show this message and exit. │
57
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
58
+ ```
59
+
60
+ An example:
61
+ ```bash
62
+ » wandb-download --runs-only "ml-moe" "ft-scaling" "./data" 1 ↵
63
+ 2025-11-10 21:47:54 - INFO -
64
+ :: Beginning Dr. Wandb Project Downloading Tool ::
65
+
66
+ 2025-11-10 21:47:54 - INFO - {
67
+ "entity": "ml-me",
68
+ "project": "scaling",
69
+ "output_dir": "data",
70
+ "runs_only": true,
71
+ "runs_per_page": 500,
72
+ "log_every": 20,
73
+ "runs_output_filename": "ml-me_scaling_runs.pkl",
74
+ "histories_output_filename": "ml-me_scaling_histories.pkl"
75
+ }
76
+ 2025-11-10 21:47:54 - INFO -
77
+ 2025-11-10 21:47:54 - INFO - >> Downloading runs, this will take a while (minutes)
78
+ wandb: Currently logged in as: danielle-rothermel (ml-moe) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
79
+ 2025-11-10 21:48:00 - INFO - - total runs found: 517
80
+ 2025-11-10 21:48:00 - INFO - >> Serializing runs and maybe getting histories: False
81
+ 2025-11-10 21:48:07 - INFO - >> 20/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-10M_main_1Mtx1_--learning_rate=5e-05
82
+ 2025-11-10 21:48:12 - INFO - >> 40/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-150M_main_10Mtx1_--learning_rate=5e-06
83
+ ...
84
+ 2025-11-10 21:50:46 - INFO - >> Dumped runs data to: ./data/ml-moe_ft-scaling_runs.pkl
85
+ 2025-11-10 21:50:46 - INFO - >> Runs only, not dumping histories to: ./data/ml-moe_ft-scaling_histories.pkl
86
+ ```
87
+
88
+
89
+
90
+ ## Very Alpha: Postgres Version
91
+
92
+ **Its very likely this won't currently work.** Download all runs from a Weights & Biases project:
93
+
94
+ ```bash
95
+ uv run python src/dr_wandb/cli/postres_download.py --entity your_entity --project your_project
96
+
97
+ Options:
98
+ --entity TEXT WandB entity (username or team name)
99
+ --project TEXT WandB project name
100
+ --runs-only Download only run metadata, skip training history
101
+ --force-refresh Download all data, ignoring existing records
102
+ --db-url TEXT PostgreSQL connection string
103
+ --output-dir TEXT Directory for exported Parquet files
104
+ --help Show help message and exit
105
+ ```
106
+
107
+ The tool creates a PostgreSQL database, downloads experiment data, and exports Parquet files to the configured output directory. It tool tracks existing data and downloads only new or updated runs by default. A run is considered for update if:
108
+
109
+ - It does not exist in the local database
110
+ - Its state is "running" (indicating potential new data)
111
+
112
+ Use `--force-refresh` to download all runs regardless of existing data.
113
+
114
+ ### Environment Variables
115
+
116
+ The tool reads configuration from environment variables with the `DR_WANDB_` prefix and supports `.env` files:
117
+
118
+ | Variable | Description | Default |
119
+ |----------|-------------|---------|
120
+ | `DR_WANDB_ENTITY` | Weights & Biases entity name | None |
121
+ | `DR_WANDB_PROJECT` | Weights & Biases project name | None |
122
+ | `DR_WANDB_DATABASE_URL` | PostgreSQL connection string | `postgresql+psycopg2://localhost/wandb` |
123
+ | `DR_WANDB_OUTPUT_DIR` | Directory for exported files | `./data` |
124
+
125
+ ### Database Configuration
126
+
127
+ The PostgreSQL connection string follows the standard format:
128
+
129
+ ```
130
+ postgresql+psycopg2://username:password@host:port/database_name
131
+ ```
132
+
133
+ If the specified database does not exist, the tool will attempt to create it automatically.
134
+
135
+ ### Data Schema
136
+
137
+
138
+ The tool generates the following files in the output directory:
139
+
140
+ - `runs_metadata.parquet` - Complete run metadata including configurations, summaries, and system information
141
+ - `runs_history.parquet` - Training metrics and logged values over time
142
+ - `runs_metadata_{component}.parquet` - Component-specific files for config, summary, wandb_metadata, system_metrics, system_attrs, and sweep_info
143
+
144
+
145
+ **Run Records**
146
+ - **run_id**: Unique identifier for the experiment run
147
+ - **run_name**: Human-readable name assigned to the run
148
+ - **state**: Current state (finished, running, crashed, failed, killed)
149
+ - **project**: Project name
150
+ - **entity**: Entity name
151
+ - **created_at**: Timestamp of run creation
152
+ - **config**: Experiment configuration parameters (JSONB)
153
+ - **summary**: Final metrics and outputs (JSONB)
154
+ - **wandb_metadata**: Platform-specific metadata (JSONB)
155
+ - **system_metrics**: Hardware and system information (JSONB)
156
+ - **system_attrs**: Additional system attributes (JSONB)
157
+ - **sweep_info**: Hyperparameter sweep information (JSONB)
158
+
159
+ **Training History Records**
160
+ - **run_id**: Reference to the parent run
161
+ - **step**: Training step number
162
+ - **timestamp**: Time of metric logging
163
+ - **runtime**: Elapsed time since run start
164
+ - **wandb_metadata**: Platform logging metadata (JSONB)
165
+ - **metrics**: All logged metrics and values (JSONB, flattened in Parquet export)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dr-wandb"
3
- version = "0.1.1"
3
+ version = "0.1.2"
4
4
  description = "Interact with wandb from python"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -9,15 +9,14 @@ authors = [
9
9
  requires-python = ">=3.12"
10
10
  dependencies = [
11
11
  "pandas>=2.3.2",
12
- "psycopg2>=2.9.10",
13
12
  "pyarrow>=21.0.0",
14
- "pydantic-settings>=2.10.1",
15
- "sqlalchemy>=2.0.43",
13
+ "typer>=0.20.0",
16
14
  "wandb>=0.21.4",
15
+ "sqlalchemy>=2.0.43",
17
16
  ]
18
17
 
19
18
  [project.scripts]
20
- wandb-download = "dr_wandb.cli.download:download_project"
19
+ wandb-download = "dr_wandb.cli.download:app"
21
20
 
22
21
  [build-system]
23
22
  requires = ["hatchling"]
@@ -26,14 +25,25 @@ build-backend = "hatchling.build"
26
25
  [tool.hatch.metadata]
27
26
  allow-direct-references = true
28
27
 
28
+ [tool.hatch.build]
29
+ exclude = [
30
+ "docs",
31
+ "docs/**",
32
+ ]
33
+
34
+
29
35
  [tool.hatch.build.targets.wheel]
30
36
  packages = ["src/dr_wandb"]
31
37
 
32
38
 
33
39
  [dependency-groups]
34
40
  dev = [
41
+ "dr-wandb",
35
42
  "pytest>=8.4.1",
36
43
  ]
44
+ postgres = [
45
+ "pydantic-settings>=2.10.1",
46
+ ]
37
47
 
38
48
  [tool.ruff]
39
49
  include = [
@@ -154,3 +164,6 @@ seaborn = "sns"
154
164
  polars = "pl"
155
165
  lightning = "L"
156
166
  "jax.numpy" = "jnp"
167
+
168
+ [tool.uv.sources]
169
+ dr-wandb = { workspace = true }
@@ -0,0 +1,97 @@
1
+ from typing import Any
2
+ import logging
3
+ from pydantic import BaseModel, Field, computed_field
4
+ from pathlib import Path
5
+ import typer
6
+ import pickle
7
+
8
+ from dr_wandb.fetch import fetch_project_runs
9
+
10
+ app = typer.Typer()
11
+
12
+ class ProjDownloadConfig(BaseModel):
13
+ entity: str
14
+ project: str
15
+ output_dir: Path = Field(
16
+ default_factory=lambda: (
17
+ Path(__file__).parent.parent.parent.parent / "data"
18
+ )
19
+ )
20
+ runs_only: bool = False
21
+ runs_per_page: int = 500
22
+ log_every: int = 20
23
+
24
+ runs_output_filename: str = Field(
25
+ default_factory=lambda data: (
26
+ f"{data['entity']}_{data['project']}_runs.pkl"
27
+ )
28
+ )
29
+ histories_output_filename: str = Field(
30
+ default_factory=lambda data: (
31
+ f"{data['entity']}_{data['project']}_histories.pkl"
32
+ )
33
+ )
34
+
35
+ def progress_callback(self, run_index: int, total_runs: int, message: str)-> None:
36
+ if run_index % self.log_every == 0:
37
+ logging.info(f">> {run_index}/{total_runs}: {message}")
38
+
39
+
40
+ @computed_field
41
+ @property
42
+ def fetch_runs_cfg(self) -> dict[str, Any]:
43
+ return {
44
+ "entity": self.entity,
45
+ "project": self.project,
46
+ "runs_per_page": self.runs_per_page,
47
+ "progress_callback": self.progress_callback,
48
+ "include_history": not self.runs_only,
49
+ }
50
+
51
+ def setup_logging(level: str = "INFO") -> None:
52
+ logging.basicConfig(
53
+ level=getattr(logging, level.upper()),
54
+ format="%(asctime)s - %(levelname)s - %(message)s",
55
+ datefmt="%Y-%m-%d %H:%M:%S",
56
+ )
57
+
58
+
59
+ @app.command()
60
+ def download_project(
61
+ entity: str,
62
+ project: str,
63
+ output_dir: str,
64
+ runs_only: bool = False,
65
+ runs_per_page: int = 500,
66
+ log_every: int = 20,
67
+ ) -> None:
68
+ setup_logging()
69
+ logging.info("\n:: Beginning Dr. Wandb Project Downloading Tool ::\n")
70
+
71
+ cfg = ProjDownloadConfig(
72
+ entity=entity,
73
+ project=project,
74
+ output_dir=output_dir,
75
+ runs_only=runs_only,
76
+ runs_per_page=runs_per_page,
77
+ log_every=log_every,
78
+ )
79
+ logging.info(str(cfg.model_dump_json(indent=4, exclude="fetch_runs_cfg")))
80
+ logging.info("")
81
+
82
+ runs, histories = fetch_project_runs(**cfg.fetch_runs_cfg)
83
+ runs_filename = f"{output_dir}/{cfg.runs_output_filename}"
84
+ histories_filename = f"{output_dir}/{cfg.histories_output_filename}"
85
+ with open(runs_filename, 'wb') as run_file:
86
+ pickle.dump(runs, run_file)
87
+ logging.info(f">> Dumped runs data to: {runs_filename}")
88
+ if not cfg.runs_only:
89
+ with open(histories_filename, 'wb') as hist_file:
90
+ pickle.dump(histories, hist_file)
91
+ logging.info(f">> Dumped histories data to: {histories_filename}")
92
+ else:
93
+ logging.info(f">> Runs only, not dumping histories to: {histories_filename}")
94
+
95
+
96
+ if __name__ == "__main__":
97
+ app()
@@ -1,6 +1,7 @@
1
1
  from collections.abc import Callable
2
2
  from typing import Literal
3
3
 
4
+ from sqlalchemy import String
4
5
  from sqlalchemy.orm import DeclarativeBase
5
6
 
6
7
 
@@ -17,4 +18,6 @@ WANDB_RUN_STATES = ["finished", "running", "crashed", "failed", "killed"]
17
18
  type RunState = Literal["finished", "running", "crashed", "failed", "killed"]
18
19
  type RunId = str
19
20
 
21
+ Base.type_annotation_map = {RunId: String}
22
+
20
23
  type ProgressCallback = Callable[[int, int, str], None]
@@ -6,6 +6,7 @@ from collections.abc import Callable, Iterator
6
6
  from typing import Any
7
7
 
8
8
  import wandb
9
+ import logging
9
10
 
10
11
  from dr_wandb.history_entry_record import HistoryEntryRecord
11
12
  from dr_wandb.run_record import RunRecord
@@ -62,9 +63,12 @@ def fetch_project_runs(
62
63
  runs: list[dict[str, Any]] = []
63
64
  histories: list[list[dict[str, Any]]] = []
64
65
 
66
+ logging.info(">> Downloading runs, this will take a while (minutes)")
65
67
  run_iter = list(_iterate_runs(entity, project, runs_per_page=runs_per_page))
66
68
  total = len(run_iter)
69
+ logging.info(f" - total runs found: {total}")
67
70
 
71
+ logging.info(f">> Serializing runs and maybe getting histories: {include_history}")
68
72
  for index, run in enumerate(run_iter, start=1):
69
73
  runs.append(serialize_run(run))
70
74
  if include_history: