dr-wandb 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dr_wandb-0.1.2/.python-version +1 -0
- dr_wandb-0.1.2/PKG-INFO +179 -0
- dr_wandb-0.1.2/README.md +165 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/pyproject.toml +18 -5
- dr_wandb-0.1.2/src/dr_wandb/cli/download.py +97 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/constants.py +3 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/fetch.py +4 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/uv.lock +383 -324
- dr_wandb-0.1.1/.python-version +0 -1
- dr_wandb-0.1.1/PKG-INFO +0 -124
- dr_wandb-0.1.1/README.md +0 -109
- dr_wandb-0.1.1/docs/processes/CODING_PRINCIPLES.md +0 -1
- dr_wandb-0.1.1/docs/processes/README.md +0 -1
- dr_wandb-0.1.1/docs/processes/audit_synthesis_pipeline.md +0 -1
- dr_wandb-0.1.1/docs/processes/design_philosophy.md +0 -1
- dr_wandb-0.1.1/docs/processes/documentation_organizer_guide.md +0 -1
- dr_wandb-0.1.1/docs/processes/fresh_eyes_review_guide.md +0 -1
- dr_wandb-0.1.1/docs/processes/general_project_extraction_prompt.md +0 -1
- dr_wandb-0.1.1/docs/processes/project_consolidation_methodology.md +0 -1
- dr_wandb-0.1.1/docs/processes/reporting_guide.md +0 -1
- dr_wandb-0.1.1/docs/processes/strategic_collaboration_guide.md +0 -1
- dr_wandb-0.1.1/docs/processes/tactical_execution_guide.md +0 -1
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/.claude/settings.local.json +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/.example.env +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/.gitignore +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/CLAUDE.md +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/LICENSE +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/__init__.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/cli/__init__.py +0 -0
- /dr_wandb-0.1.1/src/dr_wandb/cli/download.py → /dr_wandb-0.1.2/src/dr_wandb/cli/postgres_download.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/downloader.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/history_entry_record.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/py.typed +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/run_record.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/store.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/src/dr_wandb/utils.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/conftest.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_cli_contract.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_cli_download.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_fetch.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_history_entry_record.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_query_builders.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_run_record.py +0 -0
- {dr_wandb-0.1.1 → dr_wandb-0.1.2}/tests/test_utils.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
dr_wandb-0.1.2/PKG-INFO
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dr-wandb
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Interact with wandb from python
|
|
5
|
+
Author-email: Danielle Rothermel <danielle.rothermel@gmail.com>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: pandas>=2.3.2
|
|
9
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
10
|
+
Requires-Dist: sqlalchemy>=2.0.43
|
|
11
|
+
Requires-Dist: typer>=0.20.0
|
|
12
|
+
Requires-Dist: wandb>=0.21.4
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# dr_wandb
|
|
16
|
+
|
|
17
|
+
A command-line utility for downloading and archiving Weights & Biases experiment data to local storage formats optimized for offline analysis.
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
CLI Tool Install: `wandb-downloader`
|
|
23
|
+
```
|
|
24
|
+
uv tool install dr_wandb
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Or, to use the library functions
|
|
28
|
+
```bash
|
|
29
|
+
# To use the library functions
|
|
30
|
+
uv add dr_wandb
|
|
31
|
+
# Optionally
|
|
32
|
+
uv add dr_wandb[postgres]
|
|
33
|
+
uv sync
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Authentication
|
|
37
|
+
|
|
38
|
+
Configure Weights & Biases authentication using one of these methods:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
wandb login
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Or set the API key as an environment variable:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
export WANDB_API_KEY=your_api_key_here
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Quickstart
|
|
51
|
+
|
|
52
|
+
The default approach doesn't involve postgres. It fetches the runs, and optionally histories, and dumps them to local pkl files.
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
» wandb-download --help
|
|
56
|
+
|
|
57
|
+
Usage: wandb-download [OPTIONS] ENTITY PROJECT OUTPUT_DIR
|
|
58
|
+
|
|
59
|
+
╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
60
|
+
│ * entity TEXT [required] │
|
|
61
|
+
│ * project TEXT [required] │
|
|
62
|
+
│ * output_dir TEXT [required] │
|
|
63
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
64
|
+
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
65
|
+
│ --runs-only --no-runs-only [default: no-runs-only] │
|
|
66
|
+
│ --runs-per-page INTEGER [default: 500] │
|
|
67
|
+
│ --log-every INTEGER [default: 20] │
|
|
68
|
+
│ --install-completion Install completion for the current shell. │
|
|
69
|
+
│ --show-completion Show completion for the current shell, to copy it or customize the installation. │
|
|
70
|
+
│ --help Show this message and exit. │
|
|
71
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
An example:
|
|
75
|
+
```bash
|
|
76
|
+
» wandb-download --runs-only "ml-moe" "ft-scaling" "./data" 1 ↵
|
|
77
|
+
2025-11-10 21:47:54 - INFO -
|
|
78
|
+
:: Beginning Dr. Wandb Project Downloading Tool ::
|
|
79
|
+
|
|
80
|
+
2025-11-10 21:47:54 - INFO - {
|
|
81
|
+
"entity": "ml-me",
|
|
82
|
+
"project": "scaling",
|
|
83
|
+
"output_dir": "data",
|
|
84
|
+
"runs_only": true,
|
|
85
|
+
"runs_per_page": 500,
|
|
86
|
+
"log_every": 20,
|
|
87
|
+
"runs_output_filename": "ml-me_scaling_runs.pkl",
|
|
88
|
+
"histories_output_filename": "ml-me_scaling_histories.pkl"
|
|
89
|
+
}
|
|
90
|
+
2025-11-10 21:47:54 - INFO -
|
|
91
|
+
2025-11-10 21:47:54 - INFO - >> Downloading runs, this will take a while (minutes)
|
|
92
|
+
wandb: Currently logged in as: danielle-rothermel (ml-moe) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
|
|
93
|
+
2025-11-10 21:48:00 - INFO - - total runs found: 517
|
|
94
|
+
2025-11-10 21:48:00 - INFO - >> Serializing runs and maybe getting histories: False
|
|
95
|
+
2025-11-10 21:48:07 - INFO - >> 20/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-10M_main_1Mtx1_--learning_rate=5e-05
|
|
96
|
+
2025-11-10 21:48:12 - INFO - >> 40/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-150M_main_10Mtx1_--learning_rate=5e-06
|
|
97
|
+
...
|
|
98
|
+
2025-11-10 21:50:46 - INFO - >> Dumped runs data to: ./data/ml-moe_ft-scaling_runs.pkl
|
|
99
|
+
2025-11-10 21:50:46 - INFO - >> Runs only, not dumping histories to: ./data/ml-moe_ft-scaling_histories.pkl
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
## Very Alpha: Postgres Version
|
|
105
|
+
|
|
106
|
+
**Its very likely this won't currently work.** Download all runs from a Weights & Biases project:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
uv run python src/dr_wandb/cli/postres_download.py --entity your_entity --project your_project
|
|
110
|
+
|
|
111
|
+
Options:
|
|
112
|
+
--entity TEXT WandB entity (username or team name)
|
|
113
|
+
--project TEXT WandB project name
|
|
114
|
+
--runs-only Download only run metadata, skip training history
|
|
115
|
+
--force-refresh Download all data, ignoring existing records
|
|
116
|
+
--db-url TEXT PostgreSQL connection string
|
|
117
|
+
--output-dir TEXT Directory for exported Parquet files
|
|
118
|
+
--help Show help message and exit
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
The tool creates a PostgreSQL database, downloads experiment data, and exports Parquet files to the configured output directory. It tool tracks existing data and downloads only new or updated runs by default. A run is considered for update if:
|
|
122
|
+
|
|
123
|
+
- It does not exist in the local database
|
|
124
|
+
- Its state is "running" (indicating potential new data)
|
|
125
|
+
|
|
126
|
+
Use `--force-refresh` to download all runs regardless of existing data.
|
|
127
|
+
|
|
128
|
+
### Environment Variables
|
|
129
|
+
|
|
130
|
+
The tool reads configuration from environment variables with the `DR_WANDB_` prefix and supports `.env` files:
|
|
131
|
+
|
|
132
|
+
| Variable | Description | Default |
|
|
133
|
+
|----------|-------------|---------|
|
|
134
|
+
| `DR_WANDB_ENTITY` | Weights & Biases entity name | None |
|
|
135
|
+
| `DR_WANDB_PROJECT` | Weights & Biases project name | None |
|
|
136
|
+
| `DR_WANDB_DATABASE_URL` | PostgreSQL connection string | `postgresql+psycopg2://localhost/wandb` |
|
|
137
|
+
| `DR_WANDB_OUTPUT_DIR` | Directory for exported files | `./data` |
|
|
138
|
+
|
|
139
|
+
### Database Configuration
|
|
140
|
+
|
|
141
|
+
The PostgreSQL connection string follows the standard format:
|
|
142
|
+
|
|
143
|
+
```
|
|
144
|
+
postgresql+psycopg2://username:password@host:port/database_name
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
If the specified database does not exist, the tool will attempt to create it automatically.
|
|
148
|
+
|
|
149
|
+
### Data Schema
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
The tool generates the following files in the output directory:
|
|
153
|
+
|
|
154
|
+
- `runs_metadata.parquet` - Complete run metadata including configurations, summaries, and system information
|
|
155
|
+
- `runs_history.parquet` - Training metrics and logged values over time
|
|
156
|
+
- `runs_metadata_{component}.parquet` - Component-specific files for config, summary, wandb_metadata, system_metrics, system_attrs, and sweep_info
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
**Run Records**
|
|
160
|
+
- **run_id**: Unique identifier for the experiment run
|
|
161
|
+
- **run_name**: Human-readable name assigned to the run
|
|
162
|
+
- **state**: Current state (finished, running, crashed, failed, killed)
|
|
163
|
+
- **project**: Project name
|
|
164
|
+
- **entity**: Entity name
|
|
165
|
+
- **created_at**: Timestamp of run creation
|
|
166
|
+
- **config**: Experiment configuration parameters (JSONB)
|
|
167
|
+
- **summary**: Final metrics and outputs (JSONB)
|
|
168
|
+
- **wandb_metadata**: Platform-specific metadata (JSONB)
|
|
169
|
+
- **system_metrics**: Hardware and system information (JSONB)
|
|
170
|
+
- **system_attrs**: Additional system attributes (JSONB)
|
|
171
|
+
- **sweep_info**: Hyperparameter sweep information (JSONB)
|
|
172
|
+
|
|
173
|
+
**Training History Records**
|
|
174
|
+
- **run_id**: Reference to the parent run
|
|
175
|
+
- **step**: Training step number
|
|
176
|
+
- **timestamp**: Time of metric logging
|
|
177
|
+
- **runtime**: Elapsed time since run start
|
|
178
|
+
- **wandb_metadata**: Platform logging metadata (JSONB)
|
|
179
|
+
- **metrics**: All logged metrics and values (JSONB, flattened in Parquet export)
|
dr_wandb-0.1.2/README.md
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# dr_wandb
|
|
2
|
+
|
|
3
|
+
A command-line utility for downloading and archiving Weights & Biases experiment data to local storage formats optimized for offline analysis.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
## Installation
|
|
7
|
+
|
|
8
|
+
CLI Tool Install: `wandb-downloader`
|
|
9
|
+
```
|
|
10
|
+
uv tool install dr_wandb
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Or, to use the library functions
|
|
14
|
+
```bash
|
|
15
|
+
# To use the library functions
|
|
16
|
+
uv add dr_wandb
|
|
17
|
+
# Optionally
|
|
18
|
+
uv add dr_wandb[postgres]
|
|
19
|
+
uv sync
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Authentication
|
|
23
|
+
|
|
24
|
+
Configure Weights & Biases authentication using one of these methods:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
wandb login
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Or set the API key as an environment variable:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
export WANDB_API_KEY=your_api_key_here
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Quickstart
|
|
37
|
+
|
|
38
|
+
The default approach doesn't involve postgres. It fetches the runs, and optionally histories, and dumps them to local pkl files.
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
» wandb-download --help
|
|
42
|
+
|
|
43
|
+
Usage: wandb-download [OPTIONS] ENTITY PROJECT OUTPUT_DIR
|
|
44
|
+
|
|
45
|
+
╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
46
|
+
│ * entity TEXT [required] │
|
|
47
|
+
│ * project TEXT [required] │
|
|
48
|
+
│ * output_dir TEXT [required] │
|
|
49
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
50
|
+
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
51
|
+
│ --runs-only --no-runs-only [default: no-runs-only] │
|
|
52
|
+
│ --runs-per-page INTEGER [default: 500] │
|
|
53
|
+
│ --log-every INTEGER [default: 20] │
|
|
54
|
+
│ --install-completion Install completion for the current shell. │
|
|
55
|
+
│ --show-completion Show completion for the current shell, to copy it or customize the installation. │
|
|
56
|
+
│ --help Show this message and exit. │
|
|
57
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
An example:
|
|
61
|
+
```bash
|
|
62
|
+
» wandb-download --runs-only "ml-moe" "ft-scaling" "./data" 1 ↵
|
|
63
|
+
2025-11-10 21:47:54 - INFO -
|
|
64
|
+
:: Beginning Dr. Wandb Project Downloading Tool ::
|
|
65
|
+
|
|
66
|
+
2025-11-10 21:47:54 - INFO - {
|
|
67
|
+
"entity": "ml-me",
|
|
68
|
+
"project": "scaling",
|
|
69
|
+
"output_dir": "data",
|
|
70
|
+
"runs_only": true,
|
|
71
|
+
"runs_per_page": 500,
|
|
72
|
+
"log_every": 20,
|
|
73
|
+
"runs_output_filename": "ml-me_scaling_runs.pkl",
|
|
74
|
+
"histories_output_filename": "ml-me_scaling_histories.pkl"
|
|
75
|
+
}
|
|
76
|
+
2025-11-10 21:47:54 - INFO -
|
|
77
|
+
2025-11-10 21:47:54 - INFO - >> Downloading runs, this will take a while (minutes)
|
|
78
|
+
wandb: Currently logged in as: danielle-rothermel (ml-moe) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
|
|
79
|
+
2025-11-10 21:48:00 - INFO - - total runs found: 517
|
|
80
|
+
2025-11-10 21:48:00 - INFO - >> Serializing runs and maybe getting histories: False
|
|
81
|
+
2025-11-10 21:48:07 - INFO - >> 20/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-10M_main_1Mtx1_--learning_rate=5e-05
|
|
82
|
+
2025-11-10 21:48:12 - INFO - >> 40/517: 2025_08_21-08_24_43_test_finetune_DD-dolma1_7-150M_main_10Mtx1_--learning_rate=5e-06
|
|
83
|
+
...
|
|
84
|
+
2025-11-10 21:50:46 - INFO - >> Dumped runs data to: ./data/ml-moe_ft-scaling_runs.pkl
|
|
85
|
+
2025-11-10 21:50:46 - INFO - >> Runs only, not dumping histories to: ./data/ml-moe_ft-scaling_histories.pkl
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
## Very Alpha: Postgres Version
|
|
91
|
+
|
|
92
|
+
**Its very likely this won't currently work.** Download all runs from a Weights & Biases project:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
uv run python src/dr_wandb/cli/postres_download.py --entity your_entity --project your_project
|
|
96
|
+
|
|
97
|
+
Options:
|
|
98
|
+
--entity TEXT WandB entity (username or team name)
|
|
99
|
+
--project TEXT WandB project name
|
|
100
|
+
--runs-only Download only run metadata, skip training history
|
|
101
|
+
--force-refresh Download all data, ignoring existing records
|
|
102
|
+
--db-url TEXT PostgreSQL connection string
|
|
103
|
+
--output-dir TEXT Directory for exported Parquet files
|
|
104
|
+
--help Show help message and exit
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
The tool creates a PostgreSQL database, downloads experiment data, and exports Parquet files to the configured output directory. It tool tracks existing data and downloads only new or updated runs by default. A run is considered for update if:
|
|
108
|
+
|
|
109
|
+
- It does not exist in the local database
|
|
110
|
+
- Its state is "running" (indicating potential new data)
|
|
111
|
+
|
|
112
|
+
Use `--force-refresh` to download all runs regardless of existing data.
|
|
113
|
+
|
|
114
|
+
### Environment Variables
|
|
115
|
+
|
|
116
|
+
The tool reads configuration from environment variables with the `DR_WANDB_` prefix and supports `.env` files:
|
|
117
|
+
|
|
118
|
+
| Variable | Description | Default |
|
|
119
|
+
|----------|-------------|---------|
|
|
120
|
+
| `DR_WANDB_ENTITY` | Weights & Biases entity name | None |
|
|
121
|
+
| `DR_WANDB_PROJECT` | Weights & Biases project name | None |
|
|
122
|
+
| `DR_WANDB_DATABASE_URL` | PostgreSQL connection string | `postgresql+psycopg2://localhost/wandb` |
|
|
123
|
+
| `DR_WANDB_OUTPUT_DIR` | Directory for exported files | `./data` |
|
|
124
|
+
|
|
125
|
+
### Database Configuration
|
|
126
|
+
|
|
127
|
+
The PostgreSQL connection string follows the standard format:
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
postgresql+psycopg2://username:password@host:port/database_name
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
If the specified database does not exist, the tool will attempt to create it automatically.
|
|
134
|
+
|
|
135
|
+
### Data Schema
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
The tool generates the following files in the output directory:
|
|
139
|
+
|
|
140
|
+
- `runs_metadata.parquet` - Complete run metadata including configurations, summaries, and system information
|
|
141
|
+
- `runs_history.parquet` - Training metrics and logged values over time
|
|
142
|
+
- `runs_metadata_{component}.parquet` - Component-specific files for config, summary, wandb_metadata, system_metrics, system_attrs, and sweep_info
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
**Run Records**
|
|
146
|
+
- **run_id**: Unique identifier for the experiment run
|
|
147
|
+
- **run_name**: Human-readable name assigned to the run
|
|
148
|
+
- **state**: Current state (finished, running, crashed, failed, killed)
|
|
149
|
+
- **project**: Project name
|
|
150
|
+
- **entity**: Entity name
|
|
151
|
+
- **created_at**: Timestamp of run creation
|
|
152
|
+
- **config**: Experiment configuration parameters (JSONB)
|
|
153
|
+
- **summary**: Final metrics and outputs (JSONB)
|
|
154
|
+
- **wandb_metadata**: Platform-specific metadata (JSONB)
|
|
155
|
+
- **system_metrics**: Hardware and system information (JSONB)
|
|
156
|
+
- **system_attrs**: Additional system attributes (JSONB)
|
|
157
|
+
- **sweep_info**: Hyperparameter sweep information (JSONB)
|
|
158
|
+
|
|
159
|
+
**Training History Records**
|
|
160
|
+
- **run_id**: Reference to the parent run
|
|
161
|
+
- **step**: Training step number
|
|
162
|
+
- **timestamp**: Time of metric logging
|
|
163
|
+
- **runtime**: Elapsed time since run start
|
|
164
|
+
- **wandb_metadata**: Platform logging metadata (JSONB)
|
|
165
|
+
- **metrics**: All logged metrics and values (JSONB, flattened in Parquet export)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "dr-wandb"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.2"
|
|
4
4
|
description = "Interact with wandb from python"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -9,15 +9,14 @@ authors = [
|
|
|
9
9
|
requires-python = ">=3.12"
|
|
10
10
|
dependencies = [
|
|
11
11
|
"pandas>=2.3.2",
|
|
12
|
-
"psycopg2>=2.9.10",
|
|
13
12
|
"pyarrow>=21.0.0",
|
|
14
|
-
"
|
|
15
|
-
"sqlalchemy>=2.0.43",
|
|
13
|
+
"typer>=0.20.0",
|
|
16
14
|
"wandb>=0.21.4",
|
|
15
|
+
"sqlalchemy>=2.0.43",
|
|
17
16
|
]
|
|
18
17
|
|
|
19
18
|
[project.scripts]
|
|
20
|
-
wandb-download = "dr_wandb.cli.download:
|
|
19
|
+
wandb-download = "dr_wandb.cli.download:app"
|
|
21
20
|
|
|
22
21
|
[build-system]
|
|
23
22
|
requires = ["hatchling"]
|
|
@@ -26,14 +25,25 @@ build-backend = "hatchling.build"
|
|
|
26
25
|
[tool.hatch.metadata]
|
|
27
26
|
allow-direct-references = true
|
|
28
27
|
|
|
28
|
+
[tool.hatch.build]
|
|
29
|
+
exclude = [
|
|
30
|
+
"docs",
|
|
31
|
+
"docs/**",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
29
35
|
[tool.hatch.build.targets.wheel]
|
|
30
36
|
packages = ["src/dr_wandb"]
|
|
31
37
|
|
|
32
38
|
|
|
33
39
|
[dependency-groups]
|
|
34
40
|
dev = [
|
|
41
|
+
"dr-wandb",
|
|
35
42
|
"pytest>=8.4.1",
|
|
36
43
|
]
|
|
44
|
+
postgres = [
|
|
45
|
+
"pydantic-settings>=2.10.1",
|
|
46
|
+
]
|
|
37
47
|
|
|
38
48
|
[tool.ruff]
|
|
39
49
|
include = [
|
|
@@ -154,3 +164,6 @@ seaborn = "sns"
|
|
|
154
164
|
polars = "pl"
|
|
155
165
|
lightning = "L"
|
|
156
166
|
"jax.numpy" = "jnp"
|
|
167
|
+
|
|
168
|
+
[tool.uv.sources]
|
|
169
|
+
dr-wandb = { workspace = true }
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
import logging
|
|
3
|
+
from pydantic import BaseModel, Field, computed_field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import typer
|
|
6
|
+
import pickle
|
|
7
|
+
|
|
8
|
+
from dr_wandb.fetch import fetch_project_runs
|
|
9
|
+
|
|
10
|
+
app = typer.Typer()
|
|
11
|
+
|
|
12
|
+
class ProjDownloadConfig(BaseModel):
|
|
13
|
+
entity: str
|
|
14
|
+
project: str
|
|
15
|
+
output_dir: Path = Field(
|
|
16
|
+
default_factory=lambda: (
|
|
17
|
+
Path(__file__).parent.parent.parent.parent / "data"
|
|
18
|
+
)
|
|
19
|
+
)
|
|
20
|
+
runs_only: bool = False
|
|
21
|
+
runs_per_page: int = 500
|
|
22
|
+
log_every: int = 20
|
|
23
|
+
|
|
24
|
+
runs_output_filename: str = Field(
|
|
25
|
+
default_factory=lambda data: (
|
|
26
|
+
f"{data['entity']}_{data['project']}_runs.pkl"
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
histories_output_filename: str = Field(
|
|
30
|
+
default_factory=lambda data: (
|
|
31
|
+
f"{data['entity']}_{data['project']}_histories.pkl"
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def progress_callback(self, run_index: int, total_runs: int, message: str)-> None:
|
|
36
|
+
if run_index % self.log_every == 0:
|
|
37
|
+
logging.info(f">> {run_index}/{total_runs}: {message}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@computed_field
|
|
41
|
+
@property
|
|
42
|
+
def fetch_runs_cfg(self) -> dict[str, Any]:
|
|
43
|
+
return {
|
|
44
|
+
"entity": self.entity,
|
|
45
|
+
"project": self.project,
|
|
46
|
+
"runs_per_page": self.runs_per_page,
|
|
47
|
+
"progress_callback": self.progress_callback,
|
|
48
|
+
"include_history": not self.runs_only,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
def setup_logging(level: str = "INFO") -> None:
|
|
52
|
+
logging.basicConfig(
|
|
53
|
+
level=getattr(logging, level.upper()),
|
|
54
|
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
55
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@app.command()
|
|
60
|
+
def download_project(
|
|
61
|
+
entity: str,
|
|
62
|
+
project: str,
|
|
63
|
+
output_dir: str,
|
|
64
|
+
runs_only: bool = False,
|
|
65
|
+
runs_per_page: int = 500,
|
|
66
|
+
log_every: int = 20,
|
|
67
|
+
) -> None:
|
|
68
|
+
setup_logging()
|
|
69
|
+
logging.info("\n:: Beginning Dr. Wandb Project Downloading Tool ::\n")
|
|
70
|
+
|
|
71
|
+
cfg = ProjDownloadConfig(
|
|
72
|
+
entity=entity,
|
|
73
|
+
project=project,
|
|
74
|
+
output_dir=output_dir,
|
|
75
|
+
runs_only=runs_only,
|
|
76
|
+
runs_per_page=runs_per_page,
|
|
77
|
+
log_every=log_every,
|
|
78
|
+
)
|
|
79
|
+
logging.info(str(cfg.model_dump_json(indent=4, exclude="fetch_runs_cfg")))
|
|
80
|
+
logging.info("")
|
|
81
|
+
|
|
82
|
+
runs, histories = fetch_project_runs(**cfg.fetch_runs_cfg)
|
|
83
|
+
runs_filename = f"{output_dir}/{cfg.runs_output_filename}"
|
|
84
|
+
histories_filename = f"{output_dir}/{cfg.histories_output_filename}"
|
|
85
|
+
with open(runs_filename, 'wb') as run_file:
|
|
86
|
+
pickle.dump(runs, run_file)
|
|
87
|
+
logging.info(f">> Dumped runs data to: {runs_filename}")
|
|
88
|
+
if not cfg.runs_only:
|
|
89
|
+
with open(histories_filename, 'wb') as hist_file:
|
|
90
|
+
pickle.dump(histories, hist_file)
|
|
91
|
+
logging.info(f">> Dumped histories data to: {histories_filename}")
|
|
92
|
+
else:
|
|
93
|
+
logging.info(f">> Runs only, not dumping histories to: {histories_filename}")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
if __name__ == "__main__":
|
|
97
|
+
app()
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from collections.abc import Callable
|
|
2
2
|
from typing import Literal
|
|
3
3
|
|
|
4
|
+
from sqlalchemy import String
|
|
4
5
|
from sqlalchemy.orm import DeclarativeBase
|
|
5
6
|
|
|
6
7
|
|
|
@@ -17,4 +18,6 @@ WANDB_RUN_STATES = ["finished", "running", "crashed", "failed", "killed"]
|
|
|
17
18
|
type RunState = Literal["finished", "running", "crashed", "failed", "killed"]
|
|
18
19
|
type RunId = str
|
|
19
20
|
|
|
21
|
+
Base.type_annotation_map = {RunId: String}
|
|
22
|
+
|
|
20
23
|
type ProgressCallback = Callable[[int, int, str], None]
|
|
@@ -6,6 +6,7 @@ from collections.abc import Callable, Iterator
|
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
8
|
import wandb
|
|
9
|
+
import logging
|
|
9
10
|
|
|
10
11
|
from dr_wandb.history_entry_record import HistoryEntryRecord
|
|
11
12
|
from dr_wandb.run_record import RunRecord
|
|
@@ -62,9 +63,12 @@ def fetch_project_runs(
|
|
|
62
63
|
runs: list[dict[str, Any]] = []
|
|
63
64
|
histories: list[list[dict[str, Any]]] = []
|
|
64
65
|
|
|
66
|
+
logging.info(">> Downloading runs, this will take a while (minutes)")
|
|
65
67
|
run_iter = list(_iterate_runs(entity, project, runs_per_page=runs_per_page))
|
|
66
68
|
total = len(run_iter)
|
|
69
|
+
logging.info(f" - total runs found: {total}")
|
|
67
70
|
|
|
71
|
+
logging.info(f">> Serializing runs and maybe getting histories: {include_history}")
|
|
68
72
|
for index, run in enumerate(run_iter, start=1):
|
|
69
73
|
runs.append(serialize_run(run))
|
|
70
74
|
if include_history:
|