EvoScientist 0.0.1.dev4__py3-none-any.whl → 0.1.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- EvoScientist/EvoScientist.py +25 -61
- EvoScientist/__init__.py +0 -19
- EvoScientist/backends.py +0 -26
- EvoScientist/cli.py +1365 -480
- EvoScientist/middleware.py +7 -56
- EvoScientist/skills/clip/SKILL.md +253 -0
- EvoScientist/skills/clip/references/applications.md +207 -0
- EvoScientist/skills/langgraph-docs/SKILL.md +36 -0
- EvoScientist/skills/tensorboard/SKILL.md +629 -0
- EvoScientist/skills/tensorboard/references/integrations.md +638 -0
- EvoScientist/skills/tensorboard/references/profiling.md +545 -0
- EvoScientist/skills/tensorboard/references/visualization.md +620 -0
- EvoScientist/skills/vllm/SKILL.md +364 -0
- EvoScientist/skills/vllm/references/optimization.md +226 -0
- EvoScientist/skills/vllm/references/quantization.md +284 -0
- EvoScientist/skills/vllm/references/server-deployment.md +255 -0
- EvoScientist/skills/vllm/references/troubleshooting.md +447 -0
- EvoScientist/stream/__init__.py +0 -25
- EvoScientist/stream/utils.py +16 -23
- EvoScientist/tools.py +2 -75
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/METADATA +8 -153
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/RECORD +26 -24
- evoscientist-0.1.0rc2.dist-info/entry_points.txt +2 -0
- EvoScientist/config.py +0 -274
- EvoScientist/llm/__init__.py +0 -21
- EvoScientist/llm/models.py +0 -99
- EvoScientist/memory.py +0 -715
- EvoScientist/onboard.py +0 -725
- EvoScientist/paths.py +0 -44
- EvoScientist/skills_manager.py +0 -391
- EvoScientist/stream/display.py +0 -604
- EvoScientist/stream/events.py +0 -415
- EvoScientist/stream/state.py +0 -343
- evoscientist-0.0.1.dev4.dist-info/entry_points.txt +0 -5
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/WHEEL +0 -0
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/licenses/LICENSE +0 -0
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,629 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: tensorboard
|
|
3
|
+
description: Visualize training metrics, debug models with histograms, compare experiments, visualize model graphs, and profile performance with TensorBoard - Google's ML visualization toolkit
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Orchestra Research
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [MLOps, TensorBoard, Visualization, Training Metrics, Model Debugging, PyTorch, TensorFlow, Experiment Tracking, Performance Profiling]
|
|
8
|
+
dependencies: [tensorboard, torch, tensorflow]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# TensorBoard: Visualization Toolkit for ML
|
|
12
|
+
|
|
13
|
+
## When to Use This Skill
|
|
14
|
+
|
|
15
|
+
Use TensorBoard when you need to:
|
|
16
|
+
- **Visualize training metrics** like loss and accuracy over time
|
|
17
|
+
- **Debug models** with histograms and distributions
|
|
18
|
+
- **Compare experiments** across multiple runs
|
|
19
|
+
- **Visualize model graphs** and architecture
|
|
20
|
+
- **Project embeddings** to lower dimensions (t-SNE, PCA)
|
|
21
|
+
- **Track hyperparameter** experiments
|
|
22
|
+
- **Profile performance** and identify bottlenecks
|
|
23
|
+
- **Visualize images and text** during training
|
|
24
|
+
|
|
25
|
+
**Users**: 20M+ downloads/year | **GitHub Stars**: 27k+ | **License**: Apache 2.0
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# Install TensorBoard
|
|
31
|
+
pip install tensorboard
|
|
32
|
+
|
|
33
|
+
# PyTorch integration
|
|
34
|
+
pip install torch torchvision tensorboard
|
|
35
|
+
|
|
36
|
+
# TensorFlow integration (TensorBoard included)
|
|
37
|
+
pip install tensorflow
|
|
38
|
+
|
|
39
|
+
# Launch TensorBoard
|
|
40
|
+
tensorboard --logdir=runs
|
|
41
|
+
# Access at http://localhost:6006
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick Start
|
|
45
|
+
|
|
46
|
+
### PyTorch
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from torch.utils.tensorboard import SummaryWriter
|
|
50
|
+
|
|
51
|
+
# Create writer
|
|
52
|
+
writer = SummaryWriter('runs/experiment_1')
|
|
53
|
+
|
|
54
|
+
# Training loop
|
|
55
|
+
for epoch in range(10):
|
|
56
|
+
train_loss = train_epoch()
|
|
57
|
+
val_acc = validate()
|
|
58
|
+
|
|
59
|
+
# Log metrics
|
|
60
|
+
writer.add_scalar('Loss/train', train_loss, epoch)
|
|
61
|
+
writer.add_scalar('Accuracy/val', val_acc, epoch)
|
|
62
|
+
|
|
63
|
+
# Close writer
|
|
64
|
+
writer.close()
|
|
65
|
+
|
|
66
|
+
# Launch: tensorboard --logdir=runs
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### TensorFlow/Keras
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import tensorflow as tf
|
|
73
|
+
|
|
74
|
+
# Create callback
|
|
75
|
+
tensorboard_callback = tf.keras.callbacks.TensorBoard(
|
|
76
|
+
log_dir='logs/fit',
|
|
77
|
+
histogram_freq=1
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Train model
|
|
81
|
+
model.fit(
|
|
82
|
+
x_train, y_train,
|
|
83
|
+
epochs=10,
|
|
84
|
+
validation_data=(x_val, y_val),
|
|
85
|
+
callbacks=[tensorboard_callback]
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Launch: tensorboard --logdir=logs
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Core Concepts
|
|
92
|
+
|
|
93
|
+
### 1. SummaryWriter (PyTorch)
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from torch.utils.tensorboard import SummaryWriter
|
|
97
|
+
|
|
98
|
+
# Default directory: runs/CURRENT_DATETIME
|
|
99
|
+
writer = SummaryWriter()
|
|
100
|
+
|
|
101
|
+
# Custom directory
|
|
102
|
+
writer = SummaryWriter('runs/experiment_1')
|
|
103
|
+
|
|
104
|
+
# Custom comment (appended to default directory)
|
|
105
|
+
writer = SummaryWriter(comment='baseline')
|
|
106
|
+
|
|
107
|
+
# Log data
|
|
108
|
+
writer.add_scalar('Loss/train', 0.5, step=0)
|
|
109
|
+
writer.add_scalar('Loss/train', 0.3, step=1)
|
|
110
|
+
|
|
111
|
+
# Flush and close
|
|
112
|
+
writer.flush()
|
|
113
|
+
writer.close()
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### 2. Logging Scalars
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
# PyTorch
|
|
120
|
+
from torch.utils.tensorboard import SummaryWriter
|
|
121
|
+
writer = SummaryWriter()
|
|
122
|
+
|
|
123
|
+
for epoch in range(100):
|
|
124
|
+
train_loss = train()
|
|
125
|
+
val_loss = validate()
|
|
126
|
+
|
|
127
|
+
# Log individual metrics
|
|
128
|
+
writer.add_scalar('Loss/train', train_loss, epoch)
|
|
129
|
+
writer.add_scalar('Loss/val', val_loss, epoch)
|
|
130
|
+
writer.add_scalar('Accuracy/train', train_acc, epoch)
|
|
131
|
+
writer.add_scalar('Accuracy/val', val_acc, epoch)
|
|
132
|
+
|
|
133
|
+
# Learning rate
|
|
134
|
+
lr = optimizer.param_groups[0]['lr']
|
|
135
|
+
writer.add_scalar('Learning_rate', lr, epoch)
|
|
136
|
+
|
|
137
|
+
writer.close()
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
# TensorFlow
|
|
142
|
+
import tensorflow as tf
|
|
143
|
+
|
|
144
|
+
train_summary_writer = tf.summary.create_file_writer('logs/train')
|
|
145
|
+
val_summary_writer = tf.summary.create_file_writer('logs/val')
|
|
146
|
+
|
|
147
|
+
for epoch in range(100):
|
|
148
|
+
with train_summary_writer.as_default():
|
|
149
|
+
tf.summary.scalar('loss', train_loss, step=epoch)
|
|
150
|
+
tf.summary.scalar('accuracy', train_acc, step=epoch)
|
|
151
|
+
|
|
152
|
+
with val_summary_writer.as_default():
|
|
153
|
+
tf.summary.scalar('loss', val_loss, step=epoch)
|
|
154
|
+
tf.summary.scalar('accuracy', val_acc, step=epoch)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### 3. Logging Multiple Scalars
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
# PyTorch: Group related metrics
|
|
161
|
+
writer.add_scalars('Loss', {
|
|
162
|
+
'train': train_loss,
|
|
163
|
+
'validation': val_loss,
|
|
164
|
+
'test': test_loss
|
|
165
|
+
}, epoch)
|
|
166
|
+
|
|
167
|
+
writer.add_scalars('Metrics', {
|
|
168
|
+
'accuracy': accuracy,
|
|
169
|
+
'precision': precision,
|
|
170
|
+
'recall': recall,
|
|
171
|
+
'f1': f1_score
|
|
172
|
+
}, epoch)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### 4. Logging Images
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
# PyTorch
|
|
179
|
+
import torch
|
|
180
|
+
from torchvision.utils import make_grid
|
|
181
|
+
|
|
182
|
+
# Single image
|
|
183
|
+
writer.add_image('Input/sample', img_tensor, epoch)
|
|
184
|
+
|
|
185
|
+
# Multiple images as grid
|
|
186
|
+
img_grid = make_grid(images[:64], nrow=8)
|
|
187
|
+
writer.add_image('Batch/inputs', img_grid, epoch)
|
|
188
|
+
|
|
189
|
+
# Predictions visualization
|
|
190
|
+
pred_grid = make_grid(predictions[:16], nrow=4)
|
|
191
|
+
writer.add_image('Predictions', pred_grid, epoch)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
# TensorFlow
|
|
196
|
+
import tensorflow as tf
|
|
197
|
+
|
|
198
|
+
with file_writer.as_default():
|
|
199
|
+
# Encode images as PNG
|
|
200
|
+
tf.summary.image('Training samples', images, step=epoch, max_outputs=25)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### 5. Logging Histograms
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
# PyTorch: Track weight distributions
|
|
207
|
+
for name, param in model.named_parameters():
|
|
208
|
+
writer.add_histogram(name, param, epoch)
|
|
209
|
+
|
|
210
|
+
# Track gradients
|
|
211
|
+
if param.grad is not None:
|
|
212
|
+
writer.add_histogram(f'{name}.grad', param.grad, epoch)
|
|
213
|
+
|
|
214
|
+
# Track activations
|
|
215
|
+
writer.add_histogram('Activations/relu1', activations, epoch)
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
# TensorFlow
|
|
220
|
+
with file_writer.as_default():
|
|
221
|
+
tf.summary.histogram('weights/layer1', layer1.kernel, step=epoch)
|
|
222
|
+
tf.summary.histogram('activations/relu1', activations, step=epoch)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### 6. Logging Model Graph
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
# PyTorch
|
|
229
|
+
import torch
|
|
230
|
+
|
|
231
|
+
model = MyModel()
|
|
232
|
+
dummy_input = torch.randn(1, 3, 224, 224)
|
|
233
|
+
|
|
234
|
+
writer.add_graph(model, dummy_input)
|
|
235
|
+
writer.close()
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
# TensorFlow (automatic with Keras)
|
|
240
|
+
tensorboard_callback = tf.keras.callbacks.TensorBoard(
|
|
241
|
+
log_dir='logs',
|
|
242
|
+
write_graph=True
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
model.fit(x, y, callbacks=[tensorboard_callback])
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Advanced Features
|
|
249
|
+
|
|
250
|
+
### Embedding Projector
|
|
251
|
+
|
|
252
|
+
Visualize high-dimensional data (embeddings, features) in 2D/3D.
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
import torch
|
|
256
|
+
from torch.utils.tensorboard import SummaryWriter
|
|
257
|
+
|
|
258
|
+
# Get embeddings (e.g., word embeddings, image features)
|
|
259
|
+
embeddings = model.get_embeddings(data) # Shape: (N, embedding_dim)
|
|
260
|
+
|
|
261
|
+
# Metadata (labels for each point)
|
|
262
|
+
metadata = ['class_1', 'class_2', 'class_1', ...]
|
|
263
|
+
|
|
264
|
+
# Images (optional, for image embeddings)
|
|
265
|
+
label_images = torch.stack([img1, img2, img3, ...])
|
|
266
|
+
|
|
267
|
+
# Log to TensorBoard
|
|
268
|
+
writer.add_embedding(
|
|
269
|
+
embeddings,
|
|
270
|
+
metadata=metadata,
|
|
271
|
+
label_img=label_images,
|
|
272
|
+
global_step=epoch
|
|
273
|
+
)
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
**In TensorBoard:**
|
|
277
|
+
- Navigate to "Projector" tab
|
|
278
|
+
- Choose PCA, t-SNE, or UMAP visualization
|
|
279
|
+
- Search, filter, and explore clusters
|
|
280
|
+
|
|
281
|
+
### Hyperparameter Tuning
|
|
282
|
+
|
|
283
|
+
```python
|
|
284
|
+
from torch.utils.tensorboard import SummaryWriter
|
|
285
|
+
|
|
286
|
+
# Try different hyperparameters
|
|
287
|
+
for lr in [0.001, 0.01, 0.1]:
|
|
288
|
+
for batch_size in [16, 32, 64]:
|
|
289
|
+
# Create unique run directory
|
|
290
|
+
writer = SummaryWriter(f'runs/lr{lr}_bs{batch_size}')
|
|
291
|
+
|
|
292
|
+
# Log hyperparameters
|
|
293
|
+
writer.add_hparams(
|
|
294
|
+
{'lr': lr, 'batch_size': batch_size},
|
|
295
|
+
{'hparam/accuracy': final_acc, 'hparam/loss': final_loss}
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Train and log
|
|
299
|
+
for epoch in range(10):
|
|
300
|
+
loss = train(lr, batch_size)
|
|
301
|
+
writer.add_scalar('Loss/train', loss, epoch)
|
|
302
|
+
|
|
303
|
+
writer.close()
|
|
304
|
+
|
|
305
|
+
# Compare in TensorBoard's "HParams" tab
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### Text Logging
|
|
309
|
+
|
|
310
|
+
```python
|
|
311
|
+
# PyTorch: Log text (e.g., model predictions, summaries)
|
|
312
|
+
writer.add_text('Predictions', f'Epoch {epoch}: {predictions}', epoch)
|
|
313
|
+
writer.add_text('Config', str(config), 0)
|
|
314
|
+
|
|
315
|
+
# Log markdown tables
|
|
316
|
+
markdown_table = """
|
|
317
|
+
| Metric | Value |
|
|
318
|
+
|--------|-------|
|
|
319
|
+
| Accuracy | 0.95 |
|
|
320
|
+
| F1 Score | 0.93 |
|
|
321
|
+
"""
|
|
322
|
+
writer.add_text('Results', markdown_table, epoch)
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
### PR Curves
|
|
326
|
+
|
|
327
|
+
Precision-Recall curves for classification.
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
from torch.utils.tensorboard import SummaryWriter
|
|
331
|
+
|
|
332
|
+
# Get predictions and labels
|
|
333
|
+
predictions = model(test_data) # Shape: (N, num_classes)
|
|
334
|
+
labels = test_labels # Shape: (N,)
|
|
335
|
+
|
|
336
|
+
# Log PR curve for each class
|
|
337
|
+
for i in range(num_classes):
|
|
338
|
+
writer.add_pr_curve(
|
|
339
|
+
f'PR_curve/class_{i}',
|
|
340
|
+
labels == i,
|
|
341
|
+
predictions[:, i],
|
|
342
|
+
global_step=epoch
|
|
343
|
+
)
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
## Integration Examples
|
|
347
|
+
|
|
348
|
+
### PyTorch Training Loop
|
|
349
|
+
|
|
350
|
+
```python
|
|
351
|
+
import torch
|
|
352
|
+
import torch.nn as nn
|
|
353
|
+
from torch.utils.tensorboard import SummaryWriter
|
|
354
|
+
|
|
355
|
+
# Setup
|
|
356
|
+
writer = SummaryWriter('runs/resnet_experiment')
|
|
357
|
+
model = ResNet50()
|
|
358
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
|
359
|
+
criterion = nn.CrossEntropyLoss()
|
|
360
|
+
|
|
361
|
+
# Log model graph
|
|
362
|
+
dummy_input = torch.randn(1, 3, 224, 224)
|
|
363
|
+
writer.add_graph(model, dummy_input)
|
|
364
|
+
|
|
365
|
+
# Training loop
|
|
366
|
+
for epoch in range(50):
|
|
367
|
+
model.train()
|
|
368
|
+
train_loss = 0.0
|
|
369
|
+
train_correct = 0
|
|
370
|
+
|
|
371
|
+
for batch_idx, (data, target) in enumerate(train_loader):
|
|
372
|
+
optimizer.zero_grad()
|
|
373
|
+
output = model(data)
|
|
374
|
+
loss = criterion(output, target)
|
|
375
|
+
loss.backward()
|
|
376
|
+
optimizer.step()
|
|
377
|
+
|
|
378
|
+
train_loss += loss.item()
|
|
379
|
+
pred = output.argmax(dim=1)
|
|
380
|
+
train_correct += pred.eq(target).sum().item()
|
|
381
|
+
|
|
382
|
+
# Log batch metrics (every 100 batches)
|
|
383
|
+
if batch_idx % 100 == 0:
|
|
384
|
+
global_step = epoch * len(train_loader) + batch_idx
|
|
385
|
+
writer.add_scalar('Loss/train_batch', loss.item(), global_step)
|
|
386
|
+
|
|
387
|
+
# Epoch metrics
|
|
388
|
+
train_loss /= len(train_loader)
|
|
389
|
+
train_acc = train_correct / len(train_loader.dataset)
|
|
390
|
+
|
|
391
|
+
# Validation
|
|
392
|
+
model.eval()
|
|
393
|
+
val_loss = 0.0
|
|
394
|
+
val_correct = 0
|
|
395
|
+
|
|
396
|
+
with torch.no_grad():
|
|
397
|
+
for data, target in val_loader:
|
|
398
|
+
output = model(data)
|
|
399
|
+
val_loss += criterion(output, target).item()
|
|
400
|
+
pred = output.argmax(dim=1)
|
|
401
|
+
val_correct += pred.eq(target).sum().item()
|
|
402
|
+
|
|
403
|
+
val_loss /= len(val_loader)
|
|
404
|
+
val_acc = val_correct / len(val_loader.dataset)
|
|
405
|
+
|
|
406
|
+
# Log epoch metrics
|
|
407
|
+
writer.add_scalars('Loss', {'train': train_loss, 'val': val_loss}, epoch)
|
|
408
|
+
writer.add_scalars('Accuracy', {'train': train_acc, 'val': val_acc}, epoch)
|
|
409
|
+
|
|
410
|
+
# Log learning rate
|
|
411
|
+
writer.add_scalar('Learning_rate', optimizer.param_groups[0]['lr'], epoch)
|
|
412
|
+
|
|
413
|
+
# Log histograms (every 5 epochs)
|
|
414
|
+
if epoch % 5 == 0:
|
|
415
|
+
for name, param in model.named_parameters():
|
|
416
|
+
writer.add_histogram(name, param, epoch)
|
|
417
|
+
|
|
418
|
+
# Log sample predictions
|
|
419
|
+
if epoch % 10 == 0:
|
|
420
|
+
sample_images = data[:8]
|
|
421
|
+
writer.add_image('Sample_inputs', make_grid(sample_images), epoch)
|
|
422
|
+
|
|
423
|
+
writer.close()
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
### TensorFlow/Keras Training
|
|
427
|
+
|
|
428
|
+
```python
|
|
429
|
+
import tensorflow as tf
|
|
430
|
+
|
|
431
|
+
# Define model
|
|
432
|
+
model = tf.keras.models.Sequential([
|
|
433
|
+
tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
|
|
434
|
+
tf.keras.layers.MaxPooling2D(),
|
|
435
|
+
tf.keras.layers.Flatten(),
|
|
436
|
+
tf.keras.layers.Dense(128, activation='relu'),
|
|
437
|
+
tf.keras.layers.Dense(10, activation='softmax')
|
|
438
|
+
])
|
|
439
|
+
|
|
440
|
+
model.compile(
|
|
441
|
+
optimizer='adam',
|
|
442
|
+
loss='sparse_categorical_crossentropy',
|
|
443
|
+
metrics=['accuracy']
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# TensorBoard callback
|
|
447
|
+
tensorboard_callback = tf.keras.callbacks.TensorBoard(
|
|
448
|
+
log_dir='logs/fit',
|
|
449
|
+
histogram_freq=1, # Log histograms every epoch
|
|
450
|
+
write_graph=True, # Visualize model graph
|
|
451
|
+
write_images=True, # Visualize weights as images
|
|
452
|
+
update_freq='epoch', # Log metrics every epoch
|
|
453
|
+
profile_batch='500,520', # Profile batches 500-520
|
|
454
|
+
embeddings_freq=1 # Log embeddings every epoch
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
# Train
|
|
458
|
+
model.fit(
|
|
459
|
+
x_train, y_train,
|
|
460
|
+
epochs=10,
|
|
461
|
+
validation_data=(x_val, y_val),
|
|
462
|
+
callbacks=[tensorboard_callback]
|
|
463
|
+
)
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
## Comparing Experiments
|
|
467
|
+
|
|
468
|
+
### Multiple Runs
|
|
469
|
+
|
|
470
|
+
```bash
|
|
471
|
+
# Run experiments with different configs
|
|
472
|
+
python train.py --lr 0.001 --logdir runs/exp1
|
|
473
|
+
python train.py --lr 0.01 --logdir runs/exp2
|
|
474
|
+
python train.py --lr 0.1 --logdir runs/exp3
|
|
475
|
+
|
|
476
|
+
# View all runs together
|
|
477
|
+
tensorboard --logdir=runs
|
|
478
|
+
```
|
|
479
|
+
|
|
480
|
+
**In TensorBoard:**
|
|
481
|
+
- All runs appear in the same dashboard
|
|
482
|
+
- Toggle runs on/off for comparison
|
|
483
|
+
- Use regex to filter run names
|
|
484
|
+
- Overlay charts to compare metrics
|
|
485
|
+
|
|
486
|
+
### Organizing Experiments
|
|
487
|
+
|
|
488
|
+
```python
|
|
489
|
+
# Hierarchical organization
|
|
490
|
+
runs/
|
|
491
|
+
├── baseline/
|
|
492
|
+
│ ├── run_1/
|
|
493
|
+
│ └── run_2/
|
|
494
|
+
├── improved/
|
|
495
|
+
│ ├── run_1/
|
|
496
|
+
│ └── run_2/
|
|
497
|
+
└── final/
|
|
498
|
+
└── run_1/
|
|
499
|
+
|
|
500
|
+
# Log with hierarchy
|
|
501
|
+
writer = SummaryWriter('runs/baseline/run_1')
|
|
502
|
+
```
|
|
503
|
+
|
|
504
|
+
## Best Practices
|
|
505
|
+
|
|
506
|
+
### 1. Use Descriptive Run Names
|
|
507
|
+
|
|
508
|
+
```python
|
|
509
|
+
# ✅ Good: Descriptive names
|
|
510
|
+
from datetime import datetime
|
|
511
|
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
512
|
+
writer = SummaryWriter(f'runs/resnet50_lr0.001_bs32_{timestamp}')
|
|
513
|
+
|
|
514
|
+
# ❌ Bad: Auto-generated names
|
|
515
|
+
writer = SummaryWriter() # Creates runs/Jan01_12-34-56_hostname
|
|
516
|
+
```
|
|
517
|
+
|
|
518
|
+
### 2. Group Related Metrics
|
|
519
|
+
|
|
520
|
+
```python
|
|
521
|
+
# ✅ Good: Grouped metrics
|
|
522
|
+
writer.add_scalar('Loss/train', train_loss, step)
|
|
523
|
+
writer.add_scalar('Loss/val', val_loss, step)
|
|
524
|
+
writer.add_scalar('Accuracy/train', train_acc, step)
|
|
525
|
+
writer.add_scalar('Accuracy/val', val_acc, step)
|
|
526
|
+
|
|
527
|
+
# ❌ Bad: Flat namespace
|
|
528
|
+
writer.add_scalar('train_loss', train_loss, step)
|
|
529
|
+
writer.add_scalar('val_loss', val_loss, step)
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
### 3. Log Regularly but Not Too Often
|
|
533
|
+
|
|
534
|
+
```python
|
|
535
|
+
# ✅ Good: Log epoch metrics always, batch metrics occasionally
|
|
536
|
+
for epoch in range(100):
|
|
537
|
+
for batch_idx, (data, target) in enumerate(train_loader):
|
|
538
|
+
loss = train_step(data, target)
|
|
539
|
+
|
|
540
|
+
# Log every 100 batches
|
|
541
|
+
if batch_idx % 100 == 0:
|
|
542
|
+
writer.add_scalar('Loss/batch', loss, global_step)
|
|
543
|
+
|
|
544
|
+
# Always log epoch metrics
|
|
545
|
+
writer.add_scalar('Loss/epoch', epoch_loss, epoch)
|
|
546
|
+
|
|
547
|
+
# ❌ Bad: Log every batch (creates huge log files)
|
|
548
|
+
for batch in train_loader:
|
|
549
|
+
writer.add_scalar('Loss', loss, step) # Too frequent
|
|
550
|
+
```
|
|
551
|
+
|
|
552
|
+
### 4. Close Writer When Done
|
|
553
|
+
|
|
554
|
+
```python
|
|
555
|
+
# ✅ Good: Use context manager
|
|
556
|
+
with SummaryWriter('runs/exp1') as writer:
|
|
557
|
+
for epoch in range(10):
|
|
558
|
+
writer.add_scalar('Loss', loss, epoch)
|
|
559
|
+
# Automatically closes
|
|
560
|
+
|
|
561
|
+
# Or manually
|
|
562
|
+
writer = SummaryWriter('runs/exp1')
|
|
563
|
+
# ... logging ...
|
|
564
|
+
writer.close()
|
|
565
|
+
```
|
|
566
|
+
|
|
567
|
+
### 5. Use Separate Writers for Train/Val
|
|
568
|
+
|
|
569
|
+
```python
|
|
570
|
+
# ✅ Good: Separate log directories
|
|
571
|
+
train_writer = SummaryWriter('runs/exp1/train')
|
|
572
|
+
val_writer = SummaryWriter('runs/exp1/val')
|
|
573
|
+
|
|
574
|
+
train_writer.add_scalar('loss', train_loss, epoch)
|
|
575
|
+
val_writer.add_scalar('loss', val_loss, epoch)
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
## Performance Profiling
|
|
579
|
+
|
|
580
|
+
### TensorFlow Profiler
|
|
581
|
+
|
|
582
|
+
```python
|
|
583
|
+
# Enable profiling
|
|
584
|
+
tensorboard_callback = tf.keras.callbacks.TensorBoard(
|
|
585
|
+
log_dir='logs',
|
|
586
|
+
profile_batch='10,20' # Profile batches 10-20
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
model.fit(x, y, callbacks=[tensorboard_callback])
|
|
590
|
+
|
|
591
|
+
# View in TensorBoard Profile tab
|
|
592
|
+
# Shows: GPU utilization, kernel stats, memory usage, bottlenecks
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
### PyTorch Profiler
|
|
596
|
+
|
|
597
|
+
```python
|
|
598
|
+
import torch.profiler as profiler
|
|
599
|
+
|
|
600
|
+
with profiler.profile(
|
|
601
|
+
activities=[
|
|
602
|
+
profiler.ProfilerActivity.CPU,
|
|
603
|
+
profiler.ProfilerActivity.CUDA
|
|
604
|
+
],
|
|
605
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/profiler'),
|
|
606
|
+
record_shapes=True,
|
|
607
|
+
with_stack=True
|
|
608
|
+
) as prof:
|
|
609
|
+
for batch in train_loader:
|
|
610
|
+
loss = train_step(batch)
|
|
611
|
+
prof.step()
|
|
612
|
+
|
|
613
|
+
# View in TensorBoard Profile tab
|
|
614
|
+
```
|
|
615
|
+
|
|
616
|
+
## Resources
|
|
617
|
+
|
|
618
|
+
- **Documentation**: https://www.tensorflow.org/tensorboard
|
|
619
|
+
- **PyTorch Integration**: https://pytorch.org/docs/stable/tensorboard.html
|
|
620
|
+
- **GitHub**: https://github.com/tensorflow/tensorboard (27k+ stars)
|
|
621
|
+
- **TensorBoard.dev**: https://tensorboard.dev (share experiments publicly)
|
|
622
|
+
|
|
623
|
+
## See Also
|
|
624
|
+
|
|
625
|
+
- `references/visualization.md` - Comprehensive visualization guide
|
|
626
|
+
- `references/profiling.md` - Performance profiling patterns
|
|
627
|
+
- `references/integrations.md` - Framework-specific integration examples
|
|
628
|
+
|
|
629
|
+
|