wafer-core 0.1.38__py3-none-any.whl → 0.1.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer_core/lib/trace_compare/fusion_analyzer.py +2 -0
- wafer_core/rollouts/_logging/__init__.py +5 -1
- wafer_core/rollouts/_logging/logging_config.py +95 -3
- wafer_core/rollouts/_logging/sample_handler.py +66 -0
- wafer_core/rollouts/_pytui/__init__.py +114 -0
- wafer_core/rollouts/_pytui/app.py +809 -0
- wafer_core/rollouts/_pytui/console.py +291 -0
- wafer_core/rollouts/_pytui/renderer.py +210 -0
- wafer_core/rollouts/_pytui/spinner.py +73 -0
- wafer_core/rollouts/_pytui/terminal.py +489 -0
- wafer_core/rollouts/_pytui/text.py +470 -0
- wafer_core/rollouts/_pytui/theme.py +241 -0
- wafer_core/rollouts/evaluation.py +142 -177
- wafer_core/rollouts/progress_app.py +395 -0
- wafer_core/rollouts/tui/DESIGN.md +251 -115
- wafer_core/rollouts/tui/monitor.py +64 -20
- wafer_core/tools/compile/__init__.py +30 -0
- wafer_core/tools/compile/compiler.py +314 -0
- wafer_core/tools/compile/modal_compile.py +359 -0
- wafer_core/tools/compile/tests/__init__.py +1 -0
- wafer_core/tools/compile/tests/test_compiler.py +675 -0
- wafer_core/tools/compile/tests/test_data/utils.cuh +10 -0
- wafer_core/tools/compile/tests/test_data/vector_add.cu +7 -0
- wafer_core/tools/compile/tests/test_data/with_header.cu +9 -0
- wafer_core/tools/compile/tests/test_modal_integration.py +326 -0
- wafer_core/tools/compile/types.py +117 -0
- {wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/METADATA +1 -1
- {wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/RECORD +29 -12
- wafer_core/rollouts/events.py +0 -240
- wafer_core/rollouts/progress_display.py +0 -476
- wafer_core/utils/event_streaming.py +0 -63
- {wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/WHEEL +0 -0
|
@@ -3,152 +3,288 @@
|
|
|
3
3
|
## Goal
|
|
4
4
|
|
|
5
5
|
Single pattern for GEPA, RL training, and normal eval:
|
|
6
|
-
1. Process emits
|
|
7
|
-
2.
|
|
8
|
-
3.
|
|
6
|
+
1. Process emits structured log records via Python `logging`
|
|
7
|
+
2. Logging handlers route records to JSONL files (overview + per-sample)
|
|
8
|
+
3. TUI tails files and renders progress
|
|
9
|
+
4. Completely decoupled - process doesn't know if anyone is watching
|
|
9
10
|
|
|
10
|
-
## Current
|
|
11
|
+
## Current state (what's wrong)
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|------|---------------|-----|
|
|
14
|
-
| RL Training | `remote_runner.py` wraps process + tails logs → stdout | `monitor.py` reads stdin |
|
|
15
|
-
| Eval | `MultiProgress` embedded in `evaluate()` | N/A (MultiProgress IS the UI) |
|
|
16
|
-
| GEPA | Nested `MultiProgress` instances fight | Broken |
|
|
13
|
+
Three parallel emission paths that don't compose:
|
|
17
14
|
|
|
18
|
-
|
|
15
|
+
- **`on_chunk` callbacks** (in-process) -> updates `MultiProgress` directly
|
|
16
|
+
- **`EventEmitter`** (custom file writer) -> writes `events.jsonl`
|
|
17
|
+
- **Python `logging`** (standard) -> consumed by `TrainingMonitor` when JSONL-formatted
|
|
19
18
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
3. `MultiProgress` becomes one possible renderer of events
|
|
19
|
+
`MultiProgress` is embedded inside `evaluate()`. `EventEmitter` is a second logging
|
|
20
|
+
system with its own file handle, format, and context variable. The TUI monitor's
|
|
21
|
+
`parse_jsonl_line()` sniffs line shape to figure out which system produced it.
|
|
24
22
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
│ emit_event({"type": "modal_progress", "phase": "compile"})│
|
|
32
|
-
│ emit_event({"type": "sample_end", "id": "001", ...}) │
|
|
33
|
-
│ │
|
|
34
|
-
│ ↓ writes to │
|
|
35
|
-
│ {output_dir}/events.jsonl │
|
|
36
|
-
└─────────────────────────────────────────────────────────────┘
|
|
37
|
-
│
|
|
38
|
-
│ tail -f (or pipe)
|
|
39
|
-
▼
|
|
40
|
-
┌─────────────────────────────────────────────────────────────┐
|
|
41
|
-
│ TUI (rollouts.tui.watch) │
|
|
42
|
-
│ │
|
|
43
|
-
│ Renders based on event type: │
|
|
44
|
-
│ - sample_start/end → progress bar │
|
|
45
|
-
│ - turn → update turn count │
|
|
46
|
-
│ - modal_progress → show phase (compiling, checking, etc) │
|
|
47
|
-
│ - gepa_iteration → show GEPA progress │
|
|
48
|
-
│ - metrics → show charts/sparklines │
|
|
49
|
-
│ - log → route to appropriate pane │
|
|
50
|
-
└─────────────────────────────────────────────────────────────┘
|
|
51
|
-
```
|
|
23
|
+
Result: summary dashboard only (turns, phase status). No drill-down into running
|
|
24
|
+
samples. No token counts. No streaming output.
|
|
25
|
+
|
|
26
|
+
## Target state
|
|
27
|
+
|
|
28
|
+
Python logging is the single source of truth. `EventEmitter` and `MultiProgress` deleted.
|
|
52
29
|
|
|
53
|
-
|
|
30
|
+
### Emission
|
|
31
|
+
|
|
32
|
+
`on_chunk` callback body calls `logger.info()`/`logger.debug()` with structured `extra={}`:
|
|
54
33
|
|
|
55
34
|
```python
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
# RL training events (from training/grpo.py)
|
|
68
|
-
{"type": "rl_step", "step": 10, "reward_mean": 0.65, "loss": 0.023}
|
|
69
|
-
{"type": "rl_checkpoint", "step": 100, "path": "/checkpoints/step_100"}
|
|
70
|
-
|
|
71
|
-
# Generic log events (from any logger)
|
|
72
|
-
{"type": "log", "logger": "kernelbench", "level": "INFO", "message": "..."}
|
|
35
|
+
async def on_chunk_with_sample_id(event):
|
|
36
|
+
if isinstance(event, TextDelta):
|
|
37
|
+
logger.debug("text_delta", extra={"sample_id": sid, "turn": turn, "text": event.text})
|
|
38
|
+
elif isinstance(event, LLMCallEnd):
|
|
39
|
+
logger.info("llm_call", extra={"sample_id": sid, "turn": turn,
|
|
40
|
+
"duration_ms": event.duration_ms, "tokens_in": event.tokens_in,
|
|
41
|
+
"tokens_out": event.tokens_out, "provider": event.provider})
|
|
42
|
+
elif isinstance(event, ToolExecutionEnd):
|
|
43
|
+
logger.info("tool_execution", extra={"sample_id": sid, "turn": turn,
|
|
44
|
+
"tool_name": event.tool_name, "duration_ms": event.duration_ms})
|
|
73
45
|
```
|
|
74
46
|
|
|
75
|
-
|
|
47
|
+
Wide events pattern: one rich record per meaningful unit of work (per turn, per tool call,
|
|
48
|
+
per LLM call). High-cardinality fields (`sample_id`, `turn`), high dimensionality
|
|
49
|
+
(tokens, duration, tool name, status).
|
|
50
|
+
|
|
51
|
+
### Log levels as detail tiers
|
|
52
|
+
|
|
53
|
+
- **INFO**: `sample_start`, `sample_end`, `turn`, `llm_call`, `tool_execution`, `modal_progress`
|
|
54
|
+
- **DEBUG**: `text_delta`, `thinking_delta`, `assistant_message` (full content), `tool_result` (full output)
|
|
76
55
|
|
|
77
|
-
###
|
|
56
|
+
### Routing via logging handlers
|
|
78
57
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
58
|
+
```
|
|
59
|
+
Root logger handlers (configured via dictConfig at eval startup):
|
|
60
|
+
|
|
61
|
+
1. JSONLFileHandler -> {output_dir}/events.jsonl
|
|
62
|
+
- Filter: INFO+ only
|
|
63
|
+
- Purpose: overview dashboard
|
|
64
|
+
|
|
65
|
+
2. SampleRoutingHandler -> {output_dir}/samples/{sample_id}.jsonl
|
|
66
|
+
- Filter: must have sample_id in extra
|
|
67
|
+
- Level: ALL (including DEBUG text deltas)
|
|
68
|
+
- Purpose: per-sample drill-down and streaming
|
|
69
|
+
|
|
70
|
+
3. StreamHandler(stderr) -> human-readable
|
|
71
|
+
- For verbose/debug mode
|
|
72
|
+
|
|
73
|
+
4. QueueHandler wrapping 1-3 for non-blocking writes
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
`SampleRoutingHandler` is a custom `logging.Handler` that inspects
|
|
77
|
+
`record.sample_id` and writes to the appropriate per-sample file.
|
|
78
|
+
|
|
79
|
+
File handles are opened lazily on first write and closed eagerly when
|
|
80
|
+
`sample_end` is seen. Peak open FDs = `max_concurrent` samples, not total.
|
|
81
|
+
The `QueueHandler` serializes all writes, so close-then-delete is safe.
|
|
83
82
|
|
|
84
83
|
```python
|
|
85
|
-
|
|
86
|
-
class EventEmitter:
|
|
87
|
-
"""Writes structured events to JSONL file."""
|
|
88
|
-
|
|
84
|
+
class SampleRoutingHandler(logging.Handler):
|
|
89
85
|
def __init__(self, output_dir: Path):
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
86
|
+
super().__init__()
|
|
87
|
+
self.output_dir = output_dir
|
|
88
|
+
self._files: dict[str, TextIO] = {}
|
|
89
|
+
|
|
90
|
+
def emit(self, record: logging.LogRecord):
|
|
91
|
+
sample_id = getattr(record, "sample_id", None)
|
|
92
|
+
if sample_id is None:
|
|
93
|
+
return
|
|
94
|
+
if sample_id not in self._files:
|
|
95
|
+
path = self.output_dir / "samples" / f"{sample_id}.jsonl"
|
|
96
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
97
|
+
self._files[sample_id] = open(path, "a")
|
|
98
|
+
self._files[sample_id].write(self.format(record) + "\n")
|
|
99
|
+
self._files[sample_id].flush()
|
|
100
|
+
# Close file handle when sample is done
|
|
101
|
+
if record.getMessage() == "sample_end":
|
|
102
|
+
self._files[sample_id].close()
|
|
103
|
+
del self._files[sample_id]
|
|
104
|
+
|
|
105
|
+
def close(self):
|
|
106
|
+
"""Close all open file handles on shutdown."""
|
|
107
|
+
for f in self._files.values():
|
|
108
|
+
f.close()
|
|
109
|
+
self._files.clear()
|
|
110
|
+
super().close()
|
|
96
111
|
```
|
|
97
112
|
|
|
98
|
-
###
|
|
113
|
+
### Consumption
|
|
114
|
+
|
|
115
|
+
TUI tails files. Fully decoupled from eval process.
|
|
116
|
+
|
|
117
|
+
- **Overview**: tail `events.jsonl` -> dashboard with N samples, turns, tokens, current state
|
|
118
|
+
- **Drill-down**: select a sample -> tail `samples/{sample_id}.jsonl` -> streaming text, tool calls, results
|
|
119
|
+
- **Live streaming**: text deltas are DEBUG-level in per-sample files, TUI renders as streaming text
|
|
120
|
+
- **Replay**: same files work for post-hoc viewing
|
|
99
121
|
|
|
100
|
-
|
|
101
|
-
2. Renders progress based on event types
|
|
102
|
-
3. Can show multiple concurrent samples (like current MultiProgress)
|
|
103
|
-
4. Can show GEPA iteration progress in header
|
|
104
|
-
5. Can show RL training metrics/charts
|
|
122
|
+
### JSONL record format
|
|
105
123
|
|
|
106
|
-
|
|
107
|
-
# Usage
|
|
108
|
-
python -m rollouts.tui.watch /path/to/output/events.jsonl
|
|
124
|
+
All records use Python logging's structure with `extra` fields flattened:
|
|
109
125
|
|
|
110
|
-
|
|
111
|
-
|
|
126
|
+
```jsonl
|
|
127
|
+
{"timestamp": "2025-01-15T10:23:45Z", "level": "INFO", "logger": "rollouts.eval", "msg": "sample_start", "sample_id": "sample_0001", "name": "Square_matmul"}
|
|
128
|
+
{"timestamp": "2025-01-15T10:23:46Z", "level": "INFO", "logger": "rollouts.eval", "msg": "turn", "sample_id": "sample_0001", "turn": 1, "status": "streaming"}
|
|
129
|
+
{"timestamp": "2025-01-15T10:23:46Z", "level": "DEBUG", "logger": "rollouts.eval", "msg": "text_delta", "sample_id": "sample_0001", "turn": 1, "text": "Let me analyze"}
|
|
130
|
+
{"timestamp": "2025-01-15T10:23:50Z", "level": "INFO", "logger": "rollouts.eval", "msg": "llm_call", "sample_id": "sample_0001", "turn": 1, "duration_ms": 3400, "tokens_in": 2000, "tokens_out": 1200, "provider": "anthropic", "model": "claude-sonnet-4-20250514"}
|
|
131
|
+
{"timestamp": "2025-01-15T10:24:35Z", "level": "INFO", "logger": "rollouts.eval", "msg": "tool_execution", "sample_id": "sample_0001", "turn": 1, "tool_name": "bash", "duration_ms": 45000, "result_summary": "exit 0"}
|
|
132
|
+
{"timestamp": "2025-01-15T10:24:36Z", "level": "INFO", "logger": "rollouts.eval", "msg": "modal_progress", "sample_id": "sample_0001", "phase": "compiling"}
|
|
133
|
+
{"timestamp": "2025-01-15T10:30:00Z", "level": "INFO", "logger": "rollouts.eval", "msg": "sample_end", "sample_id": "sample_0001", "score": 0.85, "duration_s": 120, "turns_used": 15}
|
|
112
134
|
```
|
|
113
135
|
|
|
114
|
-
|
|
136
|
+
One format. No type sniffing. `msg` field is the event type discriminator.
|
|
137
|
+
|
|
138
|
+
## Design decisions
|
|
139
|
+
|
|
140
|
+
### EventEmitter is NOT already using Python logging
|
|
141
|
+
|
|
142
|
+
Despite having `logger = logging.getLogger(__name__)` at the top of `events.py`,
|
|
143
|
+
`EventEmitter` writes JSON directly to its own file handle (`self._file.write(json.dumps(event))`).
|
|
144
|
+
It has its own context variable (`_emitter_ctx`), its own file lifecycle, and its own format
|
|
145
|
+
(`{"type": "sample_start", ...}` vs logging's `{"message": "...", "level": "INFO", ...}`).
|
|
146
|
+
|
|
147
|
+
`LoggingEventEmitter` exists as a subclass that also logs, but the base class is a
|
|
148
|
+
completely separate channel. Phase 2 is a genuine replacement, not just a consolidation.
|
|
149
|
+
|
|
150
|
+
### Phase 2 scope: what actually changes in on_chunk
|
|
151
|
+
|
|
152
|
+
The current `on_chunk_with_sample_id` (evaluation.py:1006) does three things per event:
|
|
115
153
|
|
|
116
|
-
1.
|
|
117
|
-
2.
|
|
118
|
-
3.
|
|
154
|
+
1. Updates `MultiProgress` via `progress.update_task()` (display)
|
|
155
|
+
2. Calls `emit_event()` which writes to `EventEmitter`'s file handle (persistence)
|
|
156
|
+
3. Wraps the event with `sample_id` and forwards to `base_on_chunk` (frontend streaming)
|
|
119
157
|
|
|
120
|
-
|
|
158
|
+
The conversion:
|
|
159
|
+
- (1) is deleted entirely - no more `MultiProgress`
|
|
160
|
+
- (2) becomes `logger.info()`/`logger.debug()` calls with `extra={}`. The existing
|
|
161
|
+
event mapping logic (`LLMCallEnd` -> `llm_call`, `ToolExecutionEnd` -> `tool_execution`,
|
|
162
|
+
`TextEnd` -> `assistant_message`) is preserved, just targeting logging instead of `emit_event()`
|
|
163
|
+
- (3) stays for now - `base_on_chunk` is used by `TUIFrontend` for interactive single-agent
|
|
164
|
+
mode. It can be removed later when the TUI reads from files instead of callbacks.
|
|
165
|
+
- NEW: `logger.debug("text_delta", ...)` for streaming content (not emitted today)
|
|
166
|
+
|
|
167
|
+
The `last_status` dedup logic and `current_turn` tracking stay - they're still needed
|
|
168
|
+
to avoid flooding the log with redundant status updates.
|
|
169
|
+
|
|
170
|
+
### TUI backward compatibility
|
|
171
|
+
|
|
172
|
+
OK to break the old `events.jsonl` format. This is an internal tool, not a user-facing API.
|
|
173
|
+
Old eval result directories will have the old format, but we don't need to support replaying
|
|
174
|
+
them - the per-sample result JSON files and trajectories are the durable record.
|
|
175
|
+
|
|
176
|
+
### Testing strategy
|
|
177
|
+
|
|
178
|
+
- **Phase 1** (SampleRoutingHandler): Unit tests. Write log records with `extra={"sample_id": "x"}`,
|
|
179
|
+
assert correct files created, correct content, FD cleanup on `sample_end`.
|
|
180
|
+
- **Phase 2** (on_chunk conversion): Integration test. Run a small eval (1-2 samples, 2-3 turns),
|
|
181
|
+
assert `events.jsonl` and `samples/*.jsonl` have expected records with expected fields.
|
|
182
|
+
Compare against a snapshot of the current `emit_event` output to verify no data loss.
|
|
183
|
+
- **Phase 3** (TUI): Manual verification. The TUI is a visual tool - automated tests for
|
|
184
|
+
ANSI rendering are brittle and low-value. Test the JSONL parsing logic (the `parse_jsonl_line`
|
|
185
|
+
equivalent) with unit tests against example records.
|
|
186
|
+
|
|
187
|
+
## Existing infrastructure
|
|
188
|
+
|
|
189
|
+
Most of Phase 1 already exists in `rollouts/_logging/`:
|
|
190
|
+
|
|
191
|
+
- `json_formatter.py`: `JSONFormatter` that flattens `extra` fields onto JSONL records.
|
|
192
|
+
Already strips builtin `LogRecord` attrs and includes only custom extras.
|
|
193
|
+
- `logging_config.py`: `setup_logging()` using `dictConfig` with `QueueHandler`+`QueueListener`
|
|
194
|
+
(mCoding pattern), `RotatingFileHandler` for bounded JSONL files, `atexit` cleanup.
|
|
195
|
+
- `color_formatter.py`: ANSI color formatter for human-readable stderr output.
|
|
196
|
+
|
|
197
|
+
What's missing: `SampleRoutingHandler` and a `setup_eval_logging()` that wires it
|
|
198
|
+
into the existing `setup_logging()` config.
|
|
199
|
+
|
|
200
|
+
## Implementation plan
|
|
201
|
+
|
|
202
|
+
### Phase 1: Logging infrastructure
|
|
203
|
+
|
|
204
|
+
- Write `SampleRoutingHandler` (routes records to per-sample files based on `extra["sample_id"]`)
|
|
205
|
+
- Extend `setup_logging()` or write `setup_eval_logging(output_dir)` that adds the routing handler
|
|
206
|
+
to the existing dictConfig alongside the overview JSONL file handler
|
|
207
|
+
- Tests for routing handler
|
|
208
|
+
|
|
209
|
+
### Phase 2: Replace EventEmitter in evaluate_sample
|
|
210
|
+
|
|
211
|
+
- Convert `on_chunk_with_sample_id` to use `logger.info()`/`logger.debug()` instead of `emit_event()` + `progress.update_task()`
|
|
212
|
+
- Convert `sample_start`/`sample_end` emissions to logging calls
|
|
213
|
+
- Remove `EventEmitter` creation from `evaluate()`
|
|
214
|
+
- Remove `MultiProgress` creation from `evaluate()`
|
|
215
|
+
- Call `setup_eval_logging()` at the start of `evaluate()` instead
|
|
216
|
+
|
|
217
|
+
### Phase 3: Update TUI to consume new format
|
|
218
|
+
|
|
219
|
+
- Update `TrainingMonitor.parse_jsonl_line()` to handle unified format (simplify)
|
|
220
|
+
- Add drill-down: selecting a sample tails its per-sample JSONL file
|
|
221
|
+
- Add token counts to overview display (data now available from `llm_call` events)
|
|
222
|
+
- Update `ProgressDisplay` or replace with new consumer
|
|
223
|
+
|
|
224
|
+
### Phase 4: Cleanup
|
|
225
|
+
|
|
226
|
+
- Delete `EventEmitter` class and gut `events.py`
|
|
227
|
+
- Delete `MultiProgress` class
|
|
228
|
+
- Delete `ProgressDisplay`
|
|
229
|
+
- Update eval configs that reference `show_progress`, `verbose`, `EventEmitter`
|
|
230
|
+
|
|
231
|
+
## File structure (target)
|
|
121
232
|
|
|
122
233
|
```
|
|
123
234
|
rollouts/
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
235
|
+
_logging/
|
|
236
|
+
json_formatter.py # ALREADY EXISTS - JSONFormatter with extra flattening
|
|
237
|
+
color_formatter.py # ALREADY EXISTS - ANSI color formatter
|
|
238
|
+
logging_config.py # ALREADY EXISTS - setup_logging() with dictConfig + QueueHandler
|
|
239
|
+
sample_handler.py # NEW - SampleRoutingHandler
|
|
240
|
+
evaluation.py # Uses logger (no MultiProgress, no EventEmitter)
|
|
241
|
+
tui/
|
|
242
|
+
watch.py # Main TUI entry point (tails JSONL files)
|
|
243
|
+
monitor.py # TrainingMonitor (simplified parse_jsonl_line)
|
|
244
|
+
terminal.py # Terminal abstraction
|
|
245
|
+
traces.py # Trace viewer (drill-down into sample)
|
|
131
246
|
```
|
|
132
247
|
|
|
133
|
-
##
|
|
248
|
+
## Code style principles for this refactor
|
|
249
|
+
|
|
250
|
+
From ~/research/docs/code_style/:
|
|
251
|
+
|
|
252
|
+
**Wide events** (logging_sucks.md): One rich record per meaningful unit of work. High-cardinality
|
|
253
|
+
fields (`sample_id`, `turn`) make records queryable. High dimensionality (tokens, duration,
|
|
254
|
+
tool name) means you can answer debugging questions without a second search. Don't scatter
|
|
255
|
+
context across 20 log lines - build the event throughout the request lifecycle, emit once.
|
|
256
|
+
|
|
257
|
+
**Python logging** (mcoding_logging_dense.md): `dictConfig` for configuration, not code.
|
|
258
|
+
Handlers on root logger, let propagation work. `extra={}` for structured fields. Custom
|
|
259
|
+
JSON formatter writes `.jsonl`. `QueueHandler` for non-blocking writes. Library code
|
|
260
|
+
shouldn't configure logging - the eval runner configures, rollouts framework just logs.
|
|
261
|
+
|
|
262
|
+
**Functional core, imperative shell** (favorites.md, code_philosophy_reference.md):
|
|
263
|
+
`SampleRoutingHandler` is imperative shell (manages file handles, does I/O).
|
|
264
|
+
`JSONFormatter._prepare_log_dict()` is functional core (record -> dict, pure transform).
|
|
265
|
+
`on_chunk` mapping events to log calls is pure mapping, no state.
|
|
266
|
+
|
|
267
|
+
**Don't reuse until 2+ examples** (casey_semantic_compression): Don't pre-abstract the
|
|
268
|
+
handler routing. Start with `SampleRoutingHandler` for evals. If GEPA or RL training
|
|
269
|
+
need similar per-entity routing, extract the pattern then.
|
|
134
270
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
4. Remove `MultiProgress` from evaluation internals
|
|
139
|
-
5. Update docs to recommend TUI approach
|
|
271
|
+
**Classes for resources, functions for orchestration** (code_philosophy_reference.md):
|
|
272
|
+
`SampleRoutingHandler` is a class because it owns file handles (resource lifecycle).
|
|
273
|
+
`setup_eval_logging()` is a function. The `on_chunk` -> `logger.info()` mapping is a function.
|
|
140
274
|
|
|
141
|
-
|
|
275
|
+
**Parse at the boundary** (tiger_style): The TUI's JSONL parser is the boundary.
|
|
276
|
+
Internally, records are just dicts with a known schema. The `msg` field is the event
|
|
277
|
+
type discriminator - no polymorphic sniffing needed.
|
|
142
278
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
- Could support both via `--events-to stdout` flag
|
|
279
|
+
**Existing infra** (nmoe_experiment_tracking.md): We already have `QueueHandler`+`QueueListener`
|
|
280
|
+
(nmoe doesn't), `JSONFormatter` with extra flattening, and `RotatingFileHandler` for bounded
|
|
281
|
+
files. `SampleRoutingHandler` is the only new piece.
|
|
147
282
|
|
|
148
|
-
|
|
149
|
-
- Keep for backwards compat, but events.jsonl is the primary output
|
|
150
|
-
- Eventually deprecate on_chunk in favor of event file
|
|
283
|
+
## References
|
|
151
284
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
285
|
+
- Wide events / canonical log lines: https://loggingsucks.com/
|
|
286
|
+
- Python logging best practices: mCoding (dictConfig, QueueHandler, JSON formatter, extras)
|
|
287
|
+
- Code philosophy: ~/research/docs/code_style/favorites.md, code_philosophy_reference.md
|
|
288
|
+
- nmoe analysis: ~/research/docs/code_style/nmoe_experiment_tracking.md
|
|
289
|
+
- Existing logging infra: rollouts/_logging/ (json_formatter.py, logging_config.py)
|
|
290
|
+
- Library code shouldn't configure logging - eval runner configures, rollouts framework just logs
|
|
@@ -267,35 +267,46 @@ class TrainingMonitor:
|
|
|
267
267
|
self._needs_redraw = True
|
|
268
268
|
|
|
269
269
|
def _handle_eval_event(self, event_type: str, data: dict) -> None:
|
|
270
|
-
"""Handle eval/GEPA events and update progress state.
|
|
270
|
+
"""Handle eval/GEPA events and update progress state.
|
|
271
|
+
|
|
272
|
+
Field names: sample_id, sample_name, eval_name (not id/name, which
|
|
273
|
+
conflict with LogRecord builtins).
|
|
274
|
+
"""
|
|
275
|
+
sample_id = data.get("sample_id", "")
|
|
276
|
+
|
|
271
277
|
if event_type == "eval_start":
|
|
272
|
-
self._eval_name = data.get("
|
|
278
|
+
self._eval_name = data.get("eval_name", "eval")
|
|
273
279
|
self._eval_total = data.get("total", 0)
|
|
274
280
|
|
|
275
281
|
elif event_type == "sample_start":
|
|
276
|
-
sample_id = data.get("id", "")
|
|
277
282
|
self._eval_samples[sample_id] = {
|
|
278
|
-
"name": data.get("
|
|
283
|
+
"name": data.get("sample_name", sample_id),
|
|
279
284
|
"turn": 0,
|
|
280
285
|
"phase": "",
|
|
281
286
|
"score": None,
|
|
287
|
+
"tokens_in": 0,
|
|
288
|
+
"tokens_out": 0,
|
|
282
289
|
}
|
|
283
290
|
if sample_id not in self._eval_sample_order:
|
|
284
291
|
self._eval_sample_order.append(sample_id)
|
|
285
292
|
|
|
286
293
|
elif event_type == "turn":
|
|
287
|
-
sample_id = data.get("id", "")
|
|
288
294
|
if sample_id in self._eval_samples:
|
|
289
|
-
|
|
290
|
-
|
|
295
|
+
if "turn" in data:
|
|
296
|
+
self._eval_samples[sample_id]["turn"] = data["turn"]
|
|
297
|
+
if "status" in data:
|
|
298
|
+
self._eval_samples[sample_id]["status"] = data["status"]
|
|
291
299
|
|
|
292
300
|
elif event_type == "modal_progress":
|
|
293
|
-
sample_id = data.get("id", "")
|
|
294
301
|
if sample_id in self._eval_samples:
|
|
295
302
|
self._eval_samples[sample_id]["phase"] = data.get("phase", "")
|
|
296
303
|
|
|
304
|
+
elif event_type == "llm_call":
|
|
305
|
+
if sample_id in self._eval_samples:
|
|
306
|
+
self._eval_samples[sample_id]["tokens_in"] += data.get("tokens_in", 0)
|
|
307
|
+
self._eval_samples[sample_id]["tokens_out"] += data.get("tokens_out", 0)
|
|
308
|
+
|
|
297
309
|
elif event_type == "sample_end":
|
|
298
|
-
sample_id = data.get("id", "")
|
|
299
310
|
if sample_id in self._eval_samples:
|
|
300
311
|
self._eval_samples[sample_id]["score"] = data.get("score")
|
|
301
312
|
self._eval_samples[sample_id]["phase"] = ""
|
|
@@ -364,19 +375,36 @@ class TrainingMonitor:
|
|
|
364
375
|
extra=data,
|
|
365
376
|
)
|
|
366
377
|
|
|
367
|
-
# Check if this is an eval event (from events.
|
|
368
|
-
|
|
378
|
+
# Check if this is an eval event (from events.jsonl via logging)
|
|
379
|
+
# The "message" field is the event type discriminator (e.g. "sample_start")
|
|
380
|
+
# and the logger is "wafer.eval.events"
|
|
381
|
+
eval_event_types = {
|
|
382
|
+
"eval_start",
|
|
383
|
+
"eval_end",
|
|
384
|
+
"sample_start",
|
|
385
|
+
"sample_end",
|
|
386
|
+
"turn",
|
|
387
|
+
"modal_progress",
|
|
388
|
+
"llm_call",
|
|
389
|
+
"tool_execution",
|
|
390
|
+
"assistant_message",
|
|
391
|
+
"gepa_start",
|
|
392
|
+
"gepa_iteration",
|
|
393
|
+
"gepa_accepted",
|
|
394
|
+
"gepa_rejected",
|
|
395
|
+
}
|
|
396
|
+
event_type = message if message in eval_event_types else ""
|
|
369
397
|
if event_type:
|
|
370
398
|
self._handle_eval_event(event_type, data)
|
|
371
399
|
# Also create a log line for the pane
|
|
400
|
+
sample_id = data.get("sample_id", "")
|
|
372
401
|
if event_type in ("sample_start", "sample_end", "modal_progress", "turn"):
|
|
373
|
-
sample_id = data.get("id", "")
|
|
374
402
|
sample = self._eval_samples.get(sample_id, {})
|
|
375
|
-
|
|
403
|
+
sample_name = sample.get("name", sample_id)[:20]
|
|
376
404
|
if event_type == "sample_start":
|
|
377
405
|
return LogLine(
|
|
378
406
|
logger="eval",
|
|
379
|
-
message=f"▶ {
|
|
407
|
+
message=f"▶ {sample_name} started",
|
|
380
408
|
level="INFO",
|
|
381
409
|
extra=data,
|
|
382
410
|
)
|
|
@@ -384,7 +412,7 @@ class TrainingMonitor:
|
|
|
384
412
|
score = data.get("score", 0)
|
|
385
413
|
return LogLine(
|
|
386
414
|
logger="eval",
|
|
387
|
-
message=f"✓ {
|
|
415
|
+
message=f"✓ {sample_name} score={score:.2f}",
|
|
388
416
|
level="INFO",
|
|
389
417
|
extra=data,
|
|
390
418
|
)
|
|
@@ -392,10 +420,22 @@ class TrainingMonitor:
|
|
|
392
420
|
phase = data.get("phase", "")
|
|
393
421
|
return LogLine(
|
|
394
422
|
logger="modal",
|
|
395
|
-
message=f" {
|
|
423
|
+
message=f" {sample_name}: {phase}",
|
|
396
424
|
level="DEBUG",
|
|
397
425
|
extra=data,
|
|
398
426
|
)
|
|
427
|
+
elif event_type == "llm_call":
|
|
428
|
+
tokens_in = data.get("tokens_in", 0)
|
|
429
|
+
tokens_out = data.get("tokens_out", 0)
|
|
430
|
+
duration = data.get("duration_ms", 0)
|
|
431
|
+
sample = self._eval_samples.get(sample_id, {})
|
|
432
|
+
sample_name = sample.get("name", sample_id)[:20]
|
|
433
|
+
return LogLine(
|
|
434
|
+
logger="eval",
|
|
435
|
+
message=f" {sample_name}: llm {tokens_in}→{tokens_out} tok ({duration:.0f}ms)",
|
|
436
|
+
level="DEBUG",
|
|
437
|
+
extra=data,
|
|
438
|
+
)
|
|
399
439
|
elif event_type.startswith("gepa_"):
|
|
400
440
|
if event_type == "gepa_iteration":
|
|
401
441
|
return LogLine(
|
|
@@ -734,17 +774,21 @@ class TrainingMonitor:
|
|
|
734
774
|
turn = sample.get("turn", 0)
|
|
735
775
|
status = sample.get("status", "")
|
|
736
776
|
|
|
777
|
+
tokens_in = sample.get("tokens_in", 0)
|
|
778
|
+
tokens_out = sample.get("tokens_out", 0)
|
|
779
|
+
tok_str = f" {tokens_in}→{tokens_out}tok" if tokens_in or tokens_out else ""
|
|
780
|
+
|
|
737
781
|
if score is not None:
|
|
738
782
|
# Completed
|
|
739
783
|
color = GREEN if score > 0.5 else YELLOW if score > 0 else RED
|
|
740
|
-
status_str = f"{color}✓{RESET} T:{turn} score={score:.2f}"
|
|
784
|
+
status_str = f"{color}✓{RESET} T:{turn}{tok_str} score={score:.2f}"
|
|
741
785
|
elif phase:
|
|
742
786
|
# Modal eval in progress
|
|
743
|
-
status_str = f"{CYAN}{phase}...{RESET}"
|
|
787
|
+
status_str = f"{CYAN}{phase}...{RESET}{tok_str}"
|
|
744
788
|
elif status == "streaming":
|
|
745
|
-
status_str = f"{DIM}streaming...{RESET}"
|
|
789
|
+
status_str = f"{DIM}streaming...{RESET}{tok_str}"
|
|
746
790
|
else:
|
|
747
|
-
status_str = f"{DIM}T:{turn}{RESET}"
|
|
791
|
+
status_str = f"{DIM}T:{turn}{tok_str}{RESET}"
|
|
748
792
|
|
|
749
793
|
lines.append(f" {name} {status_str}"[:width])
|
|
750
794
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Cloud CUDA compiler - Godbolt for CUDA.
|
|
2
|
+
|
|
3
|
+
Send CUDA C++ code, get PTX/SASS back for inspection.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from wafer_core.tools.compile.compiler import (
|
|
7
|
+
compile_cuda_local,
|
|
8
|
+
compile_cuda_remote,
|
|
9
|
+
request_to_dict,
|
|
10
|
+
response_from_dict,
|
|
11
|
+
)
|
|
12
|
+
from wafer_core.tools.compile.types import (
|
|
13
|
+
CompileRequest,
|
|
14
|
+
CompileResponse,
|
|
15
|
+
OutputFormat,
|
|
16
|
+
VALID_ARCHITECTURES,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
# Types
|
|
21
|
+
"CompileRequest",
|
|
22
|
+
"CompileResponse",
|
|
23
|
+
"OutputFormat",
|
|
24
|
+
"VALID_ARCHITECTURES",
|
|
25
|
+
# Functions
|
|
26
|
+
"compile_cuda_local",
|
|
27
|
+
"compile_cuda_remote",
|
|
28
|
+
"request_to_dict",
|
|
29
|
+
"response_from_dict",
|
|
30
|
+
]
|