wafer-core 0.1.38__py3-none-any.whl → 0.1.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. wafer_core/lib/trace_compare/fusion_analyzer.py +2 -0
  2. wafer_core/rollouts/_logging/__init__.py +5 -1
  3. wafer_core/rollouts/_logging/logging_config.py +95 -3
  4. wafer_core/rollouts/_logging/sample_handler.py +66 -0
  5. wafer_core/rollouts/_pytui/__init__.py +114 -0
  6. wafer_core/rollouts/_pytui/app.py +809 -0
  7. wafer_core/rollouts/_pytui/console.py +291 -0
  8. wafer_core/rollouts/_pytui/renderer.py +210 -0
  9. wafer_core/rollouts/_pytui/spinner.py +73 -0
  10. wafer_core/rollouts/_pytui/terminal.py +489 -0
  11. wafer_core/rollouts/_pytui/text.py +470 -0
  12. wafer_core/rollouts/_pytui/theme.py +241 -0
  13. wafer_core/rollouts/evaluation.py +142 -177
  14. wafer_core/rollouts/progress_app.py +395 -0
  15. wafer_core/rollouts/tui/DESIGN.md +251 -115
  16. wafer_core/rollouts/tui/monitor.py +64 -20
  17. wafer_core/tools/compile/__init__.py +30 -0
  18. wafer_core/tools/compile/compiler.py +314 -0
  19. wafer_core/tools/compile/modal_compile.py +359 -0
  20. wafer_core/tools/compile/tests/__init__.py +1 -0
  21. wafer_core/tools/compile/tests/test_compiler.py +675 -0
  22. wafer_core/tools/compile/tests/test_data/utils.cuh +10 -0
  23. wafer_core/tools/compile/tests/test_data/vector_add.cu +7 -0
  24. wafer_core/tools/compile/tests/test_data/with_header.cu +9 -0
  25. wafer_core/tools/compile/tests/test_modal_integration.py +326 -0
  26. wafer_core/tools/compile/types.py +117 -0
  27. {wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/METADATA +1 -1
  28. {wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/RECORD +29 -12
  29. wafer_core/rollouts/events.py +0 -240
  30. wafer_core/rollouts/progress_display.py +0 -476
  31. wafer_core/utils/event_streaming.py +0 -63
  32. {wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/WHEEL +0 -0
@@ -3,152 +3,288 @@
3
3
  ## Goal
4
4
 
5
5
  Single pattern for GEPA, RL training, and normal eval:
6
- 1. Process emits JSONL events to a file
7
- 2. TUI tails the file and renders progress
8
- 3. Completely decoupled - process doesn't know if anyone is watching
6
+ 1. Process emits structured log records via Python `logging`
7
+ 2. Logging handlers route records to JSONL files (overview + per-sample)
8
+ 3. TUI tails files and renders progress
9
+ 4. Completely decoupled - process doesn't know if anyone is watching
9
10
 
10
- ## Current State
11
+ ## Current state (what's wrong)
11
12
 
12
- | Mode | Event emission | TUI |
13
- |------|---------------|-----|
14
- | RL Training | `remote_runner.py` wraps process + tails logs → stdout | `monitor.py` reads stdin |
15
- | Eval | `MultiProgress` embedded in `evaluate()` | N/A (MultiProgress IS the UI) |
16
- | GEPA | Nested `MultiProgress` instances fight | Broken |
13
+ Three parallel emission paths that don't compose:
17
14
 
18
- ## Target State
15
+ - **`on_chunk` callbacks** (in-process) -> updates `MultiProgress` directly
16
+ - **`EventEmitter`** (custom file writer) -> writes `events.jsonl`
17
+ - **Python `logging`** (standard) -> consumed by `TrainingMonitor` when JSONL-formatted
19
18
 
20
- All modes:
21
- 1. Write JSONL events to `{output_dir}/events.jsonl`
22
- 2. TUI tails that file (or stdin if piped)
23
- 3. `MultiProgress` becomes one possible renderer of events
19
+ `MultiProgress` is embedded inside `evaluate()`. `EventEmitter` is a second logging
20
+ system with its own file handle, format, and context variable. The TUI monitor's
21
+ `parse_jsonl_line()` sniffs line shape to figure out which system produced it.
24
22
 
25
- ```
26
- ┌─────────────────────────────────────────────────────────────┐
27
- │ Any Process (eval, GEPA, RL training) │
28
- │ │
29
- │ emit_event({"type": "sample_start", "id": "001", ...}) │
30
- │ emit_event({"type": "turn", "id": "001", "turn": 1}) │
31
- │ emit_event({"type": "modal_progress", "phase": "compile"})│
32
- │ emit_event({"type": "sample_end", "id": "001", ...}) │
33
- │ │
34
- │ ↓ writes to │
35
- │ {output_dir}/events.jsonl │
36
- └─────────────────────────────────────────────────────────────┘
37
-
38
- │ tail -f (or pipe)
39
-
40
- ┌─────────────────────────────────────────────────────────────┐
41
- │ TUI (rollouts.tui.watch) │
42
- │ │
43
- │ Renders based on event type: │
44
- │ - sample_start/end → progress bar │
45
- │ - turn → update turn count │
46
- │ - modal_progress → show phase (compiling, checking, etc) │
47
- │ - gepa_iteration → show GEPA progress │
48
- │ - metrics → show charts/sparklines │
49
- │ - log → route to appropriate pane │
50
- └─────────────────────────────────────────────────────────────┘
51
- ```
23
+ Result: summary dashboard only (turns, phase status). No drill-down into running
24
+ samples. No token counts. No streaming output.
25
+
26
+ ## Target state
27
+
28
+ Python logging is the single source of truth. `EventEmitter` and `MultiProgress` deleted.
52
29
 
53
- ## Event Types
30
+ ### Emission
31
+
32
+ `on_chunk` callback body calls `logger.info()`/`logger.debug()` with structured `extra={}`:
54
33
 
55
34
  ```python
56
- # Core eval events (from evaluation.py)
57
- {"type": "sample_start", "id": "001", "name": "Square_matmul", "total": 10}
58
- {"type": "turn", "id": "001", "turn": 1, "status": "streaming"}
59
- {"type": "modal_progress", "id": "001", "phase": "compiling"} # or correctness, benchmarking
60
- {"type": "sample_end", "id": "001", "score": 0.85, "time_sec": 45.2}
61
-
62
- # GEPA events (from prompt_optimization/engine.py)
63
- {"type": "gepa_iteration", "iteration": 3, "evals_used": 12, "evals_budget": 50, "best_score": 0.42}
64
- {"type": "gepa_accepted", "old_score": 0.40, "new_score": 0.42}
65
- {"type": "gepa_rejected", "old_score": 0.42, "new_score": 0.38}
66
-
67
- # RL training events (from training/grpo.py)
68
- {"type": "rl_step", "step": 10, "reward_mean": 0.65, "loss": 0.023}
69
- {"type": "rl_checkpoint", "step": 100, "path": "/checkpoints/step_100"}
70
-
71
- # Generic log events (from any logger)
72
- {"type": "log", "logger": "kernelbench", "level": "INFO", "message": "..."}
35
+ async def on_chunk_with_sample_id(event):
36
+ if isinstance(event, TextDelta):
37
+ logger.debug("text_delta", extra={"sample_id": sid, "turn": turn, "text": event.text})
38
+ elif isinstance(event, LLMCallEnd):
39
+ logger.info("llm_call", extra={"sample_id": sid, "turn": turn,
40
+ "duration_ms": event.duration_ms, "tokens_in": event.tokens_in,
41
+ "tokens_out": event.tokens_out, "provider": event.provider})
42
+ elif isinstance(event, ToolExecutionEnd):
43
+ logger.info("tool_execution", extra={"sample_id": sid, "turn": turn,
44
+ "tool_name": event.tool_name, "duration_ms": event.duration_ms})
73
45
  ```
74
46
 
75
- ## Implementation Plan
47
+ Wide events pattern: one rich record per meaningful unit of work (per turn, per tool call,
48
+ per LLM call). High-cardinality fields (`sample_id`, `turn`), high dimensionality
49
+ (tokens, duration, tool name, status).
50
+
51
+ ### Log levels as detail tiers
52
+
53
+ - **INFO**: `sample_start`, `sample_end`, `turn`, `llm_call`, `tool_execution`, `modal_progress`
54
+ - **DEBUG**: `text_delta`, `thinking_delta`, `assistant_message` (full content), `tool_result` (full output)
76
55
 
77
- ### Phase 1: Event emitter (no UI changes)
56
+ ### Routing via logging handlers
78
57
 
79
- 1. Add `EventEmitter` class that writes JSONL to file
80
- 2. Wire into `evaluate()` - emit events alongside existing `on_chunk` callbacks
81
- 3. Wire into GEPA engine - emit `gepa_iteration`, `gepa_accepted`, etc.
82
- 4. Wire into RL training - emit `rl_step`, etc.
58
+ ```
59
+ Root logger handlers (configured via dictConfig at eval startup):
60
+
61
+ 1. JSONLFileHandler -> {output_dir}/events.jsonl
62
+ - Filter: INFO+ only
63
+ - Purpose: overview dashboard
64
+
65
+ 2. SampleRoutingHandler -> {output_dir}/samples/{sample_id}.jsonl
66
+ - Filter: must have sample_id in extra
67
+ - Level: ALL (including DEBUG text deltas)
68
+ - Purpose: per-sample drill-down and streaming
69
+
70
+ 3. StreamHandler(stderr) -> human-readable
71
+ - For verbose/debug mode
72
+
73
+ 4. QueueHandler wrapping 1-3 for non-blocking writes
74
+ ```
75
+
76
+ `SampleRoutingHandler` is a custom `logging.Handler` that inspects
77
+ `record.sample_id` and writes to the appropriate per-sample file.
78
+
79
+ File handles are opened lazily on first write and closed eagerly when
80
+ `sample_end` is seen. Peak open FDs = `max_concurrent` samples, not total.
81
+ The `QueueHandler` serializes all writes, so close-then-delete is safe.
83
82
 
84
83
  ```python
85
- # rollouts/events.py
86
- class EventEmitter:
87
- """Writes structured events to JSONL file."""
88
-
84
+ class SampleRoutingHandler(logging.Handler):
89
85
  def __init__(self, output_dir: Path):
90
- self.file = open(output_dir / "events.jsonl", "a")
91
-
92
- def emit(self, event: dict) -> None:
93
- event["timestamp"] = datetime.now().isoformat()
94
- self.file.write(json.dumps(event) + "\n")
95
- self.file.flush()
86
+ super().__init__()
87
+ self.output_dir = output_dir
88
+ self._files: dict[str, TextIO] = {}
89
+
90
+ def emit(self, record: logging.LogRecord):
91
+ sample_id = getattr(record, "sample_id", None)
92
+ if sample_id is None:
93
+ return
94
+ if sample_id not in self._files:
95
+ path = self.output_dir / "samples" / f"{sample_id}.jsonl"
96
+ path.parent.mkdir(parents=True, exist_ok=True)
97
+ self._files[sample_id] = open(path, "a")
98
+ self._files[sample_id].write(self.format(record) + "\n")
99
+ self._files[sample_id].flush()
100
+ # Close file handle when sample is done
101
+ if record.getMessage() == "sample_end":
102
+ self._files[sample_id].close()
103
+ del self._files[sample_id]
104
+
105
+ def close(self):
106
+ """Close all open file handles on shutdown."""
107
+ for f in self._files.values():
108
+ f.close()
109
+ self._files.clear()
110
+ super().close()
96
111
  ```
97
112
 
98
- ### Phase 2: TUI consumer
113
+ ### Consumption
114
+
115
+ TUI tails files. Fully decoupled from eval process.
116
+
117
+ - **Overview**: tail `events.jsonl` -> dashboard with N samples, turns, tokens, current state
118
+ - **Drill-down**: select a sample -> tail `samples/{sample_id}.jsonl` -> streaming text, tool calls, results
119
+ - **Live streaming**: text deltas are DEBUG-level in per-sample files, TUI renders as streaming text
120
+ - **Replay**: same files work for post-hoc viewing
99
121
 
100
- 1. Create `rollouts.tui.watch` that tails events.jsonl
101
- 2. Renders progress based on event types
102
- 3. Can show multiple concurrent samples (like current MultiProgress)
103
- 4. Can show GEPA iteration progress in header
104
- 5. Can show RL training metrics/charts
122
+ ### JSONL record format
105
123
 
106
- ```bash
107
- # Usage
108
- python -m rollouts.tui.watch /path/to/output/events.jsonl
124
+ All records use Python logging's structure with `extra` fields flattened:
109
125
 
110
- # Or with auto-discovery
111
- python -m rollouts.tui.watch --latest # finds most recent run
126
+ ```jsonl
127
+ {"timestamp": "2025-01-15T10:23:45Z", "level": "INFO", "logger": "rollouts.eval", "msg": "sample_start", "sample_id": "sample_0001", "name": "Square_matmul"}
128
+ {"timestamp": "2025-01-15T10:23:46Z", "level": "INFO", "logger": "rollouts.eval", "msg": "turn", "sample_id": "sample_0001", "turn": 1, "status": "streaming"}
129
+ {"timestamp": "2025-01-15T10:23:46Z", "level": "DEBUG", "logger": "rollouts.eval", "msg": "text_delta", "sample_id": "sample_0001", "turn": 1, "text": "Let me analyze"}
130
+ {"timestamp": "2025-01-15T10:23:50Z", "level": "INFO", "logger": "rollouts.eval", "msg": "llm_call", "sample_id": "sample_0001", "turn": 1, "duration_ms": 3400, "tokens_in": 2000, "tokens_out": 1200, "provider": "anthropic", "model": "claude-sonnet-4-20250514"}
131
+ {"timestamp": "2025-01-15T10:24:35Z", "level": "INFO", "logger": "rollouts.eval", "msg": "tool_execution", "sample_id": "sample_0001", "turn": 1, "tool_name": "bash", "duration_ms": 45000, "result_summary": "exit 0"}
132
+ {"timestamp": "2025-01-15T10:24:36Z", "level": "INFO", "logger": "rollouts.eval", "msg": "modal_progress", "sample_id": "sample_0001", "phase": "compiling"}
133
+ {"timestamp": "2025-01-15T10:30:00Z", "level": "INFO", "logger": "rollouts.eval", "msg": "sample_end", "sample_id": "sample_0001", "score": 0.85, "duration_s": 120, "turns_used": 15}
112
134
  ```
113
135
 
114
- ### Phase 3: Remove embedded UI
136
+ One format. No type sniffing. `msg` field is the event type discriminator.
137
+
138
+ ## Design decisions
139
+
140
+ ### EventEmitter is NOT already using Python logging
141
+
142
+ Despite having `logger = logging.getLogger(__name__)` at the top of `events.py`,
143
+ `EventEmitter` writes JSON directly to its own file handle (`self._file.write(json.dumps(event))`).
144
+ It has its own context variable (`_emitter_ctx`), its own file lifecycle, and its own format
145
+ (`{"type": "sample_start", ...}` vs logging's `{"message": "...", "level": "INFO", ...}`).
146
+
147
+ `LoggingEventEmitter` exists as a subclass that also logs, but the base class is a
148
+ completely separate channel. Phase 2 is a genuine replacement, not just a consolidation.
149
+
150
+ ### Phase 2 scope: what actually changes in on_chunk
151
+
152
+ The current `on_chunk_with_sample_id` (evaluation.py:1006) does three things per event:
115
153
 
116
- 1. Remove `MultiProgress` from `evaluate()`
117
- 2. Remove nested progress displays from GEPA
118
- 3. All progress viewing goes through TUI
154
+ 1. Updates `MultiProgress` via `progress.update_task()` (display)
155
+ 2. Calls `emit_event()` which writes to `EventEmitter`'s file handle (persistence)
156
+ 3. Wraps the event with `sample_id` and forwards to `base_on_chunk` (frontend streaming)
119
157
 
120
- ## File Structure
158
+ The conversion:
159
+ - (1) is deleted entirely - no more `MultiProgress`
160
+ - (2) becomes `logger.info()`/`logger.debug()` calls with `extra={}`. The existing
161
+ event mapping logic (`LLMCallEnd` -> `llm_call`, `ToolExecutionEnd` -> `tool_execution`,
162
+ `TextEnd` -> `assistant_message`) is preserved, just targeting logging instead of `emit_event()`
163
+ - (3) stays for now - `base_on_chunk` is used by `TUIFrontend` for interactive single-agent
164
+ mode. It can be removed later when the TUI reads from files instead of callbacks.
165
+ - NEW: `logger.debug("text_delta", ...)` for streaming content (not emitted today)
166
+
167
+ The `last_status` dedup logic and `current_turn` tracking stay - they're still needed
168
+ to avoid flooding the log with redundant status updates.
169
+
170
+ ### TUI backward compatibility
171
+
172
+ OK to break the old `events.jsonl` format. This is an internal tool, not a user-facing API.
173
+ Old eval result directories will have the old format, but we don't need to support replaying
174
+ them - the per-sample result JSON files and trajectories are the durable record.
175
+
176
+ ### Testing strategy
177
+
178
+ - **Phase 1** (SampleRoutingHandler): Unit tests. Write log records with `extra={"sample_id": "x"}`,
179
+ assert correct files created, correct content, FD cleanup on `sample_end`.
180
+ - **Phase 2** (on_chunk conversion): Integration test. Run a small eval (1-2 samples, 2-3 turns),
181
+ assert `events.jsonl` and `samples/*.jsonl` have expected records with expected fields.
182
+ Compare against a snapshot of the current `emit_event` output to verify no data loss.
183
+ - **Phase 3** (TUI): Manual verification. The TUI is a visual tool - automated tests for
184
+ ANSI rendering are brittle and low-value. Test the JSONL parsing logic (the `parse_jsonl_line`
185
+ equivalent) with unit tests against example records.
186
+
187
+ ## Existing infrastructure
188
+
189
+ Most of Phase 1 already exists in `rollouts/_logging/`:
190
+
191
+ - `json_formatter.py`: `JSONFormatter` that flattens `extra` fields onto JSONL records.
192
+ Already strips builtin `LogRecord` attrs and includes only custom extras.
193
+ - `logging_config.py`: `setup_logging()` using `dictConfig` with `QueueHandler`+`QueueListener`
194
+ (mCoding pattern), `RotatingFileHandler` for bounded JSONL files, `atexit` cleanup.
195
+ - `color_formatter.py`: ANSI color formatter for human-readable stderr output.
196
+
197
+ What's missing: `SampleRoutingHandler` and a `setup_eval_logging()` that wires it
198
+ into the existing `setup_logging()` config.
199
+
200
+ ## Implementation plan
201
+
202
+ ### Phase 1: Logging infrastructure
203
+
204
+ - Write `SampleRoutingHandler` (routes records to per-sample files based on `extra["sample_id"]`)
205
+ - Extend `setup_logging()` or write `setup_eval_logging(output_dir)` that adds the routing handler
206
+ to the existing dictConfig alongside the overview JSONL file handler
207
+ - Tests for routing handler
208
+
209
+ ### Phase 2: Replace EventEmitter in evaluate_sample
210
+
211
+ - Convert `on_chunk_with_sample_id` to use `logger.info()`/`logger.debug()` instead of `emit_event()` + `progress.update_task()`
212
+ - Convert `sample_start`/`sample_end` emissions to logging calls
213
+ - Remove `EventEmitter` creation from `evaluate()`
214
+ - Remove `MultiProgress` creation from `evaluate()`
215
+ - Call `setup_eval_logging()` at the start of `evaluate()` instead
216
+
217
+ ### Phase 3: Update TUI to consume new format
218
+
219
+ - Update `TrainingMonitor.parse_jsonl_line()` to handle unified format (simplify)
220
+ - Add drill-down: selecting a sample tails its per-sample JSONL file
221
+ - Add token counts to overview display (data now available from `llm_call` events)
222
+ - Update `ProgressDisplay` or replace with new consumer
223
+
224
+ ### Phase 4: Cleanup
225
+
226
+ - Delete `EventEmitter` class and gut `events.py`
227
+ - Delete `MultiProgress` class
228
+ - Delete `ProgressDisplay`
229
+ - Update eval configs that reference `show_progress`, `verbose`, `EventEmitter`
230
+
231
+ ## File structure (target)
121
232
 
122
233
  ```
123
234
  rollouts/
124
- ├── events.py # EventEmitter - writes JSONL
125
- ├── evaluation.py # Uses EventEmitter (no MultiProgress)
126
- ├── tui/
127
- │ ├── watch.py # Main TUI entry point (tails events.jsonl)
128
- │ ├── monitor.py # TrainingMonitor (rename to EventRenderer?)
129
- │ ├── progress.py # Progress bar rendering (extracted from MultiProgress)
130
- │ └── terminal.py # Terminal abstraction
235
+ _logging/
236
+ json_formatter.py # ALREADY EXISTS - JSONFormatter with extra flattening
237
+ color_formatter.py # ALREADY EXISTS - ANSI color formatter
238
+ logging_config.py # ALREADY EXISTS - setup_logging() with dictConfig + QueueHandler
239
+ sample_handler.py # NEW - SampleRoutingHandler
240
+ evaluation.py # Uses logger (no MultiProgress, no EventEmitter)
241
+ tui/
242
+ watch.py # Main TUI entry point (tails JSONL files)
243
+ monitor.py # TrainingMonitor (simplified parse_jsonl_line)
244
+ terminal.py # Terminal abstraction
245
+ traces.py # Trace viewer (drill-down into sample)
131
246
  ```
132
247
 
133
- ## Migration Path
248
+ ## Code style principles for this refactor
249
+
250
+ From ~/research/docs/code_style/:
251
+
252
+ **Wide events** (logging_sucks.md): One rich record per meaningful unit of work. High-cardinality
253
+ fields (`sample_id`, `turn`) make records queryable. High dimensionality (tokens, duration,
254
+ tool name) means you can answer debugging questions without a second search. Don't scatter
255
+ context across 20 log lines - build the event throughout the request lifecycle, emit once.
256
+
257
+ **Python logging** (mcoding_logging_dense.md): `dictConfig` for configuration, not code.
258
+ Handlers on root logger, let propagation work. `extra={}` for structured fields. Custom
259
+ JSON formatter writes `.jsonl`. `QueueHandler` for non-blocking writes. Library code
260
+ shouldn't configure logging - the eval runner configures, rollouts framework just logs.
261
+
262
+ **Functional core, imperative shell** (favorites.md, code_philosophy_reference.md):
263
+ `SampleRoutingHandler` is imperative shell (manages file handles, does I/O).
264
+ `JSONFormatter._prepare_log_dict()` is functional core (record -> dict, pure transform).
265
+ `on_chunk` mapping events to log calls is pure mapping, no state.
266
+
267
+ **Don't reuse until 2+ examples** (casey_semantic_compression): Don't pre-abstract the
268
+ handler routing. Start with `SampleRoutingHandler` for evals. If GEPA or RL training
269
+ need similar per-entity routing, extract the pattern then.
134
270
 
135
- 1. Add EventEmitter alongside existing code (non-breaking)
136
- 2. Add `rollouts.tui.watch` (new capability)
137
- 3. Deprecate `show_progress` flag
138
- 4. Remove `MultiProgress` from evaluation internals
139
- 5. Update docs to recommend TUI approach
271
+ **Classes for resources, functions for orchestration** (code_philosophy_reference.md):
272
+ `SampleRoutingHandler` is a class because it owns file handles (resource lifecycle).
273
+ `setup_eval_logging()` is a function. The `on_chunk` -> `logger.info()` mapping is a function.
140
274
 
141
- ## Open Questions
275
+ **Parse at the boundary** (tiger_style): The TUI's JSONL parser is the boundary.
276
+ Internally, records are just dicts with a known schema. The `msg` field is the event
277
+ type discriminator - no polymorphic sniffing needed.
142
278
 
143
- 1. Should events go to stdout (like RL training) or file (like our GEPA sketch)?
144
- - File is simpler for local runs
145
- - Stdout works better for remote/piped scenarios
146
- - Could support both via `--events-to stdout` flag
279
+ **Existing infra** (nmoe_experiment_tracking.md): We already have `QueueHandler`+`QueueListener`
280
+ (nmoe doesn't), `JSONFormatter` with extra flattening, and `RotatingFileHandler` for bounded
281
+ files. `SampleRoutingHandler` is the only new piece.
147
282
 
148
- 2. How to handle existing `on_chunk` callbacks?
149
- - Keep for backwards compat, but events.jsonl is the primary output
150
- - Eventually deprecate on_chunk in favor of event file
283
+ ## References
151
284
 
152
- 3. How to integrate with existing `TrainingMonitor` panes?
153
- - Events with `type: "log"` get routed to panes by logger name
154
- - Other event types rendered in dedicated progress section
285
+ - Wide events / canonical log lines: https://loggingsucks.com/
286
+ - Python logging best practices: mCoding (dictConfig, QueueHandler, JSON formatter, extras)
287
+ - Code philosophy: ~/research/docs/code_style/favorites.md, code_philosophy_reference.md
288
+ - nmoe analysis: ~/research/docs/code_style/nmoe_experiment_tracking.md
289
+ - Existing logging infra: rollouts/_logging/ (json_formatter.py, logging_config.py)
290
+ - Library code shouldn't configure logging - eval runner configures, rollouts framework just logs
@@ -267,35 +267,46 @@ class TrainingMonitor:
267
267
  self._needs_redraw = True
268
268
 
269
269
  def _handle_eval_event(self, event_type: str, data: dict) -> None:
270
- """Handle eval/GEPA events and update progress state."""
270
+ """Handle eval/GEPA events and update progress state.
271
+
272
+ Field names: sample_id, sample_name, eval_name (not id/name, which
273
+ conflict with LogRecord builtins).
274
+ """
275
+ sample_id = data.get("sample_id", "")
276
+
271
277
  if event_type == "eval_start":
272
- self._eval_name = data.get("name", "eval")
278
+ self._eval_name = data.get("eval_name", "eval")
273
279
  self._eval_total = data.get("total", 0)
274
280
 
275
281
  elif event_type == "sample_start":
276
- sample_id = data.get("id", "")
277
282
  self._eval_samples[sample_id] = {
278
- "name": data.get("name", sample_id),
283
+ "name": data.get("sample_name", sample_id),
279
284
  "turn": 0,
280
285
  "phase": "",
281
286
  "score": None,
287
+ "tokens_in": 0,
288
+ "tokens_out": 0,
282
289
  }
283
290
  if sample_id not in self._eval_sample_order:
284
291
  self._eval_sample_order.append(sample_id)
285
292
 
286
293
  elif event_type == "turn":
287
- sample_id = data.get("id", "")
288
294
  if sample_id in self._eval_samples:
289
- self._eval_samples[sample_id]["turn"] = data.get("turn", 0)
290
- self._eval_samples[sample_id]["status"] = data.get("status", "")
295
+ if "turn" in data:
296
+ self._eval_samples[sample_id]["turn"] = data["turn"]
297
+ if "status" in data:
298
+ self._eval_samples[sample_id]["status"] = data["status"]
291
299
 
292
300
  elif event_type == "modal_progress":
293
- sample_id = data.get("id", "")
294
301
  if sample_id in self._eval_samples:
295
302
  self._eval_samples[sample_id]["phase"] = data.get("phase", "")
296
303
 
304
+ elif event_type == "llm_call":
305
+ if sample_id in self._eval_samples:
306
+ self._eval_samples[sample_id]["tokens_in"] += data.get("tokens_in", 0)
307
+ self._eval_samples[sample_id]["tokens_out"] += data.get("tokens_out", 0)
308
+
297
309
  elif event_type == "sample_end":
298
- sample_id = data.get("id", "")
299
310
  if sample_id in self._eval_samples:
300
311
  self._eval_samples[sample_id]["score"] = data.get("score")
301
312
  self._eval_samples[sample_id]["phase"] = ""
@@ -364,19 +375,36 @@ class TrainingMonitor:
364
375
  extra=data,
365
376
  )
366
377
 
367
- # Check if this is an eval event (from events.py EventEmitter)
368
- event_type = data.get("type", "")
378
+ # Check if this is an eval event (from events.jsonl via logging)
379
+ # The "message" field is the event type discriminator (e.g. "sample_start")
380
+ # and the logger is "wafer.eval.events"
381
+ eval_event_types = {
382
+ "eval_start",
383
+ "eval_end",
384
+ "sample_start",
385
+ "sample_end",
386
+ "turn",
387
+ "modal_progress",
388
+ "llm_call",
389
+ "tool_execution",
390
+ "assistant_message",
391
+ "gepa_start",
392
+ "gepa_iteration",
393
+ "gepa_accepted",
394
+ "gepa_rejected",
395
+ }
396
+ event_type = message if message in eval_event_types else ""
369
397
  if event_type:
370
398
  self._handle_eval_event(event_type, data)
371
399
  # Also create a log line for the pane
400
+ sample_id = data.get("sample_id", "")
372
401
  if event_type in ("sample_start", "sample_end", "modal_progress", "turn"):
373
- sample_id = data.get("id", "")
374
402
  sample = self._eval_samples.get(sample_id, {})
375
- name = sample.get("name", sample_id)[:20]
403
+ sample_name = sample.get("name", sample_id)[:20]
376
404
  if event_type == "sample_start":
377
405
  return LogLine(
378
406
  logger="eval",
379
- message=f"▶ {name} started",
407
+ message=f"▶ {sample_name} started",
380
408
  level="INFO",
381
409
  extra=data,
382
410
  )
@@ -384,7 +412,7 @@ class TrainingMonitor:
384
412
  score = data.get("score", 0)
385
413
  return LogLine(
386
414
  logger="eval",
387
- message=f"✓ {name} score={score:.2f}",
415
+ message=f"✓ {sample_name} score={score:.2f}",
388
416
  level="INFO",
389
417
  extra=data,
390
418
  )
@@ -392,10 +420,22 @@ class TrainingMonitor:
392
420
  phase = data.get("phase", "")
393
421
  return LogLine(
394
422
  logger="modal",
395
- message=f" {name}: {phase}",
423
+ message=f" {sample_name}: {phase}",
396
424
  level="DEBUG",
397
425
  extra=data,
398
426
  )
427
+ elif event_type == "llm_call":
428
+ tokens_in = data.get("tokens_in", 0)
429
+ tokens_out = data.get("tokens_out", 0)
430
+ duration = data.get("duration_ms", 0)
431
+ sample = self._eval_samples.get(sample_id, {})
432
+ sample_name = sample.get("name", sample_id)[:20]
433
+ return LogLine(
434
+ logger="eval",
435
+ message=f" {sample_name}: llm {tokens_in}→{tokens_out} tok ({duration:.0f}ms)",
436
+ level="DEBUG",
437
+ extra=data,
438
+ )
399
439
  elif event_type.startswith("gepa_"):
400
440
  if event_type == "gepa_iteration":
401
441
  return LogLine(
@@ -734,17 +774,21 @@ class TrainingMonitor:
734
774
  turn = sample.get("turn", 0)
735
775
  status = sample.get("status", "")
736
776
 
777
+ tokens_in = sample.get("tokens_in", 0)
778
+ tokens_out = sample.get("tokens_out", 0)
779
+ tok_str = f" {tokens_in}→{tokens_out}tok" if tokens_in or tokens_out else ""
780
+
737
781
  if score is not None:
738
782
  # Completed
739
783
  color = GREEN if score > 0.5 else YELLOW if score > 0 else RED
740
- status_str = f"{color}✓{RESET} T:{turn} score={score:.2f}"
784
+ status_str = f"{color}✓{RESET} T:{turn}{tok_str} score={score:.2f}"
741
785
  elif phase:
742
786
  # Modal eval in progress
743
- status_str = f"{CYAN}{phase}...{RESET}"
787
+ status_str = f"{CYAN}{phase}...{RESET}{tok_str}"
744
788
  elif status == "streaming":
745
- status_str = f"{DIM}streaming...{RESET}"
789
+ status_str = f"{DIM}streaming...{RESET}{tok_str}"
746
790
  else:
747
- status_str = f"{DIM}T:{turn}{RESET}"
791
+ status_str = f"{DIM}T:{turn}{tok_str}{RESET}"
748
792
 
749
793
  lines.append(f" {name} {status_str}"[:width])
750
794
 
@@ -0,0 +1,30 @@
1
+ """Cloud CUDA compiler - Godbolt for CUDA.
2
+
3
+ Send CUDA C++ code, get PTX/SASS back for inspection.
4
+ """
5
+
6
+ from wafer_core.tools.compile.compiler import (
7
+ compile_cuda_local,
8
+ compile_cuda_remote,
9
+ request_to_dict,
10
+ response_from_dict,
11
+ )
12
+ from wafer_core.tools.compile.types import (
13
+ CompileRequest,
14
+ CompileResponse,
15
+ OutputFormat,
16
+ VALID_ARCHITECTURES,
17
+ )
18
+
19
+ __all__ = [
20
+ # Types
21
+ "CompileRequest",
22
+ "CompileResponse",
23
+ "OutputFormat",
24
+ "VALID_ARCHITECTURES",
25
+ # Functions
26
+ "compile_cuda_local",
27
+ "compile_cuda_remote",
28
+ "request_to_dict",
29
+ "response_from_dict",
30
+ ]