PyPI - wafer-core - Versions diffs - 0.1.38__py3-none-any.whl → 0.1.39__py3-none-any.whl - Mend

wafer-core 0.1.38py3-none-any.whl → 0.1.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

wafer_core/lib/trace_compare/fusion_analyzer.py +2 -0
wafer_core/rollouts/_logging/__init__.py +5 -1
wafer_core/rollouts/_logging/logging_config.py +95 -3
wafer_core/rollouts/_logging/sample_handler.py +66 -0
wafer_core/rollouts/_pytui/__init__.py +114 -0
wafer_core/rollouts/_pytui/app.py +809 -0
wafer_core/rollouts/_pytui/console.py +291 -0
wafer_core/rollouts/_pytui/renderer.py +210 -0
wafer_core/rollouts/_pytui/spinner.py +73 -0
wafer_core/rollouts/_pytui/terminal.py +489 -0
wafer_core/rollouts/_pytui/text.py +470 -0
wafer_core/rollouts/_pytui/theme.py +241 -0
wafer_core/rollouts/evaluation.py +142 -177
wafer_core/rollouts/progress_app.py +395 -0
wafer_core/rollouts/tui/DESIGN.md +251 -115
wafer_core/rollouts/tui/monitor.py +64 -20
wafer_core/tools/compile/__init__.py +30 -0
wafer_core/tools/compile/compiler.py +314 -0
wafer_core/tools/compile/modal_compile.py +359 -0
wafer_core/tools/compile/tests/__init__.py +1 -0
wafer_core/tools/compile/tests/test_compiler.py +675 -0
wafer_core/tools/compile/tests/test_data/utils.cuh +10 -0
wafer_core/tools/compile/tests/test_data/vector_add.cu +7 -0
wafer_core/tools/compile/tests/test_data/with_header.cu +9 -0
wafer_core/tools/compile/tests/test_modal_integration.py +326 -0
wafer_core/tools/compile/types.py +117 -0
{wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/METADATA +1 -1
{wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/RECORD +29 -12
wafer_core/rollouts/events.py +0 -240
wafer_core/rollouts/progress_display.py +0 -476
wafer_core/utils/event_streaming.py +0 -63
{wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/WHEEL +0 -0

wafer_core/rollouts/tui/DESIGN.md CHANGED Viewed

@@ -3,152 +3,288 @@
 ## Goal
 Single pattern for GEPA, RL training, and normal eval:
-1. Process emits JSONL events to a file
-2. TUI tails the file and renders progress
-3. Completely decoupled - process doesn't know if anyone is watching
+1. Process emits structured log records via Python `logging`
+2. Logging handlers route records to JSONL files (overview + per-sample)
+3. TUI tails files and renders progress
+4. Completely decoupled - process doesn't know if anyone is watching
-## Current State
+## Current state (what's wrong)
-| Mode | Event emission | TUI |
-|------|---------------|-----|
-| RL Training | `remote_runner.py` wraps process + tails logs → stdout | `monitor.py` reads stdin |
-| Eval | `MultiProgress` embedded in `evaluate()` | N/A (MultiProgress IS the UI) |
-| GEPA | Nested `MultiProgress` instances fight | Broken |
+Three parallel emission paths that don't compose:
-## Target State
+- **`on_chunk` callbacks** (in-process) -> updates `MultiProgress` directly
+- **`EventEmitter`** (custom file writer) -> writes `events.jsonl`
+- **Python `logging`** (standard) -> consumed by `TrainingMonitor` when JSONL-formatted
-All modes:
-1. Write JSONL events to `{output_dir}/events.jsonl`
-2. TUI tails that file (or stdin if piped)
-3. `MultiProgress` becomes one possible renderer of events
+`MultiProgress` is embedded inside `evaluate()`. `EventEmitter` is a second logging
+system with its own file handle, format, and context variable. The TUI monitor's
+`parse_jsonl_line()` sniffs line shape to figure out which system produced it.
-```
-┌─────────────────────────────────────────────────────────────┐
-│  Any Process (eval, GEPA, RL training)                      │
-│                                                             │
-│  emit_event({"type": "sample_start", "id": "001", ...})    │
-│  emit_event({"type": "turn", "id": "001", "turn": 1})      │
-│  emit_event({"type": "modal_progress", "phase": "compile"})│
-│  emit_event({"type": "sample_end", "id": "001", ...})      │
-│                                                             │
-│            ↓ writes to                                      │
-│  {output_dir}/events.jsonl                                  │
-└─────────────────────────────────────────────────────────────┘
-                           │
-                           │ tail -f (or pipe)
-                           ▼
-┌─────────────────────────────────────────────────────────────┐
-│  TUI (rollouts.tui.watch)                                   │
-│                                                             │
-│  Renders based on event type:                               │
-│  - sample_start/end → progress bar                          │
-│  - turn → update turn count                                 │
-│  - modal_progress → show phase (compiling, checking, etc)   │
-│  - gepa_iteration → show GEPA progress                      │
-│  - metrics → show charts/sparklines                         │
-│  - log → route to appropriate pane                          │
-└─────────────────────────────────────────────────────────────┘
-```
+Result: summary dashboard only (turns, phase status). No drill-down into running
+samples. No token counts. No streaming output.
+## Target state
+Python logging is the single source of truth. `EventEmitter` and `MultiProgress` deleted.
-## Event Types
+### Emission
+`on_chunk` callback body calls `logger.info()`/`logger.debug()` with structured `extra={}`:
 ```python
-# Core eval events (from evaluation.py)
-{"type": "sample_start", "id": "001", "name": "Square_matmul", "total": 10}
-{"type": "turn", "id": "001", "turn": 1, "status": "streaming"}
-{"type": "modal_progress", "id": "001", "phase": "compiling"}  # or correctness, benchmarking
-{"type": "sample_end", "id": "001", "score": 0.85, "time_sec": 45.2}
-# GEPA events (from prompt_optimization/engine.py)
-{"type": "gepa_iteration", "iteration": 3, "evals_used": 12, "evals_budget": 50, "best_score": 0.42}
-{"type": "gepa_accepted", "old_score": 0.40, "new_score": 0.42}
-{"type": "gepa_rejected", "old_score": 0.42, "new_score": 0.38}
-# RL training events (from training/grpo.py)
-{"type": "rl_step", "step": 10, "reward_mean": 0.65, "loss": 0.023}
-{"type": "rl_checkpoint", "step": 100, "path": "/checkpoints/step_100"}
-# Generic log events (from any logger)
-{"type": "log", "logger": "kernelbench", "level": "INFO", "message": "..."}
+async def on_chunk_with_sample_id(event):
+    if isinstance(event, TextDelta):
+        logger.debug("text_delta", extra={"sample_id": sid, "turn": turn, "text": event.text})
+    elif isinstance(event, LLMCallEnd):
+        logger.info("llm_call", extra={"sample_id": sid, "turn": turn,
+                     "duration_ms": event.duration_ms, "tokens_in": event.tokens_in,
+                     "tokens_out": event.tokens_out, "provider": event.provider})
+    elif isinstance(event, ToolExecutionEnd):
+        logger.info("tool_execution", extra={"sample_id": sid, "turn": turn,
+                     "tool_name": event.tool_name, "duration_ms": event.duration_ms})
 ```
-## Implementation Plan
+Wide events pattern: one rich record per meaningful unit of work (per turn, per tool call,
+per LLM call). High-cardinality fields (`sample_id`, `turn`), high dimensionality
+(tokens, duration, tool name, status).
+### Log levels as detail tiers
+- **INFO**: `sample_start`, `sample_end`, `turn`, `llm_call`, `tool_execution`, `modal_progress`
+- **DEBUG**: `text_delta`, `thinking_delta`, `assistant_message` (full content), `tool_result` (full output)
-### Phase 1: Event emitter (no UI changes)
+### Routing via logging handlers
-1. Add `EventEmitter` class that writes JSONL to file
-2. Wire into `evaluate()` - emit events alongside existing `on_chunk` callbacks
-3. Wire into GEPA engine - emit `gepa_iteration`, `gepa_accepted`, etc.
-4. Wire into RL training - emit `rl_step`, etc.
+```
+Root logger handlers (configured via dictConfig at eval startup):
+  1. JSONLFileHandler -> {output_dir}/events.jsonl
+     - Filter: INFO+ only
+     - Purpose: overview dashboard
+  2. SampleRoutingHandler -> {output_dir}/samples/{sample_id}.jsonl
+     - Filter: must have sample_id in extra
+     - Level: ALL (including DEBUG text deltas)
+     - Purpose: per-sample drill-down and streaming
+  3. StreamHandler(stderr) -> human-readable
+     - For verbose/debug mode
+  4. QueueHandler wrapping 1-3 for non-blocking writes
+```
+`SampleRoutingHandler` is a custom `logging.Handler` that inspects
+`record.sample_id` and writes to the appropriate per-sample file.
+File handles are opened lazily on first write and closed eagerly when
+`sample_end` is seen. Peak open FDs = `max_concurrent` samples, not total.
+The `QueueHandler` serializes all writes, so close-then-delete is safe.
 ```python
-# rollouts/events.py
-class EventEmitter:
-    """Writes structured events to JSONL file."""
+class SampleRoutingHandler(logging.Handler):
     def __init__(self, output_dir: Path):
-        self.file = open(output_dir / "events.jsonl", "a")
-    def emit(self, event: dict) -> None:
-        event["timestamp"] = datetime.now().isoformat()
-        self.file.write(json.dumps(event) + "\n")
-        self.file.flush()
+        super().__init__()
+        self.output_dir = output_dir
+        self._files: dict[str, TextIO] = {}
+    def emit(self, record: logging.LogRecord):
+        sample_id = getattr(record, "sample_id", None)
+        if sample_id is None:
+            return
+        if sample_id not in self._files:
+            path = self.output_dir / "samples" / f"{sample_id}.jsonl"
+            path.parent.mkdir(parents=True, exist_ok=True)
+            self._files[sample_id] = open(path, "a")
+        self._files[sample_id].write(self.format(record) + "\n")
+        self._files[sample_id].flush()
+        # Close file handle when sample is done
+        if record.getMessage() == "sample_end":
+            self._files[sample_id].close()
+            del self._files[sample_id]
+    def close(self):
+        """Close all open file handles on shutdown."""
+        for f in self._files.values():
+            f.close()
+        self._files.clear()
+        super().close()
 ```
-### Phase 2: TUI consumer
+### Consumption
+TUI tails files. Fully decoupled from eval process.
+- **Overview**: tail `events.jsonl` -> dashboard with N samples, turns, tokens, current state
+- **Drill-down**: select a sample -> tail `samples/{sample_id}.jsonl` -> streaming text, tool calls, results
+- **Live streaming**: text deltas are DEBUG-level in per-sample files, TUI renders as streaming text
+- **Replay**: same files work for post-hoc viewing
-1. Create `rollouts.tui.watch` that tails events.jsonl
-2. Renders progress based on event types
-3. Can show multiple concurrent samples (like current MultiProgress)
-4. Can show GEPA iteration progress in header
-5. Can show RL training metrics/charts
+### JSONL record format
-```bash
-# Usage
-python -m rollouts.tui.watch /path/to/output/events.jsonl
+All records use Python logging's structure with `extra` fields flattened:
-# Or with auto-discovery
-python -m rollouts.tui.watch --latest  # finds most recent run
+```jsonl
+{"timestamp": "2025-01-15T10:23:45Z", "level": "INFO", "logger": "rollouts.eval", "msg": "sample_start", "sample_id": "sample_0001", "name": "Square_matmul"}
+{"timestamp": "2025-01-15T10:23:46Z", "level": "INFO", "logger": "rollouts.eval", "msg": "turn", "sample_id": "sample_0001", "turn": 1, "status": "streaming"}
+{"timestamp": "2025-01-15T10:23:46Z", "level": "DEBUG", "logger": "rollouts.eval", "msg": "text_delta", "sample_id": "sample_0001", "turn": 1, "text": "Let me analyze"}
+{"timestamp": "2025-01-15T10:23:50Z", "level": "INFO", "logger": "rollouts.eval", "msg": "llm_call", "sample_id": "sample_0001", "turn": 1, "duration_ms": 3400, "tokens_in": 2000, "tokens_out": 1200, "provider": "anthropic", "model": "claude-sonnet-4-20250514"}
+{"timestamp": "2025-01-15T10:24:35Z", "level": "INFO", "logger": "rollouts.eval", "msg": "tool_execution", "sample_id": "sample_0001", "turn": 1, "tool_name": "bash", "duration_ms": 45000, "result_summary": "exit 0"}
+{"timestamp": "2025-01-15T10:24:36Z", "level": "INFO", "logger": "rollouts.eval", "msg": "modal_progress", "sample_id": "sample_0001", "phase": "compiling"}
+{"timestamp": "2025-01-15T10:30:00Z", "level": "INFO", "logger": "rollouts.eval", "msg": "sample_end", "sample_id": "sample_0001", "score": 0.85, "duration_s": 120, "turns_used": 15}
 ```
-### Phase 3: Remove embedded UI
+One format. No type sniffing. `msg` field is the event type discriminator.
+## Design decisions
+### EventEmitter is NOT already using Python logging
+Despite having `logger = logging.getLogger(__name__)` at the top of `events.py`,
+`EventEmitter` writes JSON directly to its own file handle (`self._file.write(json.dumps(event))`).
+It has its own context variable (`_emitter_ctx`), its own file lifecycle, and its own format
+(`{"type": "sample_start", ...}` vs logging's `{"message": "...", "level": "INFO", ...}`).
+`LoggingEventEmitter` exists as a subclass that also logs, but the base class is a
+completely separate channel. Phase 2 is a genuine replacement, not just a consolidation.
+### Phase 2 scope: what actually changes in on_chunk
+The current `on_chunk_with_sample_id` (evaluation.py:1006) does three things per event:
-1. Remove `MultiProgress` from `evaluate()`
-2. Remove nested progress displays from GEPA
-3. All progress viewing goes through TUI
+1. Updates `MultiProgress` via `progress.update_task()` (display)
+2. Calls `emit_event()` which writes to `EventEmitter`'s file handle (persistence)
+3. Wraps the event with `sample_id` and forwards to `base_on_chunk` (frontend streaming)
-## File Structure
+The conversion:
+- (1) is deleted entirely - no more `MultiProgress`
+- (2) becomes `logger.info()`/`logger.debug()` calls with `extra={}`. The existing
+  event mapping logic (`LLMCallEnd` -> `llm_call`, `ToolExecutionEnd` -> `tool_execution`,
+  `TextEnd` -> `assistant_message`) is preserved, just targeting logging instead of `emit_event()`
+- (3) stays for now - `base_on_chunk` is used by `TUIFrontend` for interactive single-agent
+  mode. It can be removed later when the TUI reads from files instead of callbacks.
+- NEW: `logger.debug("text_delta", ...)` for streaming content (not emitted today)
+The `last_status` dedup logic and `current_turn` tracking stay - they're still needed
+to avoid flooding the log with redundant status updates.
+### TUI backward compatibility
+OK to break the old `events.jsonl` format. This is an internal tool, not a user-facing API.
+Old eval result directories will have the old format, but we don't need to support replaying
+them - the per-sample result JSON files and trajectories are the durable record.
+### Testing strategy
+- **Phase 1** (SampleRoutingHandler): Unit tests. Write log records with `extra={"sample_id": "x"}`,
+  assert correct files created, correct content, FD cleanup on `sample_end`.
+- **Phase 2** (on_chunk conversion): Integration test. Run a small eval (1-2 samples, 2-3 turns),
+  assert `events.jsonl` and `samples/*.jsonl` have expected records with expected fields.
+  Compare against a snapshot of the current `emit_event` output to verify no data loss.
+- **Phase 3** (TUI): Manual verification. The TUI is a visual tool - automated tests for
+  ANSI rendering are brittle and low-value. Test the JSONL parsing logic (the `parse_jsonl_line`
+  equivalent) with unit tests against example records.
+## Existing infrastructure
+Most of Phase 1 already exists in `rollouts/_logging/`:
+- `json_formatter.py`: `JSONFormatter` that flattens `extra` fields onto JSONL records.
+  Already strips builtin `LogRecord` attrs and includes only custom extras.
+- `logging_config.py`: `setup_logging()` using `dictConfig` with `QueueHandler`+`QueueListener`
+  (mCoding pattern), `RotatingFileHandler` for bounded JSONL files, `atexit` cleanup.
+- `color_formatter.py`: ANSI color formatter for human-readable stderr output.
+What's missing: `SampleRoutingHandler` and a `setup_eval_logging()` that wires it
+into the existing `setup_logging()` config.
+## Implementation plan
+### Phase 1: Logging infrastructure
+- Write `SampleRoutingHandler` (routes records to per-sample files based on `extra["sample_id"]`)
+- Extend `setup_logging()` or write `setup_eval_logging(output_dir)` that adds the routing handler
+  to the existing dictConfig alongside the overview JSONL file handler
+- Tests for routing handler
+### Phase 2: Replace EventEmitter in evaluate_sample
+- Convert `on_chunk_with_sample_id` to use `logger.info()`/`logger.debug()` instead of `emit_event()` + `progress.update_task()`
+- Convert `sample_start`/`sample_end` emissions to logging calls
+- Remove `EventEmitter` creation from `evaluate()`
+- Remove `MultiProgress` creation from `evaluate()`
+- Call `setup_eval_logging()` at the start of `evaluate()` instead
+### Phase 3: Update TUI to consume new format
+- Update `TrainingMonitor.parse_jsonl_line()` to handle unified format (simplify)
+- Add drill-down: selecting a sample tails its per-sample JSONL file
+- Add token counts to overview display (data now available from `llm_call` events)
+- Update `ProgressDisplay` or replace with new consumer
+### Phase 4: Cleanup
+- Delete `EventEmitter` class and gut `events.py`
+- Delete `MultiProgress` class
+- Delete `ProgressDisplay`
+- Update eval configs that reference `show_progress`, `verbose`, `EventEmitter`
+## File structure (target)
 ```
 rollouts/
-├── events.py           # EventEmitter - writes JSONL
-├── evaluation.py       # Uses EventEmitter (no MultiProgress)
-├── tui/
-│   ├── watch.py        # Main TUI entry point (tails events.jsonl)
-│   ├── monitor.py      # TrainingMonitor (rename to EventRenderer?)
-│   ├── progress.py     # Progress bar rendering (extracted from MultiProgress)
-│   └── terminal.py     # Terminal abstraction
+  _logging/
+    json_formatter.py     # ALREADY EXISTS - JSONFormatter with extra flattening
+    color_formatter.py    # ALREADY EXISTS - ANSI color formatter
+    logging_config.py     # ALREADY EXISTS - setup_logging() with dictConfig + QueueHandler
+    sample_handler.py     # NEW - SampleRoutingHandler
+  evaluation.py           # Uses logger (no MultiProgress, no EventEmitter)
+  tui/
+    watch.py              # Main TUI entry point (tails JSONL files)
+    monitor.py            # TrainingMonitor (simplified parse_jsonl_line)
+    terminal.py           # Terminal abstraction
+    traces.py             # Trace viewer (drill-down into sample)
 ```
-## Migration Path
+## Code style principles for this refactor
+From ~/research/docs/code_style/:
+**Wide events** (logging_sucks.md): One rich record per meaningful unit of work. High-cardinality
+fields (`sample_id`, `turn`) make records queryable. High dimensionality (tokens, duration,
+tool name) means you can answer debugging questions without a second search. Don't scatter
+context across 20 log lines - build the event throughout the request lifecycle, emit once.
+**Python logging** (mcoding_logging_dense.md): `dictConfig` for configuration, not code.
+Handlers on root logger, let propagation work. `extra={}` for structured fields. Custom
+JSON formatter writes `.jsonl`. `QueueHandler` for non-blocking writes. Library code
+shouldn't configure logging - the eval runner configures, rollouts framework just logs.
+**Functional core, imperative shell** (favorites.md, code_philosophy_reference.md):
+`SampleRoutingHandler` is imperative shell (manages file handles, does I/O).
+`JSONFormatter._prepare_log_dict()` is functional core (record -> dict, pure transform).
+`on_chunk` mapping events to log calls is pure mapping, no state.
+**Don't reuse until 2+ examples** (casey_semantic_compression): Don't pre-abstract the
+handler routing. Start with `SampleRoutingHandler` for evals. If GEPA or RL training
+need similar per-entity routing, extract the pattern then.
-1. Add EventEmitter alongside existing code (non-breaking)
-2. Add `rollouts.tui.watch` (new capability)
-3. Deprecate `show_progress` flag
-4. Remove `MultiProgress` from evaluation internals
-5. Update docs to recommend TUI approach
+**Classes for resources, functions for orchestration** (code_philosophy_reference.md):
+`SampleRoutingHandler` is a class because it owns file handles (resource lifecycle).
+`setup_eval_logging()` is a function. The `on_chunk` -> `logger.info()` mapping is a function.
-## Open Questions
+**Parse at the boundary** (tiger_style): The TUI's JSONL parser is the boundary.
+Internally, records are just dicts with a known schema. The `msg` field is the event
+type discriminator - no polymorphic sniffing needed.
-1. Should events go to stdout (like RL training) or file (like our GEPA sketch)?
-   - File is simpler for local runs
-   - Stdout works better for remote/piped scenarios
-   - Could support both via `--events-to stdout` flag
+**Existing infra** (nmoe_experiment_tracking.md): We already have `QueueHandler`+`QueueListener`
+(nmoe doesn't), `JSONFormatter` with extra flattening, and `RotatingFileHandler` for bounded
+files. `SampleRoutingHandler` is the only new piece.
-2. How to handle existing `on_chunk` callbacks?
-   - Keep for backwards compat, but events.jsonl is the primary output
-   - Eventually deprecate on_chunk in favor of event file
+## References
-3. How to integrate with existing `TrainingMonitor` panes?
-   - Events with `type: "log"` get routed to panes by logger name
-   - Other event types rendered in dedicated progress section
+- Wide events / canonical log lines: https://loggingsucks.com/
+- Python logging best practices: mCoding (dictConfig, QueueHandler, JSON formatter, extras)
+- Code philosophy: ~/research/docs/code_style/favorites.md, code_philosophy_reference.md
+- nmoe analysis: ~/research/docs/code_style/nmoe_experiment_tracking.md
+- Existing logging infra: rollouts/_logging/ (json_formatter.py, logging_config.py)
+- Library code shouldn't configure logging - eval runner configures, rollouts framework just logs

wafer_core/rollouts/tui/monitor.py CHANGED Viewed

@@ -267,35 +267,46 @@ class TrainingMonitor:
             self._needs_redraw = True
     def _handle_eval_event(self, event_type: str, data: dict) -> None:
-        """Handle eval/GEPA events and update progress state."""
+        """Handle eval/GEPA events and update progress state.
+        Field names: sample_id, sample_name, eval_name (not id/name, which
+        conflict with LogRecord builtins).
+        """
+        sample_id = data.get("sample_id", "")
         if event_type == "eval_start":
-            self._eval_name = data.get("name", "eval")
+            self._eval_name = data.get("eval_name", "eval")
             self._eval_total = data.get("total", 0)
         elif event_type == "sample_start":
-            sample_id = data.get("id", "")
             self._eval_samples[sample_id] = {
-                "name": data.get("name", sample_id),
+                "name": data.get("sample_name", sample_id),
                 "turn": 0,
                 "phase": "",
                 "score": None,
+                "tokens_in": 0,
+                "tokens_out": 0,
             }
             if sample_id not in self._eval_sample_order:
                 self._eval_sample_order.append(sample_id)
         elif event_type == "turn":
-            sample_id = data.get("id", "")
             if sample_id in self._eval_samples:
-                self._eval_samples[sample_id]["turn"] = data.get("turn", 0)
-                self._eval_samples[sample_id]["status"] = data.get("status", "")
+                if "turn" in data:
+                    self._eval_samples[sample_id]["turn"] = data["turn"]
+                if "status" in data:
+                    self._eval_samples[sample_id]["status"] = data["status"]
         elif event_type == "modal_progress":
-            sample_id = data.get("id", "")
             if sample_id in self._eval_samples:
                 self._eval_samples[sample_id]["phase"] = data.get("phase", "")
+        elif event_type == "llm_call":
+            if sample_id in self._eval_samples:
+                self._eval_samples[sample_id]["tokens_in"] += data.get("tokens_in", 0)
+                self._eval_samples[sample_id]["tokens_out"] += data.get("tokens_out", 0)
         elif event_type == "sample_end":
-            sample_id = data.get("id", "")
             if sample_id in self._eval_samples:
                 self._eval_samples[sample_id]["score"] = data.get("score")
                 self._eval_samples[sample_id]["phase"] = ""
@@ -364,19 +375,36 @@ class TrainingMonitor:
                         extra=data,
                     )
-            # Check if this is an eval event (from events.py EventEmitter)
-            event_type = data.get("type", "")
+            # Check if this is an eval event (from events.jsonl via logging)
+            # The "message" field is the event type discriminator (e.g. "sample_start")
+            # and the logger is "wafer.eval.events"
+            eval_event_types = {
+                "eval_start",
+                "eval_end",
+                "sample_start",
+                "sample_end",
+                "turn",
+                "modal_progress",
+                "llm_call",
+                "tool_execution",
+                "assistant_message",
+                "gepa_start",
+                "gepa_iteration",
+                "gepa_accepted",
+                "gepa_rejected",
+            }
+            event_type = message if message in eval_event_types else ""
             if event_type:
                 self._handle_eval_event(event_type, data)
                 # Also create a log line for the pane
+                sample_id = data.get("sample_id", "")
                 if event_type in ("sample_start", "sample_end", "modal_progress", "turn"):
-                    sample_id = data.get("id", "")
                     sample = self._eval_samples.get(sample_id, {})
-                    name = sample.get("name", sample_id)[:20]
+                    sample_name = sample.get("name", sample_id)[:20]
                     if event_type == "sample_start":
                         return LogLine(
                             logger="eval",
-                            message=f"▶ {name} started",
+                            message=f"▶ {sample_name} started",
                             level="INFO",
                             extra=data,
                         )
@@ -384,7 +412,7 @@ class TrainingMonitor:
                         score = data.get("score", 0)
                         return LogLine(
                             logger="eval",
-                            message=f"✓ {name} score={score:.2f}",
+                            message=f"✓ {sample_name} score={score:.2f}",
                             level="INFO",
                             extra=data,
                         )
@@ -392,10 +420,22 @@ class TrainingMonitor:
                         phase = data.get("phase", "")
                         return LogLine(
                             logger="modal",
-                            message=f"  {name}: {phase}",
+                            message=f"  {sample_name}: {phase}",
                             level="DEBUG",
                             extra=data,
                         )
+                elif event_type == "llm_call":
+                    tokens_in = data.get("tokens_in", 0)
+                    tokens_out = data.get("tokens_out", 0)
+                    duration = data.get("duration_ms", 0)
+                    sample = self._eval_samples.get(sample_id, {})
+                    sample_name = sample.get("name", sample_id)[:20]
+                    return LogLine(
+                        logger="eval",
+                        message=f"  {sample_name}: llm {tokens_in}→{tokens_out} tok ({duration:.0f}ms)",
+                        level="DEBUG",
+                        extra=data,
+                    )
                 elif event_type.startswith("gepa_"):
                     if event_type == "gepa_iteration":
                         return LogLine(
@@ -734,17 +774,21 @@ class TrainingMonitor:
             turn = sample.get("turn", 0)
             status = sample.get("status", "")
+            tokens_in = sample.get("tokens_in", 0)
+            tokens_out = sample.get("tokens_out", 0)
+            tok_str = f" {tokens_in}→{tokens_out}tok" if tokens_in or tokens_out else ""
             if score is not None:
                 # Completed
                 color = GREEN if score > 0.5 else YELLOW if score > 0 else RED
-                status_str = f"{color}✓{RESET} T:{turn} score={score:.2f}"
+                status_str = f"{color}✓{RESET} T:{turn}{tok_str} score={score:.2f}"
             elif phase:
                 # Modal eval in progress
-                status_str = f"{CYAN}{phase}...{RESET}"
+                status_str = f"{CYAN}{phase}...{RESET}{tok_str}"
             elif status == "streaming":
-                status_str = f"{DIM}streaming...{RESET}"
+                status_str = f"{DIM}streaming...{RESET}{tok_str}"
             else:
-                status_str = f"{DIM}T:{turn}{RESET}"
+                status_str = f"{DIM}T:{turn}{tok_str}{RESET}"
             lines.append(f"  {name} {status_str}"[:width])

wafer_core/tools/compile/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Cloud CUDA compiler - Godbolt for CUDA.
+Send CUDA C++ code, get PTX/SASS back for inspection.
+"""
+from wafer_core.tools.compile.compiler import (
+    compile_cuda_local,
+    compile_cuda_remote,
+    request_to_dict,
+    response_from_dict,
+)
+from wafer_core.tools.compile.types import (
+    CompileRequest,
+    CompileResponse,
+    OutputFormat,
+    VALID_ARCHITECTURES,
+)
+__all__ = [
+    # Types
+    "CompileRequest",
+    "CompileResponse",
+    "OutputFormat",
+    "VALID_ARCHITECTURES",
+    # Functions
+    "compile_cuda_local",
+    "compile_cuda_remote",
+    "request_to_dict",
+    "response_from_dict",
+]

wafer-core 0.1.38__py3-none-any.whl → 0.1.39__py3-none-any.whl

wafer-core 0.1.38py3-none-any.whl → 0.1.39py3-none-any.whl