PyPI - wafer-core - Versions diffs - 0.1.38__py3-none-any.whl → 0.1.39__py3-none-any.whl - Mend

wafer-core 0.1.38py3-none-any.whl → 0.1.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

wafer_core/lib/trace_compare/fusion_analyzer.py +2 -0
wafer_core/rollouts/_logging/__init__.py +5 -1
wafer_core/rollouts/_logging/logging_config.py +95 -3
wafer_core/rollouts/_logging/sample_handler.py +66 -0
wafer_core/rollouts/_pytui/__init__.py +114 -0
wafer_core/rollouts/_pytui/app.py +809 -0
wafer_core/rollouts/_pytui/console.py +291 -0
wafer_core/rollouts/_pytui/renderer.py +210 -0
wafer_core/rollouts/_pytui/spinner.py +73 -0
wafer_core/rollouts/_pytui/terminal.py +489 -0
wafer_core/rollouts/_pytui/text.py +470 -0
wafer_core/rollouts/_pytui/theme.py +241 -0
wafer_core/rollouts/evaluation.py +142 -177
wafer_core/rollouts/progress_app.py +395 -0
wafer_core/rollouts/tui/DESIGN.md +251 -115
wafer_core/rollouts/tui/monitor.py +64 -20
wafer_core/tools/compile/__init__.py +30 -0
wafer_core/tools/compile/compiler.py +314 -0
wafer_core/tools/compile/modal_compile.py +359 -0
wafer_core/tools/compile/tests/__init__.py +1 -0
wafer_core/tools/compile/tests/test_compiler.py +675 -0
wafer_core/tools/compile/tests/test_data/utils.cuh +10 -0
wafer_core/tools/compile/tests/test_data/vector_add.cu +7 -0
wafer_core/tools/compile/tests/test_data/with_header.cu +9 -0
wafer_core/tools/compile/tests/test_modal_integration.py +326 -0
wafer_core/tools/compile/types.py +117 -0
{wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/METADATA +1 -1
{wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/RECORD +29 -12
wafer_core/rollouts/events.py +0 -240
wafer_core/rollouts/progress_display.py +0 -476
wafer_core/utils/event_streaming.py +0 -63
{wafer_core-0.1.38.dist-info → wafer_core-0.1.39.dist-info}/WHEEL +0 -0

wafer_core/rollouts/evaluation.py CHANGED Viewed

@@ -17,6 +17,7 @@ from typing import Any
 import trio
+from ._logging.logging_config import EvalLoggingContext, setup_eval_logging
 from .agents import run_agent
 from .dtypes import (
     Actor,
@@ -29,16 +30,19 @@ from .dtypes import (
     Score,
     StopReason,
     StreamChunk,
+    TextDelta,
     TextEnd,
+    ThinkingDelta,
     ToolExecutionEnd,
     Trajectory,
 )
-from .events import EventEmitter, emit_event
-from .progress import MultiProgress
 from .training.types import Sample, Status
 logger = logging.getLogger(__name__)
+# Logger for structured eval events — handlers configured by setup_eval_logging()
+_event_logger = logging.getLogger("wafer.eval.events")
 # ── Runtime Context ───────────────────────────────────────────────────────────
@@ -56,7 +60,6 @@ class EvalRuntime:
     config: EvalConfig
     api_limiter: trio.CapacityLimiter | None = None
     tool_limiter: trio.CapacityLimiter | None = None
-    progress: MultiProgress | None = None
 # JSON-like recursive type for sanitize_api_keys
@@ -187,17 +190,12 @@ async def _evaluate_batch(
                            Used for incremental report writing.
     """
     config = runtime.config
-    progress = runtime.progress
     results: list[Sample] = []
     # Lock for thread-safe results access during concurrent execution
     results_lock = trio.Lock()
     async def run_one(sample_id: str, sample_data: dict[str, Any]) -> Sample:
         """Evaluate a single sample."""
-        task_name = sample_data.get("name", sample_id)
-        if progress:
-            progress.add_task(sample_id, name=task_name)
         # Get environment: prefer direct environment, fall back to factory
         if config.environment is not None:
             env = config.environment
@@ -212,17 +210,6 @@ async def _evaluate_batch(
             environment=env,
         )
-        # Mark task complete
-        if progress:
-            reward = result.score.reward if result.score else 0.0
-            success = result.metadata.get("status") == "success"
-            if success:
-                message = f"reward={reward:.2f}"
-            else:
-                error = result.metadata.get("error", "failed")
-                message = error[:30] if len(error) > 30 else error
-            progress.complete_task(sample_id, success=success, message=message)
         return result
     if config.max_concurrent == 1:
@@ -477,11 +464,8 @@ def _strip_ansi(text: str) -> str:
 def _truncate(text: str, max_len: int = 2000) -> str:
-    """Truncate long text and strip ANSI codes."""
-    text = _strip_ansi(text)
-    if len(text) <= max_len:
-        return text
-    return text[:max_len] + f"\n\n... (truncated, {len(text)} chars total)"
+    """Strip ANSI codes from text."""
+    return _strip_ansi(text)
 def _format_message_content(content: Any) -> str:
@@ -642,7 +626,9 @@ class EvalReport:
     timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
     git_info: dict[str, Any] = field(default_factory=_get_git_info)
     config_path: str | None = None  # Path to config file relative to repo root
-    metadata: dict[str, Any] | None = None  # Custom metadata (waferbench_category, github_runner, etc.)
+    metadata: dict[str, Any] | None = (
+        None  # Custom metadata (waferbench_category, github_runner, etc.)
+    )
     async def save(self, output_dir: Path) -> None:
         """Save evaluation results to directory."""
@@ -974,7 +960,6 @@ async def evaluate_sample(
     """
     # Unpack runtime for convenience
     config = runtime.config
-    progress = runtime.progress
     # Prepare initial messages from sample
     initial_messages = config.prepare_messages(sample_data)
@@ -1004,73 +989,101 @@ async def evaluate_sample(
     async def on_chunk_with_sample_id(event: object) -> None:
         nonlocal last_status, current_turn
-        # Update MultiProgress on various events for granular status
         status = _get_progress_status_for_event(event)
-        turn = _get_turn_from_event(event)
-        if progress is not None:
-            if status is not None or turn is not None:
-                progress.update_task(
-                    sample_id,
-                    turn=turn if turn is not None else None,
-                    status=status if status is not None else None,
-                )
-        # Emit to file for TUI - only on status changes to avoid flooding
+        # Emit to JSONL files via logging — overview (INFO+) and per-sample (all levels)
         if isinstance(event, StreamChunk):
             if event.type == "turn_start":
                 turn_num = event.data.get("turn", 0)
                 current_turn[sample_id] = turn_num
-                emit_event("turn", id=sample_id, turn=turn_num, status="waiting")
+                _event_logger.info(
+                    "turn",
+                    extra={
+                        "sample_id": sample_id,
+                        "turn": turn_num,
+                        "status": "waiting",
+                    },
+                )
                 last_status[sample_id] = "waiting"
             elif event.type == "modal_progress":
-                emit_event("modal_progress", id=sample_id, phase=event.data.get("phase", ""))
+                _event_logger.info(
+                    "modal_progress",
+                    extra={
+                        "sample_id": sample_id,
+                        "phase": event.data.get("phase", ""),
+                    },
+                )
-        # Emit status changes for LLM events (streaming, thinking, tool calls)
+        # Emit status changes (dedup to avoid flooding)
         if status is not None and status != last_status.get(sample_id):
-            emit_event("turn", id=sample_id, status=status)
+            _event_logger.info("turn", extra={"sample_id": sample_id, "status": status})
             last_status[sample_id] = status
-        # Wide events: emit detailed timing for performance analysis
+        # Wide events: detailed timing for performance analysis
         sample_turn = current_turn.get(sample_id, 0)
         if isinstance(event, LLMCallEnd):
-            emit_event(
+            _event_logger.info(
                 "llm_call",
-                id=sample_id,
-                turn=sample_turn,
-                duration_ms=round(event.duration_ms, 1),
-                provider=event.provider,
-                model=event.model,
-                tokens_in=event.tokens_in,
-                tokens_out=event.tokens_out,
-                status=event.status,
-                error=event.error,
+                extra={
+                    "sample_id": sample_id,
+                    "turn": sample_turn,
+                    "duration_ms": round(event.duration_ms, 1),
+                    "provider": event.provider,
+                    "model": event.model,
+                    "tokens_in": event.tokens_in,
+                    "tokens_out": event.tokens_out,
+                    "status": event.status,
+                    "error": event.error,
+                },
             )
         elif isinstance(event, ToolExecutionEnd):
-            emit_event(
+            _event_logger.info(
                 "tool_execution",
-                id=sample_id,
-                turn=sample_turn,
-                tool_name=event.tool_name,
-                duration_ms=round(event.duration_ms, 1),
-                status=event.status,
-                is_error=event.is_error,
-                result_summary=event.result_summary,
+                extra={
+                    "sample_id": sample_id,
+                    "turn": sample_turn,
+                    "tool_name": event.tool_name,
+                    "duration_ms": round(event.duration_ms, 1),
+                    "status": event.status,
+                    "is_error": event.is_error,
+                    "result_summary": event.result_summary,
+                },
             )
         elif isinstance(event, TextEnd):
-            # Emit assistant message content for observability
-            # Truncate long content to avoid bloating events file
+            # Truncate for events.jsonl (INFO), full content in per-sample (also INFO)
             content = event.content
             truncated = len(content) > 2000
             if truncated:
                 content = content[:2000] + "..."
-            emit_event(
+            _event_logger.info(
                 "assistant_message",
-                id=sample_id,
-                turn=sample_turn,
-                content=content,
-                content_length=len(event.content),
-                truncated=truncated,
+                extra={
+                    "sample_id": sample_id,
+                    "turn": sample_turn,
+                    "content": content,
+                    "content_length": len(event.content),
+                    "truncated": truncated,
+                },
+            )
+        # DEBUG: streaming deltas — per-sample files only (filtered out of events.jsonl)
+        elif isinstance(event, TextDelta):
+            _event_logger.debug(
+                "text_delta",
+                extra={
+                    "sample_id": sample_id,
+                    "turn": sample_turn,
+                    "text": event.delta,
+                },
+            )
+        elif isinstance(event, ThinkingDelta):
+            _event_logger.debug(
+                "thinking_delta",
+                extra={
+                    "sample_id": sample_id,
+                    "turn": sample_turn,
+                    "text": event.delta,
+                },
             )
         # Wrap event with sample_id and forward to base handler
@@ -1107,13 +1120,13 @@ async def evaluate_sample(
         )
     )
-    # Also emit to file for TUI (if emitter configured)
+    # Emit sample_start for progress display
     # TODO: Retry logic can emit multiple sample_start events for the same sample_id
     # without a corresponding sample_end, causing progress display to show 100/100
     # while a sample is still running. Either emit sample_end before retry, or
     # don't emit sample_start on retries. See: chiraag/supabase-eval-traces PR #504
     sample_name = sample_data.get("name", sample_id)
-    emit_event("sample_start", id=sample_id, name=sample_name)
+    _event_logger.info("sample_start", extra={"sample_id": sample_id, "sample_name": sample_name})
     # Run agent with error handling
     result = await _run_agent_with_error_handling(initial_state, run_config, sample_id)
@@ -1202,8 +1215,7 @@ async def evaluate_sample(
         )
     )
-    # Also emit to file for TUI (if emitter configured)
-    emit_event("sample_end", id=sample_id, score=reward)
+    _event_logger.info("sample_end", extra={"sample_id": sample_id, "score": reward})
     return sample
@@ -1296,13 +1308,26 @@ async def evaluate(
         logger.info(f"max concurrent: {config.max_concurrent}")
         logger.debug("=" * 50)
-    # Initialize event emitter for TUI progress (writes to events.jsonl)
-    # This is separate from MultiProgress - events go to file for external TUI
-    emitter: EventEmitter | None = None
+    # Set up eval logging: events.jsonl (overview) + samples/{id}.jsonl (per-sample)
+    eval_logging: EvalLoggingContext | None = None
     if config.output_dir:
-        emitter = EventEmitter(output_dir=config.output_dir)
-        emitter.as_context()  # Make available via get_emitter()
-        emitter.emit("eval_start", name=config.eval_name, total=len(samples_to_eval))
+        eval_logging = setup_eval_logging(config.output_dir)
+        _event_logger.info(
+            "eval_start",
+            extra={
+                "eval_name": config.eval_name,
+                "total": len(samples_to_eval),
+            },
+        )
+    # Progress display: pytui subprocess that reads events.jsonl
+    # Requires output_dir since it reads events.jsonl from there
+    progress_ctx = None
+    if config.show_progress and config.output_dir:
+        from .progress_app import progress_display
+        progress_ctx = progress_display(output_dir=config.output_dir)
+        progress_ctx.__enter__()
     # Evaluate samples (with concurrency control)
     results: list[Sample] = []
@@ -1329,18 +1354,6 @@ async def evaluate(
             )
             last_report_count = len(all_results)
-    # Initialize progress display for sample-level tracking
-    # MultiProgress shows each concurrent sample with turn-by-turn updates
-    progress: MultiProgress | None = None
-    if config.show_progress:
-        progress = MultiProgress(
-            total=len(samples_to_eval),
-            desc=config.eval_name,
-            unit="sample",
-            verbose=config.verbose,  # verbose=True shows INFO logs, False shows only WARNING+
-        )
-        progress.__enter__()
     # Create two-level concurrency limiters if configured
     api_limiter = (
         trio.CapacityLimiter(config.max_api_concurrent)
@@ -1358,7 +1371,6 @@ async def evaluate(
         config=config,
         api_limiter=api_limiter,
         tool_limiter=tool_limiter,
-        progress=progress,
     )
     # Run initial evaluation batch with incremental report callback
@@ -1386,9 +1398,10 @@ async def evaluate(
         else:
             raise
-    # Close progress display
-    if progress:
-        progress.__exit__(None, None, None)
+    # Close progress TUI before any print() calls
+    if progress_ctx:
+        progress_ctx.__exit__(None, None, None)
+        progress_ctx = None
     # Write final partial report and upload if interrupted
     if interrupted and config.output_dir:
@@ -1409,10 +1422,17 @@ async def evaluate(
             print("Upload complete.")
         except Exception as e:
             print(f"Upload failed: {e}")
-        # Close emitter and exit
-        if emitter:
-            emitter.emit("eval_end", name=config.eval_name, total=len(results), interrupted=True)
-            emitter.close()
+        # Emit eval_end and clean up logging
+        if eval_logging:
+            _event_logger.info(
+                "eval_end",
+                extra={
+                    "eval_name": config.eval_name,
+                    "total": len(results),
+                    "interrupted": True,
+                },
+            )
+            eval_logging.teardown()
         # Exit cleanly - don't re-raise to avoid big traceback
         print(f"Partial results saved to {config.output_dir}")
         print("Resume with: --resume", config.output_dir)
@@ -1435,21 +1455,16 @@ async def evaluate(
             f"Retrying {len(failed_samples)} failed samples "
             f"(attempt {retry_attempt + 1}/{config.max_sample_retries}, waiting {wait_seconds}s)"
         )
-        if progress:
-            progress.log(retry_msg)
-        else:
-            logger.info(retry_msg)
+        logger.info(retry_msg)
         await trio.sleep(wait_seconds)
         # Remove failed samples and retry
         failed_ids = {sid for sid, _ in failed_samples}
         results = [r for r in results if r.id not in failed_ids]
-        # Create runtime without progress for retries (no incremental reports during retry)
         retry_runtime = EvalRuntime(
             config=config,
             api_limiter=api_limiter,
             tool_limiter=tool_limiter,
-            progress=None,
         )
         retry_results = await _evaluate_batch(failed_samples, retry_runtime)
         results.extend(retry_results)
@@ -1471,10 +1486,7 @@ async def evaluate(
         retry_result_msg = (
             f"Retry {retry_attempt + 1}: {succeeded} succeeded, {still_failed} still failing"
         )
-        if progress:
-            progress.log(retry_result_msg)
-        else:
-            logger.info(retry_result_msg)
+        logger.info(retry_result_msg)
     # Compute summary metrics
     summary_metrics = compute_summary_metrics(results)
@@ -1517,10 +1529,21 @@ async def evaluate(
             else:
                 logger.info(f"{key}: {value}")
-    # Close event emitter
-    if emitter:
-        emitter.emit("eval_end", name=config.eval_name, total=len(results))
-        emitter.close()
+    # Emit eval_end and clean up logging (before progress TUI so it sees eval_end)
+    if eval_logging:
+        _event_logger.info(
+            "eval_end",
+            extra={
+                "eval_name": config.eval_name,
+                "total": len(results),
+            },
+        )
+        eval_logging.teardown()
+    # Close progress TUI
+    if progress_ctx:
+        progress_ctx.__exit__(None, None, None)
+        progress_ctx = None
     return report
@@ -1571,7 +1594,7 @@ def compute_summary_metrics(results: list[Sample]) -> dict[str, float]:
                 median_val = (sorted_values[n // 2 - 1] + sorted_values[n // 2]) / 2
             else:
                 median_val = sorted_values[n // 2]
             summary[f"mean_{metric_name}"] = mean_val
             summary[f"median_{metric_name}"] = median_val
             summary[f"min_{metric_name}"] = min(values)
@@ -1800,75 +1823,22 @@ def get_api_key(provider: str = "anthropic") -> str | None:
     return None
-def run_with_progress(
-    eval_fn: Callable[[Any], Any],
-    config: Any,
-    output_dir: Path,
-    quiet_config_fn: Callable[[Any], Any] | None = None,
-    async_wrapper: Callable[[Callable, Any], Callable[[], Any]] | None = None,
-) -> dict[str, Any]:
-    """Run evaluation with progress display TUI.
-    Wraps an async eval function with the progress_display context manager,
-    which redirects stdout/stderr to output.log and renders a TUI.
-    Args:
-        eval_fn: Async evaluation function that takes config and returns results
-        config: Configuration object
-        output_dir: Directory for output files (events.jsonl, output.log)
-        quiet_config_fn: Optional function to create a quiet version of config
-                        (disables internal verbose/show_progress flags)
-        async_wrapper: Optional wrapper for async runtime compatibility (e.g., trio_asyncio).
-                      Takes (eval_fn, config) and returns an async callable for trio.run().
-    Returns:
-        Results dict from eval_fn
-    Example:
-        def my_quiet_config(config):
-            return replace(config, run=replace(config.run, verbose=False, show_progress=False))
-        result = run_with_progress(
-            evaluate_my_task,
-            config,
-            config.output.output_dir,
-            quiet_config_fn=my_quiet_config,
-        )
-    """
-    from .progress_display import progress_display
-    # Apply quiet config transformation if provided
-    run_config = quiet_config_fn(config) if quiet_config_fn else config
-    with progress_display(output_dir=output_dir):
-        if async_wrapper:
-            result = trio.run(async_wrapper(eval_fn, run_config))
-        else:
-            result = trio.run(eval_fn, run_config)
-        assert result is not None, "Evaluation was cancelled"
-    return result
 def run_eval(
     eval_fn: Callable[[Any], Any],
     config: Any,
     output_dir: Path,
-    show_progress: bool = False,
-    quiet_config_fn: Callable[[Any], Any] | None = None,
     print_summary_fn: Callable[[dict[str, Any], Path], None] | None = None,
     async_wrapper: Callable[[Callable, Any], Callable[[], Any]] | None = None,
 ) -> dict[str, Any]:
     """Standard entry point for running evaluations.
-    Handles the common pattern of optionally wrapping eval in progress display.
+    Runs the async eval function via trio.run(). Progress display is handled
+    internally by evaluate() based on config.show_progress.
     Args:
         eval_fn: Async evaluation function that takes config and returns results
         config: Configuration object
         output_dir: Directory for output files
-        show_progress: Whether to show progress TUI
-        quiet_config_fn: Optional function to create quiet config for progress mode
         print_summary_fn: Optional function to print summary after completion
         async_wrapper: Optional wrapper for async runtime compatibility (e.g., trio_asyncio).
                       Takes (eval_fn, config) and returns an async callable for trio.run().
@@ -1883,8 +1853,6 @@ def run_eval(
                 eval_fn=evaluate_my_task,
                 config=config,
                 output_dir=config.output.output_dir,
-                show_progress=config.run.show_progress,
-                quiet_config_fn=lambda c: replace(c, run=replace(c.run, verbose=False, show_progress=False)),
                 print_summary_fn=print_my_summary,
             )
@@ -1904,14 +1872,11 @@ def run_eval(
             async_wrapper=asyncio_compat_wrapper,
         )
     """
-    if show_progress:
-        result = run_with_progress(eval_fn, config, output_dir, quiet_config_fn, async_wrapper)
+    if async_wrapper:
+        result = trio.run(async_wrapper(eval_fn, config))
     else:
-        if async_wrapper:
-            result = trio.run(async_wrapper(eval_fn, config))
-        else:
-            result = trio.run(eval_fn, config)
-        assert result is not None, "Evaluation was cancelled"
+        result = trio.run(eval_fn, config)
+    assert result is not None, "Evaluation was cancelled"
     if print_summary_fn:
         print_summary_fn(result, output_dir)

wafer-core 0.1.38__py3-none-any.whl → 0.1.39__py3-none-any.whl

wafer-core 0.1.38py3-none-any.whl → 0.1.39py3-none-any.whl