PyPI - EvoScientist - Versions diffs - 0.0.1.dev2__py3-none-any.whl - Mend

EvoScientist 0.0.1.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

EvoScientist/stream/formatter.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""
+ToolResultFormatter - content-aware tool result formatting with Rich.
+Detects content type (success/error/json/markdown/text) and formats accordingly.
+"""
+import json
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, List
+from rich.panel import Panel
+from rich.syntax import Syntax
+from rich.text import Text
+from rich.markdown import Markdown
+from .utils import SUCCESS_PREFIX, FAILURE_PREFIX, is_success as _is_success, truncate
+class ContentType(Enum):
+    """Content type categories."""
+    SUCCESS = "success"
+    ERROR = "error"
+    JSON = "json"
+    MARKDOWN = "markdown"
+    TEXT = "text"
+@dataclass
+class FormattedResult:
+    """Formatted result container."""
+    content_type: ContentType
+    elements: List[Any]  # Rich renderable elements
+    success: bool = True
+class ToolResultFormatter:
+    """Tool result formatter with content type detection.
+    Usage:
+        formatter = ToolResultFormatter()
+        result = formatter.format("execute", output, max_length=800)
+        for elem in result.elements:
+            console.print(elem)
+    """
+    def detect_type(self, content: str) -> ContentType:
+        """Detect content type."""
+        content = content.strip()
+        if content.startswith(SUCCESS_PREFIX):
+            body = self._extract_body(content)
+            if self._is_json(body):
+                return ContentType.JSON
+            return ContentType.SUCCESS
+        if content.startswith(FAILURE_PREFIX):
+            return ContentType.ERROR
+        if self._is_json(content):
+            return ContentType.JSON
+        if self._is_error(content):
+            return ContentType.ERROR
+        if self._is_markdown(content):
+            return ContentType.MARKDOWN
+        return ContentType.TEXT
+    def is_success(self, content: str) -> bool:
+        """Check if content indicates successful execution."""
+        return _is_success(content)
+    def format(self, name: str, content: str, max_length: int = 800) -> FormattedResult:
+        """Format tool result based on detected content type."""
+        content_type = self.detect_type(content)
+        success = self.is_success(content)
+        formatter_map = {
+            ContentType.SUCCESS: self._format_success,
+            ContentType.ERROR: self._format_error,
+            ContentType.JSON: self._format_json,
+            ContentType.MARKDOWN: self._format_markdown,
+            ContentType.TEXT: self._format_text,
+        }
+        formatter = formatter_map.get(content_type, self._format_text)
+        elements = formatter(name, content, max_length)
+        return FormattedResult(content_type=content_type, elements=elements, success=success)
+    def _extract_body(self, content: str) -> str:
+        """Extract body after status prefix."""
+        lines = content.split("\n", 2)
+        return lines[2].strip() if len(lines) > 2 else ""
+    def _is_json(self, content: str) -> bool:
+        content = content.strip()
+        if not content:
+            return False
+        if (content.startswith('{') and content.endswith('}')) or \
+           (content.startswith('[') and content.endswith(']')):
+            try:
+                json.loads(content)
+                return True
+            except (json.JSONDecodeError, ValueError):
+                pass
+        return False
+    def _is_error(self, content: str) -> bool:
+        error_patterns = [
+            'Traceback (most recent call last)',
+            'Exception:',
+            'Error:',
+        ]
+        return any(pattern in content for pattern in error_patterns)
+    def _is_markdown(self, content: str) -> bool:
+        md_patterns = ['```', '**', '##', '- **']
+        return content.startswith('#') or any(p in content for p in md_patterns)
+    def _format_success(self, name: str, content: str, max_length: int) -> List[Any]:
+        display = truncate(content, max_length)
+        return [Panel(
+            Text(display, style="green"),
+            title=f"{name} OK",
+            border_style="green",
+        )]
+    def _format_error(self, name: str, content: str, max_length: int) -> List[Any]:
+        display = truncate(content, max_length)
+        return [Panel(
+            Text(display, style="red"),
+            title=f"{name} FAILED",
+            border_style="red",
+        )]
+    def _format_json(self, name: str, content: str, max_length: int) -> List[Any]:
+        json_content = content
+        if content.startswith(SUCCESS_PREFIX):
+            json_content = self._extract_body(content)
+        try:
+            data = json.loads(json_content)
+            formatted = json.dumps(data, indent=2, ensure_ascii=False)
+            formatted = truncate(formatted, max_length)
+            return [
+                Text(f"{name} OK", style="cyan bold"),
+                Syntax(formatted, "json", theme="monokai", line_numbers=False),
+            ]
+        except (json.JSONDecodeError, ValueError):
+            return self._format_text(name, content, max_length)
+    def _format_markdown(self, name: str, content: str, max_length: int) -> List[Any]:
+        display = truncate(content, max_length)
+        return [Panel(
+            Markdown(display),
+            title=f"{name}",
+            border_style="cyan dim",
+        )]
+    def _format_text(self, name: str, content: str, max_length: int) -> List[Any]:
+        display = truncate(content, max_length)
+        return [
+            Text(f"{name}:", style="cyan bold"),
+            Text(f"   {display}", style="dim"),
+        ]

EvoScientist/stream/tracker.py ADDED Viewed

@@ -0,0 +1,115 @@
+"""
+ToolCallTracker - manages incremental JSON parsing for tool parameters.
+Handles tool_use blocks where arguments arrive in fragments via input_json_delta.
+"""
+import json
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+@dataclass
+class ToolCallInfo:
+    """Tool call information."""
+    id: str
+    name: str
+    args: Dict = field(default_factory=dict)
+    emitted: bool = False
+    args_complete: bool = False
+    _json_buffer: str = ""
+class ToolCallTracker:
+    """Tool call tracker for incremental argument parsing.
+    Usage:
+        tracker = ToolCallTracker()
+        tracker.update(tool_id, name="execute")
+        tracker.append_json_delta('{"command')
+        tracker.append_json_delta('": "ls"}')
+        tracker.finalize_all()
+        info = tracker.get(tool_id)
+        yield emitter.tool_call(info.name, info.args)
+    """
+    def __init__(self):
+        self._calls: Dict[str, ToolCallInfo] = {}
+        self._last_tool_id: Optional[str] = None
+    def update(
+        self,
+        tool_id: str,
+        name: Optional[str] = None,
+        args: Optional[Dict] = None,
+        args_complete: bool = False,
+    ) -> None:
+        """Update tool call info (accumulative)."""
+        if tool_id not in self._calls:
+            self._calls[tool_id] = ToolCallInfo(
+                id=tool_id,
+                name=name or "",
+                args=args or {},
+                args_complete=args_complete,
+            )
+            self._last_tool_id = tool_id
+        else:
+            info = self._calls[tool_id]
+            if name:
+                info.name = name
+            if args:
+                info.args = args
+            if args_complete:
+                info.args_complete = True
+    def append_json_delta(self, partial_json: str, index: int = 0) -> None:
+        """Accumulate input_json_delta fragment."""
+        tool_id = self._last_tool_id
+        if tool_id and tool_id in self._calls:
+            self._calls[tool_id]._json_buffer += partial_json
+    def finalize_all(self) -> None:
+        """Finalize all tool calls: parse accumulated JSON and mark complete."""
+        for info in self._calls.values():
+            if info._json_buffer:
+                try:
+                    info.args = json.loads(info._json_buffer)
+                except json.JSONDecodeError:
+                    pass
+                info._json_buffer = ""
+            info.args_complete = True
+    def is_ready(self, tool_id: str) -> bool:
+        """Check if a tool call is ready to emit (has name and not yet emitted)."""
+        if tool_id not in self._calls:
+            return False
+        info = self._calls[tool_id]
+        return bool(info.name) and not info.emitted
+    def get_all(self) -> list[ToolCallInfo]:
+        """Get all tool calls."""
+        return list(self._calls.values())
+    def mark_emitted(self, tool_id: str) -> None:
+        """Mark a tool call as emitted."""
+        if tool_id in self._calls:
+            self._calls[tool_id].emitted = True
+    def get(self, tool_id: str) -> Optional[ToolCallInfo]:
+        """Get tool call info by ID."""
+        return self._calls.get(tool_id)
+    def get_pending(self) -> list[ToolCallInfo]:
+        """Get all unemitted tool calls."""
+        return [info for info in self._calls.values() if not info.emitted]
+    def emit_all_pending(self) -> list[ToolCallInfo]:
+        """Emit all pending tool calls and mark them."""
+        pending = self.get_pending()
+        for info in pending:
+            info.emitted = True
+        return pending
+    def clear(self) -> None:
+        """Clear all tracked tool calls."""
+        self._calls.clear()

EvoScientist/stream/utils.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""
+Stream utility functions and constants.
+Provides tool status indicators, display limits, and formatting helpers
+adapted for deepagents tool names.
+"""
+import sys
+from pathlib import PurePath
+from enum import Enum
+# === Status marker constants ===
+SUCCESS_PREFIX = "[OK]"
+FAILURE_PREFIX = "[FAILED]"
+# === Tool status indicators ===
+class ToolStatus(str, Enum):
+    """Tool execution status indicators."""
+    RUNNING = "\u25cf"   # Running - yellow
+    SUCCESS = "\u25cf"   # Success - green
+    ERROR = "\u25cf"     # Failed - red
+    PENDING = "\u25cb"   # Pending - gray
+def get_status_symbol(status: ToolStatus) -> str:
+    """Get status symbol with ASCII fallback for terminals without Unicode."""
+    try:
+        supports_unicode = (
+            sys.stdout.encoding
+            and 'utf' in sys.stdout.encoding.lower()
+        )
+    except Exception:
+        supports_unicode = False
+    if supports_unicode:
+        return status.value
+    fallback = {
+        ToolStatus.RUNNING: "*",
+        ToolStatus.SUCCESS: "+",
+        ToolStatus.ERROR: "x",
+        ToolStatus.PENDING: "-",
+    }
+    return fallback.get(status, "?")
+# === Display limit constants ===
+class DisplayLimits:
+    """Display length limits."""
+    THINKING_STREAM = 1000
+    THINKING_FINAL = 2000
+    ARGS_INLINE = 100
+    ARGS_FORMATTED = 300
+    TOOL_RESULT_STREAM = 500
+    TOOL_RESULT_FINAL = 800
+    TOOL_RESULT_MAX = 2000
+def has_args(args) -> bool:
+    """Check if args has content (handles empty dict falsy issue)."""
+    return args is not None and args != {}
+def is_success(content: str) -> bool:
+    """Determine if tool output indicates successful execution."""
+    content = content.strip()
+    if content.startswith(SUCCESS_PREFIX):
+        return True
+    if content.startswith(FAILURE_PREFIX):
+        return False
+    error_patterns = [
+        'Traceback (most recent call last)',
+        'Exception:',
+        'Error:',
+    ]
+    return not any(pattern in content for pattern in error_patterns)
+def truncate(content: str, max_length: int, suffix: str = "\n... (truncated)") -> str:
+    """Truncate content to specified length."""
+    if len(content) > max_length:
+        return content[:max_length] + suffix
+    return content
+# === Compact formatting for deepagents tools ===
+def _shorten_path(path: str, max_len: int = 40) -> str:
+    """Shorten a file path for display."""
+    if len(path) <= max_len:
+        return path
+    path_obj = PurePath(path)
+    parts = path_obj.parts
+    if len(parts) > 2:
+        return ".../" + "/".join(parts[-2:])
+    return path
+def format_tool_compact(name: str, args: dict | None) -> str:
+    """Format as compact tool call string: ToolName(key_arg).
+    Adapted for deepagents tool names: execute, read_file, write_file,
+    edit_file, grep, glob, ls, write_todos, read_todos, task, load_skill,
+    tavily_search, think_tool.
+    """
+    if not args:
+        return f"{name}()"
+    name_lower = name.lower()
+    # Shell execution
+    if name_lower == "execute":
+        cmd = args.get("command", "")
+        if len(cmd) > 50:
+            cmd = cmd[:47] + "..."
+        return f"execute({cmd})"
+    # File operations
+    if name_lower == "read_file":
+        path = _shorten_path(args.get("path", ""))
+        return f"read_file({path})"
+    if name_lower == "write_file":
+        path = _shorten_path(args.get("path", ""))
+        return f"write_file({path})"
+    if name_lower == "edit_file":
+        path = _shorten_path(args.get("path", ""))
+        return f"edit_file({path})"
+    # Search operations
+    if name_lower == "glob":
+        pattern = args.get("pattern", "")
+        if len(pattern) > 40:
+            pattern = pattern[:37] + "..."
+        return f"glob({pattern})"
+    if name_lower == "grep":
+        pattern = args.get("pattern", "")
+        path = args.get("path", ".")
+        if len(pattern) > 30:
+            pattern = pattern[:27] + "..."
+        return f"grep({pattern}, {path})"
+    # Directory listing
+    if name_lower == "ls":
+        path = args.get("path", ".")
+        return f"ls({path})"
+    # Todo management
+    if name_lower == "write_todos":
+        todos = args.get("todos", [])
+        if isinstance(todos, list):
+            return f"write_todos({len(todos)} items)"
+        return "write_todos(...)"
+    if name_lower == "read_todos":
+        return "read_todos()"
+    # Sub-agent delegation — display as "Cooking with {agent}" instead of "task()"
+    if name_lower == "task":
+        sa_type = args.get("subagent_type", "").strip()
+        task_desc = args.get("description", args.get("task", "")).strip()
+        if sa_type:
+            if task_desc:
+                if len(task_desc) > 50:
+                    task_desc = task_desc[:47] + "..."
+                return f"Cooking with {sa_type} — {task_desc}"
+            return f"Cooking with {sa_type}"
+        # Fallback if no subagent_type
+        if task_desc:
+            if len(task_desc) > 50:
+                task_desc = task_desc[:47] + "..."
+            return f"Cooking with sub-agent — {task_desc}"
+        return "Cooking with sub-agent"
+    # Skills
+    if name_lower == "load_skill":
+        skill_name = args.get("skill_name", args.get("name", ""))
+        return f"load_skill({skill_name})"
+    # Web search
+    if name_lower in ("tavily_search", "internet_search"):
+        query = args.get("query", "")
+        if len(query) > 40:
+            query = query[:37] + "..."
+        return f"{name}({query})"
+    # Think/reflection
+    if name_lower == "think_tool":
+        reflection = args.get("reflection", "")
+        if len(reflection) > 40:
+            reflection = reflection[:37] + "..."
+        return f"think_tool({reflection})"
+    # Default: show first few params
+    params = []
+    for k, v in list(args.items())[:2]:
+        v_str = str(v)
+        if len(v_str) > 20:
+            v_str = v_str[:17] + "..."
+        params.append(f"{k}={v_str}")
+    params_str = ", ".join(params)
+    if len(params_str) > 50:
+        params_str = params_str[:47] + "..."
+    return f"{name}({params_str})"
+def format_tree_output(lines: list[str], max_lines: int = 5, indent: str = "  ") -> str:
+    """Format output as tree structure.
+    Example:
+        └ On branch main
+          Your branch is up to date
+          ... +16 lines
+    """
+    if not lines:
+        return ""
+    result = []
+    display_lines = lines[:max_lines]
+    for i, line in enumerate(display_lines):
+        prefix = "\u2514" if i == 0 else " "
+        result.append(f"{indent}{prefix} {line}")
+    remaining = len(lines) - max_lines
+    if remaining > 0:
+        result.append(f"{indent}  ... +{remaining} lines")
+    return "\n".join(result)
+def count_lines(content: str) -> int:
+    """Count number of lines in content."""
+    if not content:
+        return 0
+    return len(content.strip().split("\n"))
+def truncate_with_line_hint(content: str, max_lines: int = 5) -> tuple[str, int]:
+    """Truncate by line count, returning remaining line count."""
+    lines = content.strip().split("\n")
+    total = len(lines)
+    if total <= max_lines:
+        return content.strip(), 0
+    truncated = "\n".join(lines[:max_lines])
+    remaining = total - max_lines
+    return truncated, remaining

EvoScientist/subagent.yaml ADDED Viewed

@@ -0,0 +1,147 @@
+planner-agent:
+  description: "Plan experiments: stages, success signals, and dependencies (no web search, no implementation)."
+  tools: [think_tool]
+  system_prompt: |
+    You are the planner-agent. You do NOT implement code. You create and update experimental plans
+    that are practical to run locally.
+    You may be invoked in two modes:
+    1) PLAN MODE: produce an initial experimental plan.
+    2) REFLECTION MODE: update the plan based on stage results.
+    The caller should start the task with either:
+    - MODE: PLAN
+    - MODE: REFLECTION
+    If MODE is not specified, assume PLAN.
+    PLAN MODE output (Markdown):
+    1) Assumptions & scope
+    2) Stages (numbered). For each stage include:
+       - goal
+       - success signals (metrics/thresholds or qualitative checks)
+       - what to run (scripts/commands at a high level)
+       - expected artifacts (tables/plots/logs)
+    3) Dependencies (data, compute, environment)
+    4) Iteration triggers (when to change dataset/model/objective)
+    5) Evaluation protocol (splits, primary metrics, baselines) and data quality checks
+    6) Environment preflight (GPU/CUDA/VRAM/disk) and required dependencies (pip packages)
+    REFLECTION MODE output (JSON only, no extra text):
+    {
+      "completed": ["..."],
+      "unmet_success_signals": ["..."],
+      "skill_suggestions": ["..."],
+      "stage_modifications": [
+        {"stage": "Stage name or index", "change": "What to adjust and why"}
+      ],
+      "new_stages": [
+        {
+          "title": "...",
+          "goal": "...",
+          "success_signals": ["..."],
+          "what_to_run": ["..."],
+          "expected_artifacts": ["..."]
+        }
+      ],
+      "todo_updates": ["..."]
+    }
+    Empty arrays are valid. If no changes are needed, return the JSON with empty arrays.
+    "skill_suggestions" must contain skill ids from SKILL.md frontmatter ("name:").
+    Keep the structure flexible (not rigid templates). If model size is unspecified, default to
+    <=7B-class models and lightweight baselines.
+research-agent:
+  description: "Web research for methods/baselines/datasets (one topic at a time, return actionable notes + sources)."
+  tools: [tavily_search, think_tool]
+  system_prompt_ref: RESEARCHER_INSTRUCTIONS
+code-agent:
+  description: "Implement experiment code and runnable scripts; keep changes minimal and reproducible."
+  tools: [think_tool]
+  system_prompt: |
+    You are the code-agent. Implement experiment code in the workspace and keep changes minimal,
+    reproducible, and easy to run.
+    Guidelines:
+    - Prefer small scripts and clear entry points.
+    - Record exact commands to run and where outputs are written.
+    - Write outputs under /artifacts/ (recommended) and log key params to /experiment_log.md (optional).
+    - Do not modify /skills/.
+    - If a relevant local skill exists, load it (load_skill) and follow it instead of reinventing.
+    - Before heavy runs, confirm GPU/CUDA/VRAM availability and required packages.
+    - Suggested preflight commands:
+      - nvidia-smi
+      - python -c "import torch; print(torch.cuda.is_available(), torch.version.cuda, torch.cuda.get_device_name(0))"
+    When responding, include:
+    - Files changed
+    - Commands to run
+    - Output paths
+    - Any remaining issues/next steps
+debug-agent:
+  description: "Debug runtime failures and fix bugs with minimal, verifiable patches."
+  tools: [think_tool]
+  system_prompt: |
+    You are the debug-agent. Reproduce failures, identify root causes, apply minimal fixes, and provide
+    concise diagnostics.
+    Guidelines:
+    - Prefer small, safe changes.
+    - Explain the root cause in one paragraph.
+    - Provide how to reproduce and how to verify the fix.
+    - Do not modify /skills/.
+    - If a relevant local skill exists, load it (load_skill) and use it as a checklist.
+    When responding, include:
+    - Root cause
+    - Fix summary (files/changes)
+    - Repro steps
+    - Verification steps
+data-analysis-agent:
+  description: "Analyze experiment outputs: compute metrics, make plots, summarize insights."
+  tools: [think_tool]
+  system_prompt: |
+    You are the data-analysis-agent. Analyze experiment outputs, compute metrics, and create
+    publication-friendly plots.
+    Guidelines:
+    - Do not invent numbers; compute from files or state what is missing.
+    - Save figures/tables under /artifacts/ (recommended) and reference paths.
+    - Summarize insights and provide 1-3 recommended next experiments.
+    - If a relevant local skill exists (evaluation, logging, plotting), load it (load_skill) and follow it.
+    - Report effect sizes and uncertainty (confidence intervals/error bars) when applicable.
+    - Apply multiple-testing corrections when comparing many conditions.
+    - Distinguish exploratory vs confirmatory findings.
+    When responding, include:
+    - Metrics computed (with definitions)
+    - Figures/tables produced (paths)
+    - Interpretation and next steps
+writing-agent:
+  description: "Draft a paper-ready Markdown experiment report (no fabricated results/citations)."
+  tools: [think_tool]
+  system_prompt: |
+    You are the writing-agent. Draft a clear Markdown experimental report suitable for later paper writing.
+    Guidelines:
+    - Use the experiment plan, logs, and artifacts. Reference file paths for figures/tables.
+    - Do not fabricate results or citations.
+    - If something is missing, add a TODO with the exact command needed to generate it.
+    - If a relevant local skill exists (e.g., evaluation/reporting conventions), load it (load_skill) and apply it.
+    - Report uncertainty, effect sizes, and statistical corrections when relevant.
+    - Include negative results and clear limitations.
+    - Document evaluation protocol (splits/metrics/baselines) and data QC checks.
+    Preferred sections:
+    1) Summary & goals
+    2) Experiment plan (stages + success signals)
+    3) Setup (data, model, environment, parameters)
+    4) Baselines and comparisons
+    5) Results (with artifact paths)
+    6) Analysis, limitations, and next steps
+    7) Sources (only if web research was used)