PyPI - epi-recorder - Versions diffs - 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

epi-recorder 2.1.3py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

epi_analyzer/__init__.py +9 -0
epi_analyzer/detector.py +337 -0
epi_cli/__init__.py +4 -0
epi_cli/__main__.py +4 -0
epi_cli/chat.py +21 -3
epi_cli/debug.py +107 -0
epi_cli/keys.py +4 -0
epi_cli/ls.py +5 -1
epi_cli/main.py +8 -0
epi_cli/record.py +4 -0
epi_cli/run.py +12 -4
epi_cli/verify.py +4 -0
epi_cli/view.py +4 -0
epi_core/__init__.py +5 -1
epi_core/container.py +68 -55
epi_core/redactor.py +4 -0
epi_core/schemas.py +6 -2
epi_core/serialize.py +4 -0
epi_core/storage.py +186 -0
epi_core/trust.py +4 -0
epi_recorder/__init__.py +5 -1
epi_recorder/api.py +28 -2
epi_recorder/async_api.py +151 -0
epi_recorder/bootstrap.py +4 -0
epi_recorder/environment.py +4 -0
epi_recorder/patcher.py +33 -13
epi_recorder/test_import.py +2 -0
epi_recorder/test_script.py +2 -0
epi_recorder-2.2.0.dist-info/METADATA +162 -0
epi_recorder-2.2.0.dist-info/RECORD +38 -0
{epi_recorder-2.1.3.dist-info → epi_recorder-2.2.0.dist-info}/WHEEL +1 -1
{epi_recorder-2.1.3.dist-info → epi_recorder-2.2.0.dist-info}/licenses/LICENSE +4 -29
{epi_recorder-2.1.3.dist-info → epi_recorder-2.2.0.dist-info}/top_level.txt +1 -0
epi_viewer_static/app.js +38 -7
epi_viewer_static/crypto.js +3 -0
epi_viewer_static/index.html +4 -2
epi_viewer_static/viewer_lite.css +3 -1
epi_postinstall.py +0 -197
epi_recorder-2.1.3.dist-info/METADATA +0 -577
epi_recorder-2.1.3.dist-info/RECORD +0 -34
{epi_recorder-2.1.3.dist-info → epi_recorder-2.2.0.dist-info}/entry_points.txt +0 -0

epi_analyzer/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""EPI Analyzer package - Agent mistake detection"""
+from .detector import MistakeDetector
+__all__ = ['MistakeDetector']

epi_analyzer/detector.py ADDED Viewed

@@ -0,0 +1,337 @@
+"""
+EPI Agent Mistake Detector
+AI-powered analysis of agent execution to identify bugs:
+- Infinite loops (same tool called repeatedly with errors)
+- Hallucinations (confident LLM output followed by tool failures)
+- Inefficiency (excessive token usage, repeated work)
+- Repetitive patterns (agent redoing same queries)
+"""
+import json
+import sqlite3
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from difflib import SequenceMatcher
+class MistakeDetector:
+    """
+    AI-powered agent bug detection.
+    Analyzes .epi files to find infinite loops, hallucinations, inefficiencies.
+    """
+    def __init__(self, epi_file: str):
+        """
+        Initialize detector with an EPI recording file.
+        Args:
+            epi_file: Path to .epi file (can be .epi.db or steps.jsonl)
+        """
+        self.epi_path = Path(epi_file)
+        self.steps = self._load_steps()
+        self.mistakes: List[Dict] = []
+    def _load_steps(self) -> List[Dict]:
+        """Load steps from EPI file (ZIP, SQLite, or JSONL)"""
+        import tempfile
+        import zipfile
+        # If it's a ZIP file (.epi), unpack it first
+        if self.epi_path.is_file() and self.epi_path.suffix == '.epi':
+            try:
+                # Check if it's a valid ZIP
+                if zipfile.is_zipfile(self.epi_path):
+                    temp_dir = Path(tempfile.mkdtemp())
+                    with zipfile.ZipFile(self.epi_path, 'r') as zf:
+                        zf.extractall(temp_dir)
+                    # Look for steps.jsonl in extracted content
+                    steps_file = temp_dir / "steps.jsonl"
+                    if steps_file.exists():
+                        return self._load_from_jsonl(steps_file)
+                    # Also check for SQLite db
+                    for db_file in temp_dir.glob("*.db"):
+                        try:
+                            return self._load_from_sqlite(db_file)
+                        except Exception:
+                            continue
+            except Exception:
+                pass  # Fall through to other methods
+        # Try loading from steps.jsonl in directory
+        if self.epi_path.is_dir():
+            jsonl_path = self.epi_path / "steps.jsonl"
+            if jsonl_path.exists():
+                return self._load_from_jsonl(jsonl_path)
+            # Check for temp databases
+            temp_dbs = list(self.epi_path.glob("*_temp.db"))
+            if temp_dbs:
+                return self._load_from_sqlite(temp_dbs[0])
+        # Try as JSONL file directly
+        if self.epi_path.suffix == '.jsonl':
+            return self._load_from_jsonl(self.epi_path)
+        # Try as SQLite database
+        db_paths = [
+            self.epi_path,
+            self.epi_path.with_suffix('.epi.db'),
+            self.epi_path / 'recording.db'
+        ]
+        for db_path in db_paths:
+            if db_path.exists():
+                try:
+                    return self._load_from_sqlite(db_path)
+                except Exception:
+                    continue
+        raise FileNotFoundError(f"No valid .epi file found at {self.epi_path}")
+    def _load_from_jsonl(self, path: Path) -> List[Dict]:
+        """Load steps from JSONL file"""
+        steps = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                if line.strip():
+                    step = json.loads(line)
+                    steps.append({
+                        'id': i,
+                        'index': step.get('index', i),
+                        'type': step.get('kind', 'unknown'),
+                        'content': step.get('content', {}),
+                        'timestamp': step.get('timestamp', '')
+                    })
+        return steps
+    def _load_from_sqlite(self, db_path: Path) -> List[Dict]:
+        """Load steps from SQLite database"""
+        conn = sqlite3.connect(str(db_path))
+        cursor = conn.execute('SELECT * FROM steps ORDER BY id')
+        steps = []
+        for row in cursor.fetchall():
+            content = json.loads(row[3]) if isinstance(row[3], str) else row[3]
+            steps.append({
+                'id': row[0],
+                'index': row[1],
+                'type': row[2],
+                'content': content,
+                'timestamp': row[4] if len(row) > 4 else None
+            })
+        conn.close()
+        return steps
+    def analyze(self) -> List[Dict]:
+        """Run all detection patterns"""
+        self._detect_infinite_loops()
+        self._detect_hallucinations()
+        self._detect_inefficiency()
+        self._detect_repetitive_patterns()
+        return self.mistakes
+    def _detect_infinite_loops(self):
+        """Detect agent stuck calling same tool repeatedly"""
+        # Look for LLM request/response patterns
+        llm_steps = [s for s in self.steps if 'llm' in s['type'].lower()]
+        if len(llm_steps) < 5:
+            return
+        # Check last N calls for repetition
+        window = 5
+        recent = llm_steps[-window:]
+        # Extract patterns (model, messages similarity)
+        patterns = []
+        for step in recent:
+            content = step.get('content', {})
+            # Check if this is a request with messages
+            messages = content.get('messages', [])
+            if messages:
+                # Get last user message
+                user_msgs = [m for m in messages if m.get('role') == 'user']
+                if user_msgs:
+                    patterns.append(user_msgs[-1].get('content', '')[:100])
+        # If we see very similar patterns repeated, it's likely a loop
+        if len(patterns) >= 3:
+            similarities = [
+                self._calculate_similarity(patterns[i], patterns[i+1])
+                for i in range(len(patterns)-1)
+            ]
+            avg_similarity = sum(similarities) / len(similarities)
+            if avg_similarity > 0.8:  # 80% similar
+                self.mistakes.append({
+                    'type': 'INFINITE_LOOP',
+                    'severity': 'CRITICAL',
+                    'step': recent[-1]['id'],
+                    'explanation': f'Agent appears stuck in a loop - repeated similar requests {window} times',
+                    'fix': 'Add max_iterations limit or better error handling',
+                    'cost_impact': 'High - stuck in loop burning API credits',
+                    'pattern_similarity': f'{avg_similarity:.0%}'
+                })
+    def _detect_hallucinations(self):
+        """Detect high-confidence LLM calls followed by errors"""
+        for i, step in enumerate(self.steps[:-1]):
+            if 'llm.response' not in step['type'].lower():
+                continue
+            content = step.get('content', {})
+            # Check if next few steps show errors
+            next_steps = self.steps[i+1:min(i+4, len(self.steps))]
+            errors = [s for s in next_steps if 'error' in s['type'].lower()]
+            if errors and content.get('provider') in ['openai', 'google']:
+                # LLM gave response but then errors occurred
+                choices = content.get('choices', [])
+                if choices:
+                    finish_reason = choices[0].get('finish_reason', 'stop')
+                    if finish_reason == 'stop':  # Completed confidently
+                        response_text = choices[0].get('message', {}).get('content', '')[:150]
+                        self.mistakes.append({
+                            'type': 'HALLUCINATION',
+                            'severity': 'HIGH',
+                            'step': step['id'],
+                            'explanation': 'LLM generated confident output but subsequent operations failed',
+                            'details': f"LLM said: {response_text}...",
+                            'error_step': errors[0]['id'],
+                            'fix': 'Add output validation or use function calling with strict schemas'
+                        })
+    def _detect_inefficiency(self):
+        """Detect expensive operations for simple tasks"""
+        llm_responses = [s for s in self.steps if 'llm.response' in s['type'].lower()]
+        if not llm_responses:
+            return
+        # Calculate token usage
+        total_tokens = 0
+        for step in llm_responses:
+            content = step.get('content', {})
+            usage = content.get('usage', {})
+            if usage:
+                total_tokens += usage.get('total_tokens', 0)
+        step_count = len(self.steps)
+        # Red flags
+        flags = []
+        if total_tokens > 10000 and step_count < 5:
+            flags.append(f"High token usage ({total_tokens:,} tokens) for simple workflow")
+        # Estimate cost (rough)
+        # GPT-4: ~$0.03/1K input, ~$0.06/1K output - use avg $0.045/1K
+        estimated_cost = (total_tokens / 1000) * 0.045
+        if estimated_cost > 0.50:
+            flags.append(f"Expensive execution (~${estimated_cost:.2f})")
+        # Check for model inefficiency (using GPT-4 when GPT-3.5 would work)
+        gpt4_calls = sum(1 for s in llm_responses
+                        if 'gpt-4' in s.get('content', {}).get('model', '').lower())
+        if gpt4_calls > 0 and step_count < 3:
+            flags.append(f"Using GPT-4 for simple task ({gpt4_calls} calls)")
+        if flags:
+            self.mistakes.append({
+                'type': 'INEFFICIENT',
+                'severity': 'MEDIUM',
+                'step': llm_responses[-1]['id'],
+                'explanation': '; '.join(flags),
+                'metrics': {
+                    'total_tokens': total_tokens,
+                    'estimated_cost': round(estimated_cost, 2),
+                    'step_count': step_count,
+                    'llm_calls': len(llm_responses)
+                },
+                'fix': 'Consider using GPT-3.5-turbo or caching responses'
+            })
+    def _detect_repetitive_patterns(self):
+        """Detect agent redoing same work"""
+        if len(self.steps) < 10:
+            return
+        # Look for repeated LLM requests
+        llm_requests = [s for s in self.steps if 'llm.request' in s['type'].lower()]
+        if len(llm_requests) < 3:
+            return
+        # Extract user messages
+        queries = []
+        for step in llm_requests:
+            content = step.get('content', {})
+            messages = content.get('messages', [])
+            for msg in messages:
+                if msg.get('role') == 'user':
+                    queries.append((step['id'], msg.get('content', '')[:100]))
+                    break
+        # Find similar queries
+        for i in range(len(queries)):
+            for j in range(i+1, len(queries)):
+                similarity = self._calculate_similarity(queries[i][1], queries[j][1])
+                if similarity > 0.7:  # 70% similar
+                    self.mistakes.append({
+                        'type': 'REPETITIVE_PATTERN',
+                        'severity': 'LOW',
+                        'step': queries[j][0],
+                        'explanation': f'Similar query repeated (steps {queries[i][0]} and {queries[j][0]})',
+                        'pattern': f'"{queries[i][1][:50]}..."',
+                        'fix': 'Implement memory/caching to avoid redundant LLM calls'
+                    })
+                    return  # Only report first instance
+    def _calculate_similarity(self, a: str, b: str) -> float:
+        """Simple string similarity using SequenceMatcher"""
+        return SequenceMatcher(None, a, b).ratio()
+    def get_summary(self) -> str:
+        """Human-readable summary of detected mistakes"""
+        if not self.mistakes:
+            return "[OK] No obvious mistakes detected"
+        # Count by severity
+        critical = sum(1 for m in self.mistakes if m.get('severity') == 'CRITICAL')
+        high = sum(1 for m in self.mistakes if m.get('severity') == 'HIGH')
+        medium = sum(1 for m in self.mistakes if m.get('severity') == 'MEDIUM')
+        low = sum(1 for m in self.mistakes if m.get('severity') == 'LOW')
+        lines = [
+            f"[!] Found {len(self.mistakes)} issue(s):",
+            f"   {critical} Critical, {high} High, {medium} Medium, {low} Low severity",
+            ""
+        ]
+        # Show details for each mistake
+        for i, m in enumerate(self.mistakes, 1):
+            severity_marker = {
+                'CRITICAL': '[!!!]',
+                'HIGH': '[!!]',
+                'MEDIUM': '[!]',
+                'LOW': '[-]'
+            }.get(m.get('severity', 'LOW'), '[?]')
+            lines.append(f"{i}. {severity_marker} [{m.get('severity')}] {m.get('type')} at Step {m.get('step')}")
+            lines.append(f"   -> {m.get('explanation')}")
+            if 'fix' in m:
+                lines.append(f"   -> Fix: {m['fix']}")
+            lines.append("")
+        return '\n'.join(lines)
+        return '\n'.join(lines)

epi_cli/__init__.py CHANGED Viewed

@@ -3,3 +3,7 @@ EPI CLI - Command-line interface for EPI operations.
 """
 __version__ = "1.0.0-keystone"

epi_cli/__main__.py CHANGED Viewed

@@ -10,3 +10,7 @@ from epi_cli.main import cli_main
 if __name__ == "__main__":
     cli_main()

epi_cli/chat.py CHANGED Viewed

@@ -45,6 +45,7 @@ def load_steps_from_epi(epi_path: Path) -> list:
 def chat(
     epi_file: Path = typer.Argument(..., help="Path to .epi file to chat with"),
+    query: str = typer.Option(None, "--query", "-q", help="Single question (non-interactive mode)"),
     model: str = typer.Option("gemini-2.0-flash", "--model", "-m", help="Gemini model to use")
 ):
     """
@@ -52,8 +53,9 @@ def chat(
     Ask natural language questions about what happened in your recording.
-    Example:
-        epi chat my_recording.epi
+    Examples:
+        epi chat my_recording.epi                    # Interactive mode
+        epi chat my_recording.epi -q "What happened?"  # Single question
     """
     # Resolve path
     if not epi_file.exists():
@@ -150,7 +152,19 @@ When answering questions:
     ))
     console.print()
-    # Chat loop
+    # Non-interactive mode: answer single question and exit
+    if query:
+        try:
+            full_prompt = f"{context}\n\nUser question: {query}"
+            response = chat_session.send_message(full_prompt)
+            console.print("[bold green]AI:[/bold green]")
+            console.print(Markdown(response.text))
+            return
+        except Exception as e:
+            console.print(f"[red]Error:[/red] {e}")
+            raise typer.Exit(1)
+    # Interactive chat loop
     while True:
         try:
             question = Prompt.ask("[bold cyan]You[/bold cyan]")
@@ -191,3 +205,7 @@ When answering questions:
             console.print(f"[red]Error:[/red] {e}")
             console.print("[dim]Try asking a different question.[/dim]")
             console.print()

epi_cli/debug.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""
+EPI Debug Command - AI-powered agent mistake detection.
+Analyzes .epi recordings to find:
+- Infinite loops
+- Hallucinations
+- Inefficiencies
+- Repetitive patterns
+"""
+import json
+from pathlib import Path
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from epi_analyzer.detector import MistakeDetector
+console = Console()
+app = typer.Typer(name="debug", help="Debug AI agent recordings for mistakes")
+@app.callback(invoke_without_command=True)
+def debug(
+    ctx: typer.Context,
+    epi_file: Path = typer.Argument(..., help="Path to .epi recording file or directory"),
+    output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
+    export: Path = typer.Option(None, "--export", help="Export report to file"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed analysis"),
+):
+    """
+    Analyze agent execution for mistakes and inefficiencies.
+    This command uses AI-powered analysis to detect:
+    - Infinite loops (same tool called repeatedly)
+    - Hallucinations (LLM confident but wrong)
+    - Inefficiencies (excessive token usage)
+    - Repetitive patterns (redundant work)
+    Examples:
+        epi debug agent_session.epi
+        epi debug recording_dir/ --json
+        epi debug agent.epi --export report.txt
+    """
+    console.print(f"Analyzing [cyan]{epi_file}[/cyan]...")
+    try:
+        # Run analysis
+        detector = MistakeDetector(str(epi_file))
+        mistakes = detector.analyze()
+        # Prepare output
+        if output_json:
+            output = json.dumps(mistakes, indent=2)
+        else:
+            output = detector.get_summary()
+            if verbose and mistakes:
+                # Add detailed metrics for each mistake
+                details = ["\nDetailed Analysis:"]
+                for i, m in enumerate(mistakes, 1):
+                    details.append(f"\n{i}. {m.get('type')} (Step {m.get('step')})")
+                    for key, value in m.items():
+                        if key not in ['type', 'step']:
+                            details.append(f"   {key}: {value}")
+                output += "\n".join(details)
+        # Display or export
+        if export:
+            export.write_text(output, encoding='utf-8')
+            console.print(f"\nReport saved to [green]{export}[/green]")
+        else:
+            console.print(f"\n{output}")
+        # Show actionable summary if mistakes found
+        if mistakes and not output_json:
+            critical_count = sum(1 for m in mistakes if m.get('severity') == 'CRITICAL')
+            if critical_count > 0:
+                console.print(
+                    Panel(
+                        f"[bold red]WARNING: {critical_count} CRITICAL issue(s) detected![/bold red]\n\n"
+                        "These issues can cause your agent to fail or waste resources.\n"
+                        "Review the suggestions above to fix them.",
+                        title="Action Required",
+                        border_style="red"
+                    )
+                )
+        # Exit code: 1 if critical mistakes found
+        if any(m.get('severity') == 'CRITICAL' for m in mistakes):
+            raise typer.Exit(code=1)
+        console.print("\nAnalysis complete")
+    except FileNotFoundError as e:
+        console.print(f"[red]ERROR: File not found:[/red] {e}")
+        raise typer.Exit(code=2)
+    except Exception as e:
+        console.print(f"[red]ERROR analyzing file:[/red] {e}")
+        if verbose:
+            import traceback
+            console.print(traceback.format_exc())
+        raise typer.Exit(code=3)

epi_cli/keys.py CHANGED Viewed

@@ -270,3 +270,7 @@ def print_keys_table(keys: list[dict[str, str]]) -> None:
         )
     console.print(table)

epi_cli/ls.py CHANGED Viewed

@@ -157,4 +157,8 @@ def ls(
     console.print()
     console.print(table)
     console.print()
-    console.print(f"[dim]Tip: View a recording with 'epi view <name>'[/dim]")
+    console.print(f"[dim]Tip: View a recording with 'epi view <name>'[/dim]")

epi_cli/main.py CHANGED Viewed

@@ -126,6 +126,10 @@ app.command(name="ls", help="List local recordings (./epi-recordings/)")(ls_comm
 from epi_cli.chat import chat as chat_command
 app.command(name="chat", help="Chat with your evidence file using AI")(chat_command)
+# NEW: debug command (v2.2.0 - AI-powered mistake detection)
+from epi_cli.debug import app as debug_app
+app.add_typer(debug_app, name="debug", help="Debug AI agent recordings for mistakes")
 # Phase 1: keys command (for manual key management)
 @app.command()
 def keys(
@@ -320,3 +324,7 @@ def cli_main():
 if __name__ == "__main__":
     cli_main()

epi_cli/record.py CHANGED Viewed

@@ -197,3 +197,7 @@ def record(
     # Exit with child return code
     raise typer.Exit(rc)

epi_cli/run.py CHANGED Viewed

@@ -323,7 +323,7 @@ def run(
         km = KeyManager()
         priv = km.load_private_key("default")
-        # Read manifest from ZIP
+        # Extract, sign, and repack with new viewer
         import json as _json
         with zipfile.ZipFile(out, "r") as zf:
             raw = zf.read("manifest.json").decode("utf-8")
@@ -336,14 +336,18 @@ def run(
         sm = _sign(m, priv, "default")
         signed_json = sm.model_dump_json(indent=2)
-        # Replace manifest in ZIP
+        # Regenerate viewer.html with signed manifest and steps
+        viewer_html = EPIContainer._create_embedded_viewer(temp_workspace, sm)
+        # Replace manifest AND viewer in ZIP
         temp_zip = out.with_suffix(".epi.tmp")
         with zipfile.ZipFile(out, "r") as zf_in:
             with zipfile.ZipFile(temp_zip, "w", zipfile.ZIP_DEFLATED) as zf_out:
                 for item in zf_in.namelist():
-                    if item != "manifest.json":
+                    if item not in ("manifest.json", "viewer.html"):
                         zf_out.writestr(item, zf_in.read(item))
                 zf_out.writestr("manifest.json", signed_json)
+                zf_out.writestr("viewer.html", viewer_html)
         temp_zip.replace(out)
         signed = True
@@ -394,4 +398,8 @@ def run(
         raise typer.Exit(rc)
     if not verified and not no_verify:
         raise typer.Exit(1)
-    raise typer.Exit(0)
+    raise typer.Exit(0)

epi_cli/verify.py CHANGED Viewed

@@ -217,3 +217,7 @@ def print_trust_report(report: dict, epi_file: Path, verbose: bool = False):
     console.print("\n")
     console.print(panel)
     console.print("")

epi_cli/view.py CHANGED Viewed

@@ -127,3 +127,7 @@ def view(
     except Exception as e:
         console.print(f"[red][FAIL] Error:[/red] {e}")
         raise typer.Exit(1)

epi_core/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 EPI Core - Core data structures, serialization, and container management.
 """
-__version__ = "2.1.3"
+__version__ = "2.2.0"
 from epi_core.schemas import ManifestModel, StepModel
 from epi_core.serialize import get_canonical_hash
@@ -12,3 +12,7 @@ __all__ = [
     "StepModel",
     "get_canonical_hash",
 ]

epi-recorder 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl

epi-recorder 2.1.3py3-none-any.whl → 2.2.0py3-none-any.whl