PyPI - epi-recorder - Versions diffs - 1.0.0__py3-none-any.whl - Mend

epi-recorder 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

epi_cli/__init__.py +5 -0
epi_cli/keys.py +272 -0
epi_cli/main.py +106 -0
epi_cli/record.py +192 -0
epi_cli/verify.py +219 -0
epi_cli/view.py +74 -0
epi_core/__init__.py +14 -0
epi_core/container.py +336 -0
epi_core/redactor.py +266 -0
epi_core/schemas.py +112 -0
epi_core/serialize.py +131 -0
epi_core/trust.py +236 -0
epi_recorder/__init__.py +21 -0
epi_recorder/api.py +389 -0
epi_recorder/bootstrap.py +58 -0
epi_recorder/environment.py +216 -0
epi_recorder/patcher.py +356 -0
epi_recorder-1.0.0.dist-info/METADATA +503 -0
epi_recorder-1.0.0.dist-info/RECORD +25 -0
epi_recorder-1.0.0.dist-info/WHEEL +5 -0
epi_recorder-1.0.0.dist-info/entry_points.txt +2 -0
epi_recorder-1.0.0.dist-info/licenses/LICENSE +201 -0
epi_recorder-1.0.0.dist-info/top_level.txt +4 -0
epi_viewer_static/app.js +267 -0
epi_viewer_static/index.html +77 -0

epi_core/redactor.py ADDED Viewed

@@ -0,0 +1,266 @@
+"""
+EPI Core Redactor - Automatic secret redaction for security.
+Provides regex-based pattern matching to automatically remove sensitive
+information like API keys, tokens, and credentials from captured data.
+"""
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+import json
+# Default redaction patterns (security-first)
+DEFAULT_REDACTION_PATTERNS = [
+    # OpenAI API keys
+    (r'sk-[a-zA-Z0-9]{48}', 'OpenAI API key'),
+    (r'sk-proj-[a-zA-Z0-9_-]{48,}', 'OpenAI Project API key'),
+    # Anthropic API keys
+    (r'sk-ant-[a-zA-Z0-9_-]{95,}', 'Anthropic API key'),
+    # Google/Gemini API keys
+    (r'AIza[a-zA-Z0-9_-]{35}', 'Google API key'),
+    # Generic Bearer tokens
+    (r'Bearer\s+[a-zA-Z0-9_\-\.]{20,}', 'Bearer token'),
+    # AWS credentials
+    (r'AKIA[0-9A-Z]{16}', 'AWS Access Key'),
+    (r'aws_secret_access_key\s*=\s*[a-zA-Z0-9/+=]{40}', 'AWS Secret Key'),
+    # GitHub tokens
+    (r'ghp_[a-zA-Z0-9]{36}', 'GitHub Personal Access Token'),
+    (r'gho_[a-zA-Z0-9]{36}', 'GitHub OAuth Token'),
+    # Generic API keys (common patterns)
+    (r'api[_-]?key["\']?\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{32,})', 'Generic API key'),
+    (r'apikey["\']?\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{32,})', 'Generic API key'),
+    # JWT tokens
+    (r'eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}', 'JWT token'),
+    # Database connection strings
+    (r'postgres://[^:]+:[^@]+@[^/]+', 'PostgreSQL connection string'),
+    (r'mysql://[^:]+:[^@]+@[^/]+', 'MySQL connection string'),
+    (r'mongodb://[^:]+:[^@]+@[^/]+', 'MongoDB connection string'),
+    # Private keys (PEM format)
+    (r'-----BEGIN\s+(?:RSA\s+)?PRIVATE\s+KEY-----[\s\S]+?-----END\s+(?:RSA\s+)?PRIVATE\s+KEY-----', 'Private key'),
+]
+# Environment variable names to redact
+REDACT_ENV_VARS = {
+    'OPENAI_API_KEY',
+    'ANTHROPIC_API_KEY',
+    'GOOGLE_API_KEY',
+    'GEMINI_API_KEY',
+    'AWS_ACCESS_KEY_ID',
+    'AWS_SECRET_ACCESS_KEY',
+    'GITHUB_TOKEN',
+    'API_KEY',
+    'SECRET_KEY',
+    'DATABASE_URL',
+    'DB_PASSWORD',
+    'PASSWORD',
+    'SECRET',
+}
+REDACTION_PLACEHOLDER = "***REDACTED***"
+class Redactor:
+    """
+    Redacts sensitive information using configurable regex patterns.
+    Automatically removes API keys, tokens, credentials, and other secrets
+    from captured workflow data.
+    """
+    def __init__(self, config_path: Path | None = None, enabled: bool = True):
+        """
+        Initialize redactor with optional custom configuration.
+        Args:
+            config_path: Optional path to config.toml with custom patterns
+            enabled: Whether redaction is enabled (default: True)
+        """
+        self.enabled = enabled
+        self.patterns: List[Tuple[re.Pattern, str]] = []
+        self.env_vars_to_redact = REDACT_ENV_VARS.copy()
+        # Compile default patterns
+        for pattern_str, description in DEFAULT_REDACTION_PATTERNS:
+            try:
+                compiled = re.compile(pattern_str, re.IGNORECASE)
+                self.patterns.append((compiled, description))
+            except re.error as e:
+                # Skip invalid patterns (should not happen with defaults)
+                print(f"Warning: Invalid pattern '{pattern_str}': {e}")
+        # Load custom config if provided
+        if config_path and config_path.exists():
+            self._load_config(config_path)
+    def _load_config(self, config_path: Path):
+        """
+        Load custom redaction patterns from TOML config.
+        Args:
+            config_path: Path to config.toml
+        """
+        try:
+            import tomllib  # Python 3.11+
+            with open(config_path, 'rb') as f:
+                config = tomllib.load(f)
+            # Load custom patterns
+            if 'redaction' in config and 'patterns' in config['redaction']:
+                for pattern_dict in config['redaction']['patterns']:
+                    pattern_str = pattern_dict.get('pattern')
+                    description = pattern_dict.get('description', 'Custom pattern')
+                    if pattern_str:
+                        try:
+                            compiled = re.compile(pattern_str, re.IGNORECASE)
+                            self.patterns.append((compiled, description))
+                        except re.error as e:
+                            print(f"Warning: Invalid custom pattern '{pattern_str}': {e}")
+            # Load custom env vars
+            if 'redaction' in config and 'env_vars' in config['redaction']:
+                self.env_vars_to_redact.update(config['redaction']['env_vars'])
+        except Exception as e:
+            print(f"Warning: Could not load config from {config_path}: {e}")
+    def redact(self, data: Any) -> Tuple[Any, int]:
+        """
+        Redact sensitive information from data.
+        Recursively processes dicts, lists, and strings to find and replace
+        sensitive patterns with REDACTION_PLACEHOLDER.
+        Args:
+            data: Data to redact (dict, list, str, or primitive)
+        Returns:
+            tuple: (redacted_data, redaction_count)
+        """
+        if not self.enabled:
+            return data, 0
+        redaction_count = 0
+        if isinstance(data, dict):
+            redacted_dict = {}
+            for key, value in data.items():
+                # Check if key is a sensitive env var
+                if key.upper() in self.env_vars_to_redact:
+                    redacted_dict[key] = REDACTION_PLACEHOLDER
+                    redaction_count += 1
+                else:
+                    redacted_value, count = self.redact(value)
+                    redacted_dict[key] = redacted_value
+                    redaction_count += count
+            return redacted_dict, redaction_count
+        elif isinstance(data, list):
+            redacted_list = []
+            for item in data:
+                redacted_item, count = self.redact(item)
+                redacted_list.append(redacted_item)
+                redaction_count += count
+            return redacted_list, redaction_count
+        elif isinstance(data, str):
+            redacted_str = data
+            for pattern, description in self.patterns:
+                matches = pattern.findall(redacted_str)
+                if matches:
+                    redacted_str = pattern.sub(REDACTION_PLACEHOLDER, redacted_str)
+                    redaction_count += len(matches) if isinstance(matches, list) else 1
+            return redacted_str, redaction_count
+        else:
+            # Primitive types (int, float, bool, None)
+            return data, 0
+    def redact_dict_keys(self, data: Dict[str, Any], sensitive_keys: set[str]) -> Tuple[Dict[str, Any], int]:
+        """
+        Redact specific dictionary keys by name.
+        Args:
+            data: Dictionary to redact
+            sensitive_keys: Set of key names to redact (case-insensitive)
+        Returns:
+            tuple: (redacted_dict, redaction_count)
+        """
+        if not self.enabled:
+            return data, 0
+        redacted_dict = {}
+        redaction_count = 0
+        sensitive_keys_lower = {k.lower() for k in sensitive_keys}
+        for key, value in data.items():
+            if key.lower() in sensitive_keys_lower:
+                redacted_dict[key] = REDACTION_PLACEHOLDER
+                redaction_count += 1
+            else:
+                redacted_dict[key] = value
+        return redacted_dict, redaction_count
+def create_default_config(config_path: Path) -> None:
+    """
+    Create default configuration file with redaction patterns.
+    Args:
+        config_path: Path where config should be created
+    """
+    config_content = """# EPI Configuration
+# Redaction patterns for automatic secret removal
+[redaction]
+# Whether redaction is enabled (true by default)
+enabled = true
+# Additional custom patterns (regex)
+# Example:
+# [[redaction.patterns]]
+# pattern = "my_secret_[a-zA-Z0-9]{20}"
+# description = "My custom secret"
+# Additional environment variable names to redact
+# env_vars = ["MY_SECRET_VAR", "CUSTOM_TOKEN"]
+"""
+    config_path.parent.mkdir(parents=True, exist_ok=True)
+    config_path.write_text(config_content)
+def get_default_redactor() -> Redactor:
+    """
+    Get a redactor with default configuration.
+    Attempts to load ~/.epi/config.toml if it exists.
+    Returns:
+        Redactor: Configured redactor instance
+    """
+    config_path = Path.home() / ".epi" / "config.toml"
+    # Create default config if it doesn't exist
+    if not config_path.exists():
+        try:
+            create_default_config(config_path)
+        except Exception:
+            pass  # Fail silently, use defaults
+    return Redactor(config_path=config_path if config_path.exists() else None)

epi_core/schemas.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""
+EPI Core Schemas - Pydantic models for manifest and steps.
+"""
+from datetime import datetime
+from typing import Any, Dict, Optional
+from uuid import UUID, uuid4
+from pydantic import BaseModel, ConfigDict, Field
+class ManifestModel(BaseModel):
+    """
+    Manifest model for .epi files.
+    This is the global header analogous to a PDF catalog.
+    Contains metadata, file hashes, and cryptographic signature.
+    """
+    spec_version: str = Field(
+        default="1.0-keystone",
+        description="EPI specification version"
+    )
+    workflow_id: UUID = Field(
+        default_factory=uuid4,
+        description="Unique identifier for this workflow execution"
+    )
+    created_at: datetime = Field(
+        default_factory=datetime.utcnow,
+        description="Timestamp when the .epi file was created (UTC)"
+    )
+    cli_command: Optional[str] = Field(
+        default=None,
+        description="The command-line invocation that produced this workflow"
+    )
+    env_snapshot_hash: Optional[str] = Field(
+        default=None,
+        description="SHA-256 hash of env.json (environment snapshot)"
+    )
+    file_manifest: Dict[str, str] = Field(
+        default_factory=dict,
+        description="Mapping of file paths to their SHA-256 hashes for integrity verification"
+    )
+    signature: Optional[str] = Field(
+        default=None,
+        description="Ed25519 signature of the canonical CBOR hash of this manifest (excluding signature field)"
+    )
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "spec_version": "1.0-keystone",
+                "workflow_id": "550e8400-e29b-41d4-a716-446655440000",
+                "created_at": "2025-01-15T10:30:00Z",
+                "cli_command": "epi record --out demo.epi -- python train.py",
+                "env_snapshot_hash": "a3c5f...",
+                "file_manifest": {
+                    "steps.jsonl": "b4d6e...",
+                    "env.json": "a3c5f...",
+                    "artifacts/output.txt": "c7f8a..."
+                },
+                "signature": "ed25519:3a4b5c6d..."
+            }
+        }
+    )
+class StepModel(BaseModel):
+    """
+    Step model for recording individual events in a workflow timeline.
+    Each step is an immutable record in steps.jsonl (NDJSON format).
+    """
+    index: int = Field(
+        description="Sequential step number (0-indexed)"
+    )
+    timestamp: datetime = Field(
+        default_factory=datetime.utcnow,
+        description="Timestamp when this step occurred (UTC)"
+    )
+    kind: str = Field(
+        description="Step type: shell.command, python.call, llm.request, llm.response, file.write, security.redaction"
+    )
+    content: Dict[str, Any] = Field(
+        description="Step-specific data (command, output, prompt, response, etc.)"
+    )
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "index": 0,
+                "timestamp": "2025-01-15T10:30:00Z",
+                "kind": "llm.request",
+                "content": {
+                    "provider": "openai",
+                    "model": "gpt-4",
+                    "prompt": "Explain quantum computing",
+                    "parameters": {"temperature": 0.7}
+                }
+            }
+        }
+    )

epi_core/serialize.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""
+EPI Core Serialization - Canonical CBOR hashing for tamper-evident records.
+This module provides deterministic serialization using CBOR (RFC 8949) with
+canonical encoding to ensure identical hashes across platforms and time.
+"""
+import hashlib
+from datetime import datetime
+from typing import Any
+from uuid import UUID
+import cbor2
+from pydantic import BaseModel
+def _cbor_default_encoder(encoder, value: Any) -> None:
+    """
+    Custom CBOR encoder for datetime and UUID types.
+    Ensures consistent encoding across different Python environments:
+    - datetime objects are encoded as ISO 8601 strings (UTC)
+    - UUID objects are encoded as their canonical string representation
+    Args:
+        encoder: CBOR encoder instance
+        value: Value to encode
+    Raises:
+        ValueError: If value type cannot be encoded
+    """
+    if isinstance(value, datetime):
+        # Remove microseconds for stability
+        normalized_dt = value.replace(microsecond=0)
+        # Encode as ISO 8601 string with Z suffix for UTC
+        encoder.encode(normalized_dt.isoformat() + "Z")
+    elif isinstance(value, UUID):
+        # Use canonical UUID string representation
+        encoder.encode(str(value))
+    else:
+        raise ValueError(f"Cannot encode type {type(value)} to CBOR")
+def get_canonical_hash(model: BaseModel, exclude_fields: set[str] | None = None) -> str:
+    """
+    Compute a deterministic SHA-256 hash of a Pydantic model using canonical CBOR encoding.
+    This function ensures:
+    1. Identical hashes across different Python versions and platforms
+    2. Key ordering independence (CBOR canonical encoding sorts keys)
+    3. Deterministic encoding of datetime/UUID types
+    4. Tamper-evident records (any modification changes the hash)
+    Args:
+        model: Pydantic model instance to hash
+        exclude_fields: Optional set of field names to exclude from hashing
+                       (useful for excluding signature fields)
+    Returns:
+        str: Hexadecimal SHA-256 hash (64 characters)
+    Example:
+        >>> from epi_core.schemas import ManifestModel
+        >>> manifest = ManifestModel(cli_command="epi record --out test.epi")
+        >>> hash1 = get_canonical_hash(manifest)
+        >>> # Same model with fields in different order
+        >>> manifest2 = ManifestModel(cli_command="epi record --out test.epi")
+        >>> hash2 = get_canonical_hash(manifest2)
+        >>> assert hash1 == hash2  # Hashes are identical
+    """
+    # Convert model to dict
+    model_dict = model.model_dump()
+    # Normalize datetime and UUID fields to strings
+    def normalize_value(value: Any) -> Any:
+        if isinstance(value, datetime):
+            # Remove microseconds and convert to ISO 8601 string with Z suffix
+            normalized_dt = value.replace(microsecond=0)
+            return normalized_dt.isoformat() + "Z"
+        elif isinstance(value, UUID):
+            # Convert UUID to canonical string representation
+            return str(value)
+        elif isinstance(value, dict):
+            return {k: normalize_value(v) for k, v in value.items()}
+        elif isinstance(value, list):
+            return [normalize_value(item) for item in value]
+        else:
+            return value
+    model_dict = normalize_value(model_dict)
+    if exclude_fields:
+        for field in exclude_fields:
+            model_dict.pop(field, None)
+    # Encode to canonical CBOR
+    # canonical=True ensures:
+    # - Keys are sorted lexicographically
+    # - Minimal encoding is used
+    # - Deterministic representation
+    cbor_bytes = cbor2.dumps(
+        model_dict,
+        canonical=True,
+        default=_cbor_default_encoder
+    )
+    # Compute SHA-256 hash
+    hash_obj = hashlib.sha256(cbor_bytes)
+    return hash_obj.hexdigest()
+def verify_hash(model: BaseModel, expected_hash: str, exclude_fields: set[str] | None = None) -> bool:
+    """
+    Verify that a model's canonical hash matches an expected value.
+    Args:
+        model: Pydantic model instance to verify
+        expected_hash: Expected hexadecimal SHA-256 hash
+        exclude_fields: Optional set of field names to exclude from hashing
+    Returns:
+        bool: True if hashes match, False otherwise
+    Example:
+        >>> manifest = ManifestModel(cli_command="epi record --out test.epi")
+        >>> expected = get_canonical_hash(manifest)
+        >>> assert verify_hash(manifest, expected) == True
+    """
+    actual_hash = get_canonical_hash(model, exclude_fields)
+    return actual_hash == expected_hash