epi-recorder 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
epi_core/redactor.py ADDED
@@ -0,0 +1,266 @@
1
+ """
2
+ EPI Core Redactor - Automatic secret redaction for security.
3
+
4
+ Provides regex-based pattern matching to automatically remove sensitive
5
+ information like API keys, tokens, and credentials from captured data.
6
+ """
7
+
8
+ import re
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Tuple
11
+ import json
12
+
13
+
14
+ # Default redaction patterns (security-first)
15
+ DEFAULT_REDACTION_PATTERNS = [
16
+ # OpenAI API keys
17
+ (r'sk-[a-zA-Z0-9]{48}', 'OpenAI API key'),
18
+ (r'sk-proj-[a-zA-Z0-9_-]{48,}', 'OpenAI Project API key'),
19
+
20
+ # Anthropic API keys
21
+ (r'sk-ant-[a-zA-Z0-9_-]{95,}', 'Anthropic API key'),
22
+
23
+ # Google/Gemini API keys
24
+ (r'AIza[a-zA-Z0-9_-]{35}', 'Google API key'),
25
+
26
+ # Generic Bearer tokens
27
+ (r'Bearer\s+[a-zA-Z0-9_\-\.]{20,}', 'Bearer token'),
28
+
29
+ # AWS credentials
30
+ (r'AKIA[0-9A-Z]{16}', 'AWS Access Key'),
31
+ (r'aws_secret_access_key\s*=\s*[a-zA-Z0-9/+=]{40}', 'AWS Secret Key'),
32
+
33
+ # GitHub tokens
34
+ (r'ghp_[a-zA-Z0-9]{36}', 'GitHub Personal Access Token'),
35
+ (r'gho_[a-zA-Z0-9]{36}', 'GitHub OAuth Token'),
36
+
37
+ # Generic API keys (common patterns)
38
+ (r'api[_-]?key["\']?\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{32,})', 'Generic API key'),
39
+ (r'apikey["\']?\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{32,})', 'Generic API key'),
40
+
41
+ # JWT tokens
42
+ (r'eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}', 'JWT token'),
43
+
44
+ # Database connection strings
45
+ (r'postgres://[^:]+:[^@]+@[^/]+', 'PostgreSQL connection string'),
46
+ (r'mysql://[^:]+:[^@]+@[^/]+', 'MySQL connection string'),
47
+ (r'mongodb://[^:]+:[^@]+@[^/]+', 'MongoDB connection string'),
48
+
49
+ # Private keys (PEM format)
50
+ (r'-----BEGIN\s+(?:RSA\s+)?PRIVATE\s+KEY-----[\s\S]+?-----END\s+(?:RSA\s+)?PRIVATE\s+KEY-----', 'Private key'),
51
+ ]
52
+
53
+ # Environment variable names to redact
54
+ REDACT_ENV_VARS = {
55
+ 'OPENAI_API_KEY',
56
+ 'ANTHROPIC_API_KEY',
57
+ 'GOOGLE_API_KEY',
58
+ 'GEMINI_API_KEY',
59
+ 'AWS_ACCESS_KEY_ID',
60
+ 'AWS_SECRET_ACCESS_KEY',
61
+ 'GITHUB_TOKEN',
62
+ 'API_KEY',
63
+ 'SECRET_KEY',
64
+ 'DATABASE_URL',
65
+ 'DB_PASSWORD',
66
+ 'PASSWORD',
67
+ 'SECRET',
68
+ }
69
+
70
+ REDACTION_PLACEHOLDER = "***REDACTED***"
71
+
72
+
73
+ class Redactor:
74
+ """
75
+ Redacts sensitive information using configurable regex patterns.
76
+
77
+ Automatically removes API keys, tokens, credentials, and other secrets
78
+ from captured workflow data.
79
+ """
80
+
81
+ def __init__(self, config_path: Path | None = None, enabled: bool = True):
82
+ """
83
+ Initialize redactor with optional custom configuration.
84
+
85
+ Args:
86
+ config_path: Optional path to config.toml with custom patterns
87
+ enabled: Whether redaction is enabled (default: True)
88
+ """
89
+ self.enabled = enabled
90
+ self.patterns: List[Tuple[re.Pattern, str]] = []
91
+ self.env_vars_to_redact = REDACT_ENV_VARS.copy()
92
+
93
+ # Compile default patterns
94
+ for pattern_str, description in DEFAULT_REDACTION_PATTERNS:
95
+ try:
96
+ compiled = re.compile(pattern_str, re.IGNORECASE)
97
+ self.patterns.append((compiled, description))
98
+ except re.error as e:
99
+ # Skip invalid patterns (should not happen with defaults)
100
+ print(f"Warning: Invalid pattern '{pattern_str}': {e}")
101
+
102
+ # Load custom config if provided
103
+ if config_path and config_path.exists():
104
+ self._load_config(config_path)
105
+
106
+ def _load_config(self, config_path: Path):
107
+ """
108
+ Load custom redaction patterns from TOML config.
109
+
110
+ Args:
111
+ config_path: Path to config.toml
112
+ """
113
+ try:
114
+ import tomllib # Python 3.11+
115
+
116
+ with open(config_path, 'rb') as f:
117
+ config = tomllib.load(f)
118
+
119
+ # Load custom patterns
120
+ if 'redaction' in config and 'patterns' in config['redaction']:
121
+ for pattern_dict in config['redaction']['patterns']:
122
+ pattern_str = pattern_dict.get('pattern')
123
+ description = pattern_dict.get('description', 'Custom pattern')
124
+
125
+ if pattern_str:
126
+ try:
127
+ compiled = re.compile(pattern_str, re.IGNORECASE)
128
+ self.patterns.append((compiled, description))
129
+ except re.error as e:
130
+ print(f"Warning: Invalid custom pattern '{pattern_str}': {e}")
131
+
132
+ # Load custom env vars
133
+ if 'redaction' in config and 'env_vars' in config['redaction']:
134
+ self.env_vars_to_redact.update(config['redaction']['env_vars'])
135
+
136
+ except Exception as e:
137
+ print(f"Warning: Could not load config from {config_path}: {e}")
138
+
139
+ def redact(self, data: Any) -> Tuple[Any, int]:
140
+ """
141
+ Redact sensitive information from data.
142
+
143
+ Recursively processes dicts, lists, and strings to find and replace
144
+ sensitive patterns with REDACTION_PLACEHOLDER.
145
+
146
+ Args:
147
+ data: Data to redact (dict, list, str, or primitive)
148
+
149
+ Returns:
150
+ tuple: (redacted_data, redaction_count)
151
+ """
152
+ if not self.enabled:
153
+ return data, 0
154
+
155
+ redaction_count = 0
156
+
157
+ if isinstance(data, dict):
158
+ redacted_dict = {}
159
+ for key, value in data.items():
160
+ # Check if key is a sensitive env var
161
+ if key.upper() in self.env_vars_to_redact:
162
+ redacted_dict[key] = REDACTION_PLACEHOLDER
163
+ redaction_count += 1
164
+ else:
165
+ redacted_value, count = self.redact(value)
166
+ redacted_dict[key] = redacted_value
167
+ redaction_count += count
168
+ return redacted_dict, redaction_count
169
+
170
+ elif isinstance(data, list):
171
+ redacted_list = []
172
+ for item in data:
173
+ redacted_item, count = self.redact(item)
174
+ redacted_list.append(redacted_item)
175
+ redaction_count += count
176
+ return redacted_list, redaction_count
177
+
178
+ elif isinstance(data, str):
179
+ redacted_str = data
180
+ for pattern, description in self.patterns:
181
+ matches = pattern.findall(redacted_str)
182
+ if matches:
183
+ redacted_str = pattern.sub(REDACTION_PLACEHOLDER, redacted_str)
184
+ redaction_count += len(matches) if isinstance(matches, list) else 1
185
+
186
+ return redacted_str, redaction_count
187
+
188
+ else:
189
+ # Primitive types (int, float, bool, None)
190
+ return data, 0
191
+
192
+ def redact_dict_keys(self, data: Dict[str, Any], sensitive_keys: set[str]) -> Tuple[Dict[str, Any], int]:
193
+ """
194
+ Redact specific dictionary keys by name.
195
+
196
+ Args:
197
+ data: Dictionary to redact
198
+ sensitive_keys: Set of key names to redact (case-insensitive)
199
+
200
+ Returns:
201
+ tuple: (redacted_dict, redaction_count)
202
+ """
203
+ if not self.enabled:
204
+ return data, 0
205
+
206
+ redacted_dict = {}
207
+ redaction_count = 0
208
+ sensitive_keys_lower = {k.lower() for k in sensitive_keys}
209
+
210
+ for key, value in data.items():
211
+ if key.lower() in sensitive_keys_lower:
212
+ redacted_dict[key] = REDACTION_PLACEHOLDER
213
+ redaction_count += 1
214
+ else:
215
+ redacted_dict[key] = value
216
+
217
+ return redacted_dict, redaction_count
218
+
219
+
220
+ def create_default_config(config_path: Path) -> None:
221
+ """
222
+ Create default configuration file with redaction patterns.
223
+
224
+ Args:
225
+ config_path: Path where config should be created
226
+ """
227
+ config_content = """# EPI Configuration
228
+ # Redaction patterns for automatic secret removal
229
+
230
+ [redaction]
231
+ # Whether redaction is enabled (true by default)
232
+ enabled = true
233
+
234
+ # Additional custom patterns (regex)
235
+ # Example:
236
+ # [[redaction.patterns]]
237
+ # pattern = "my_secret_[a-zA-Z0-9]{20}"
238
+ # description = "My custom secret"
239
+
240
+ # Additional environment variable names to redact
241
+ # env_vars = ["MY_SECRET_VAR", "CUSTOM_TOKEN"]
242
+ """
243
+
244
+ config_path.parent.mkdir(parents=True, exist_ok=True)
245
+ config_path.write_text(config_content)
246
+
247
+
248
+ def get_default_redactor() -> Redactor:
249
+ """
250
+ Get a redactor with default configuration.
251
+
252
+ Attempts to load ~/.epi/config.toml if it exists.
253
+
254
+ Returns:
255
+ Redactor: Configured redactor instance
256
+ """
257
+ config_path = Path.home() / ".epi" / "config.toml"
258
+
259
+ # Create default config if it doesn't exist
260
+ if not config_path.exists():
261
+ try:
262
+ create_default_config(config_path)
263
+ except Exception:
264
+ pass # Fail silently, use defaults
265
+
266
+ return Redactor(config_path=config_path if config_path.exists() else None)
epi_core/schemas.py ADDED
@@ -0,0 +1,112 @@
1
+ """
2
+ EPI Core Schemas - Pydantic models for manifest and steps.
3
+ """
4
+
5
+ from datetime import datetime
6
+ from typing import Any, Dict, Optional
7
+ from uuid import UUID, uuid4
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+
12
+ class ManifestModel(BaseModel):
13
+ """
14
+ Manifest model for .epi files.
15
+
16
+ This is the global header analogous to a PDF catalog.
17
+ Contains metadata, file hashes, and cryptographic signature.
18
+ """
19
+
20
+ spec_version: str = Field(
21
+ default="1.0-keystone",
22
+ description="EPI specification version"
23
+ )
24
+
25
+ workflow_id: UUID = Field(
26
+ default_factory=uuid4,
27
+ description="Unique identifier for this workflow execution"
28
+ )
29
+
30
+ created_at: datetime = Field(
31
+ default_factory=datetime.utcnow,
32
+ description="Timestamp when the .epi file was created (UTC)"
33
+ )
34
+
35
+ cli_command: Optional[str] = Field(
36
+ default=None,
37
+ description="The command-line invocation that produced this workflow"
38
+ )
39
+
40
+ env_snapshot_hash: Optional[str] = Field(
41
+ default=None,
42
+ description="SHA-256 hash of env.json (environment snapshot)"
43
+ )
44
+
45
+ file_manifest: Dict[str, str] = Field(
46
+ default_factory=dict,
47
+ description="Mapping of file paths to their SHA-256 hashes for integrity verification"
48
+ )
49
+
50
+ signature: Optional[str] = Field(
51
+ default=None,
52
+ description="Ed25519 signature of the canonical CBOR hash of this manifest (excluding signature field)"
53
+ )
54
+
55
+ model_config = ConfigDict(
56
+ json_schema_extra={
57
+ "example": {
58
+ "spec_version": "1.0-keystone",
59
+ "workflow_id": "550e8400-e29b-41d4-a716-446655440000",
60
+ "created_at": "2025-01-15T10:30:00Z",
61
+ "cli_command": "epi record --out demo.epi -- python train.py",
62
+ "env_snapshot_hash": "a3c5f...",
63
+ "file_manifest": {
64
+ "steps.jsonl": "b4d6e...",
65
+ "env.json": "a3c5f...",
66
+ "artifacts/output.txt": "c7f8a..."
67
+ },
68
+ "signature": "ed25519:3a4b5c6d..."
69
+ }
70
+ }
71
+ )
72
+
73
+
74
+ class StepModel(BaseModel):
75
+ """
76
+ Step model for recording individual events in a workflow timeline.
77
+
78
+ Each step is an immutable record in steps.jsonl (NDJSON format).
79
+ """
80
+
81
+ index: int = Field(
82
+ description="Sequential step number (0-indexed)"
83
+ )
84
+
85
+ timestamp: datetime = Field(
86
+ default_factory=datetime.utcnow,
87
+ description="Timestamp when this step occurred (UTC)"
88
+ )
89
+
90
+ kind: str = Field(
91
+ description="Step type: shell.command, python.call, llm.request, llm.response, file.write, security.redaction"
92
+ )
93
+
94
+ content: Dict[str, Any] = Field(
95
+ description="Step-specific data (command, output, prompt, response, etc.)"
96
+ )
97
+
98
+ model_config = ConfigDict(
99
+ json_schema_extra={
100
+ "example": {
101
+ "index": 0,
102
+ "timestamp": "2025-01-15T10:30:00Z",
103
+ "kind": "llm.request",
104
+ "content": {
105
+ "provider": "openai",
106
+ "model": "gpt-4",
107
+ "prompt": "Explain quantum computing",
108
+ "parameters": {"temperature": 0.7}
109
+ }
110
+ }
111
+ }
112
+ )
epi_core/serialize.py ADDED
@@ -0,0 +1,131 @@
1
+ """
2
+ EPI Core Serialization - Canonical CBOR hashing for tamper-evident records.
3
+
4
+ This module provides deterministic serialization using CBOR (RFC 8949) with
5
+ canonical encoding to ensure identical hashes across platforms and time.
6
+ """
7
+
8
+ import hashlib
9
+ from datetime import datetime
10
+ from typing import Any
11
+ from uuid import UUID
12
+
13
+ import cbor2
14
+ from pydantic import BaseModel
15
+
16
+
17
+ def _cbor_default_encoder(encoder, value: Any) -> None:
18
+ """
19
+ Custom CBOR encoder for datetime and UUID types.
20
+
21
+ Ensures consistent encoding across different Python environments:
22
+ - datetime objects are encoded as ISO 8601 strings (UTC)
23
+ - UUID objects are encoded as their canonical string representation
24
+
25
+ Args:
26
+ encoder: CBOR encoder instance
27
+ value: Value to encode
28
+
29
+ Raises:
30
+ ValueError: If value type cannot be encoded
31
+ """
32
+ if isinstance(value, datetime):
33
+ # Remove microseconds for stability
34
+ normalized_dt = value.replace(microsecond=0)
35
+ # Encode as ISO 8601 string with Z suffix for UTC
36
+ encoder.encode(normalized_dt.isoformat() + "Z")
37
+ elif isinstance(value, UUID):
38
+ # Use canonical UUID string representation
39
+ encoder.encode(str(value))
40
+ else:
41
+ raise ValueError(f"Cannot encode type {type(value)} to CBOR")
42
+
43
+
44
+ def get_canonical_hash(model: BaseModel, exclude_fields: set[str] | None = None) -> str:
45
+ """
46
+ Compute a deterministic SHA-256 hash of a Pydantic model using canonical CBOR encoding.
47
+
48
+ This function ensures:
49
+ 1. Identical hashes across different Python versions and platforms
50
+ 2. Key ordering independence (CBOR canonical encoding sorts keys)
51
+ 3. Deterministic encoding of datetime/UUID types
52
+ 4. Tamper-evident records (any modification changes the hash)
53
+
54
+ Args:
55
+ model: Pydantic model instance to hash
56
+ exclude_fields: Optional set of field names to exclude from hashing
57
+ (useful for excluding signature fields)
58
+
59
+ Returns:
60
+ str: Hexadecimal SHA-256 hash (64 characters)
61
+
62
+ Example:
63
+ >>> from epi_core.schemas import ManifestModel
64
+ >>> manifest = ManifestModel(cli_command="epi record --out test.epi")
65
+ >>> hash1 = get_canonical_hash(manifest)
66
+ >>> # Same model with fields in different order
67
+ >>> manifest2 = ManifestModel(cli_command="epi record --out test.epi")
68
+ >>> hash2 = get_canonical_hash(manifest2)
69
+ >>> assert hash1 == hash2 # Hashes are identical
70
+ """
71
+ # Convert model to dict
72
+ model_dict = model.model_dump()
73
+
74
+ # Normalize datetime and UUID fields to strings
75
+ def normalize_value(value: Any) -> Any:
76
+ if isinstance(value, datetime):
77
+ # Remove microseconds and convert to ISO 8601 string with Z suffix
78
+ normalized_dt = value.replace(microsecond=0)
79
+ return normalized_dt.isoformat() + "Z"
80
+ elif isinstance(value, UUID):
81
+ # Convert UUID to canonical string representation
82
+ return str(value)
83
+ elif isinstance(value, dict):
84
+ return {k: normalize_value(v) for k, v in value.items()}
85
+ elif isinstance(value, list):
86
+ return [normalize_value(item) for item in value]
87
+ else:
88
+ return value
89
+
90
+ model_dict = normalize_value(model_dict)
91
+
92
+ if exclude_fields:
93
+ for field in exclude_fields:
94
+ model_dict.pop(field, None)
95
+
96
+ # Encode to canonical CBOR
97
+ # canonical=True ensures:
98
+ # - Keys are sorted lexicographically
99
+ # - Minimal encoding is used
100
+ # - Deterministic representation
101
+ cbor_bytes = cbor2.dumps(
102
+ model_dict,
103
+ canonical=True,
104
+ default=_cbor_default_encoder
105
+ )
106
+
107
+ # Compute SHA-256 hash
108
+ hash_obj = hashlib.sha256(cbor_bytes)
109
+
110
+ return hash_obj.hexdigest()
111
+
112
+
113
+ def verify_hash(model: BaseModel, expected_hash: str, exclude_fields: set[str] | None = None) -> bool:
114
+ """
115
+ Verify that a model's canonical hash matches an expected value.
116
+
117
+ Args:
118
+ model: Pydantic model instance to verify
119
+ expected_hash: Expected hexadecimal SHA-256 hash
120
+ exclude_fields: Optional set of field names to exclude from hashing
121
+
122
+ Returns:
123
+ bool: True if hashes match, False otherwise
124
+
125
+ Example:
126
+ >>> manifest = ManifestModel(cli_command="epi record --out test.epi")
127
+ >>> expected = get_canonical_hash(manifest)
128
+ >>> assert verify_hash(manifest, expected) == True
129
+ """
130
+ actual_hash = get_canonical_hash(model, exclude_fields)
131
+ return actual_hash == expected_hash