agmem 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agmem-0.1.1.dist-info/METADATA +656 -0
- agmem-0.1.1.dist-info/RECORD +67 -0
- agmem-0.1.1.dist-info/WHEEL +5 -0
- agmem-0.1.1.dist-info/entry_points.txt +2 -0
- agmem-0.1.1.dist-info/licenses/LICENSE +21 -0
- agmem-0.1.1.dist-info/top_level.txt +1 -0
- memvcs/__init__.py +9 -0
- memvcs/cli.py +178 -0
- memvcs/commands/__init__.py +23 -0
- memvcs/commands/add.py +258 -0
- memvcs/commands/base.py +23 -0
- memvcs/commands/blame.py +169 -0
- memvcs/commands/branch.py +110 -0
- memvcs/commands/checkout.py +101 -0
- memvcs/commands/clean.py +76 -0
- memvcs/commands/clone.py +91 -0
- memvcs/commands/commit.py +174 -0
- memvcs/commands/daemon.py +267 -0
- memvcs/commands/diff.py +157 -0
- memvcs/commands/fsck.py +203 -0
- memvcs/commands/garden.py +107 -0
- memvcs/commands/graph.py +151 -0
- memvcs/commands/init.py +61 -0
- memvcs/commands/log.py +103 -0
- memvcs/commands/mcp.py +59 -0
- memvcs/commands/merge.py +88 -0
- memvcs/commands/pull.py +65 -0
- memvcs/commands/push.py +143 -0
- memvcs/commands/reflog.py +52 -0
- memvcs/commands/remote.py +51 -0
- memvcs/commands/reset.py +98 -0
- memvcs/commands/search.py +163 -0
- memvcs/commands/serve.py +54 -0
- memvcs/commands/show.py +125 -0
- memvcs/commands/stash.py +97 -0
- memvcs/commands/status.py +112 -0
- memvcs/commands/tag.py +117 -0
- memvcs/commands/test.py +132 -0
- memvcs/commands/tree.py +156 -0
- memvcs/core/__init__.py +21 -0
- memvcs/core/config_loader.py +245 -0
- memvcs/core/constants.py +12 -0
- memvcs/core/diff.py +380 -0
- memvcs/core/gardener.py +466 -0
- memvcs/core/hooks.py +151 -0
- memvcs/core/knowledge_graph.py +381 -0
- memvcs/core/merge.py +474 -0
- memvcs/core/objects.py +323 -0
- memvcs/core/pii_scanner.py +343 -0
- memvcs/core/refs.py +447 -0
- memvcs/core/remote.py +278 -0
- memvcs/core/repository.py +522 -0
- memvcs/core/schema.py +414 -0
- memvcs/core/staging.py +227 -0
- memvcs/core/storage/__init__.py +72 -0
- memvcs/core/storage/base.py +359 -0
- memvcs/core/storage/gcs.py +308 -0
- memvcs/core/storage/local.py +182 -0
- memvcs/core/storage/s3.py +369 -0
- memvcs/core/test_runner.py +371 -0
- memvcs/core/vector_store.py +313 -0
- memvcs/integrations/__init__.py +5 -0
- memvcs/integrations/mcp_server.py +267 -0
- memvcs/integrations/web_ui/__init__.py +1 -0
- memvcs/integrations/web_ui/server.py +352 -0
- memvcs/utils/__init__.py +9 -0
- memvcs/utils/helpers.py +178 -0
memvcs/core/objects.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Object storage system for agmem.
|
|
3
|
+
|
|
4
|
+
Implements Git-style content-addressable storage with blob, tree, and commit objects.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import zlib
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional, Dict, List, Any, Union
|
|
13
|
+
from dataclasses import dataclass, asdict
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _valid_object_hash(hash_id: str) -> bool:
|
|
18
|
+
"""Return True if hash_id is safe for object paths (hex, 4-64 chars)."""
|
|
19
|
+
if not hash_id or len(hash_id) < 4 or len(hash_id) > 64:
|
|
20
|
+
return False
|
|
21
|
+
return all(c in '0123456789abcdef' for c in hash_id.lower())
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ObjectStore:
|
|
25
|
+
"""Content-addressable object storage system."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, objects_dir: Path):
|
|
28
|
+
self.objects_dir = Path(objects_dir)
|
|
29
|
+
self._ensure_directories()
|
|
30
|
+
|
|
31
|
+
def _ensure_directories(self):
|
|
32
|
+
"""Create object storage directories."""
|
|
33
|
+
for obj_type in ['blob', 'tree', 'commit', 'tag']:
|
|
34
|
+
(self.objects_dir / obj_type).mkdir(parents=True, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
def _get_object_path(self, hash_id: str, obj_type: str) -> Path:
|
|
37
|
+
"""Get storage path for an object. Validates hash_id to prevent path traversal."""
|
|
38
|
+
if not _valid_object_hash(hash_id):
|
|
39
|
+
raise ValueError(f"Invalid object hash: {hash_id!r}")
|
|
40
|
+
prefix = hash_id[:2]
|
|
41
|
+
suffix = hash_id[2:]
|
|
42
|
+
return self.objects_dir / obj_type / prefix / suffix
|
|
43
|
+
|
|
44
|
+
def _compute_hash(self, content: bytes, obj_type: str) -> str:
|
|
45
|
+
"""Compute SHA-256 hash of content with type header."""
|
|
46
|
+
header = f"{obj_type} {len(content)}\0".encode()
|
|
47
|
+
full_content = header + content
|
|
48
|
+
return hashlib.sha256(full_content).hexdigest()
|
|
49
|
+
|
|
50
|
+
def store(self, content: bytes, obj_type: str) -> str:
|
|
51
|
+
"""
|
|
52
|
+
Store content and return its hash ID.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
content: Raw bytes to store
|
|
56
|
+
obj_type: Type of object ('blob', 'tree', 'commit', 'tag')
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
SHA-256 hash ID of stored object
|
|
60
|
+
"""
|
|
61
|
+
hash_id = self._compute_hash(content, obj_type)
|
|
62
|
+
obj_path = self._get_object_path(hash_id, obj_type)
|
|
63
|
+
|
|
64
|
+
# Don't store if already exists (deduplication)
|
|
65
|
+
if obj_path.exists():
|
|
66
|
+
return hash_id
|
|
67
|
+
|
|
68
|
+
# Create directory if needed
|
|
69
|
+
obj_path.parent.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
# Compress and store
|
|
72
|
+
header = f"{obj_type} {len(content)}\0".encode()
|
|
73
|
+
full_content = header + content
|
|
74
|
+
compressed = zlib.compress(full_content)
|
|
75
|
+
|
|
76
|
+
obj_path.write_bytes(compressed)
|
|
77
|
+
return hash_id
|
|
78
|
+
|
|
79
|
+
def retrieve(self, hash_id: str, obj_type: str) -> Optional[bytes]:
|
|
80
|
+
"""
|
|
81
|
+
Retrieve content by hash ID.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
hash_id: SHA-256 hash of the object
|
|
85
|
+
obj_type: Type of object
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Raw bytes content or None if not found
|
|
89
|
+
"""
|
|
90
|
+
obj_path = self._get_object_path(hash_id, obj_type)
|
|
91
|
+
|
|
92
|
+
if not obj_path.exists():
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
# Decompress and extract content
|
|
96
|
+
compressed = obj_path.read_bytes()
|
|
97
|
+
full_content = zlib.decompress(compressed)
|
|
98
|
+
|
|
99
|
+
# Parse header
|
|
100
|
+
null_idx = full_content.index(b'\0')
|
|
101
|
+
header = full_content[:null_idx].decode()
|
|
102
|
+
content = full_content[null_idx + 1:]
|
|
103
|
+
|
|
104
|
+
return content
|
|
105
|
+
|
|
106
|
+
def exists(self, hash_id: str, obj_type: str) -> bool:
|
|
107
|
+
"""Check if an object exists. Returns False for invalid hash (no raise)."""
|
|
108
|
+
if not _valid_object_hash(hash_id):
|
|
109
|
+
return False
|
|
110
|
+
obj_path = self._get_object_path(hash_id, obj_type)
|
|
111
|
+
return obj_path.exists()
|
|
112
|
+
|
|
113
|
+
def delete(self, hash_id: str, obj_type: str) -> bool:
|
|
114
|
+
"""Delete an object. Returns True if deleted, False if not found."""
|
|
115
|
+
obj_path = self._get_object_path(hash_id, obj_type)
|
|
116
|
+
if obj_path.exists():
|
|
117
|
+
obj_path.unlink()
|
|
118
|
+
# Clean up empty parent directories
|
|
119
|
+
if not any(obj_path.parent.iterdir()):
|
|
120
|
+
obj_path.parent.rmdir()
|
|
121
|
+
return True
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
def list_objects(self, obj_type: str) -> List[str]:
|
|
125
|
+
"""List all objects of a given type."""
|
|
126
|
+
obj_dir = self.objects_dir / obj_type
|
|
127
|
+
if not obj_dir.exists():
|
|
128
|
+
return []
|
|
129
|
+
|
|
130
|
+
hashes = []
|
|
131
|
+
for prefix_dir in obj_dir.iterdir():
|
|
132
|
+
if prefix_dir.is_dir():
|
|
133
|
+
for suffix_file in prefix_dir.iterdir():
|
|
134
|
+
hash_id = prefix_dir.name + suffix_file.name
|
|
135
|
+
hashes.append(hash_id)
|
|
136
|
+
return hashes
|
|
137
|
+
|
|
138
|
+
def get_size(self, hash_id: str, obj_type: str) -> int:
|
|
139
|
+
"""Get the compressed size of an object."""
|
|
140
|
+
obj_path = self._get_object_path(hash_id, obj_type)
|
|
141
|
+
if obj_path.exists():
|
|
142
|
+
return obj_path.stat().st_size
|
|
143
|
+
return 0
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@dataclass
|
|
147
|
+
class Blob:
|
|
148
|
+
"""Blob object for storing raw memory content."""
|
|
149
|
+
content: bytes
|
|
150
|
+
|
|
151
|
+
def store(self, store: ObjectStore) -> str:
|
|
152
|
+
"""Store this blob and return its hash."""
|
|
153
|
+
return store.store(self.content, 'blob')
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def load(store: ObjectStore, hash_id: str) -> Optional['Blob']:
|
|
157
|
+
"""Load a blob from storage."""
|
|
158
|
+
content = store.retrieve(hash_id, 'blob')
|
|
159
|
+
if content is not None:
|
|
160
|
+
return Blob(content=content)
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@dataclass
|
|
165
|
+
class TreeEntry:
|
|
166
|
+
"""Entry in a tree object."""
|
|
167
|
+
mode: str # '100644' for file, '040000' for directory
|
|
168
|
+
obj_type: str # 'blob' or 'tree'
|
|
169
|
+
hash: str
|
|
170
|
+
name: str
|
|
171
|
+
path: str = "" # Relative path within tree
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@dataclass
|
|
175
|
+
class Tree:
|
|
176
|
+
"""Tree object for storing directory structure."""
|
|
177
|
+
entries: List[TreeEntry]
|
|
178
|
+
|
|
179
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
180
|
+
"""Convert to dictionary for serialization."""
|
|
181
|
+
return {
|
|
182
|
+
'type': 'tree',
|
|
183
|
+
'entries': [
|
|
184
|
+
{
|
|
185
|
+
'mode': e.mode,
|
|
186
|
+
'type': e.obj_type,
|
|
187
|
+
'hash': e.hash,
|
|
188
|
+
'name': e.name,
|
|
189
|
+
'path': e.path
|
|
190
|
+
}
|
|
191
|
+
for e in self.entries
|
|
192
|
+
]
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
def to_bytes(self) -> bytes:
|
|
196
|
+
"""Serialize to bytes."""
|
|
197
|
+
return json.dumps(self.to_dict(), sort_keys=True).encode()
|
|
198
|
+
|
|
199
|
+
def store(self, store: ObjectStore) -> str:
|
|
200
|
+
"""Store this tree and return its hash."""
|
|
201
|
+
return store.store(self.to_bytes(), 'tree')
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def load(store: ObjectStore, hash_id: str) -> Optional['Tree']:
|
|
205
|
+
"""Load a tree from storage."""
|
|
206
|
+
content = store.retrieve(hash_id, 'tree')
|
|
207
|
+
if content is None:
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
data = json.loads(content)
|
|
211
|
+
entries = [
|
|
212
|
+
TreeEntry(
|
|
213
|
+
mode=e['mode'],
|
|
214
|
+
obj_type=e['type'],
|
|
215
|
+
hash=e['hash'],
|
|
216
|
+
name=e['name'],
|
|
217
|
+
path=e.get('path', '')
|
|
218
|
+
)
|
|
219
|
+
for e in data.get('entries', [])
|
|
220
|
+
]
|
|
221
|
+
return Tree(entries=entries)
|
|
222
|
+
|
|
223
|
+
def get_entry(self, name: str) -> Optional[TreeEntry]:
|
|
224
|
+
"""Get an entry by name."""
|
|
225
|
+
for entry in self.entries:
|
|
226
|
+
if entry.name == name:
|
|
227
|
+
return entry
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
@dataclass
|
|
232
|
+
class Commit:
|
|
233
|
+
"""Commit object for storing memory snapshots."""
|
|
234
|
+
tree: str # Hash of tree object
|
|
235
|
+
parents: List[str] # Hashes of parent commits
|
|
236
|
+
author: str
|
|
237
|
+
timestamp: str
|
|
238
|
+
message: str
|
|
239
|
+
metadata: Dict[str, Any] # Additional metadata
|
|
240
|
+
|
|
241
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
242
|
+
"""Convert to dictionary for serialization."""
|
|
243
|
+
return {
|
|
244
|
+
'type': 'commit',
|
|
245
|
+
'tree': self.tree,
|
|
246
|
+
'parents': self.parents,
|
|
247
|
+
'author': self.author,
|
|
248
|
+
'timestamp': self.timestamp,
|
|
249
|
+
'message': self.message,
|
|
250
|
+
'metadata': self.metadata
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
def to_bytes(self) -> bytes:
|
|
254
|
+
"""Serialize to bytes."""
|
|
255
|
+
return json.dumps(self.to_dict(), sort_keys=True).encode()
|
|
256
|
+
|
|
257
|
+
def store(self, store: ObjectStore) -> str:
|
|
258
|
+
"""Store this commit and return its hash."""
|
|
259
|
+
return store.store(self.to_bytes(), 'commit')
|
|
260
|
+
|
|
261
|
+
@staticmethod
|
|
262
|
+
def load(store: ObjectStore, hash_id: str) -> Optional['Commit']:
|
|
263
|
+
"""Load a commit from storage."""
|
|
264
|
+
content = store.retrieve(hash_id, 'commit')
|
|
265
|
+
if content is None:
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
data = json.loads(content)
|
|
269
|
+
return Commit(
|
|
270
|
+
tree=data['tree'],
|
|
271
|
+
parents=data.get('parents', []),
|
|
272
|
+
author=data['author'],
|
|
273
|
+
timestamp=data['timestamp'],
|
|
274
|
+
message=data['message'],
|
|
275
|
+
metadata=data.get('metadata', {})
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def short_hash(self, store: ObjectStore) -> str:
|
|
279
|
+
"""Get short hash for display."""
|
|
280
|
+
full_hash = self.store(store)
|
|
281
|
+
return full_hash[:8]
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@dataclass
|
|
285
|
+
class Tag:
|
|
286
|
+
"""Tag object for marking specific commits."""
|
|
287
|
+
name: str
|
|
288
|
+
commit_hash: str
|
|
289
|
+
message: str
|
|
290
|
+
timestamp: str
|
|
291
|
+
|
|
292
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
293
|
+
"""Convert to dictionary for serialization."""
|
|
294
|
+
return {
|
|
295
|
+
'type': 'tag',
|
|
296
|
+
'name': self.name,
|
|
297
|
+
'commit_hash': self.commit_hash,
|
|
298
|
+
'message': self.message,
|
|
299
|
+
'timestamp': self.timestamp
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
def to_bytes(self) -> bytes:
|
|
303
|
+
"""Serialize to bytes."""
|
|
304
|
+
return json.dumps(self.to_dict(), sort_keys=True).encode()
|
|
305
|
+
|
|
306
|
+
def store(self, store: ObjectStore) -> str:
|
|
307
|
+
"""Store this tag and return its hash."""
|
|
308
|
+
return store.store(self.to_bytes(), 'tag')
|
|
309
|
+
|
|
310
|
+
@staticmethod
|
|
311
|
+
def load(store: ObjectStore, hash_id: str) -> Optional['Tag']:
|
|
312
|
+
"""Load a tag from storage."""
|
|
313
|
+
content = store.retrieve(hash_id, 'tag')
|
|
314
|
+
if content is None:
|
|
315
|
+
return None
|
|
316
|
+
|
|
317
|
+
data = json.loads(content)
|
|
318
|
+
return Tag(
|
|
319
|
+
name=data['name'],
|
|
320
|
+
commit_hash=data['commit_hash'],
|
|
321
|
+
message=data['message'],
|
|
322
|
+
timestamp=data['timestamp']
|
|
323
|
+
)
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PII (Personally Identifiable Information) scanner for agmem.
|
|
3
|
+
|
|
4
|
+
Scans staged files for sensitive information before commit.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Dict, Any, Optional
|
|
11
|
+
|
|
12
|
+
# IPs to ignore (localhost / internal); not reported as PII
|
|
13
|
+
IP_FALSE_POSITIVES = frozenset(['127.0.0.1', '0.0.0.0', '192.168.0.1', '10.0.0.1'])
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class PIIIssue:
|
|
18
|
+
"""A detected PII issue."""
|
|
19
|
+
filepath: str
|
|
20
|
+
line_number: int
|
|
21
|
+
issue_type: str
|
|
22
|
+
description: str
|
|
23
|
+
matched_text: str # Partially redacted
|
|
24
|
+
severity: str = "high" # "high", "medium", "low"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class PIIScanResult:
|
|
29
|
+
"""Result of scanning for PII."""
|
|
30
|
+
has_issues: bool
|
|
31
|
+
issues: List[PIIIssue] = field(default_factory=list)
|
|
32
|
+
scanned_files: int = 0
|
|
33
|
+
|
|
34
|
+
def add_issue(self, issue: PIIIssue):
|
|
35
|
+
self.issues.append(issue)
|
|
36
|
+
self.has_issues = True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class PIIScanner:
|
|
40
|
+
"""
|
|
41
|
+
Scanner for detecting PII and secrets in memory files.
|
|
42
|
+
|
|
43
|
+
Detects:
|
|
44
|
+
- API keys and tokens
|
|
45
|
+
- Credit card numbers
|
|
46
|
+
- Email addresses
|
|
47
|
+
- Social Security Numbers
|
|
48
|
+
- Phone numbers
|
|
49
|
+
- IP addresses
|
|
50
|
+
- Private keys
|
|
51
|
+
- Database connection strings
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# Patterns for detecting various types of PII and secrets
|
|
55
|
+
PATTERNS = {
|
|
56
|
+
'api_key': {
|
|
57
|
+
'pattern': re.compile(
|
|
58
|
+
r'(?i)'
|
|
59
|
+
r'(?:api[_-]?key|apikey|api[_-]?secret|api[_-]?token|'
|
|
60
|
+
r'auth[_-]?token|access[_-]?token|bearer[_-]?token|'
|
|
61
|
+
r'secret[_-]?key|private[_-]?key|password|passwd|pwd)'
|
|
62
|
+
r'\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{16,})["\']?',
|
|
63
|
+
re.MULTILINE
|
|
64
|
+
),
|
|
65
|
+
'description': 'API key or secret token detected',
|
|
66
|
+
'severity': 'high'
|
|
67
|
+
},
|
|
68
|
+
'aws_key': {
|
|
69
|
+
'pattern': re.compile(r'(?:AKIA|ABIA|ACCA|ASIA)[A-Z0-9]{16}'),
|
|
70
|
+
'description': 'AWS access key detected',
|
|
71
|
+
'severity': 'high'
|
|
72
|
+
},
|
|
73
|
+
'aws_secret': {
|
|
74
|
+
'pattern': re.compile(
|
|
75
|
+
r'(?i)aws[_-]?secret[_-]?(?:access[_-]?)?key\s*[:=]\s*["\']?([a-zA-Z0-9+/]{40})["\']?'
|
|
76
|
+
),
|
|
77
|
+
'description': 'AWS secret access key detected',
|
|
78
|
+
'severity': 'high'
|
|
79
|
+
},
|
|
80
|
+
'private_key': {
|
|
81
|
+
'pattern': re.compile(
|
|
82
|
+
r'-----BEGIN (?:RSA |DSA |EC |OPENSSH )?PRIVATE KEY-----'
|
|
83
|
+
),
|
|
84
|
+
'description': 'Private key detected',
|
|
85
|
+
'severity': 'high'
|
|
86
|
+
},
|
|
87
|
+
'credit_card': {
|
|
88
|
+
'pattern': re.compile(
|
|
89
|
+
r'\b(?:4[0-9]{12}(?:[0-9]{3})?|' # Visa
|
|
90
|
+
r'5[1-5][0-9]{14}|' # Mastercard
|
|
91
|
+
r'3[47][0-9]{13}|' # Amex
|
|
92
|
+
r'6(?:011|5[0-9]{2})[0-9]{12})\b' # Discover
|
|
93
|
+
),
|
|
94
|
+
'description': 'Credit card number detected',
|
|
95
|
+
'severity': 'high'
|
|
96
|
+
},
|
|
97
|
+
'ssn': {
|
|
98
|
+
'pattern': re.compile(r'\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b'),
|
|
99
|
+
'description': 'Social Security Number detected',
|
|
100
|
+
'severity': 'high'
|
|
101
|
+
},
|
|
102
|
+
'email': {
|
|
103
|
+
'pattern': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
|
|
104
|
+
'description': 'Email address detected',
|
|
105
|
+
'severity': 'medium'
|
|
106
|
+
},
|
|
107
|
+
'phone': {
|
|
108
|
+
'pattern': re.compile(
|
|
109
|
+
r'\b(?:\+?1[-.\s]?)?\(?[2-9][0-9]{2}\)?[-.\s]?[2-9][0-9]{2}[-.\s]?[0-9]{4}\b'
|
|
110
|
+
),
|
|
111
|
+
'description': 'Phone number detected',
|
|
112
|
+
'severity': 'medium'
|
|
113
|
+
},
|
|
114
|
+
'ip_address': {
|
|
115
|
+
'pattern': re.compile(
|
|
116
|
+
r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}'
|
|
117
|
+
r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
|
|
118
|
+
),
|
|
119
|
+
'description': 'IP address detected',
|
|
120
|
+
'severity': 'low'
|
|
121
|
+
},
|
|
122
|
+
'database_url': {
|
|
123
|
+
'pattern': re.compile(
|
|
124
|
+
r'(?i)(?:postgres|mysql|mongodb|redis)://[^\s"\'"]+',
|
|
125
|
+
re.MULTILINE
|
|
126
|
+
),
|
|
127
|
+
'description': 'Database connection string detected',
|
|
128
|
+
'severity': 'high'
|
|
129
|
+
},
|
|
130
|
+
'jwt': {
|
|
131
|
+
'pattern': re.compile(r'eyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+'),
|
|
132
|
+
'description': 'JWT token detected',
|
|
133
|
+
'severity': 'high'
|
|
134
|
+
},
|
|
135
|
+
'github_token': {
|
|
136
|
+
'pattern': re.compile(r'(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9_]{36,}'),
|
|
137
|
+
'description': 'GitHub token detected',
|
|
138
|
+
'severity': 'high'
|
|
139
|
+
},
|
|
140
|
+
'slack_token': {
|
|
141
|
+
'pattern': re.compile(r'xox[baprs]-[0-9]+-[0-9]+-[a-zA-Z0-9]+'),
|
|
142
|
+
'description': 'Slack token detected',
|
|
143
|
+
'severity': 'high'
|
|
144
|
+
},
|
|
145
|
+
'stripe_key': {
|
|
146
|
+
'pattern': re.compile(r'(?:sk|pk)_(?:test|live)_[a-zA-Z0-9]{24,}'),
|
|
147
|
+
'description': 'Stripe API key detected',
|
|
148
|
+
'severity': 'high'
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# Files/patterns to skip
|
|
153
|
+
SKIP_PATTERNS = [
|
|
154
|
+
r'\.git/',
|
|
155
|
+
r'\.mem/',
|
|
156
|
+
r'node_modules/',
|
|
157
|
+
r'__pycache__/',
|
|
158
|
+
r'\.pyc$',
|
|
159
|
+
r'\.pyo$',
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
@classmethod
|
|
163
|
+
def _redact(cls, text: str, keep: int = 4) -> str:
|
|
164
|
+
"""Partially redact sensitive text for display."""
|
|
165
|
+
if len(text) <= keep * 2:
|
|
166
|
+
return '*' * len(text)
|
|
167
|
+
return text[:keep] + '*' * (len(text) - keep * 2) + text[-keep:]
|
|
168
|
+
|
|
169
|
+
@classmethod
|
|
170
|
+
def _should_skip(cls, filepath: str) -> bool:
|
|
171
|
+
"""Check if file should be skipped."""
|
|
172
|
+
for pattern in cls.SKIP_PATTERNS:
|
|
173
|
+
if re.search(pattern, filepath):
|
|
174
|
+
return True
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def scan_content(cls, content: str, filepath: str) -> List[PIIIssue]:
|
|
179
|
+
"""
|
|
180
|
+
Scan content for PII.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
content: File content to scan
|
|
184
|
+
filepath: Path to the file (for reporting)
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of PIIIssue objects
|
|
188
|
+
"""
|
|
189
|
+
issues = []
|
|
190
|
+
lines = content.split('\n')
|
|
191
|
+
|
|
192
|
+
for line_num, line in enumerate(lines, 1):
|
|
193
|
+
for pii_type, config in cls.PATTERNS.items():
|
|
194
|
+
matches = config['pattern'].finditer(line)
|
|
195
|
+
for match in matches:
|
|
196
|
+
matched_text = match.group(0)
|
|
197
|
+
|
|
198
|
+
# Skip common false positives
|
|
199
|
+
if cls._is_false_positive(pii_type, matched_text, line):
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
issues.append(PIIIssue(
|
|
203
|
+
filepath=filepath,
|
|
204
|
+
line_number=line_num,
|
|
205
|
+
issue_type=pii_type,
|
|
206
|
+
description=config['description'],
|
|
207
|
+
matched_text=cls._redact(matched_text),
|
|
208
|
+
severity=config['severity']
|
|
209
|
+
))
|
|
210
|
+
|
|
211
|
+
return issues
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
def _is_false_positive(cls, pii_type: str, matched_text: str, line: str) -> bool:
|
|
215
|
+
"""Check for common false positives."""
|
|
216
|
+
lower_line = line.lower()
|
|
217
|
+
|
|
218
|
+
# Skip example/placeholder values
|
|
219
|
+
if any(x in lower_line for x in ['example', 'placeholder', 'your_', 'xxx', 'sample']):
|
|
220
|
+
return True
|
|
221
|
+
|
|
222
|
+
# Skip comments that are likely documentation
|
|
223
|
+
if line.strip().startswith('#') and 'example' in lower_line:
|
|
224
|
+
return True
|
|
225
|
+
|
|
226
|
+
if pii_type == 'ip_address':
|
|
227
|
+
if matched_text in IP_FALSE_POSITIVES:
|
|
228
|
+
return True
|
|
229
|
+
# Skip version numbers that look like IPs
|
|
230
|
+
if 'version' in lower_line or 'v.' in lower_line:
|
|
231
|
+
return True
|
|
232
|
+
|
|
233
|
+
# Email false positives
|
|
234
|
+
if pii_type == 'email':
|
|
235
|
+
# Skip example domains
|
|
236
|
+
if any(x in matched_text for x in ['example.com', 'test.com', 'localhost']):
|
|
237
|
+
return True
|
|
238
|
+
|
|
239
|
+
return False
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def scan_file(cls, filepath: Path) -> List[PIIIssue]:
|
|
243
|
+
"""
|
|
244
|
+
Scan a file for PII.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
filepath: Path to the file
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
List of PIIIssue objects
|
|
251
|
+
"""
|
|
252
|
+
if cls._should_skip(str(filepath)):
|
|
253
|
+
return []
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
content = filepath.read_text(encoding='utf-8', errors='ignore')
|
|
257
|
+
return cls.scan_content(content, str(filepath))
|
|
258
|
+
except Exception:
|
|
259
|
+
return []
|
|
260
|
+
|
|
261
|
+
@classmethod
|
|
262
|
+
def _get_blob_hash_from_staged(cls, file_info: Any) -> Optional[str]:
|
|
263
|
+
"""Get blob hash from StagedFile or dict (staging returns Dict[str, StagedFile])."""
|
|
264
|
+
if hasattr(file_info, 'blob_hash'):
|
|
265
|
+
return file_info.blob_hash
|
|
266
|
+
if isinstance(file_info, dict):
|
|
267
|
+
return file_info.get('blob_hash') or file_info.get('hash')
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def scan_staged_files(cls, repo, staged_files: Dict[str, Any]) -> PIIScanResult:
|
|
272
|
+
"""
|
|
273
|
+
Scan staged files for PII.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
repo: Repository instance
|
|
277
|
+
staged_files: Dict of staged files with their info
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
PIIScanResult with any issues found
|
|
281
|
+
"""
|
|
282
|
+
from .objects import Blob
|
|
283
|
+
|
|
284
|
+
result = PIIScanResult(has_issues=False)
|
|
285
|
+
|
|
286
|
+
for filepath, file_info in staged_files.items():
|
|
287
|
+
if cls._should_skip(filepath):
|
|
288
|
+
continue
|
|
289
|
+
|
|
290
|
+
result.scanned_files += 1
|
|
291
|
+
|
|
292
|
+
blob_hash = PIIScanner._get_blob_hash_from_staged(file_info)
|
|
293
|
+
if not blob_hash:
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
blob = Blob.load(repo.object_store, blob_hash)
|
|
297
|
+
if not blob:
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
content = blob.content.decode('utf-8', errors='ignore')
|
|
302
|
+
except Exception:
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
# Scan content
|
|
306
|
+
issues = cls.scan_content(content, filepath)
|
|
307
|
+
for issue in issues:
|
|
308
|
+
result.add_issue(issue)
|
|
309
|
+
|
|
310
|
+
return result
|
|
311
|
+
|
|
312
|
+
@classmethod
|
|
313
|
+
def scan_directory(cls, directory: Path, recursive: bool = True) -> PIIScanResult:
|
|
314
|
+
"""
|
|
315
|
+
Scan a directory for PII.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
directory: Directory to scan
|
|
319
|
+
recursive: Whether to scan recursively
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
PIIScanResult with any issues found
|
|
323
|
+
"""
|
|
324
|
+
result = PIIScanResult(has_issues=False)
|
|
325
|
+
|
|
326
|
+
if recursive:
|
|
327
|
+
files = directory.rglob('*')
|
|
328
|
+
else:
|
|
329
|
+
files = directory.glob('*')
|
|
330
|
+
|
|
331
|
+
for filepath in files:
|
|
332
|
+
if not filepath.is_file():
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
if cls._should_skip(str(filepath)):
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
result.scanned_files += 1
|
|
339
|
+
issues = cls.scan_file(filepath)
|
|
340
|
+
for issue in issues:
|
|
341
|
+
result.add_issue(issue)
|
|
342
|
+
|
|
343
|
+
return result
|