agmem 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. agmem-0.1.1.dist-info/METADATA +656 -0
  2. agmem-0.1.1.dist-info/RECORD +67 -0
  3. agmem-0.1.1.dist-info/WHEEL +5 -0
  4. agmem-0.1.1.dist-info/entry_points.txt +2 -0
  5. agmem-0.1.1.dist-info/licenses/LICENSE +21 -0
  6. agmem-0.1.1.dist-info/top_level.txt +1 -0
  7. memvcs/__init__.py +9 -0
  8. memvcs/cli.py +178 -0
  9. memvcs/commands/__init__.py +23 -0
  10. memvcs/commands/add.py +258 -0
  11. memvcs/commands/base.py +23 -0
  12. memvcs/commands/blame.py +169 -0
  13. memvcs/commands/branch.py +110 -0
  14. memvcs/commands/checkout.py +101 -0
  15. memvcs/commands/clean.py +76 -0
  16. memvcs/commands/clone.py +91 -0
  17. memvcs/commands/commit.py +174 -0
  18. memvcs/commands/daemon.py +267 -0
  19. memvcs/commands/diff.py +157 -0
  20. memvcs/commands/fsck.py +203 -0
  21. memvcs/commands/garden.py +107 -0
  22. memvcs/commands/graph.py +151 -0
  23. memvcs/commands/init.py +61 -0
  24. memvcs/commands/log.py +103 -0
  25. memvcs/commands/mcp.py +59 -0
  26. memvcs/commands/merge.py +88 -0
  27. memvcs/commands/pull.py +65 -0
  28. memvcs/commands/push.py +143 -0
  29. memvcs/commands/reflog.py +52 -0
  30. memvcs/commands/remote.py +51 -0
  31. memvcs/commands/reset.py +98 -0
  32. memvcs/commands/search.py +163 -0
  33. memvcs/commands/serve.py +54 -0
  34. memvcs/commands/show.py +125 -0
  35. memvcs/commands/stash.py +97 -0
  36. memvcs/commands/status.py +112 -0
  37. memvcs/commands/tag.py +117 -0
  38. memvcs/commands/test.py +132 -0
  39. memvcs/commands/tree.py +156 -0
  40. memvcs/core/__init__.py +21 -0
  41. memvcs/core/config_loader.py +245 -0
  42. memvcs/core/constants.py +12 -0
  43. memvcs/core/diff.py +380 -0
  44. memvcs/core/gardener.py +466 -0
  45. memvcs/core/hooks.py +151 -0
  46. memvcs/core/knowledge_graph.py +381 -0
  47. memvcs/core/merge.py +474 -0
  48. memvcs/core/objects.py +323 -0
  49. memvcs/core/pii_scanner.py +343 -0
  50. memvcs/core/refs.py +447 -0
  51. memvcs/core/remote.py +278 -0
  52. memvcs/core/repository.py +522 -0
  53. memvcs/core/schema.py +414 -0
  54. memvcs/core/staging.py +227 -0
  55. memvcs/core/storage/__init__.py +72 -0
  56. memvcs/core/storage/base.py +359 -0
  57. memvcs/core/storage/gcs.py +308 -0
  58. memvcs/core/storage/local.py +182 -0
  59. memvcs/core/storage/s3.py +369 -0
  60. memvcs/core/test_runner.py +371 -0
  61. memvcs/core/vector_store.py +313 -0
  62. memvcs/integrations/__init__.py +5 -0
  63. memvcs/integrations/mcp_server.py +267 -0
  64. memvcs/integrations/web_ui/__init__.py +1 -0
  65. memvcs/integrations/web_ui/server.py +352 -0
  66. memvcs/utils/__init__.py +9 -0
  67. memvcs/utils/helpers.py +178 -0
memvcs/core/objects.py ADDED
@@ -0,0 +1,323 @@
1
+ """
2
+ Object storage system for agmem.
3
+
4
+ Implements Git-style content-addressable storage with blob, tree, and commit objects.
5
+ """
6
+
7
+ import hashlib
8
+ import json
9
+ import os
10
+ import zlib
11
+ from pathlib import Path
12
+ from typing import Optional, Dict, List, Any, Union
13
+ from dataclasses import dataclass, asdict
14
+ from datetime import datetime
15
+
16
+
17
+ def _valid_object_hash(hash_id: str) -> bool:
18
+ """Return True if hash_id is safe for object paths (hex, 4-64 chars)."""
19
+ if not hash_id or len(hash_id) < 4 or len(hash_id) > 64:
20
+ return False
21
+ return all(c in '0123456789abcdef' for c in hash_id.lower())
22
+
23
+
24
+ class ObjectStore:
25
+ """Content-addressable object storage system."""
26
+
27
+ def __init__(self, objects_dir: Path):
28
+ self.objects_dir = Path(objects_dir)
29
+ self._ensure_directories()
30
+
31
+ def _ensure_directories(self):
32
+ """Create object storage directories."""
33
+ for obj_type in ['blob', 'tree', 'commit', 'tag']:
34
+ (self.objects_dir / obj_type).mkdir(parents=True, exist_ok=True)
35
+
36
+ def _get_object_path(self, hash_id: str, obj_type: str) -> Path:
37
+ """Get storage path for an object. Validates hash_id to prevent path traversal."""
38
+ if not _valid_object_hash(hash_id):
39
+ raise ValueError(f"Invalid object hash: {hash_id!r}")
40
+ prefix = hash_id[:2]
41
+ suffix = hash_id[2:]
42
+ return self.objects_dir / obj_type / prefix / suffix
43
+
44
+ def _compute_hash(self, content: bytes, obj_type: str) -> str:
45
+ """Compute SHA-256 hash of content with type header."""
46
+ header = f"{obj_type} {len(content)}\0".encode()
47
+ full_content = header + content
48
+ return hashlib.sha256(full_content).hexdigest()
49
+
50
+ def store(self, content: bytes, obj_type: str) -> str:
51
+ """
52
+ Store content and return its hash ID.
53
+
54
+ Args:
55
+ content: Raw bytes to store
56
+ obj_type: Type of object ('blob', 'tree', 'commit', 'tag')
57
+
58
+ Returns:
59
+ SHA-256 hash ID of stored object
60
+ """
61
+ hash_id = self._compute_hash(content, obj_type)
62
+ obj_path = self._get_object_path(hash_id, obj_type)
63
+
64
+ # Don't store if already exists (deduplication)
65
+ if obj_path.exists():
66
+ return hash_id
67
+
68
+ # Create directory if needed
69
+ obj_path.parent.mkdir(parents=True, exist_ok=True)
70
+
71
+ # Compress and store
72
+ header = f"{obj_type} {len(content)}\0".encode()
73
+ full_content = header + content
74
+ compressed = zlib.compress(full_content)
75
+
76
+ obj_path.write_bytes(compressed)
77
+ return hash_id
78
+
79
+ def retrieve(self, hash_id: str, obj_type: str) -> Optional[bytes]:
80
+ """
81
+ Retrieve content by hash ID.
82
+
83
+ Args:
84
+ hash_id: SHA-256 hash of the object
85
+ obj_type: Type of object
86
+
87
+ Returns:
88
+ Raw bytes content or None if not found
89
+ """
90
+ obj_path = self._get_object_path(hash_id, obj_type)
91
+
92
+ if not obj_path.exists():
93
+ return None
94
+
95
+ # Decompress and extract content
96
+ compressed = obj_path.read_bytes()
97
+ full_content = zlib.decompress(compressed)
98
+
99
+ # Parse header
100
+ null_idx = full_content.index(b'\0')
101
+ header = full_content[:null_idx].decode()
102
+ content = full_content[null_idx + 1:]
103
+
104
+ return content
105
+
106
+ def exists(self, hash_id: str, obj_type: str) -> bool:
107
+ """Check if an object exists. Returns False for invalid hash (no raise)."""
108
+ if not _valid_object_hash(hash_id):
109
+ return False
110
+ obj_path = self._get_object_path(hash_id, obj_type)
111
+ return obj_path.exists()
112
+
113
+ def delete(self, hash_id: str, obj_type: str) -> bool:
114
+ """Delete an object. Returns True if deleted, False if not found."""
115
+ obj_path = self._get_object_path(hash_id, obj_type)
116
+ if obj_path.exists():
117
+ obj_path.unlink()
118
+ # Clean up empty parent directories
119
+ if not any(obj_path.parent.iterdir()):
120
+ obj_path.parent.rmdir()
121
+ return True
122
+ return False
123
+
124
+ def list_objects(self, obj_type: str) -> List[str]:
125
+ """List all objects of a given type."""
126
+ obj_dir = self.objects_dir / obj_type
127
+ if not obj_dir.exists():
128
+ return []
129
+
130
+ hashes = []
131
+ for prefix_dir in obj_dir.iterdir():
132
+ if prefix_dir.is_dir():
133
+ for suffix_file in prefix_dir.iterdir():
134
+ hash_id = prefix_dir.name + suffix_file.name
135
+ hashes.append(hash_id)
136
+ return hashes
137
+
138
+ def get_size(self, hash_id: str, obj_type: str) -> int:
139
+ """Get the compressed size of an object."""
140
+ obj_path = self._get_object_path(hash_id, obj_type)
141
+ if obj_path.exists():
142
+ return obj_path.stat().st_size
143
+ return 0
144
+
145
+
146
+ @dataclass
147
+ class Blob:
148
+ """Blob object for storing raw memory content."""
149
+ content: bytes
150
+
151
+ def store(self, store: ObjectStore) -> str:
152
+ """Store this blob and return its hash."""
153
+ return store.store(self.content, 'blob')
154
+
155
+ @staticmethod
156
+ def load(store: ObjectStore, hash_id: str) -> Optional['Blob']:
157
+ """Load a blob from storage."""
158
+ content = store.retrieve(hash_id, 'blob')
159
+ if content is not None:
160
+ return Blob(content=content)
161
+ return None
162
+
163
+
164
+ @dataclass
165
+ class TreeEntry:
166
+ """Entry in a tree object."""
167
+ mode: str # '100644' for file, '040000' for directory
168
+ obj_type: str # 'blob' or 'tree'
169
+ hash: str
170
+ name: str
171
+ path: str = "" # Relative path within tree
172
+
173
+
174
+ @dataclass
175
+ class Tree:
176
+ """Tree object for storing directory structure."""
177
+ entries: List[TreeEntry]
178
+
179
+ def to_dict(self) -> Dict[str, Any]:
180
+ """Convert to dictionary for serialization."""
181
+ return {
182
+ 'type': 'tree',
183
+ 'entries': [
184
+ {
185
+ 'mode': e.mode,
186
+ 'type': e.obj_type,
187
+ 'hash': e.hash,
188
+ 'name': e.name,
189
+ 'path': e.path
190
+ }
191
+ for e in self.entries
192
+ ]
193
+ }
194
+
195
+ def to_bytes(self) -> bytes:
196
+ """Serialize to bytes."""
197
+ return json.dumps(self.to_dict(), sort_keys=True).encode()
198
+
199
+ def store(self, store: ObjectStore) -> str:
200
+ """Store this tree and return its hash."""
201
+ return store.store(self.to_bytes(), 'tree')
202
+
203
+ @staticmethod
204
+ def load(store: ObjectStore, hash_id: str) -> Optional['Tree']:
205
+ """Load a tree from storage."""
206
+ content = store.retrieve(hash_id, 'tree')
207
+ if content is None:
208
+ return None
209
+
210
+ data = json.loads(content)
211
+ entries = [
212
+ TreeEntry(
213
+ mode=e['mode'],
214
+ obj_type=e['type'],
215
+ hash=e['hash'],
216
+ name=e['name'],
217
+ path=e.get('path', '')
218
+ )
219
+ for e in data.get('entries', [])
220
+ ]
221
+ return Tree(entries=entries)
222
+
223
+ def get_entry(self, name: str) -> Optional[TreeEntry]:
224
+ """Get an entry by name."""
225
+ for entry in self.entries:
226
+ if entry.name == name:
227
+ return entry
228
+ return None
229
+
230
+
231
+ @dataclass
232
+ class Commit:
233
+ """Commit object for storing memory snapshots."""
234
+ tree: str # Hash of tree object
235
+ parents: List[str] # Hashes of parent commits
236
+ author: str
237
+ timestamp: str
238
+ message: str
239
+ metadata: Dict[str, Any] # Additional metadata
240
+
241
+ def to_dict(self) -> Dict[str, Any]:
242
+ """Convert to dictionary for serialization."""
243
+ return {
244
+ 'type': 'commit',
245
+ 'tree': self.tree,
246
+ 'parents': self.parents,
247
+ 'author': self.author,
248
+ 'timestamp': self.timestamp,
249
+ 'message': self.message,
250
+ 'metadata': self.metadata
251
+ }
252
+
253
+ def to_bytes(self) -> bytes:
254
+ """Serialize to bytes."""
255
+ return json.dumps(self.to_dict(), sort_keys=True).encode()
256
+
257
+ def store(self, store: ObjectStore) -> str:
258
+ """Store this commit and return its hash."""
259
+ return store.store(self.to_bytes(), 'commit')
260
+
261
+ @staticmethod
262
+ def load(store: ObjectStore, hash_id: str) -> Optional['Commit']:
263
+ """Load a commit from storage."""
264
+ content = store.retrieve(hash_id, 'commit')
265
+ if content is None:
266
+ return None
267
+
268
+ data = json.loads(content)
269
+ return Commit(
270
+ tree=data['tree'],
271
+ parents=data.get('parents', []),
272
+ author=data['author'],
273
+ timestamp=data['timestamp'],
274
+ message=data['message'],
275
+ metadata=data.get('metadata', {})
276
+ )
277
+
278
+ def short_hash(self, store: ObjectStore) -> str:
279
+ """Get short hash for display."""
280
+ full_hash = self.store(store)
281
+ return full_hash[:8]
282
+
283
+
284
+ @dataclass
285
+ class Tag:
286
+ """Tag object for marking specific commits."""
287
+ name: str
288
+ commit_hash: str
289
+ message: str
290
+ timestamp: str
291
+
292
+ def to_dict(self) -> Dict[str, Any]:
293
+ """Convert to dictionary for serialization."""
294
+ return {
295
+ 'type': 'tag',
296
+ 'name': self.name,
297
+ 'commit_hash': self.commit_hash,
298
+ 'message': self.message,
299
+ 'timestamp': self.timestamp
300
+ }
301
+
302
+ def to_bytes(self) -> bytes:
303
+ """Serialize to bytes."""
304
+ return json.dumps(self.to_dict(), sort_keys=True).encode()
305
+
306
+ def store(self, store: ObjectStore) -> str:
307
+ """Store this tag and return its hash."""
308
+ return store.store(self.to_bytes(), 'tag')
309
+
310
+ @staticmethod
311
+ def load(store: ObjectStore, hash_id: str) -> Optional['Tag']:
312
+ """Load a tag from storage."""
313
+ content = store.retrieve(hash_id, 'tag')
314
+ if content is None:
315
+ return None
316
+
317
+ data = json.loads(content)
318
+ return Tag(
319
+ name=data['name'],
320
+ commit_hash=data['commit_hash'],
321
+ message=data['message'],
322
+ timestamp=data['timestamp']
323
+ )
@@ -0,0 +1,343 @@
1
+ """
2
+ PII (Personally Identifiable Information) scanner for agmem.
3
+
4
+ Scans staged files for sensitive information before commit.
5
+ """
6
+
7
+ import re
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import List, Dict, Any, Optional
11
+
12
+ # IPs to ignore (localhost / internal); not reported as PII
13
+ IP_FALSE_POSITIVES = frozenset(['127.0.0.1', '0.0.0.0', '192.168.0.1', '10.0.0.1'])
14
+
15
+
16
+ @dataclass
17
+ class PIIIssue:
18
+ """A detected PII issue."""
19
+ filepath: str
20
+ line_number: int
21
+ issue_type: str
22
+ description: str
23
+ matched_text: str # Partially redacted
24
+ severity: str = "high" # "high", "medium", "low"
25
+
26
+
27
+ @dataclass
28
+ class PIIScanResult:
29
+ """Result of scanning for PII."""
30
+ has_issues: bool
31
+ issues: List[PIIIssue] = field(default_factory=list)
32
+ scanned_files: int = 0
33
+
34
+ def add_issue(self, issue: PIIIssue):
35
+ self.issues.append(issue)
36
+ self.has_issues = True
37
+
38
+
39
+ class PIIScanner:
40
+ """
41
+ Scanner for detecting PII and secrets in memory files.
42
+
43
+ Detects:
44
+ - API keys and tokens
45
+ - Credit card numbers
46
+ - Email addresses
47
+ - Social Security Numbers
48
+ - Phone numbers
49
+ - IP addresses
50
+ - Private keys
51
+ - Database connection strings
52
+ """
53
+
54
+ # Patterns for detecting various types of PII and secrets
55
+ PATTERNS = {
56
+ 'api_key': {
57
+ 'pattern': re.compile(
58
+ r'(?i)'
59
+ r'(?:api[_-]?key|apikey|api[_-]?secret|api[_-]?token|'
60
+ r'auth[_-]?token|access[_-]?token|bearer[_-]?token|'
61
+ r'secret[_-]?key|private[_-]?key|password|passwd|pwd)'
62
+ r'\s*[:=]\s*["\']?([a-zA-Z0-9_\-]{16,})["\']?',
63
+ re.MULTILINE
64
+ ),
65
+ 'description': 'API key or secret token detected',
66
+ 'severity': 'high'
67
+ },
68
+ 'aws_key': {
69
+ 'pattern': re.compile(r'(?:AKIA|ABIA|ACCA|ASIA)[A-Z0-9]{16}'),
70
+ 'description': 'AWS access key detected',
71
+ 'severity': 'high'
72
+ },
73
+ 'aws_secret': {
74
+ 'pattern': re.compile(
75
+ r'(?i)aws[_-]?secret[_-]?(?:access[_-]?)?key\s*[:=]\s*["\']?([a-zA-Z0-9+/]{40})["\']?'
76
+ ),
77
+ 'description': 'AWS secret access key detected',
78
+ 'severity': 'high'
79
+ },
80
+ 'private_key': {
81
+ 'pattern': re.compile(
82
+ r'-----BEGIN (?:RSA |DSA |EC |OPENSSH )?PRIVATE KEY-----'
83
+ ),
84
+ 'description': 'Private key detected',
85
+ 'severity': 'high'
86
+ },
87
+ 'credit_card': {
88
+ 'pattern': re.compile(
89
+ r'\b(?:4[0-9]{12}(?:[0-9]{3})?|' # Visa
90
+ r'5[1-5][0-9]{14}|' # Mastercard
91
+ r'3[47][0-9]{13}|' # Amex
92
+ r'6(?:011|5[0-9]{2})[0-9]{12})\b' # Discover
93
+ ),
94
+ 'description': 'Credit card number detected',
95
+ 'severity': 'high'
96
+ },
97
+ 'ssn': {
98
+ 'pattern': re.compile(r'\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b'),
99
+ 'description': 'Social Security Number detected',
100
+ 'severity': 'high'
101
+ },
102
+ 'email': {
103
+ 'pattern': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
104
+ 'description': 'Email address detected',
105
+ 'severity': 'medium'
106
+ },
107
+ 'phone': {
108
+ 'pattern': re.compile(
109
+ r'\b(?:\+?1[-.\s]?)?\(?[2-9][0-9]{2}\)?[-.\s]?[2-9][0-9]{2}[-.\s]?[0-9]{4}\b'
110
+ ),
111
+ 'description': 'Phone number detected',
112
+ 'severity': 'medium'
113
+ },
114
+ 'ip_address': {
115
+ 'pattern': re.compile(
116
+ r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}'
117
+ r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
118
+ ),
119
+ 'description': 'IP address detected',
120
+ 'severity': 'low'
121
+ },
122
+ 'database_url': {
123
+ 'pattern': re.compile(
124
+ r'(?i)(?:postgres|mysql|mongodb|redis)://[^\s"\'"]+',
125
+ re.MULTILINE
126
+ ),
127
+ 'description': 'Database connection string detected',
128
+ 'severity': 'high'
129
+ },
130
+ 'jwt': {
131
+ 'pattern': re.compile(r'eyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+'),
132
+ 'description': 'JWT token detected',
133
+ 'severity': 'high'
134
+ },
135
+ 'github_token': {
136
+ 'pattern': re.compile(r'(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9_]{36,}'),
137
+ 'description': 'GitHub token detected',
138
+ 'severity': 'high'
139
+ },
140
+ 'slack_token': {
141
+ 'pattern': re.compile(r'xox[baprs]-[0-9]+-[0-9]+-[a-zA-Z0-9]+'),
142
+ 'description': 'Slack token detected',
143
+ 'severity': 'high'
144
+ },
145
+ 'stripe_key': {
146
+ 'pattern': re.compile(r'(?:sk|pk)_(?:test|live)_[a-zA-Z0-9]{24,}'),
147
+ 'description': 'Stripe API key detected',
148
+ 'severity': 'high'
149
+ }
150
+ }
151
+
152
+ # Files/patterns to skip
153
+ SKIP_PATTERNS = [
154
+ r'\.git/',
155
+ r'\.mem/',
156
+ r'node_modules/',
157
+ r'__pycache__/',
158
+ r'\.pyc$',
159
+ r'\.pyo$',
160
+ ]
161
+
162
+ @classmethod
163
+ def _redact(cls, text: str, keep: int = 4) -> str:
164
+ """Partially redact sensitive text for display."""
165
+ if len(text) <= keep * 2:
166
+ return '*' * len(text)
167
+ return text[:keep] + '*' * (len(text) - keep * 2) + text[-keep:]
168
+
169
+ @classmethod
170
+ def _should_skip(cls, filepath: str) -> bool:
171
+ """Check if file should be skipped."""
172
+ for pattern in cls.SKIP_PATTERNS:
173
+ if re.search(pattern, filepath):
174
+ return True
175
+ return False
176
+
177
+ @classmethod
178
+ def scan_content(cls, content: str, filepath: str) -> List[PIIIssue]:
179
+ """
180
+ Scan content for PII.
181
+
182
+ Args:
183
+ content: File content to scan
184
+ filepath: Path to the file (for reporting)
185
+
186
+ Returns:
187
+ List of PIIIssue objects
188
+ """
189
+ issues = []
190
+ lines = content.split('\n')
191
+
192
+ for line_num, line in enumerate(lines, 1):
193
+ for pii_type, config in cls.PATTERNS.items():
194
+ matches = config['pattern'].finditer(line)
195
+ for match in matches:
196
+ matched_text = match.group(0)
197
+
198
+ # Skip common false positives
199
+ if cls._is_false_positive(pii_type, matched_text, line):
200
+ continue
201
+
202
+ issues.append(PIIIssue(
203
+ filepath=filepath,
204
+ line_number=line_num,
205
+ issue_type=pii_type,
206
+ description=config['description'],
207
+ matched_text=cls._redact(matched_text),
208
+ severity=config['severity']
209
+ ))
210
+
211
+ return issues
212
+
213
+ @classmethod
214
+ def _is_false_positive(cls, pii_type: str, matched_text: str, line: str) -> bool:
215
+ """Check for common false positives."""
216
+ lower_line = line.lower()
217
+
218
+ # Skip example/placeholder values
219
+ if any(x in lower_line for x in ['example', 'placeholder', 'your_', 'xxx', 'sample']):
220
+ return True
221
+
222
+ # Skip comments that are likely documentation
223
+ if line.strip().startswith('#') and 'example' in lower_line:
224
+ return True
225
+
226
+ if pii_type == 'ip_address':
227
+ if matched_text in IP_FALSE_POSITIVES:
228
+ return True
229
+ # Skip version numbers that look like IPs
230
+ if 'version' in lower_line or 'v.' in lower_line:
231
+ return True
232
+
233
+ # Email false positives
234
+ if pii_type == 'email':
235
+ # Skip example domains
236
+ if any(x in matched_text for x in ['example.com', 'test.com', 'localhost']):
237
+ return True
238
+
239
+ return False
240
+
241
+ @classmethod
242
+ def scan_file(cls, filepath: Path) -> List[PIIIssue]:
243
+ """
244
+ Scan a file for PII.
245
+
246
+ Args:
247
+ filepath: Path to the file
248
+
249
+ Returns:
250
+ List of PIIIssue objects
251
+ """
252
+ if cls._should_skip(str(filepath)):
253
+ return []
254
+
255
+ try:
256
+ content = filepath.read_text(encoding='utf-8', errors='ignore')
257
+ return cls.scan_content(content, str(filepath))
258
+ except Exception:
259
+ return []
260
+
261
+ @classmethod
262
+ def _get_blob_hash_from_staged(cls, file_info: Any) -> Optional[str]:
263
+ """Get blob hash from StagedFile or dict (staging returns Dict[str, StagedFile])."""
264
+ if hasattr(file_info, 'blob_hash'):
265
+ return file_info.blob_hash
266
+ if isinstance(file_info, dict):
267
+ return file_info.get('blob_hash') or file_info.get('hash')
268
+ return None
269
+
270
+ @classmethod
271
+ def scan_staged_files(cls, repo, staged_files: Dict[str, Any]) -> PIIScanResult:
272
+ """
273
+ Scan staged files for PII.
274
+
275
+ Args:
276
+ repo: Repository instance
277
+ staged_files: Dict of staged files with their info
278
+
279
+ Returns:
280
+ PIIScanResult with any issues found
281
+ """
282
+ from .objects import Blob
283
+
284
+ result = PIIScanResult(has_issues=False)
285
+
286
+ for filepath, file_info in staged_files.items():
287
+ if cls._should_skip(filepath):
288
+ continue
289
+
290
+ result.scanned_files += 1
291
+
292
+ blob_hash = PIIScanner._get_blob_hash_from_staged(file_info)
293
+ if not blob_hash:
294
+ continue
295
+
296
+ blob = Blob.load(repo.object_store, blob_hash)
297
+ if not blob:
298
+ continue
299
+
300
+ try:
301
+ content = blob.content.decode('utf-8', errors='ignore')
302
+ except Exception:
303
+ continue
304
+
305
+ # Scan content
306
+ issues = cls.scan_content(content, filepath)
307
+ for issue in issues:
308
+ result.add_issue(issue)
309
+
310
+ return result
311
+
312
+ @classmethod
313
+ def scan_directory(cls, directory: Path, recursive: bool = True) -> PIIScanResult:
314
+ """
315
+ Scan a directory for PII.
316
+
317
+ Args:
318
+ directory: Directory to scan
319
+ recursive: Whether to scan recursively
320
+
321
+ Returns:
322
+ PIIScanResult with any issues found
323
+ """
324
+ result = PIIScanResult(has_issues=False)
325
+
326
+ if recursive:
327
+ files = directory.rglob('*')
328
+ else:
329
+ files = directory.glob('*')
330
+
331
+ for filepath in files:
332
+ if not filepath.is_file():
333
+ continue
334
+
335
+ if cls._should_skip(str(filepath)):
336
+ continue
337
+
338
+ result.scanned_files += 1
339
+ issues = cls.scan_file(filepath)
340
+ for issue in issues:
341
+ result.add_issue(issue)
342
+
343
+ return result