phi-scan 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
phi_scan/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ """PhiScan — HIPAA & FHIR compliant PHI/PII scanner for CI/CD pipelines."""
2
+
3
+ __version__ = "0.3.0"
4
+ __app_name__ = "phi-scan"
phi_scan/audit.py ADDED
@@ -0,0 +1,495 @@
1
+ """SQLite audit logging — HIPAA-compliant immutable scan event storage.
2
+
3
+ Audit records are INSERT-only. No UPDATE or DELETE operations are ever issued.
4
+ HIPAA (45 CFR §164.530(j)) requires audit logs to be retained for a minimum of
5
+ six years. Corrections are new INSERT rows referencing the original entry —
6
+ never modifications to existing rows.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import datetime
12
+ import hashlib
13
+ import json
14
+ import logging
15
+ import sqlite3
16
+ import subprocess
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from phi_scan import __version__
21
+ from phi_scan.constants import AUDIT_SCHEMA_VERSION
22
+ from phi_scan.exceptions import AuditLogError, SchemaMigrationError
23
+ from phi_scan.logging_config import get_logger
24
+ from phi_scan.models import ScanFinding, ScanResult
25
+
26
+ __all__ = [
27
+ "create_audit_schema",
28
+ "get_last_scan",
29
+ "get_schema_version",
30
+ "insert_scan_event",
31
+ "migrate_schema",
32
+ "query_recent_scans",
33
+ ]
34
+
35
+ _logger: logging.Logger = get_logger("audit")
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Log and error message templates
39
+ # ---------------------------------------------------------------------------
40
+
41
+ _SYMLINK_DATABASE_PATH_ERROR: str = (
42
+ "Audit database path {path!r} is a symlink — symlinks are prohibited "
43
+ "to prevent log-redirection attacks"
44
+ )
45
+ _SCHEMA_DOWNGRADE_ERROR: str = (
46
+ "Cannot downgrade audit schema from version {from_version} to {to_version}"
47
+ )
48
+ _UNKNOWN_MIGRATION_ERROR: str = (
49
+ "No migration path exists from schema version {from_version} "
50
+ "to {to_version} — add the SQL to _MIGRATIONS"
51
+ )
52
+ _SCHEMA_VERSION_MISSING_ERROR: str = "schema_meta table exists but the schema_version key is absent"
53
+ _DATABASE_ERROR: str = "Audit database operation failed: {detail}"
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Implementation constants
57
+ # ---------------------------------------------------------------------------
58
+
59
+ _SCAN_EVENTS_TABLE: str = "scan_events"
60
+ _SCHEMA_META_TABLE: str = "schema_meta"
61
+ _SCHEMA_VERSION_KEY: str = "schema_version"
62
+ _CREATED_AT_KEY: str = "created_at"
63
+ _UNKNOWN_REPOSITORY: str = "unknown"
64
+ _UNKNOWN_BRANCH: str = "unknown"
65
+ _BOOLEAN_TRUE: int = 1
66
+ _BOOLEAN_FALSE: int = 0
67
+ _PRAGMA_WAL_MODE: str = "PRAGMA journal_mode=WAL"
68
+ _LAST_SCAN_LIMIT: int = 1
69
+ _GIT_SUBPROCESS_TIMEOUT_SECONDS: int = 5
70
+ # Git args are fully hardcoded tuples — shell=False is implicit (list form),
71
+ # no user input is interpolated, so subprocess injection is not possible.
72
+ _GIT_BRANCH_ARGS: tuple[str, ...] = ("git", "branch", "--show-current")
73
+ _GIT_TOPLEVEL_ARGS: tuple[str, ...] = ("git", "rev-parse", "--show-toplevel")
74
+
75
+ # SQL DDL — table names are module-level constants, not user input; f-strings are safe.
76
+ _CREATE_SCAN_EVENTS_SQL: str = f"""
77
+ CREATE TABLE IF NOT EXISTS {_SCAN_EVENTS_TABLE} (
78
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
79
+ timestamp TEXT NOT NULL,
80
+ scanner_version TEXT NOT NULL,
81
+ repository_hash TEXT NOT NULL,
82
+ branch_hash TEXT NOT NULL,
83
+ files_scanned INTEGER NOT NULL,
84
+ findings_count INTEGER NOT NULL,
85
+ findings_json TEXT NOT NULL,
86
+ is_clean INTEGER NOT NULL,
87
+ scan_duration REAL NOT NULL
88
+ )
89
+ """
90
+ _CREATE_SCHEMA_META_SQL: str = f"""
91
+ CREATE TABLE IF NOT EXISTS {_SCHEMA_META_TABLE} (
92
+ key TEXT PRIMARY KEY,
93
+ value TEXT NOT NULL
94
+ )
95
+ """
96
+ _INSERT_META_SQL: str = f"INSERT OR IGNORE INTO {_SCHEMA_META_TABLE} (key, value) VALUES (?, ?)"
97
+ _UPSERT_SCHEMA_VERSION_SQL: str = (
98
+ f"INSERT INTO {_SCHEMA_META_TABLE} (key, value) VALUES (?, ?)"
99
+ f" ON CONFLICT(key) DO UPDATE SET value = excluded.value"
100
+ )
101
+ _INSERT_SCAN_EVENT_SQL: str = f"""
102
+ INSERT INTO {_SCAN_EVENTS_TABLE}
103
+ (timestamp, scanner_version, repository_hash, branch_hash,
104
+ files_scanned, findings_count, findings_json, is_clean, scan_duration)
105
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
106
+ """
107
+ _SELECT_RECENT_SCANS_SQL: str = (
108
+ f"SELECT * FROM {_SCAN_EVENTS_TABLE} WHERE timestamp >= ? ORDER BY timestamp DESC"
109
+ )
110
+ _SELECT_LAST_SCAN_SQL: str = (
111
+ f"SELECT * FROM {_SCAN_EVENTS_TABLE} ORDER BY id DESC LIMIT {_LAST_SCAN_LIMIT}"
112
+ )
113
+ _SELECT_SCHEMA_VERSION_SQL: str = f"SELECT value FROM {_SCHEMA_META_TABLE} WHERE key = ?"
114
+ _CREATE_SCAN_EVENTS_TIMESTAMP_INDEX_SQL: str = (
115
+ f"CREATE INDEX IF NOT EXISTS idx_scan_events_timestamp ON {_SCAN_EVENTS_TABLE} (timestamp DESC)"
116
+ )
117
+
118
+ # Migration map: from_version → SQL to advance the schema by one version.
119
+ # Add entries here when AUDIT_SCHEMA_VERSION is incremented. Never remove entries
120
+ # — they must remain to support upgrading older databases.
121
+ _MIGRATIONS: dict[int, str] = {}
122
+
123
+
124
+ # ---------------------------------------------------------------------------
125
+ # Public API
126
+ # ---------------------------------------------------------------------------
127
+
128
+
129
+ def create_audit_schema(database_path: Path) -> None:
130
+ """Create the audit schema if it does not already exist.
131
+
132
+ Idempotent — safe to call on every startup. Initialises both the
133
+ ``scan_events`` table and the ``schema_meta`` table, then seeds
134
+ ``schema_version`` and ``created_at`` metadata keys.
135
+
136
+ Args:
137
+ database_path: Path to the SQLite audit database file. The parent
138
+ directory is created automatically if it does not exist.
139
+
140
+ Raises:
141
+ AuditLogError: If database_path is a symlink, or if the database
142
+ cannot be opened or written to.
143
+ """
144
+ timestamp = _get_current_timestamp()
145
+ connection = _open_database(database_path)
146
+ try:
147
+ connection.execute(_CREATE_SCAN_EVENTS_SQL)
148
+ connection.execute(_CREATE_SCAN_EVENTS_TIMESTAMP_INDEX_SQL)
149
+ connection.execute(_CREATE_SCHEMA_META_SQL)
150
+ connection.execute(_INSERT_META_SQL, (_SCHEMA_VERSION_KEY, str(AUDIT_SCHEMA_VERSION)))
151
+ connection.execute(_INSERT_META_SQL, (_CREATED_AT_KEY, timestamp))
152
+ connection.commit()
153
+ except sqlite3.Error as db_error:
154
+ connection.rollback()
155
+ raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
156
+ finally:
157
+ connection.close()
158
+
159
+
160
+ def _assemble_scan_event_row(scan_result: ScanResult) -> tuple[str | int | float, ...]:
161
+ repository_hash = hashlib.sha256(_get_current_repository_path().encode()).hexdigest()
162
+ branch_hash = hashlib.sha256(_get_current_branch().encode()).hexdigest()
163
+ return (
164
+ _get_current_timestamp(),
165
+ __version__,
166
+ repository_hash,
167
+ branch_hash,
168
+ scan_result.files_scanned,
169
+ len(scan_result.findings),
170
+ _serialize_findings(scan_result.findings),
171
+ _BOOLEAN_TRUE if scan_result.is_clean else _BOOLEAN_FALSE,
172
+ scan_result.scan_duration,
173
+ )
174
+
175
+
176
+ def insert_scan_event(database_path: Path, scan_result: ScanResult) -> None:
177
+ """Record a completed scan as an immutable audit entry.
178
+
179
+ findings_json stores only value_hash and metadata fields — raw detected
180
+ values and code_context (which may contain raw PHI) are never persisted.
181
+ repository_hash, branch_hash, and file_path_hash store SHA-256 digests
182
+ — paths and branch names can be PHI-revealing (e.g. a branch named
183
+ feature/patient-john-doe-ssn-fix or a repo at /home/patient_records).
184
+
185
+ Args:
186
+ database_path: Path to the SQLite audit database file.
187
+ scan_result: The completed scan result to record.
188
+
189
+ Raises:
190
+ AuditLogError: If the database cannot be written to.
191
+ """
192
+ scan_event_row = _assemble_scan_event_row(scan_result)
193
+ connection = _open_database(database_path)
194
+ try:
195
+ connection.execute(_INSERT_SCAN_EVENT_SQL, scan_event_row)
196
+ connection.commit()
197
+ except sqlite3.Error as db_error:
198
+ connection.rollback()
199
+ raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
200
+ finally:
201
+ connection.close()
202
+
203
+
204
+ def query_recent_scans(database_path: Path, lookback_days: int) -> list[dict[str, Any]]:
205
+ """Return scan events recorded within the last ``lookback_days`` days.
206
+
207
+ Args:
208
+ database_path: Path to the SQLite audit database file.
209
+ lookback_days: Number of days back to include in the results.
210
+
211
+ Returns:
212
+ List of scan event rows as dicts, ordered by timestamp descending.
213
+
214
+ Raises:
215
+ AuditLogError: If the database cannot be read.
216
+ """
217
+ cutoff = (
218
+ datetime.datetime.now(datetime.UTC) - datetime.timedelta(days=lookback_days)
219
+ ).isoformat()
220
+ connection = _open_database(database_path)
221
+ try:
222
+ cursor = connection.execute(_SELECT_RECENT_SCANS_SQL, (cutoff,))
223
+ return [dict(row) for row in cursor.fetchall()]
224
+ except sqlite3.Error as db_error:
225
+ raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
226
+ finally:
227
+ connection.close()
228
+
229
+
230
+ def get_last_scan(database_path: Path) -> dict[str, Any] | None:
231
+ """Return the most recent scan event, or None if no scans exist.
232
+
233
+ Args:
234
+ database_path: Path to the SQLite audit database file.
235
+
236
+ Returns:
237
+ The most recent scan event row as a dict, or None.
238
+
239
+ Raises:
240
+ AuditLogError: If the database cannot be read.
241
+ """
242
+ connection = _open_database(database_path)
243
+ try:
244
+ cursor = connection.execute(_SELECT_LAST_SCAN_SQL)
245
+ row = cursor.fetchone()
246
+ return dict(row) if row is not None else None
247
+ except sqlite3.Error as db_error:
248
+ raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
249
+ finally:
250
+ connection.close()
251
+
252
+
253
+ def get_schema_version(database_path: Path) -> int:
254
+ """Return the schema version stored in the database.
255
+
256
+ Args:
257
+ database_path: Path to the SQLite audit database file.
258
+
259
+ Returns:
260
+ The integer schema version read from schema_meta.
261
+
262
+ Raises:
263
+ AuditLogError: If the database cannot be read or the key is absent.
264
+ """
265
+ connection = _open_database(database_path)
266
+ try:
267
+ cursor = connection.execute(_SELECT_SCHEMA_VERSION_SQL, (_SCHEMA_VERSION_KEY,))
268
+ row = cursor.fetchone()
269
+ if row is None:
270
+ raise AuditLogError(_SCHEMA_VERSION_MISSING_ERROR)
271
+ return int(row[0])
272
+ except sqlite3.Error as db_error:
273
+ raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
274
+ finally:
275
+ connection.close()
276
+
277
+
278
+ def migrate_schema(database_path: Path, from_version: int, to_version: int) -> None:
279
+ """Advance the database schema from from_version to to_version.
280
+
281
+ Applies sequential migrations from _MIGRATIONS. Each migration step
282
+ advances the version by one. Downgrading is not supported.
283
+
284
+ Args:
285
+ database_path: Path to the SQLite audit database file.
286
+ from_version: The current schema version in the database.
287
+ to_version: The target schema version to migrate to.
288
+
289
+ Raises:
290
+ SchemaMigrationError: If from_version > to_version, or if no
291
+ migration SQL exists for a required step.
292
+ AuditLogError: If the database cannot be written to.
293
+ """
294
+ if from_version == to_version:
295
+ return
296
+ if from_version > to_version:
297
+ raise SchemaMigrationError(
298
+ _SCHEMA_DOWNGRADE_ERROR.format(from_version=from_version, to_version=to_version)
299
+ )
300
+ connection = _open_database(database_path)
301
+ try:
302
+ _apply_migration_steps(connection, from_version, to_version)
303
+ connection.commit()
304
+ except sqlite3.Error as db_error:
305
+ connection.rollback()
306
+ raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
307
+ finally:
308
+ connection.close()
309
+
310
+
311
+ # ---------------------------------------------------------------------------
312
+ # Private helpers
313
+ # ---------------------------------------------------------------------------
314
+
315
+
316
+ def _apply_migration_steps(
317
+ connection: sqlite3.Connection, from_version: int, to_version: int
318
+ ) -> None:
319
+ """Execute sequential migration SQL steps from from_version up to to_version.
320
+
321
+ Args:
322
+ connection: Open database connection to execute migrations on.
323
+ from_version: The starting schema version.
324
+ to_version: The target schema version.
325
+
326
+ Raises:
327
+ SchemaMigrationError: If no migration SQL exists for a required step.
328
+ """
329
+ current_version = from_version
330
+ while current_version < to_version:
331
+ if current_version not in _MIGRATIONS:
332
+ raise SchemaMigrationError(
333
+ _UNKNOWN_MIGRATION_ERROR.format(
334
+ from_version=current_version,
335
+ to_version=current_version + 1,
336
+ )
337
+ )
338
+ connection.execute(_MIGRATIONS[current_version])
339
+ next_version = str(current_version + 1)
340
+ connection.execute(_UPSERT_SCHEMA_VERSION_SQL, (_SCHEMA_VERSION_KEY, next_version))
341
+ current_version += 1
342
+
343
+
344
+ def _reject_symlink_database_path(database_path: Path) -> None:
345
+ """Raise AuditLogError if database_path is a symlink.
346
+
347
+ A symlinked database path could allow an attacker to redirect audit log
348
+ writes to an arbitrary location, destroying HIPAA immutability guarantees.
349
+
350
+ Args:
351
+ database_path: The path to validate.
352
+
353
+ Raises:
354
+ AuditLogError: If database_path is a symlink.
355
+ """
356
+ if database_path.is_symlink():
357
+ raise AuditLogError(_SYMLINK_DATABASE_PATH_ERROR.format(path=database_path))
358
+
359
+
360
+ def _ensure_database_parent_exists(database_path: Path) -> None:
361
+ """Create the parent directory of database_path if it does not exist.
362
+
363
+ Args:
364
+ database_path: Path to the SQLite file whose parent must exist.
365
+
366
+ Raises:
367
+ AuditLogError: If the parent directory cannot be created.
368
+ """
369
+ try:
370
+ database_path.parent.mkdir(parents=True, exist_ok=True)
371
+ except OSError as io_error:
372
+ raise AuditLogError(_DATABASE_ERROR.format(detail=io_error)) from io_error
373
+
374
+
375
+ def _open_database(database_path: Path) -> sqlite3.Connection:
376
+ """Open and configure a SQLite connection to the audit database.
377
+
378
+ Args:
379
+ database_path: Path to the SQLite file to open or create.
380
+
381
+ Returns:
382
+ An open sqlite3.Connection with row_factory and WAL mode configured.
383
+
384
+ Raises:
385
+ AuditLogError: If the path is a symlink, the parent directory cannot
386
+ be created, or the database cannot be opened or configured.
387
+ """
388
+ # Security: TOCTOU race between is_symlink() and sqlite3.connect().
389
+ # An attacker who can write to the filesystem could swap the path for a symlink
390
+ # in the window between the check and the open. The complete fix is to open the
391
+ # file descriptor with O_NOFOLLOW (Linux/macOS) before passing the fd to SQLite,
392
+ # which collapses the race to zero. O_NOFOLLOW is not available on Windows
393
+ # (os.O_NOFOLLOW is undefined there), so the fix requires a platform branch or
394
+ # a ctypes shim. Residual risk is low in the intended CI/CD context where the
395
+ # audit database directory is not world-writable. Tracked for Phase 5 hardening.
396
+ _reject_symlink_database_path(database_path)
397
+ _ensure_database_parent_exists(database_path)
398
+ try:
399
+ connection = sqlite3.connect(str(database_path))
400
+ except sqlite3.Error as db_error:
401
+ raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
402
+ try:
403
+ connection.row_factory = sqlite3.Row
404
+ connection.execute(_PRAGMA_WAL_MODE)
405
+ except sqlite3.Error as config_error:
406
+ connection.close()
407
+ raise AuditLogError(_DATABASE_ERROR.format(detail=config_error)) from config_error
408
+ return connection
409
+
410
+
411
+ def _get_current_timestamp() -> str:
412
+ """Return the current UTC time as an ISO 8601 string.
413
+
414
+ Returns:
415
+ ISO 8601 formatted timestamp with timezone offset.
416
+ """
417
+ return datetime.datetime.now(datetime.UTC).isoformat()
418
+
419
+
420
+ def _serialize_findings(findings: tuple[ScanFinding, ...]) -> str:
421
+ """Serialise findings to a JSON string for audit storage.
422
+
423
+ Only fields that cannot contain raw PHI are included. ``code_context``
424
+ is deliberately excluded — it stores surrounding source lines that may
425
+ contain the detected value in plaintext. ``file_path`` is stored as a
426
+ SHA-256 hash (``file_path_hash``) — paths can be PHI-revealing (e.g.
427
+ patient_ssn_export.csv) and must not be persisted in plaintext.
428
+
429
+ Args:
430
+ findings: The findings tuple from a completed ScanResult.
431
+
432
+ Returns:
433
+ A JSON array string safe for storage in the audit database.
434
+ """
435
+ serialized_findings = [
436
+ {
437
+ "file_path_hash": hashlib.sha256(str(finding.file_path).encode()).hexdigest(),
438
+ "line_number": finding.line_number,
439
+ "entity_type": finding.entity_type,
440
+ "hipaa_category": finding.hipaa_category.value,
441
+ "confidence": finding.confidence,
442
+ "detection_layer": finding.detection_layer,
443
+ "value_hash": finding.value_hash,
444
+ "severity": finding.severity.value,
445
+ "remediation_hint": finding.remediation_hint,
446
+ }
447
+ for finding in findings
448
+ ]
449
+ return json.dumps(serialized_findings)
450
+
451
+
452
+ def _get_current_branch() -> str:
453
+ """Return the current git branch name, or 'unknown' if unavailable.
454
+
455
+ Returns:
456
+ The branch name string, or _UNKNOWN_BRANCH on any failure.
457
+ """
458
+ try:
459
+ completed_process = subprocess.run(
460
+ _GIT_BRANCH_ARGS,
461
+ capture_output=True,
462
+ text=True,
463
+ timeout=_GIT_SUBPROCESS_TIMEOUT_SECONDS,
464
+ )
465
+ if completed_process.returncode == 0:
466
+ branch = completed_process.stdout.strip()
467
+ return branch if branch else _UNKNOWN_BRANCH
468
+ except (OSError, subprocess.TimeoutExpired) as git_error:
469
+ # Log only the error type — branch names can embed PHI (e.g. feature/patient-john-doe).
470
+ _logger.warning("Could not determine git branch: %s", type(git_error).__name__)
471
+ return _UNKNOWN_BRANCH
472
+
473
+
474
+ def _get_current_repository_path() -> str:
475
+ """Return the git repository root path, or the current directory if unavailable.
476
+
477
+ Returns:
478
+ Absolute path string of the repository root or CWD.
479
+ """
480
+ try:
481
+ completed_process = subprocess.run(
482
+ _GIT_TOPLEVEL_ARGS,
483
+ capture_output=True,
484
+ text=True,
485
+ timeout=_GIT_SUBPROCESS_TIMEOUT_SECONDS,
486
+ )
487
+ if completed_process.returncode == 0:
488
+ return completed_process.stdout.strip()
489
+ except (OSError, subprocess.TimeoutExpired) as git_error:
490
+ # Log only the error type — repository paths can embed PHI (e.g. /home/patient_records/).
491
+ _logger.warning("Could not determine git repository path: %s", type(git_error).__name__)
492
+ # Path.cwd() follows symlinks on most platforms. The returned path is
493
+ # SHA-256 hashed before storage, so no plaintext PHI is persisted even
494
+ # if a symlinked CWD returns an attacker-influenced path.
495
+ return str(Path.cwd())