phi-scan 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phi_scan/__init__.py +4 -0
- phi_scan/audit.py +495 -0
- phi_scan/baseline.py +650 -0
- phi_scan/cache.py +499 -0
- phi_scan/cli.py +1722 -0
- phi_scan/compliance.py +1 -0
- phi_scan/config.py +415 -0
- phi_scan/constants.py +700 -0
- phi_scan/detection_coordinator.py +491 -0
- phi_scan/diff.py +213 -0
- phi_scan/exceptions.py +135 -0
- phi_scan/fhir_recognizer.py +296 -0
- phi_scan/fixer.py +757 -0
- phi_scan/hashing.py +80 -0
- phi_scan/help_text.py +364 -0
- phi_scan/hl7_scanner.py +284 -0
- phi_scan/logging_config.py +186 -0
- phi_scan/models.py +525 -0
- phi_scan/nlp_detector.py +336 -0
- phi_scan/notifier.py +1 -0
- phi_scan/output.py +2322 -0
- phi_scan/plugin_api.py +1 -0
- phi_scan/py.typed +0 -0
- phi_scan/regex_detector.py +1334 -0
- phi_scan/report.py +1 -0
- phi_scan/scanner.py +868 -0
- phi_scan/suppression.py +203 -0
- phi_scan-0.3.0.dist-info/METADATA +158 -0
- phi_scan-0.3.0.dist-info/RECORD +32 -0
- phi_scan-0.3.0.dist-info/WHEEL +4 -0
- phi_scan-0.3.0.dist-info/entry_points.txt +2 -0
- phi_scan-0.3.0.dist-info/licenses/LICENSE +21 -0
phi_scan/__init__.py
ADDED
phi_scan/audit.py
ADDED
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
"""SQLite audit logging — HIPAA-compliant immutable scan event storage.
|
|
2
|
+
|
|
3
|
+
Audit records are INSERT-only. No UPDATE or DELETE operations are ever issued.
|
|
4
|
+
HIPAA (45 CFR §164.530(j)) requires audit logs to be retained for a minimum of
|
|
5
|
+
six years. Corrections are new INSERT rows referencing the original entry —
|
|
6
|
+
never modifications to existing rows.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import datetime
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
import sqlite3
|
|
16
|
+
import subprocess
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from phi_scan import __version__
|
|
21
|
+
from phi_scan.constants import AUDIT_SCHEMA_VERSION
|
|
22
|
+
from phi_scan.exceptions import AuditLogError, SchemaMigrationError
|
|
23
|
+
from phi_scan.logging_config import get_logger
|
|
24
|
+
from phi_scan.models import ScanFinding, ScanResult
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"create_audit_schema",
|
|
28
|
+
"get_last_scan",
|
|
29
|
+
"get_schema_version",
|
|
30
|
+
"insert_scan_event",
|
|
31
|
+
"migrate_schema",
|
|
32
|
+
"query_recent_scans",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
_logger: logging.Logger = get_logger("audit")
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Log and error message templates
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
_SYMLINK_DATABASE_PATH_ERROR: str = (
|
|
42
|
+
"Audit database path {path!r} is a symlink — symlinks are prohibited "
|
|
43
|
+
"to prevent log-redirection attacks"
|
|
44
|
+
)
|
|
45
|
+
_SCHEMA_DOWNGRADE_ERROR: str = (
|
|
46
|
+
"Cannot downgrade audit schema from version {from_version} to {to_version}"
|
|
47
|
+
)
|
|
48
|
+
_UNKNOWN_MIGRATION_ERROR: str = (
|
|
49
|
+
"No migration path exists from schema version {from_version} "
|
|
50
|
+
"to {to_version} — add the SQL to _MIGRATIONS"
|
|
51
|
+
)
|
|
52
|
+
_SCHEMA_VERSION_MISSING_ERROR: str = "schema_meta table exists but the schema_version key is absent"
|
|
53
|
+
_DATABASE_ERROR: str = "Audit database operation failed: {detail}"
|
|
54
|
+
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# Implementation constants
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
_SCAN_EVENTS_TABLE: str = "scan_events"
|
|
60
|
+
_SCHEMA_META_TABLE: str = "schema_meta"
|
|
61
|
+
_SCHEMA_VERSION_KEY: str = "schema_version"
|
|
62
|
+
_CREATED_AT_KEY: str = "created_at"
|
|
63
|
+
_UNKNOWN_REPOSITORY: str = "unknown"
|
|
64
|
+
_UNKNOWN_BRANCH: str = "unknown"
|
|
65
|
+
_BOOLEAN_TRUE: int = 1
|
|
66
|
+
_BOOLEAN_FALSE: int = 0
|
|
67
|
+
_PRAGMA_WAL_MODE: str = "PRAGMA journal_mode=WAL"
|
|
68
|
+
_LAST_SCAN_LIMIT: int = 1
|
|
69
|
+
_GIT_SUBPROCESS_TIMEOUT_SECONDS: int = 5
|
|
70
|
+
# Git args are fully hardcoded tuples — shell=False is implicit (list form),
|
|
71
|
+
# no user input is interpolated, so subprocess injection is not possible.
|
|
72
|
+
_GIT_BRANCH_ARGS: tuple[str, ...] = ("git", "branch", "--show-current")
|
|
73
|
+
_GIT_TOPLEVEL_ARGS: tuple[str, ...] = ("git", "rev-parse", "--show-toplevel")
|
|
74
|
+
|
|
75
|
+
# SQL DDL — table names are module-level constants, not user input; f-strings are safe.
|
|
76
|
+
_CREATE_SCAN_EVENTS_SQL: str = f"""
|
|
77
|
+
CREATE TABLE IF NOT EXISTS {_SCAN_EVENTS_TABLE} (
|
|
78
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
79
|
+
timestamp TEXT NOT NULL,
|
|
80
|
+
scanner_version TEXT NOT NULL,
|
|
81
|
+
repository_hash TEXT NOT NULL,
|
|
82
|
+
branch_hash TEXT NOT NULL,
|
|
83
|
+
files_scanned INTEGER NOT NULL,
|
|
84
|
+
findings_count INTEGER NOT NULL,
|
|
85
|
+
findings_json TEXT NOT NULL,
|
|
86
|
+
is_clean INTEGER NOT NULL,
|
|
87
|
+
scan_duration REAL NOT NULL
|
|
88
|
+
)
|
|
89
|
+
"""
|
|
90
|
+
_CREATE_SCHEMA_META_SQL: str = f"""
|
|
91
|
+
CREATE TABLE IF NOT EXISTS {_SCHEMA_META_TABLE} (
|
|
92
|
+
key TEXT PRIMARY KEY,
|
|
93
|
+
value TEXT NOT NULL
|
|
94
|
+
)
|
|
95
|
+
"""
|
|
96
|
+
_INSERT_META_SQL: str = f"INSERT OR IGNORE INTO {_SCHEMA_META_TABLE} (key, value) VALUES (?, ?)"
|
|
97
|
+
_UPSERT_SCHEMA_VERSION_SQL: str = (
|
|
98
|
+
f"INSERT INTO {_SCHEMA_META_TABLE} (key, value) VALUES (?, ?)"
|
|
99
|
+
f" ON CONFLICT(key) DO UPDATE SET value = excluded.value"
|
|
100
|
+
)
|
|
101
|
+
_INSERT_SCAN_EVENT_SQL: str = f"""
|
|
102
|
+
INSERT INTO {_SCAN_EVENTS_TABLE}
|
|
103
|
+
(timestamp, scanner_version, repository_hash, branch_hash,
|
|
104
|
+
files_scanned, findings_count, findings_json, is_clean, scan_duration)
|
|
105
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
106
|
+
"""
|
|
107
|
+
_SELECT_RECENT_SCANS_SQL: str = (
|
|
108
|
+
f"SELECT * FROM {_SCAN_EVENTS_TABLE} WHERE timestamp >= ? ORDER BY timestamp DESC"
|
|
109
|
+
)
|
|
110
|
+
_SELECT_LAST_SCAN_SQL: str = (
|
|
111
|
+
f"SELECT * FROM {_SCAN_EVENTS_TABLE} ORDER BY id DESC LIMIT {_LAST_SCAN_LIMIT}"
|
|
112
|
+
)
|
|
113
|
+
_SELECT_SCHEMA_VERSION_SQL: str = f"SELECT value FROM {_SCHEMA_META_TABLE} WHERE key = ?"
|
|
114
|
+
_CREATE_SCAN_EVENTS_TIMESTAMP_INDEX_SQL: str = (
|
|
115
|
+
f"CREATE INDEX IF NOT EXISTS idx_scan_events_timestamp ON {_SCAN_EVENTS_TABLE} (timestamp DESC)"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Migration map: from_version → SQL to advance the schema by one version.
|
|
119
|
+
# Add entries here when AUDIT_SCHEMA_VERSION is incremented. Never remove entries
|
|
120
|
+
# — they must remain to support upgrading older databases.
|
|
121
|
+
_MIGRATIONS: dict[int, str] = {}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
# Public API
|
|
126
|
+
# ---------------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def create_audit_schema(database_path: Path) -> None:
|
|
130
|
+
"""Create the audit schema if it does not already exist.
|
|
131
|
+
|
|
132
|
+
Idempotent — safe to call on every startup. Initialises both the
|
|
133
|
+
``scan_events`` table and the ``schema_meta`` table, then seeds
|
|
134
|
+
``schema_version`` and ``created_at`` metadata keys.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
database_path: Path to the SQLite audit database file. The parent
|
|
138
|
+
directory is created automatically if it does not exist.
|
|
139
|
+
|
|
140
|
+
Raises:
|
|
141
|
+
AuditLogError: If database_path is a symlink, or if the database
|
|
142
|
+
cannot be opened or written to.
|
|
143
|
+
"""
|
|
144
|
+
timestamp = _get_current_timestamp()
|
|
145
|
+
connection = _open_database(database_path)
|
|
146
|
+
try:
|
|
147
|
+
connection.execute(_CREATE_SCAN_EVENTS_SQL)
|
|
148
|
+
connection.execute(_CREATE_SCAN_EVENTS_TIMESTAMP_INDEX_SQL)
|
|
149
|
+
connection.execute(_CREATE_SCHEMA_META_SQL)
|
|
150
|
+
connection.execute(_INSERT_META_SQL, (_SCHEMA_VERSION_KEY, str(AUDIT_SCHEMA_VERSION)))
|
|
151
|
+
connection.execute(_INSERT_META_SQL, (_CREATED_AT_KEY, timestamp))
|
|
152
|
+
connection.commit()
|
|
153
|
+
except sqlite3.Error as db_error:
|
|
154
|
+
connection.rollback()
|
|
155
|
+
raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
|
|
156
|
+
finally:
|
|
157
|
+
connection.close()
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _assemble_scan_event_row(scan_result: ScanResult) -> tuple[str | int | float, ...]:
|
|
161
|
+
repository_hash = hashlib.sha256(_get_current_repository_path().encode()).hexdigest()
|
|
162
|
+
branch_hash = hashlib.sha256(_get_current_branch().encode()).hexdigest()
|
|
163
|
+
return (
|
|
164
|
+
_get_current_timestamp(),
|
|
165
|
+
__version__,
|
|
166
|
+
repository_hash,
|
|
167
|
+
branch_hash,
|
|
168
|
+
scan_result.files_scanned,
|
|
169
|
+
len(scan_result.findings),
|
|
170
|
+
_serialize_findings(scan_result.findings),
|
|
171
|
+
_BOOLEAN_TRUE if scan_result.is_clean else _BOOLEAN_FALSE,
|
|
172
|
+
scan_result.scan_duration,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def insert_scan_event(database_path: Path, scan_result: ScanResult) -> None:
|
|
177
|
+
"""Record a completed scan as an immutable audit entry.
|
|
178
|
+
|
|
179
|
+
findings_json stores only value_hash and metadata fields — raw detected
|
|
180
|
+
values and code_context (which may contain raw PHI) are never persisted.
|
|
181
|
+
repository_hash, branch_hash, and file_path_hash store SHA-256 digests
|
|
182
|
+
— paths and branch names can be PHI-revealing (e.g. a branch named
|
|
183
|
+
feature/patient-john-doe-ssn-fix or a repo at /home/patient_records).
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
database_path: Path to the SQLite audit database file.
|
|
187
|
+
scan_result: The completed scan result to record.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
AuditLogError: If the database cannot be written to.
|
|
191
|
+
"""
|
|
192
|
+
scan_event_row = _assemble_scan_event_row(scan_result)
|
|
193
|
+
connection = _open_database(database_path)
|
|
194
|
+
try:
|
|
195
|
+
connection.execute(_INSERT_SCAN_EVENT_SQL, scan_event_row)
|
|
196
|
+
connection.commit()
|
|
197
|
+
except sqlite3.Error as db_error:
|
|
198
|
+
connection.rollback()
|
|
199
|
+
raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
|
|
200
|
+
finally:
|
|
201
|
+
connection.close()
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def query_recent_scans(database_path: Path, lookback_days: int) -> list[dict[str, Any]]:
|
|
205
|
+
"""Return scan events recorded within the last ``lookback_days`` days.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
database_path: Path to the SQLite audit database file.
|
|
209
|
+
lookback_days: Number of days back to include in the results.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
List of scan event rows as dicts, ordered by timestamp descending.
|
|
213
|
+
|
|
214
|
+
Raises:
|
|
215
|
+
AuditLogError: If the database cannot be read.
|
|
216
|
+
"""
|
|
217
|
+
cutoff = (
|
|
218
|
+
datetime.datetime.now(datetime.UTC) - datetime.timedelta(days=lookback_days)
|
|
219
|
+
).isoformat()
|
|
220
|
+
connection = _open_database(database_path)
|
|
221
|
+
try:
|
|
222
|
+
cursor = connection.execute(_SELECT_RECENT_SCANS_SQL, (cutoff,))
|
|
223
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
224
|
+
except sqlite3.Error as db_error:
|
|
225
|
+
raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
|
|
226
|
+
finally:
|
|
227
|
+
connection.close()
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def get_last_scan(database_path: Path) -> dict[str, Any] | None:
|
|
231
|
+
"""Return the most recent scan event, or None if no scans exist.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
database_path: Path to the SQLite audit database file.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
The most recent scan event row as a dict, or None.
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
AuditLogError: If the database cannot be read.
|
|
241
|
+
"""
|
|
242
|
+
connection = _open_database(database_path)
|
|
243
|
+
try:
|
|
244
|
+
cursor = connection.execute(_SELECT_LAST_SCAN_SQL)
|
|
245
|
+
row = cursor.fetchone()
|
|
246
|
+
return dict(row) if row is not None else None
|
|
247
|
+
except sqlite3.Error as db_error:
|
|
248
|
+
raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
|
|
249
|
+
finally:
|
|
250
|
+
connection.close()
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def get_schema_version(database_path: Path) -> int:
|
|
254
|
+
"""Return the schema version stored in the database.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
database_path: Path to the SQLite audit database file.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
The integer schema version read from schema_meta.
|
|
261
|
+
|
|
262
|
+
Raises:
|
|
263
|
+
AuditLogError: If the database cannot be read or the key is absent.
|
|
264
|
+
"""
|
|
265
|
+
connection = _open_database(database_path)
|
|
266
|
+
try:
|
|
267
|
+
cursor = connection.execute(_SELECT_SCHEMA_VERSION_SQL, (_SCHEMA_VERSION_KEY,))
|
|
268
|
+
row = cursor.fetchone()
|
|
269
|
+
if row is None:
|
|
270
|
+
raise AuditLogError(_SCHEMA_VERSION_MISSING_ERROR)
|
|
271
|
+
return int(row[0])
|
|
272
|
+
except sqlite3.Error as db_error:
|
|
273
|
+
raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
|
|
274
|
+
finally:
|
|
275
|
+
connection.close()
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def migrate_schema(database_path: Path, from_version: int, to_version: int) -> None:
|
|
279
|
+
"""Advance the database schema from from_version to to_version.
|
|
280
|
+
|
|
281
|
+
Applies sequential migrations from _MIGRATIONS. Each migration step
|
|
282
|
+
advances the version by one. Downgrading is not supported.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
database_path: Path to the SQLite audit database file.
|
|
286
|
+
from_version: The current schema version in the database.
|
|
287
|
+
to_version: The target schema version to migrate to.
|
|
288
|
+
|
|
289
|
+
Raises:
|
|
290
|
+
SchemaMigrationError: If from_version > to_version, or if no
|
|
291
|
+
migration SQL exists for a required step.
|
|
292
|
+
AuditLogError: If the database cannot be written to.
|
|
293
|
+
"""
|
|
294
|
+
if from_version == to_version:
|
|
295
|
+
return
|
|
296
|
+
if from_version > to_version:
|
|
297
|
+
raise SchemaMigrationError(
|
|
298
|
+
_SCHEMA_DOWNGRADE_ERROR.format(from_version=from_version, to_version=to_version)
|
|
299
|
+
)
|
|
300
|
+
connection = _open_database(database_path)
|
|
301
|
+
try:
|
|
302
|
+
_apply_migration_steps(connection, from_version, to_version)
|
|
303
|
+
connection.commit()
|
|
304
|
+
except sqlite3.Error as db_error:
|
|
305
|
+
connection.rollback()
|
|
306
|
+
raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
|
|
307
|
+
finally:
|
|
308
|
+
connection.close()
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
# ---------------------------------------------------------------------------
|
|
312
|
+
# Private helpers
|
|
313
|
+
# ---------------------------------------------------------------------------
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _apply_migration_steps(
|
|
317
|
+
connection: sqlite3.Connection, from_version: int, to_version: int
|
|
318
|
+
) -> None:
|
|
319
|
+
"""Execute sequential migration SQL steps from from_version up to to_version.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
connection: Open database connection to execute migrations on.
|
|
323
|
+
from_version: The starting schema version.
|
|
324
|
+
to_version: The target schema version.
|
|
325
|
+
|
|
326
|
+
Raises:
|
|
327
|
+
SchemaMigrationError: If no migration SQL exists for a required step.
|
|
328
|
+
"""
|
|
329
|
+
current_version = from_version
|
|
330
|
+
while current_version < to_version:
|
|
331
|
+
if current_version not in _MIGRATIONS:
|
|
332
|
+
raise SchemaMigrationError(
|
|
333
|
+
_UNKNOWN_MIGRATION_ERROR.format(
|
|
334
|
+
from_version=current_version,
|
|
335
|
+
to_version=current_version + 1,
|
|
336
|
+
)
|
|
337
|
+
)
|
|
338
|
+
connection.execute(_MIGRATIONS[current_version])
|
|
339
|
+
next_version = str(current_version + 1)
|
|
340
|
+
connection.execute(_UPSERT_SCHEMA_VERSION_SQL, (_SCHEMA_VERSION_KEY, next_version))
|
|
341
|
+
current_version += 1
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _reject_symlink_database_path(database_path: Path) -> None:
|
|
345
|
+
"""Raise AuditLogError if database_path is a symlink.
|
|
346
|
+
|
|
347
|
+
A symlinked database path could allow an attacker to redirect audit log
|
|
348
|
+
writes to an arbitrary location, destroying HIPAA immutability guarantees.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
database_path: The path to validate.
|
|
352
|
+
|
|
353
|
+
Raises:
|
|
354
|
+
AuditLogError: If database_path is a symlink.
|
|
355
|
+
"""
|
|
356
|
+
if database_path.is_symlink():
|
|
357
|
+
raise AuditLogError(_SYMLINK_DATABASE_PATH_ERROR.format(path=database_path))
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _ensure_database_parent_exists(database_path: Path) -> None:
|
|
361
|
+
"""Create the parent directory of database_path if it does not exist.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
database_path: Path to the SQLite file whose parent must exist.
|
|
365
|
+
|
|
366
|
+
Raises:
|
|
367
|
+
AuditLogError: If the parent directory cannot be created.
|
|
368
|
+
"""
|
|
369
|
+
try:
|
|
370
|
+
database_path.parent.mkdir(parents=True, exist_ok=True)
|
|
371
|
+
except OSError as io_error:
|
|
372
|
+
raise AuditLogError(_DATABASE_ERROR.format(detail=io_error)) from io_error
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _open_database(database_path: Path) -> sqlite3.Connection:
|
|
376
|
+
"""Open and configure a SQLite connection to the audit database.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
database_path: Path to the SQLite file to open or create.
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
An open sqlite3.Connection with row_factory and WAL mode configured.
|
|
383
|
+
|
|
384
|
+
Raises:
|
|
385
|
+
AuditLogError: If the path is a symlink, the parent directory cannot
|
|
386
|
+
be created, or the database cannot be opened or configured.
|
|
387
|
+
"""
|
|
388
|
+
# Security: TOCTOU race between is_symlink() and sqlite3.connect().
|
|
389
|
+
# An attacker who can write to the filesystem could swap the path for a symlink
|
|
390
|
+
# in the window between the check and the open. The complete fix is to open the
|
|
391
|
+
# file descriptor with O_NOFOLLOW (Linux/macOS) before passing the fd to SQLite,
|
|
392
|
+
# which collapses the race to zero. O_NOFOLLOW is not available on Windows
|
|
393
|
+
# (os.O_NOFOLLOW is undefined there), so the fix requires a platform branch or
|
|
394
|
+
# a ctypes shim. Residual risk is low in the intended CI/CD context where the
|
|
395
|
+
# audit database directory is not world-writable. Tracked for Phase 5 hardening.
|
|
396
|
+
_reject_symlink_database_path(database_path)
|
|
397
|
+
_ensure_database_parent_exists(database_path)
|
|
398
|
+
try:
|
|
399
|
+
connection = sqlite3.connect(str(database_path))
|
|
400
|
+
except sqlite3.Error as db_error:
|
|
401
|
+
raise AuditLogError(_DATABASE_ERROR.format(detail=db_error)) from db_error
|
|
402
|
+
try:
|
|
403
|
+
connection.row_factory = sqlite3.Row
|
|
404
|
+
connection.execute(_PRAGMA_WAL_MODE)
|
|
405
|
+
except sqlite3.Error as config_error:
|
|
406
|
+
connection.close()
|
|
407
|
+
raise AuditLogError(_DATABASE_ERROR.format(detail=config_error)) from config_error
|
|
408
|
+
return connection
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _get_current_timestamp() -> str:
|
|
412
|
+
"""Return the current UTC time as an ISO 8601 string.
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
ISO 8601 formatted timestamp with timezone offset.
|
|
416
|
+
"""
|
|
417
|
+
return datetime.datetime.now(datetime.UTC).isoformat()
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _serialize_findings(findings: tuple[ScanFinding, ...]) -> str:
|
|
421
|
+
"""Serialise findings to a JSON string for audit storage.
|
|
422
|
+
|
|
423
|
+
Only fields that cannot contain raw PHI are included. ``code_context``
|
|
424
|
+
is deliberately excluded — it stores surrounding source lines that may
|
|
425
|
+
contain the detected value in plaintext. ``file_path`` is stored as a
|
|
426
|
+
SHA-256 hash (``file_path_hash``) — paths can be PHI-revealing (e.g.
|
|
427
|
+
patient_ssn_export.csv) and must not be persisted in plaintext.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
findings: The findings tuple from a completed ScanResult.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
A JSON array string safe for storage in the audit database.
|
|
434
|
+
"""
|
|
435
|
+
serialized_findings = [
|
|
436
|
+
{
|
|
437
|
+
"file_path_hash": hashlib.sha256(str(finding.file_path).encode()).hexdigest(),
|
|
438
|
+
"line_number": finding.line_number,
|
|
439
|
+
"entity_type": finding.entity_type,
|
|
440
|
+
"hipaa_category": finding.hipaa_category.value,
|
|
441
|
+
"confidence": finding.confidence,
|
|
442
|
+
"detection_layer": finding.detection_layer,
|
|
443
|
+
"value_hash": finding.value_hash,
|
|
444
|
+
"severity": finding.severity.value,
|
|
445
|
+
"remediation_hint": finding.remediation_hint,
|
|
446
|
+
}
|
|
447
|
+
for finding in findings
|
|
448
|
+
]
|
|
449
|
+
return json.dumps(serialized_findings)
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _get_current_branch() -> str:
|
|
453
|
+
"""Return the current git branch name, or 'unknown' if unavailable.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
The branch name string, or _UNKNOWN_BRANCH on any failure.
|
|
457
|
+
"""
|
|
458
|
+
try:
|
|
459
|
+
completed_process = subprocess.run(
|
|
460
|
+
_GIT_BRANCH_ARGS,
|
|
461
|
+
capture_output=True,
|
|
462
|
+
text=True,
|
|
463
|
+
timeout=_GIT_SUBPROCESS_TIMEOUT_SECONDS,
|
|
464
|
+
)
|
|
465
|
+
if completed_process.returncode == 0:
|
|
466
|
+
branch = completed_process.stdout.strip()
|
|
467
|
+
return branch if branch else _UNKNOWN_BRANCH
|
|
468
|
+
except (OSError, subprocess.TimeoutExpired) as git_error:
|
|
469
|
+
# Log only the error type — branch names can embed PHI (e.g. feature/patient-john-doe).
|
|
470
|
+
_logger.warning("Could not determine git branch: %s", type(git_error).__name__)
|
|
471
|
+
return _UNKNOWN_BRANCH
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _get_current_repository_path() -> str:
|
|
475
|
+
"""Return the git repository root path, or the current directory if unavailable.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
Absolute path string of the repository root or CWD.
|
|
479
|
+
"""
|
|
480
|
+
try:
|
|
481
|
+
completed_process = subprocess.run(
|
|
482
|
+
_GIT_TOPLEVEL_ARGS,
|
|
483
|
+
capture_output=True,
|
|
484
|
+
text=True,
|
|
485
|
+
timeout=_GIT_SUBPROCESS_TIMEOUT_SECONDS,
|
|
486
|
+
)
|
|
487
|
+
if completed_process.returncode == 0:
|
|
488
|
+
return completed_process.stdout.strip()
|
|
489
|
+
except (OSError, subprocess.TimeoutExpired) as git_error:
|
|
490
|
+
# Log only the error type — repository paths can embed PHI (e.g. /home/patient_records/).
|
|
491
|
+
_logger.warning("Could not determine git repository path: %s", type(git_error).__name__)
|
|
492
|
+
# Path.cwd() follows symlinks on most platforms. The returned path is
|
|
493
|
+
# SHA-256 hashed before storage, so no plaintext PHI is persisted even
|
|
494
|
+
# if a symlinked CWD returns an attacker-influenced path.
|
|
495
|
+
return str(Path.cwd())
|