sdsa 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdsa-1.1.0/PKG-INFO +20 -0
- sdsa-1.1.0/pyproject.toml +50 -0
- sdsa-1.1.0/setup.cfg +4 -0
- sdsa-1.1.0/src/sdsa/__init__.py +1 -0
- sdsa-1.1.0/src/sdsa/anonymize/__init__.py +0 -0
- sdsa-1.1.0/src/sdsa/anonymize/policy.py +92 -0
- sdsa-1.1.0/src/sdsa/anonymize/primitives.py +195 -0
- sdsa-1.1.0/src/sdsa/api/__init__.py +0 -0
- sdsa-1.1.0/src/sdsa/api/routes.py +352 -0
- sdsa-1.1.0/src/sdsa/cli.py +82 -0
- sdsa-1.1.0/src/sdsa/core/__init__.py +0 -0
- sdsa-1.1.0/src/sdsa/core/config.py +121 -0
- sdsa-1.1.0/src/sdsa/core/logging.py +63 -0
- sdsa-1.1.0/src/sdsa/core/session.py +182 -0
- sdsa-1.1.0/src/sdsa/detect/__init__.py +0 -0
- sdsa-1.1.0/src/sdsa/detect/pii.py +191 -0
- sdsa-1.1.0/src/sdsa/detect/schema.py +58 -0
- sdsa-1.1.0/src/sdsa/dp/__init__.py +0 -0
- sdsa-1.1.0/src/sdsa/dp/accountant.py +25 -0
- sdsa-1.1.0/src/sdsa/dp/laplace.py +88 -0
- sdsa-1.1.0/src/sdsa/frontend/app.js +1118 -0
- sdsa-1.1.0/src/sdsa/frontend/index.html +369 -0
- sdsa-1.1.0/src/sdsa/frontend/style.css +1153 -0
- sdsa-1.1.0/src/sdsa/ingest.py +389 -0
- sdsa-1.1.0/src/sdsa/kanon/__init__.py +0 -0
- sdsa-1.1.0/src/sdsa/kanon/enforce.py +77 -0
- sdsa-1.1.0/src/sdsa/main.py +76 -0
- sdsa-1.1.0/src/sdsa/pipeline.py +263 -0
- sdsa-1.1.0/src/sdsa/policy_config.py +148 -0
- sdsa-1.1.0/src/sdsa/preflight.py +279 -0
- sdsa-1.1.0/src/sdsa/report.py +103 -0
- sdsa-1.1.0/src/sdsa/validate/__init__.py +0 -0
- sdsa-1.1.0/src/sdsa/validate/metrics.py +144 -0
- sdsa-1.1.0/src/sdsa.egg-info/PKG-INFO +20 -0
- sdsa-1.1.0/src/sdsa.egg-info/SOURCES.txt +50 -0
- sdsa-1.1.0/src/sdsa.egg-info/dependency_links.txt +1 -0
- sdsa-1.1.0/src/sdsa.egg-info/entry_points.txt +2 -0
- sdsa-1.1.0/src/sdsa.egg-info/requires.txt +16 -0
- sdsa-1.1.0/src/sdsa.egg-info/top_level.txt +1 -0
- sdsa-1.1.0/tests/test_anonymize.py +140 -0
- sdsa-1.1.0/tests/test_api.py +522 -0
- sdsa-1.1.0/tests/test_cli.py +71 -0
- sdsa-1.1.0/tests/test_core.py +144 -0
- sdsa-1.1.0/tests/test_deployment.py +92 -0
- sdsa-1.1.0/tests/test_detect.py +77 -0
- sdsa-1.1.0/tests/test_dp.py +118 -0
- sdsa-1.1.0/tests/test_ingest.py +206 -0
- sdsa-1.1.0/tests/test_kanon.py +74 -0
- sdsa-1.1.0/tests/test_pipeline.py +254 -0
- sdsa-1.1.0/tests/test_policy_config.py +50 -0
- sdsa-1.1.0/tests/test_preflight.py +136 -0
- sdsa-1.1.0/tests/test_validate.py +19 -0
sdsa-1.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sdsa
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Secure Data Sanitization App backend
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: fastapi>=0.115
|
|
7
|
+
Requires-Dist: uvicorn[standard]>=0.30
|
|
8
|
+
Requires-Dist: python-multipart>=0.0.9
|
|
9
|
+
Requires-Dist: polars>=1.0
|
|
10
|
+
Requires-Dist: pyarrow>=16
|
|
11
|
+
Requires-Dist: opendp>=0.11
|
|
12
|
+
Requires-Dist: phonenumbers>=8.13
|
|
13
|
+
Requires-Dist: chardet>=5.2
|
|
14
|
+
Requires-Dist: pydantic>=2.8
|
|
15
|
+
Requires-Dist: python-dateutil>=2.9
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
18
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
|
|
19
|
+
Requires-Dist: httpx>=0.27; extra == "dev"
|
|
20
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "sdsa"
|
|
3
|
+
version = "1.1.0"
|
|
4
|
+
description = "Secure Data Sanitization App backend"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"fastapi>=0.115",
|
|
8
|
+
"uvicorn[standard]>=0.30",
|
|
9
|
+
"python-multipart>=0.0.9",
|
|
10
|
+
"polars>=1.0",
|
|
11
|
+
"pyarrow>=16",
|
|
12
|
+
"opendp>=0.11",
|
|
13
|
+
"phonenumbers>=8.13",
|
|
14
|
+
"chardet>=5.2",
|
|
15
|
+
"pydantic>=2.8",
|
|
16
|
+
"python-dateutil>=2.9",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.scripts]
|
|
20
|
+
sdsa-server = "sdsa.cli:main"
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
dev = [
|
|
24
|
+
"pytest>=8.0",
|
|
25
|
+
"pytest-asyncio>=0.24",
|
|
26
|
+
"httpx>=0.27",
|
|
27
|
+
"ruff>=0.6",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[build-system]
|
|
31
|
+
requires = ["setuptools>=68"]
|
|
32
|
+
build-backend = "setuptools.build_meta"
|
|
33
|
+
|
|
34
|
+
[tool.setuptools.packages.find]
|
|
35
|
+
where = ["src"]
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.package-data]
|
|
38
|
+
sdsa = ["frontend/*.html", "frontend/*.css", "frontend/*.js"]
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
pythonpath = ["src"]
|
|
42
|
+
testpaths = ["tests"]
|
|
43
|
+
asyncio_mode = "auto"
|
|
44
|
+
filterwarnings = [
|
|
45
|
+
"ignore:Field name \"schema\".*:UserWarning",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
[tool.ruff]
|
|
49
|
+
line-length = 100
|
|
50
|
+
target-version = "py311"
|
sdsa-1.1.0/setup.cfg
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.0"
|
|
File without changes
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Column policy model + apply function.
|
|
2
|
+
|
|
3
|
+
A ColumnPolicy describes how one column should be transformed. The pipeline
|
|
4
|
+
iterates over policies and invokes the matching primitive or DP mechanism.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
from pydantic import BaseModel, Field, field_validator
|
|
12
|
+
|
|
13
|
+
from . import primitives as prim
|
|
14
|
+
|
|
15
|
+
Action = Literal[
|
|
16
|
+
"retain", "mask", "hash", "tokenize", "redact",
|
|
17
|
+
"numeric_bin", "date_truncate", "string_truncate",
|
|
18
|
+
"dp_laplace",
|
|
19
|
+
"drop",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PolicyApplicationError(ValueError):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ColumnPolicy(BaseModel):
|
|
28
|
+
column: str = Field(min_length=1)
|
|
29
|
+
action: Action
|
|
30
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
|
31
|
+
is_quasi_identifier: bool = False
|
|
32
|
+
|
|
33
|
+
# For dp_laplace: caller must supply `epsilon` and column bounds `lower`/`upper`.
|
|
34
|
+
|
|
35
|
+
@field_validator("column")
|
|
36
|
+
@classmethod
|
|
37
|
+
def validate_column(cls, value: str) -> str:
|
|
38
|
+
if "\n" in value or "\r" in value or "\x00" in value:
|
|
39
|
+
raise ValueError("column names must not contain newlines or null bytes")
|
|
40
|
+
if len(value) > 200:
|
|
41
|
+
raise ValueError("column names must not exceed 200 characters")
|
|
42
|
+
return value
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def apply_policy(df: pl.DataFrame, policy: ColumnPolicy, hmac_key: bytes) -> pl.DataFrame:
|
|
46
|
+
"""Apply a single non-DP policy to a DataFrame. DP is applied separately."""
|
|
47
|
+
col = policy.column
|
|
48
|
+
if col not in df.columns:
|
|
49
|
+
return df
|
|
50
|
+
s = df[col]
|
|
51
|
+
action = policy.action
|
|
52
|
+
p = policy.params
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
if action == "retain":
|
|
56
|
+
return df
|
|
57
|
+
if action == "drop":
|
|
58
|
+
return df.drop(col)
|
|
59
|
+
if action == "mask":
|
|
60
|
+
out = prim.mask(s, keep_prefix=p.get("keep_prefix", 0),
|
|
61
|
+
keep_suffix=p.get("keep_suffix", 0),
|
|
62
|
+
mask_char=p.get("mask_char", "*"))
|
|
63
|
+
elif action == "hash":
|
|
64
|
+
out = prim.hmac_hash(s, hmac_key)
|
|
65
|
+
elif action == "tokenize":
|
|
66
|
+
out = prim.tokenize(s, hmac_key, prefix=p.get("prefix", "tok_"))
|
|
67
|
+
elif action == "redact":
|
|
68
|
+
out = prim.redact(s, replacement=p.get("replacement", "[REDACTED]"))
|
|
69
|
+
elif action == "numeric_bin":
|
|
70
|
+
if "bin_width" not in p:
|
|
71
|
+
raise PolicyApplicationError(
|
|
72
|
+
f"column '{col}' with action 'numeric_bin' requires param 'bin_width'"
|
|
73
|
+
)
|
|
74
|
+
out = prim.numeric_bin(s, bin_width=float(p["bin_width"]))
|
|
75
|
+
elif action == "date_truncate":
|
|
76
|
+
out = prim.date_truncate(s, granularity=p.get("granularity", "month"))
|
|
77
|
+
elif action == "string_truncate":
|
|
78
|
+
out = prim.string_truncate(s, keep=int(p.get("keep", 3)),
|
|
79
|
+
pad_char=p.get("pad_char", "*"))
|
|
80
|
+
elif action == "dp_laplace":
|
|
81
|
+
# Applied by the DP pass, not here.
|
|
82
|
+
return df
|
|
83
|
+
else:
|
|
84
|
+
raise PolicyApplicationError(f"unknown action {action}")
|
|
85
|
+
except PolicyApplicationError:
|
|
86
|
+
raise
|
|
87
|
+
except (KeyError, TypeError, ValueError) as e:
|
|
88
|
+
raise PolicyApplicationError(
|
|
89
|
+
f"invalid params for column '{col}' action '{action}': {e}"
|
|
90
|
+
) from e
|
|
91
|
+
|
|
92
|
+
return df.with_columns(out.alias(col))
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Per-column anonymization primitives.
|
|
2
|
+
|
|
3
|
+
Each function takes a Polars Series and returns a transformed Series.
|
|
4
|
+
Row count is preserved except for suppression (done by the k-anonymity step).
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from decimal import Decimal, ROUND_FLOOR
|
|
9
|
+
import hashlib
|
|
10
|
+
import hmac
|
|
11
|
+
import math
|
|
12
|
+
import secrets
|
|
13
|
+
from datetime import date, datetime
|
|
14
|
+
|
|
15
|
+
import polars as pl
|
|
16
|
+
|
|
17
|
+
# --- direct-identifier primitives --------------------------------------------
|
|
18
|
+
|
|
19
|
+
def mask(series: pl.Series, keep_prefix: int = 0, keep_suffix: int = 0,
|
|
20
|
+
mask_char: str = "*") -> pl.Series:
|
|
21
|
+
"""Replace characters with mask_char, optionally keeping a prefix/suffix.
|
|
22
|
+
|
|
23
|
+
Guarantees at least one masked character when the input is non-empty.
|
|
24
|
+
If keep_prefix + keep_suffix >= len(s), both are scaled down
|
|
25
|
+
proportionally so that at least one character is masked — otherwise
|
|
26
|
+
a short value like "hi" with keep_prefix=5 would leak unchanged.
|
|
27
|
+
"""
|
|
28
|
+
if keep_prefix < 0 or keep_suffix < 0:
|
|
29
|
+
raise ValueError("keep_prefix and keep_suffix must be >= 0")
|
|
30
|
+
if not mask_char:
|
|
31
|
+
raise ValueError("mask_char must be a non-empty string")
|
|
32
|
+
|
|
33
|
+
def _mask(v):
|
|
34
|
+
if v is None:
|
|
35
|
+
return None
|
|
36
|
+
s = str(v)
|
|
37
|
+
n = len(s)
|
|
38
|
+
if n == 0:
|
|
39
|
+
return s
|
|
40
|
+
p = keep_prefix
|
|
41
|
+
q = keep_suffix
|
|
42
|
+
# Enforce the privacy invariant: at least one character is masked.
|
|
43
|
+
# If the caller's prefix+suffix would leave zero masked chars, we
|
|
44
|
+
# shrink them proportionally (rounding down) so 1 char gets masked.
|
|
45
|
+
if p + q >= n:
|
|
46
|
+
# Scale so p + q = n - 1 (at least one char masked).
|
|
47
|
+
target = max(n - 1, 0)
|
|
48
|
+
if p + q > 0:
|
|
49
|
+
scale = target / (p + q)
|
|
50
|
+
p = int(p * scale)
|
|
51
|
+
q = int(q * scale)
|
|
52
|
+
else:
|
|
53
|
+
p = q = 0
|
|
54
|
+
p = min(p, n)
|
|
55
|
+
q = min(q, max(n - p, 0))
|
|
56
|
+
middle = mask_char * (n - p - q)
|
|
57
|
+
return s[:p] + middle + (s[n - q:] if q else "")
|
|
58
|
+
return series.map_elements(_mask, return_dtype=pl.Utf8)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def hmac_hash(series: pl.Series, key: bytes) -> pl.Series:
|
|
62
|
+
"""HMAC-SHA256, hex-truncated to 16 chars. Keyed → resists rainbow tables."""
|
|
63
|
+
def _h(v):
|
|
64
|
+
if v is None:
|
|
65
|
+
return None
|
|
66
|
+
digest = hmac.new(key, str(v).encode("utf-8"), hashlib.sha256).hexdigest()
|
|
67
|
+
return digest[:16]
|
|
68
|
+
return series.map_elements(_h, return_dtype=pl.Utf8)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def tokenize(series: pl.Series, key: bytes, prefix: str = "tok_") -> pl.Series:
|
|
72
|
+
"""Deterministic-within-session token. Uses HMAC to prevent rainbow tables."""
|
|
73
|
+
def _t(v):
|
|
74
|
+
if v is None:
|
|
75
|
+
return None
|
|
76
|
+
digest = hmac.new(key, str(v).encode("utf-8"), hashlib.sha256).hexdigest()
|
|
77
|
+
return f"{prefix}{digest[:12]}"
|
|
78
|
+
return series.map_elements(_t, return_dtype=pl.Utf8)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def redact(series: pl.Series, replacement: str = "[REDACTED]") -> pl.Series:
|
|
82
|
+
return pl.Series(series.name, [replacement if v is not None else None for v in series],
|
|
83
|
+
dtype=pl.Utf8)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# --- generalization primitives -----------------------------------------------
|
|
87
|
+
|
|
88
|
+
def numeric_bin(series: pl.Series, bin_width: float) -> pl.Series:
|
|
89
|
+
"""Equal-width binning: value → [lo, lo+width)."""
|
|
90
|
+
if bin_width <= 0:
|
|
91
|
+
raise ValueError("bin_width must be > 0")
|
|
92
|
+
step = Decimal(str(bin_width))
|
|
93
|
+
|
|
94
|
+
def _fmt_decimal(value: Decimal) -> str:
|
|
95
|
+
normalized = format(value.normalize(), "f")
|
|
96
|
+
if "." in normalized:
|
|
97
|
+
normalized = normalized.rstrip("0").rstrip(".")
|
|
98
|
+
return normalized or "0"
|
|
99
|
+
|
|
100
|
+
def _bin(v):
|
|
101
|
+
if v is None:
|
|
102
|
+
return None
|
|
103
|
+
try:
|
|
104
|
+
fv = float(v)
|
|
105
|
+
except (TypeError, ValueError):
|
|
106
|
+
raise ValueError(
|
|
107
|
+
f"numeric_bin cannot convert a value in column '{series.name}' to float; "
|
|
108
|
+
"column must contain numeric data"
|
|
109
|
+
) from None
|
|
110
|
+
if not math.isfinite(fv):
|
|
111
|
+
return None
|
|
112
|
+
dec_value = Decimal(str(v))
|
|
113
|
+
bucket = (dec_value / step).to_integral_value(rounding=ROUND_FLOOR)
|
|
114
|
+
lo = bucket * step
|
|
115
|
+
hi = lo + step
|
|
116
|
+
return f"[{_fmt_decimal(lo)}, {_fmt_decimal(hi)})"
|
|
117
|
+
return series.map_elements(_bin, return_dtype=pl.Utf8)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def date_truncate(series: pl.Series, granularity: str = "month") -> pl.Series:
|
|
121
|
+
"""Truncate dates/datetimes to year / month / day.
|
|
122
|
+
|
|
123
|
+
Requires a Date/Datetime/Time column. For strings (e.g. a column that
|
|
124
|
+
Polars couldn't auto-parse because of a non-ISO format), we attempt a
|
|
125
|
+
best-effort parse with dateutil; if that fails for any non-null value
|
|
126
|
+
we raise rather than silently passing the original value through
|
|
127
|
+
(which would leak full-resolution dates).
|
|
128
|
+
"""
|
|
129
|
+
if granularity not in ("year", "month", "day"):
|
|
130
|
+
raise ValueError("granularity must be year/month/day")
|
|
131
|
+
|
|
132
|
+
if series.dtype == pl.Time:
|
|
133
|
+
raise ValueError(
|
|
134
|
+
f"date_truncate does not support pl.Time columns ('{series.name}'); "
|
|
135
|
+
"use a Date or Datetime column"
|
|
136
|
+
)
|
|
137
|
+
if series.dtype in (pl.Date, pl.Datetime):
|
|
138
|
+
fmt = {"year": "%Y", "month": "%Y-%m", "day": "%F"}[granularity]
|
|
139
|
+
return series.dt.strftime(fmt).alias(series.name)
|
|
140
|
+
|
|
141
|
+
from dateutil import parser as _date_parser
|
|
142
|
+
|
|
143
|
+
def _t(v):
|
|
144
|
+
if v is None:
|
|
145
|
+
return None
|
|
146
|
+
if isinstance(v, datetime):
|
|
147
|
+
d = v.date()
|
|
148
|
+
elif isinstance(v, date):
|
|
149
|
+
d = v
|
|
150
|
+
else:
|
|
151
|
+
# Best-effort string parse. We intentionally raise on failure —
|
|
152
|
+
# silently stringifying the value would leak it.
|
|
153
|
+
try:
|
|
154
|
+
d = _date_parser.parse(str(v)).date()
|
|
155
|
+
except (ValueError, TypeError, OverflowError) as e:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
f"date_truncate cannot parse a value in column '{series.name}' as a date "
|
|
158
|
+
f"(row value withheld for privacy): {e}"
|
|
159
|
+
) from e
|
|
160
|
+
if granularity == "year":
|
|
161
|
+
return f"{d.year:04d}"
|
|
162
|
+
if granularity == "month":
|
|
163
|
+
return f"{d.year:04d}-{d.month:02d}"
|
|
164
|
+
return d.isoformat()
|
|
165
|
+
return series.map_elements(_t, return_dtype=pl.Utf8)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def string_truncate(series: pl.Series, keep: int = 3, pad_char: str = "*") -> pl.Series:
|
|
169
|
+
"""Keep first `keep` chars, pad the rest (e.g., ZIP 12345 → 123**).
|
|
170
|
+
|
|
171
|
+
Guarantees at least one character is masked for any non-empty value, even
|
|
172
|
+
when keep >= len(s) — without this, short values like two-letter state
|
|
173
|
+
codes pass through completely unmasked.
|
|
174
|
+
"""
|
|
175
|
+
if keep < 0:
|
|
176
|
+
raise ValueError("keep must be >= 0")
|
|
177
|
+
if len(pad_char) != 1:
|
|
178
|
+
raise ValueError("pad_char must be a single character")
|
|
179
|
+
|
|
180
|
+
def _t(v):
|
|
181
|
+
if v is None:
|
|
182
|
+
return None
|
|
183
|
+
s = str(v)
|
|
184
|
+
n = len(s)
|
|
185
|
+
if n == 0:
|
|
186
|
+
return s
|
|
187
|
+
effective_keep = min(keep, max(n - 1, 0))
|
|
188
|
+
return s[:effective_keep] + pad_char * (n - effective_keep)
|
|
189
|
+
return series.map_elements(_t, return_dtype=pl.Utf8)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# --- utility -----------------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
def new_session_key() -> bytes:
|
|
195
|
+
return secrets.token_bytes(32)
|
|
File without changes
|