sdsa 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. sdsa-1.1.0/PKG-INFO +20 -0
  2. sdsa-1.1.0/pyproject.toml +50 -0
  3. sdsa-1.1.0/setup.cfg +4 -0
  4. sdsa-1.1.0/src/sdsa/__init__.py +1 -0
  5. sdsa-1.1.0/src/sdsa/anonymize/__init__.py +0 -0
  6. sdsa-1.1.0/src/sdsa/anonymize/policy.py +92 -0
  7. sdsa-1.1.0/src/sdsa/anonymize/primitives.py +195 -0
  8. sdsa-1.1.0/src/sdsa/api/__init__.py +0 -0
  9. sdsa-1.1.0/src/sdsa/api/routes.py +352 -0
  10. sdsa-1.1.0/src/sdsa/cli.py +82 -0
  11. sdsa-1.1.0/src/sdsa/core/__init__.py +0 -0
  12. sdsa-1.1.0/src/sdsa/core/config.py +121 -0
  13. sdsa-1.1.0/src/sdsa/core/logging.py +63 -0
  14. sdsa-1.1.0/src/sdsa/core/session.py +182 -0
  15. sdsa-1.1.0/src/sdsa/detect/__init__.py +0 -0
  16. sdsa-1.1.0/src/sdsa/detect/pii.py +191 -0
  17. sdsa-1.1.0/src/sdsa/detect/schema.py +58 -0
  18. sdsa-1.1.0/src/sdsa/dp/__init__.py +0 -0
  19. sdsa-1.1.0/src/sdsa/dp/accountant.py +25 -0
  20. sdsa-1.1.0/src/sdsa/dp/laplace.py +88 -0
  21. sdsa-1.1.0/src/sdsa/frontend/app.js +1118 -0
  22. sdsa-1.1.0/src/sdsa/frontend/index.html +369 -0
  23. sdsa-1.1.0/src/sdsa/frontend/style.css +1153 -0
  24. sdsa-1.1.0/src/sdsa/ingest.py +389 -0
  25. sdsa-1.1.0/src/sdsa/kanon/__init__.py +0 -0
  26. sdsa-1.1.0/src/sdsa/kanon/enforce.py +77 -0
  27. sdsa-1.1.0/src/sdsa/main.py +76 -0
  28. sdsa-1.1.0/src/sdsa/pipeline.py +263 -0
  29. sdsa-1.1.0/src/sdsa/policy_config.py +148 -0
  30. sdsa-1.1.0/src/sdsa/preflight.py +279 -0
  31. sdsa-1.1.0/src/sdsa/report.py +103 -0
  32. sdsa-1.1.0/src/sdsa/validate/__init__.py +0 -0
  33. sdsa-1.1.0/src/sdsa/validate/metrics.py +144 -0
  34. sdsa-1.1.0/src/sdsa.egg-info/PKG-INFO +20 -0
  35. sdsa-1.1.0/src/sdsa.egg-info/SOURCES.txt +50 -0
  36. sdsa-1.1.0/src/sdsa.egg-info/dependency_links.txt +1 -0
  37. sdsa-1.1.0/src/sdsa.egg-info/entry_points.txt +2 -0
  38. sdsa-1.1.0/src/sdsa.egg-info/requires.txt +16 -0
  39. sdsa-1.1.0/src/sdsa.egg-info/top_level.txt +1 -0
  40. sdsa-1.1.0/tests/test_anonymize.py +140 -0
  41. sdsa-1.1.0/tests/test_api.py +522 -0
  42. sdsa-1.1.0/tests/test_cli.py +71 -0
  43. sdsa-1.1.0/tests/test_core.py +144 -0
  44. sdsa-1.1.0/tests/test_deployment.py +92 -0
  45. sdsa-1.1.0/tests/test_detect.py +77 -0
  46. sdsa-1.1.0/tests/test_dp.py +118 -0
  47. sdsa-1.1.0/tests/test_ingest.py +206 -0
  48. sdsa-1.1.0/tests/test_kanon.py +74 -0
  49. sdsa-1.1.0/tests/test_pipeline.py +254 -0
  50. sdsa-1.1.0/tests/test_policy_config.py +50 -0
  51. sdsa-1.1.0/tests/test_preflight.py +136 -0
  52. sdsa-1.1.0/tests/test_validate.py +19 -0
sdsa-1.1.0/PKG-INFO ADDED
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: sdsa
3
+ Version: 1.1.0
4
+ Summary: Secure Data Sanitization App backend
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: fastapi>=0.115
7
+ Requires-Dist: uvicorn[standard]>=0.30
8
+ Requires-Dist: python-multipart>=0.0.9
9
+ Requires-Dist: polars>=1.0
10
+ Requires-Dist: pyarrow>=16
11
+ Requires-Dist: opendp>=0.11
12
+ Requires-Dist: phonenumbers>=8.13
13
+ Requires-Dist: chardet>=5.2
14
+ Requires-Dist: pydantic>=2.8
15
+ Requires-Dist: python-dateutil>=2.9
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=8.0; extra == "dev"
18
+ Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
19
+ Requires-Dist: httpx>=0.27; extra == "dev"
20
+ Requires-Dist: ruff>=0.6; extra == "dev"
@@ -0,0 +1,50 @@
1
+ [project]
2
+ name = "sdsa"
3
+ version = "1.1.0"
4
+ description = "Secure Data Sanitization App backend"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "fastapi>=0.115",
8
+ "uvicorn[standard]>=0.30",
9
+ "python-multipart>=0.0.9",
10
+ "polars>=1.0",
11
+ "pyarrow>=16",
12
+ "opendp>=0.11",
13
+ "phonenumbers>=8.13",
14
+ "chardet>=5.2",
15
+ "pydantic>=2.8",
16
+ "python-dateutil>=2.9",
17
+ ]
18
+
19
+ [project.scripts]
20
+ sdsa-server = "sdsa.cli:main"
21
+
22
+ [project.optional-dependencies]
23
+ dev = [
24
+ "pytest>=8.0",
25
+ "pytest-asyncio>=0.24",
26
+ "httpx>=0.27",
27
+ "ruff>=0.6",
28
+ ]
29
+
30
+ [build-system]
31
+ requires = ["setuptools>=68"]
32
+ build-backend = "setuptools.build_meta"
33
+
34
+ [tool.setuptools.packages.find]
35
+ where = ["src"]
36
+
37
+ [tool.setuptools.package-data]
38
+ sdsa = ["frontend/*.html", "frontend/*.css", "frontend/*.js"]
39
+
40
+ [tool.pytest.ini_options]
41
+ pythonpath = ["src"]
42
+ testpaths = ["tests"]
43
+ asyncio_mode = "auto"
44
+ filterwarnings = [
45
+ "ignore:Field name \"schema\".*:UserWarning",
46
+ ]
47
+
48
+ [tool.ruff]
49
+ line-length = 100
50
+ target-version = "py311"
sdsa-1.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ __version__ = "1.1.0"
File without changes
@@ -0,0 +1,92 @@
1
+ """Column policy model + apply function.
2
+
3
+ A ColumnPolicy describes how one column should be transformed. The pipeline
4
+ iterates over policies and invokes the matching primitive or DP mechanism.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from typing import Any, Literal
9
+
10
+ import polars as pl
11
+ from pydantic import BaseModel, Field, field_validator
12
+
13
+ from . import primitives as prim
14
+
15
+ Action = Literal[
16
+ "retain", "mask", "hash", "tokenize", "redact",
17
+ "numeric_bin", "date_truncate", "string_truncate",
18
+ "dp_laplace",
19
+ "drop",
20
+ ]
21
+
22
+
23
+ class PolicyApplicationError(ValueError):
24
+ pass
25
+
26
+
27
+ class ColumnPolicy(BaseModel):
28
+ column: str = Field(min_length=1)
29
+ action: Action
30
+ params: dict[str, Any] = Field(default_factory=dict)
31
+ is_quasi_identifier: bool = False
32
+
33
+ # For dp_laplace: caller must supply `epsilon` and column bounds `lower`/`upper`.
34
+
35
+ @field_validator("column")
36
+ @classmethod
37
+ def validate_column(cls, value: str) -> str:
38
+ if "\n" in value or "\r" in value or "\x00" in value:
39
+ raise ValueError("column names must not contain newlines or null bytes")
40
+ if len(value) > 200:
41
+ raise ValueError("column names must not exceed 200 characters")
42
+ return value
43
+
44
+
45
+ def apply_policy(df: pl.DataFrame, policy: ColumnPolicy, hmac_key: bytes) -> pl.DataFrame:
46
+ """Apply a single non-DP policy to a DataFrame. DP is applied separately."""
47
+ col = policy.column
48
+ if col not in df.columns:
49
+ return df
50
+ s = df[col]
51
+ action = policy.action
52
+ p = policy.params
53
+
54
+ try:
55
+ if action == "retain":
56
+ return df
57
+ if action == "drop":
58
+ return df.drop(col)
59
+ if action == "mask":
60
+ out = prim.mask(s, keep_prefix=p.get("keep_prefix", 0),
61
+ keep_suffix=p.get("keep_suffix", 0),
62
+ mask_char=p.get("mask_char", "*"))
63
+ elif action == "hash":
64
+ out = prim.hmac_hash(s, hmac_key)
65
+ elif action == "tokenize":
66
+ out = prim.tokenize(s, hmac_key, prefix=p.get("prefix", "tok_"))
67
+ elif action == "redact":
68
+ out = prim.redact(s, replacement=p.get("replacement", "[REDACTED]"))
69
+ elif action == "numeric_bin":
70
+ if "bin_width" not in p:
71
+ raise PolicyApplicationError(
72
+ f"column '{col}' with action 'numeric_bin' requires param 'bin_width'"
73
+ )
74
+ out = prim.numeric_bin(s, bin_width=float(p["bin_width"]))
75
+ elif action == "date_truncate":
76
+ out = prim.date_truncate(s, granularity=p.get("granularity", "month"))
77
+ elif action == "string_truncate":
78
+ out = prim.string_truncate(s, keep=int(p.get("keep", 3)),
79
+ pad_char=p.get("pad_char", "*"))
80
+ elif action == "dp_laplace":
81
+ # Applied by the DP pass, not here.
82
+ return df
83
+ else:
84
+ raise PolicyApplicationError(f"unknown action {action}")
85
+ except PolicyApplicationError:
86
+ raise
87
+ except (KeyError, TypeError, ValueError) as e:
88
+ raise PolicyApplicationError(
89
+ f"invalid params for column '{col}' action '{action}': {e}"
90
+ ) from e
91
+
92
+ return df.with_columns(out.alias(col))
@@ -0,0 +1,195 @@
1
+ """Per-column anonymization primitives.
2
+
3
+ Each function takes a Polars Series and returns a transformed Series.
4
+ Row count is preserved except for suppression (done by the k-anonymity step).
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from decimal import Decimal, ROUND_FLOOR
9
+ import hashlib
10
+ import hmac
11
+ import math
12
+ import secrets
13
+ from datetime import date, datetime
14
+
15
+ import polars as pl
16
+
17
+ # --- direct-identifier primitives --------------------------------------------
18
+
19
+ def mask(series: pl.Series, keep_prefix: int = 0, keep_suffix: int = 0,
20
+ mask_char: str = "*") -> pl.Series:
21
+ """Replace characters with mask_char, optionally keeping a prefix/suffix.
22
+
23
+ Guarantees at least one masked character when the input is non-empty.
24
+ If keep_prefix + keep_suffix >= len(s), both are scaled down
25
+ proportionally so that at least one character is masked — otherwise
26
+ a short value like "hi" with keep_prefix=5 would leak unchanged.
27
+ """
28
+ if keep_prefix < 0 or keep_suffix < 0:
29
+ raise ValueError("keep_prefix and keep_suffix must be >= 0")
30
+ if not mask_char:
31
+ raise ValueError("mask_char must be a non-empty string")
32
+
33
+ def _mask(v):
34
+ if v is None:
35
+ return None
36
+ s = str(v)
37
+ n = len(s)
38
+ if n == 0:
39
+ return s
40
+ p = keep_prefix
41
+ q = keep_suffix
42
+ # Enforce the privacy invariant: at least one character is masked.
43
+ # If the caller's prefix+suffix would leave zero masked chars, we
44
+ # shrink them proportionally (rounding down) so 1 char gets masked.
45
+ if p + q >= n:
46
+ # Scale so p + q = n - 1 (at least one char masked).
47
+ target = max(n - 1, 0)
48
+ if p + q > 0:
49
+ scale = target / (p + q)
50
+ p = int(p * scale)
51
+ q = int(q * scale)
52
+ else:
53
+ p = q = 0
54
+ p = min(p, n)
55
+ q = min(q, max(n - p, 0))
56
+ middle = mask_char * (n - p - q)
57
+ return s[:p] + middle + (s[n - q:] if q else "")
58
+ return series.map_elements(_mask, return_dtype=pl.Utf8)
59
+
60
+
61
+ def hmac_hash(series: pl.Series, key: bytes) -> pl.Series:
62
+ """HMAC-SHA256, hex-truncated to 16 chars. Keyed → resists rainbow tables."""
63
+ def _h(v):
64
+ if v is None:
65
+ return None
66
+ digest = hmac.new(key, str(v).encode("utf-8"), hashlib.sha256).hexdigest()
67
+ return digest[:16]
68
+ return series.map_elements(_h, return_dtype=pl.Utf8)
69
+
70
+
71
+ def tokenize(series: pl.Series, key: bytes, prefix: str = "tok_") -> pl.Series:
72
+ """Deterministic-within-session token. Uses HMAC to prevent rainbow tables."""
73
+ def _t(v):
74
+ if v is None:
75
+ return None
76
+ digest = hmac.new(key, str(v).encode("utf-8"), hashlib.sha256).hexdigest()
77
+ return f"{prefix}{digest[:12]}"
78
+ return series.map_elements(_t, return_dtype=pl.Utf8)
79
+
80
+
81
+ def redact(series: pl.Series, replacement: str = "[REDACTED]") -> pl.Series:
82
+ return pl.Series(series.name, [replacement if v is not None else None for v in series],
83
+ dtype=pl.Utf8)
84
+
85
+
86
+ # --- generalization primitives -----------------------------------------------
87
+
88
+ def numeric_bin(series: pl.Series, bin_width: float) -> pl.Series:
89
+ """Equal-width binning: value → [lo, lo+width)."""
90
+ if bin_width <= 0:
91
+ raise ValueError("bin_width must be > 0")
92
+ step = Decimal(str(bin_width))
93
+
94
+ def _fmt_decimal(value: Decimal) -> str:
95
+ normalized = format(value.normalize(), "f")
96
+ if "." in normalized:
97
+ normalized = normalized.rstrip("0").rstrip(".")
98
+ return normalized or "0"
99
+
100
+ def _bin(v):
101
+ if v is None:
102
+ return None
103
+ try:
104
+ fv = float(v)
105
+ except (TypeError, ValueError):
106
+ raise ValueError(
107
+ f"numeric_bin cannot convert a value in column '{series.name}' to float; "
108
+ "column must contain numeric data"
109
+ ) from None
110
+ if not math.isfinite(fv):
111
+ return None
112
+ dec_value = Decimal(str(v))
113
+ bucket = (dec_value / step).to_integral_value(rounding=ROUND_FLOOR)
114
+ lo = bucket * step
115
+ hi = lo + step
116
+ return f"[{_fmt_decimal(lo)}, {_fmt_decimal(hi)})"
117
+ return series.map_elements(_bin, return_dtype=pl.Utf8)
118
+
119
+
120
+ def date_truncate(series: pl.Series, granularity: str = "month") -> pl.Series:
121
+ """Truncate dates/datetimes to year / month / day.
122
+
123
+ Requires a Date/Datetime/Time column. For strings (e.g. a column that
124
+ Polars couldn't auto-parse because of a non-ISO format), we attempt a
125
+ best-effort parse with dateutil; if that fails for any non-null value
126
+ we raise rather than silently passing the original value through
127
+ (which would leak full-resolution dates).
128
+ """
129
+ if granularity not in ("year", "month", "day"):
130
+ raise ValueError("granularity must be year/month/day")
131
+
132
+ if series.dtype == pl.Time:
133
+ raise ValueError(
134
+ f"date_truncate does not support pl.Time columns ('{series.name}'); "
135
+ "use a Date or Datetime column"
136
+ )
137
+ if series.dtype in (pl.Date, pl.Datetime):
138
+ fmt = {"year": "%Y", "month": "%Y-%m", "day": "%F"}[granularity]
139
+ return series.dt.strftime(fmt).alias(series.name)
140
+
141
+ from dateutil import parser as _date_parser
142
+
143
+ def _t(v):
144
+ if v is None:
145
+ return None
146
+ if isinstance(v, datetime):
147
+ d = v.date()
148
+ elif isinstance(v, date):
149
+ d = v
150
+ else:
151
+ # Best-effort string parse. We intentionally raise on failure —
152
+ # silently stringifying the value would leak it.
153
+ try:
154
+ d = _date_parser.parse(str(v)).date()
155
+ except (ValueError, TypeError, OverflowError) as e:
156
+ raise ValueError(
157
+ f"date_truncate cannot parse a value in column '{series.name}' as a date "
158
+ f"(row value withheld for privacy): {e}"
159
+ ) from e
160
+ if granularity == "year":
161
+ return f"{d.year:04d}"
162
+ if granularity == "month":
163
+ return f"{d.year:04d}-{d.month:02d}"
164
+ return d.isoformat()
165
+ return series.map_elements(_t, return_dtype=pl.Utf8)
166
+
167
+
168
+ def string_truncate(series: pl.Series, keep: int = 3, pad_char: str = "*") -> pl.Series:
169
+ """Keep first `keep` chars, pad the rest (e.g., ZIP 12345 → 123**).
170
+
171
+ Guarantees at least one character is masked for any non-empty value, even
172
+ when keep >= len(s) — without this, short values like two-letter state
173
+ codes pass through completely unmasked.
174
+ """
175
+ if keep < 0:
176
+ raise ValueError("keep must be >= 0")
177
+ if len(pad_char) != 1:
178
+ raise ValueError("pad_char must be a single character")
179
+
180
+ def _t(v):
181
+ if v is None:
182
+ return None
183
+ s = str(v)
184
+ n = len(s)
185
+ if n == 0:
186
+ return s
187
+ effective_keep = min(keep, max(n - 1, 0))
188
+ return s[:effective_keep] + pad_char * (n - effective_keep)
189
+ return series.map_elements(_t, return_dtype=pl.Utf8)
190
+
191
+
192
+ # --- utility -----------------------------------------------------------------
193
+
194
+ def new_session_key() -> bytes:
195
+ return secrets.token_bytes(32)
File without changes