PyPI - sdsa - Versions diffs - 1.1.0__py3-none-any.whl - Mend

sdsa 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sdsa/__init__.py +1 -0
sdsa/anonymize/__init__.py +0 -0
sdsa/anonymize/policy.py +92 -0
sdsa/anonymize/primitives.py +195 -0
sdsa/api/__init__.py +0 -0
sdsa/api/routes.py +352 -0
sdsa/cli.py +82 -0
sdsa/core/__init__.py +0 -0
sdsa/core/config.py +121 -0
sdsa/core/logging.py +63 -0
sdsa/core/session.py +182 -0
sdsa/detect/__init__.py +0 -0
sdsa/detect/pii.py +191 -0
sdsa/detect/schema.py +58 -0
sdsa/dp/__init__.py +0 -0
sdsa/dp/accountant.py +25 -0
sdsa/dp/laplace.py +88 -0
sdsa/frontend/app.js +1118 -0
sdsa/frontend/index.html +369 -0
sdsa/frontend/style.css +1153 -0
sdsa/ingest.py +389 -0
sdsa/kanon/__init__.py +0 -0
sdsa/kanon/enforce.py +77 -0
sdsa/main.py +76 -0
sdsa/pipeline.py +263 -0
sdsa/policy_config.py +148 -0
sdsa/preflight.py +279 -0
sdsa/report.py +103 -0
sdsa/validate/__init__.py +0 -0
sdsa/validate/metrics.py +144 -0
sdsa-1.1.0.dist-info/METADATA +20 -0
sdsa-1.1.0.dist-info/RECORD +35 -0
sdsa-1.1.0.dist-info/WHEEL +5 -0
sdsa-1.1.0.dist-info/entry_points.txt +2 -0
sdsa-1.1.0.dist-info/top_level.txt +1 -0

sdsa/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.1.0"

sdsa/anonymize/__init__.py ADDED Viewed

File without changes

sdsa/anonymize/policy.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""Column policy model + apply function.
+A ColumnPolicy describes how one column should be transformed. The pipeline
+iterates over policies and invokes the matching primitive or DP mechanism.
+"""
+from __future__ import annotations
+from typing import Any, Literal
+import polars as pl
+from pydantic import BaseModel, Field, field_validator
+from . import primitives as prim
+Action = Literal[
+    "retain", "mask", "hash", "tokenize", "redact",
+    "numeric_bin", "date_truncate", "string_truncate",
+    "dp_laplace",
+    "drop",
+]
+class PolicyApplicationError(ValueError):
+    pass
+class ColumnPolicy(BaseModel):
+    column: str = Field(min_length=1)
+    action: Action
+    params: dict[str, Any] = Field(default_factory=dict)
+    is_quasi_identifier: bool = False
+    # For dp_laplace: caller must supply `epsilon` and column bounds `lower`/`upper`.
+    @field_validator("column")
+    @classmethod
+    def validate_column(cls, value: str) -> str:
+        if "\n" in value or "\r" in value or "\x00" in value:
+            raise ValueError("column names must not contain newlines or null bytes")
+        if len(value) > 200:
+            raise ValueError("column names must not exceed 200 characters")
+        return value
+def apply_policy(df: pl.DataFrame, policy: ColumnPolicy, hmac_key: bytes) -> pl.DataFrame:
+    """Apply a single non-DP policy to a DataFrame. DP is applied separately."""
+    col = policy.column
+    if col not in df.columns:
+        return df
+    s = df[col]
+    action = policy.action
+    p = policy.params
+    try:
+        if action == "retain":
+            return df
+        if action == "drop":
+            return df.drop(col)
+        if action == "mask":
+            out = prim.mask(s, keep_prefix=p.get("keep_prefix", 0),
+                           keep_suffix=p.get("keep_suffix", 0),
+                           mask_char=p.get("mask_char", "*"))
+        elif action == "hash":
+            out = prim.hmac_hash(s, hmac_key)
+        elif action == "tokenize":
+            out = prim.tokenize(s, hmac_key, prefix=p.get("prefix", "tok_"))
+        elif action == "redact":
+            out = prim.redact(s, replacement=p.get("replacement", "[REDACTED]"))
+        elif action == "numeric_bin":
+            if "bin_width" not in p:
+                raise PolicyApplicationError(
+                    f"column '{col}' with action 'numeric_bin' requires param 'bin_width'"
+                )
+            out = prim.numeric_bin(s, bin_width=float(p["bin_width"]))
+        elif action == "date_truncate":
+            out = prim.date_truncate(s, granularity=p.get("granularity", "month"))
+        elif action == "string_truncate":
+            out = prim.string_truncate(s, keep=int(p.get("keep", 3)),
+                                       pad_char=p.get("pad_char", "*"))
+        elif action == "dp_laplace":
+            # Applied by the DP pass, not here.
+            return df
+        else:
+            raise PolicyApplicationError(f"unknown action {action}")
+    except PolicyApplicationError:
+        raise
+    except (KeyError, TypeError, ValueError) as e:
+        raise PolicyApplicationError(
+            f"invalid params for column '{col}' action '{action}': {e}"
+        ) from e
+    return df.with_columns(out.alias(col))

sdsa/anonymize/primitives.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""Per-column anonymization primitives.
+Each function takes a Polars Series and returns a transformed Series.
+Row count is preserved except for suppression (done by the k-anonymity step).
+"""
+from __future__ import annotations
+from decimal import Decimal, ROUND_FLOOR
+import hashlib
+import hmac
+import math
+import secrets
+from datetime import date, datetime
+import polars as pl
+# --- direct-identifier primitives --------------------------------------------
+def mask(series: pl.Series, keep_prefix: int = 0, keep_suffix: int = 0,
+         mask_char: str = "*") -> pl.Series:
+    """Replace characters with mask_char, optionally keeping a prefix/suffix.
+    Guarantees at least one masked character when the input is non-empty.
+    If keep_prefix + keep_suffix >= len(s), both are scaled down
+    proportionally so that at least one character is masked — otherwise
+    a short value like "hi" with keep_prefix=5 would leak unchanged.
+    """
+    if keep_prefix < 0 or keep_suffix < 0:
+        raise ValueError("keep_prefix and keep_suffix must be >= 0")
+    if not mask_char:
+        raise ValueError("mask_char must be a non-empty string")
+    def _mask(v):
+        if v is None:
+            return None
+        s = str(v)
+        n = len(s)
+        if n == 0:
+            return s
+        p = keep_prefix
+        q = keep_suffix
+        # Enforce the privacy invariant: at least one character is masked.
+        # If the caller's prefix+suffix would leave zero masked chars, we
+        # shrink them proportionally (rounding down) so 1 char gets masked.
+        if p + q >= n:
+            # Scale so p + q = n - 1 (at least one char masked).
+            target = max(n - 1, 0)
+            if p + q > 0:
+                scale = target / (p + q)
+                p = int(p * scale)
+                q = int(q * scale)
+            else:
+                p = q = 0
+        p = min(p, n)
+        q = min(q, max(n - p, 0))
+        middle = mask_char * (n - p - q)
+        return s[:p] + middle + (s[n - q:] if q else "")
+    return series.map_elements(_mask, return_dtype=pl.Utf8)
+def hmac_hash(series: pl.Series, key: bytes) -> pl.Series:
+    """HMAC-SHA256, hex-truncated to 16 chars. Keyed → resists rainbow tables."""
+    def _h(v):
+        if v is None:
+            return None
+        digest = hmac.new(key, str(v).encode("utf-8"), hashlib.sha256).hexdigest()
+        return digest[:16]
+    return series.map_elements(_h, return_dtype=pl.Utf8)
+def tokenize(series: pl.Series, key: bytes, prefix: str = "tok_") -> pl.Series:
+    """Deterministic-within-session token. Uses HMAC to prevent rainbow tables."""
+    def _t(v):
+        if v is None:
+            return None
+        digest = hmac.new(key, str(v).encode("utf-8"), hashlib.sha256).hexdigest()
+        return f"{prefix}{digest[:12]}"
+    return series.map_elements(_t, return_dtype=pl.Utf8)
+def redact(series: pl.Series, replacement: str = "[REDACTED]") -> pl.Series:
+    return pl.Series(series.name, [replacement if v is not None else None for v in series],
+                     dtype=pl.Utf8)
+# --- generalization primitives -----------------------------------------------
+def numeric_bin(series: pl.Series, bin_width: float) -> pl.Series:
+    """Equal-width binning: value → [lo, lo+width)."""
+    if bin_width <= 0:
+        raise ValueError("bin_width must be > 0")
+    step = Decimal(str(bin_width))
+    def _fmt_decimal(value: Decimal) -> str:
+        normalized = format(value.normalize(), "f")
+        if "." in normalized:
+            normalized = normalized.rstrip("0").rstrip(".")
+        return normalized or "0"
+    def _bin(v):
+        if v is None:
+            return None
+        try:
+            fv = float(v)
+        except (TypeError, ValueError):
+            raise ValueError(
+                f"numeric_bin cannot convert a value in column '{series.name}' to float; "
+                "column must contain numeric data"
+            ) from None
+        if not math.isfinite(fv):
+            return None
+        dec_value = Decimal(str(v))
+        bucket = (dec_value / step).to_integral_value(rounding=ROUND_FLOOR)
+        lo = bucket * step
+        hi = lo + step
+        return f"[{_fmt_decimal(lo)}, {_fmt_decimal(hi)})"
+    return series.map_elements(_bin, return_dtype=pl.Utf8)
+def date_truncate(series: pl.Series, granularity: str = "month") -> pl.Series:
+    """Truncate dates/datetimes to year / month / day.
+    Requires a Date/Datetime/Time column. For strings (e.g. a column that
+    Polars couldn't auto-parse because of a non-ISO format), we attempt a
+    best-effort parse with dateutil; if that fails for any non-null value
+    we raise rather than silently passing the original value through
+    (which would leak full-resolution dates).
+    """
+    if granularity not in ("year", "month", "day"):
+        raise ValueError("granularity must be year/month/day")
+    if series.dtype == pl.Time:
+        raise ValueError(
+            f"date_truncate does not support pl.Time columns ('{series.name}'); "
+            "use a Date or Datetime column"
+        )
+    if series.dtype in (pl.Date, pl.Datetime):
+        fmt = {"year": "%Y", "month": "%Y-%m", "day": "%F"}[granularity]
+        return series.dt.strftime(fmt).alias(series.name)
+    from dateutil import parser as _date_parser
+    def _t(v):
+        if v is None:
+            return None
+        if isinstance(v, datetime):
+            d = v.date()
+        elif isinstance(v, date):
+            d = v
+        else:
+            # Best-effort string parse. We intentionally raise on failure —
+            # silently stringifying the value would leak it.
+            try:
+                d = _date_parser.parse(str(v)).date()
+            except (ValueError, TypeError, OverflowError) as e:
+                raise ValueError(
+                    f"date_truncate cannot parse a value in column '{series.name}' as a date "
+                    f"(row value withheld for privacy): {e}"
+                ) from e
+        if granularity == "year":
+            return f"{d.year:04d}"
+        if granularity == "month":
+            return f"{d.year:04d}-{d.month:02d}"
+        return d.isoformat()
+    return series.map_elements(_t, return_dtype=pl.Utf8)
+def string_truncate(series: pl.Series, keep: int = 3, pad_char: str = "*") -> pl.Series:
+    """Keep first `keep` chars, pad the rest (e.g., ZIP 12345 → 123**).
+    Guarantees at least one character is masked for any non-empty value, even
+    when keep >= len(s) — without this, short values like two-letter state
+    codes pass through completely unmasked.
+    """
+    if keep < 0:
+        raise ValueError("keep must be >= 0")
+    if len(pad_char) != 1:
+        raise ValueError("pad_char must be a single character")
+    def _t(v):
+        if v is None:
+            return None
+        s = str(v)
+        n = len(s)
+        if n == 0:
+            return s
+        effective_keep = min(keep, max(n - 1, 0))
+        return s[:effective_keep] + pad_char * (n - effective_keep)
+    return series.map_elements(_t, return_dtype=pl.Utf8)
+# --- utility -----------------------------------------------------------------
+def new_session_key() -> bytes:
+    return secrets.token_bytes(32)

sdsa/api/__init__.py ADDED Viewed

File without changes

sdsa/api/routes.py ADDED Viewed

@@ -0,0 +1,352 @@
+"""FastAPI routes: upload → process → download → delete."""
+from __future__ import annotations
+import io
+import threading
+from typing import Any
+import polars as pl
+from fastapi import APIRouter, HTTPException, Response, UploadFile
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field
+from ..core.config import get_config
+from ..core.logging import get_logger
+from ..core.session import get_store
+from ..detect.pii import detect_dataframe
+from ..detect.schema import infer_schema
+from ..ingest import ParseError, parse_upload
+from ..policy_config import PolicyConfigError, build_policy_suggestions
+from ..preflight import PreflightRequest, preflight_k_anonymity
+from ..pipeline import PipelineError, ProcessRequest, _derive_deterministic_key, run_pipeline
+from ..report import render_markdown
+from ..anonymize.policy import PolicyApplicationError, apply_policy
+from ..dp.laplace import LaplaceParams, apply_laplace
+# A small fixed cap so preview can never leak more than a handful of rows back
+# to the client and is always cheap to render. Five rows is enough to make
+# transformations legible without scaring users with a wall of data.
+PREVIEW_ROW_LIMIT = 5
+log = get_logger("sdsa.api")
+router = APIRouter(prefix="/api")
+_processing_sessions: set[str] = set()
+_processing_lock = threading.Lock()
+class UploadResponse(BaseModel):
+    model_config = {"protected_namespaces": ()}
+    session_id: str
+    session_ttl_seconds: int
+    session_expires_at: float
+    default_k: int
+    row_count: int
+    column_count: int
+    format: str
+    encoding: str
+    parse_meta: dict
+    schema_: list[dict] = Field(..., serialization_alias="schema")
+    pii_suggestions: dict[str, dict]
+    policy_suggestions: dict[str, dict]
+    sample_columns: list[str]
+    sample_rows: list[list[str | None]]
+@router.post("/upload", response_model=UploadResponse)
+async def upload(file: UploadFile) -> UploadResponse:
+    cfg = get_config()
+    raw = await file.read(cfg.max_upload_bytes + 1)
+    if len(raw) > cfg.max_upload_bytes:
+        raise HTTPException(413, "file exceeds max upload size")
+    try:
+        result = parse_upload(file.filename or "", raw)
+    except ParseError as e:
+        raise HTTPException(400, str(e))
+    df = result.df
+    sample = df.head(cfg.sample_rows_for_detection)
+    schema = infer_schema(df)
+    pii = {k: asdict_pii(v) for k, v in detect_dataframe(sample).items()}
+    preview_sample = _serialize_sample(df.head(PREVIEW_ROW_LIMIT))
+    try:
+        policy_suggestions = build_policy_suggestions(schema, pii)
+    except PolicyConfigError as e:
+        raise HTTPException(400, str(e))
+    store = get_store()
+    session = store.create()
+    session.df = df
+    session.detection = {
+        "schema": schema,
+        "pii": pii,
+        "policy_suggestions": policy_suggestions,
+    }
+    log.info("upload_complete", extra={
+        "session_id": session.session_id,
+        "rows": df.height,
+        "cols": df.width,
+        "format": result.format,
+        "encoding": result.encoding,
+    })
+    return UploadResponse(
+        session_id=session.session_id,
+        session_ttl_seconds=cfg.session_ttl_seconds,
+        session_expires_at=session.created_at + cfg.session_ttl_seconds,
+        default_k=cfg.default_k,
+        row_count=df.height,
+        column_count=df.width,
+        format=result.format,
+        encoding=result.encoding,
+        parse_meta=result.meta,
+        schema_=schema,
+        pii_suggestions=pii,
+        policy_suggestions=policy_suggestions,
+        sample_columns=df.columns,
+        sample_rows=preview_sample,
+    )
+def _stringify_cell(v: Any) -> str | None:
+    if v is None:
+        return None
+    # Floats with long tails clutter the preview; trim to 6 sig digits.
+    if isinstance(v, float):
+        return f"{v:.6g}"
+    s = str(v)
+    if len(s) > 80:
+        return s[:77] + "…"
+    return s
+def _serialize_sample(df: pl.DataFrame) -> list[list[str | None]]:
+    rows: list[list[str | None]] = []
+    cols = df.columns
+    for row in df.iter_rows():
+        rows.append([_stringify_cell(row[i]) for i in range(len(cols))])
+    return rows
+def asdict_pii(s) -> dict[str, Any]:
+    return {"kind": s.kind, "confidence": round(s.confidence, 3), "reason": s.reason}
+class ProcessResponse(BaseModel):
+    session_id: str
+    report: dict
+    ready_for_download: bool = True
+class PreflightResponse(BaseModel):
+    session_id: str
+    preflight: dict
+@router.post("/process/{session_id}", response_model=ProcessResponse)
+async def process(session_id: str, request: ProcessRequest) -> ProcessResponse:
+    with _processing_lock:
+        if session_id in _processing_sessions:
+            raise HTTPException(409, "processing already in progress for this session")
+        _processing_sessions.add(session_id)
+    try:
+        store = get_store()
+        snapshot = store.checkout(session_id)
+        if snapshot is None or snapshot.df is None or snapshot.hmac_key is None:
+            raise HTTPException(404, "session not found or expired")
+        detection = snapshot.detection or {"schema": [], "pii": {}}
+        # Best-effort clear of previous output. If the session was reaped
+        # between checkout and here the snapshot is still valid — proceed
+        # with the data we already have rather than failing spuriously.
+        store.clear_output(session_id)
+        try:
+            result = run_pipeline(
+                original=snapshot.df,
+                request=request,
+                session_id=session_id,
+                hmac_key=snapshot.hmac_key,
+                schema=detection.get("schema", []),
+                pii_suggestions=detection.get("pii", {}),
+            )
+        except (PipelineError, PolicyApplicationError) as e:
+            raise HTTPException(400, str(e))
+        # Serialize CSV into session bytes buffer.
+        buf = io.BytesIO()
+        result.df.write_csv(buf)
+        if not store.store_output(session_id, buf.getvalue(), result.report):
+            raise HTTPException(404, "session expired — please re-upload and reprocess")
+        log.info("process_complete", extra={
+            "session_id": session_id,
+            "rows_out": result.df.height,
+            "cols_out": result.df.width,
+        })
+        return ProcessResponse(session_id=session_id, report=result.report)
+    finally:
+        with _processing_lock:
+            _processing_sessions.discard(session_id)
+class PreviewResponse(BaseModel):
+    session_id: str
+    columns: list[str]
+    original: list[list[str | None]]
+    sanitized: list[list[str | None]]
+    dropped_columns: list[str]
+@router.post("/preview/{session_id}", response_model=PreviewResponse)
+async def preview(session_id: str, request: ProcessRequest) -> PreviewResponse:
+    """Return a small before/after sample under the given policies.
+    Skips k-anonymity (it would suppress all rows of a tiny sample). DP noise
+    is applied so the user sees realistic post-noise values.
+    """
+    store = get_store()
+    snapshot = store.checkout(session_id)
+    if snapshot is None or snapshot.df is None or snapshot.hmac_key is None:
+        raise HTTPException(404, "session not found or expired")
+    head = snapshot.df.head(PREVIEW_ROW_LIMIT)
+    cols_in = head.columns
+    cfg = get_config()
+    # Apply same deterministic key derivation as pipeline/preflight.
+    hmac_key = snapshot.hmac_key
+    if request.deterministic_key_name:
+        if cfg.deployment_salt_is_ephemeral:
+            raise HTTPException(400, "Deterministic mode requires SDSA_DEPLOYMENT_SALT to be set.")
+        hmac_key = _derive_deterministic_key(request.deterministic_key_name, cfg.deployment_salt)
+    df = head.clone()
+    dp_columns = {p.column for p in request.policies if p.action == "dp_laplace"}
+    if request.deterministic_key_name and dp_columns:
+        raise HTTPException(
+            400,
+            "Deterministic mode cannot be combined with DP columns (ADR-0008)."
+        )
+    try:
+        for p in request.policies:
+            df = apply_policy(df, p, hmac_key)
+        for col in dp_columns:
+            if col not in df.columns:
+                continue
+            params = request.dp_params.get(col) or {}
+            if "epsilon" not in params or "lower" not in params or "upper" not in params:
+                # Preview is best-effort: skip incomplete DP configs rather than
+                # error out — the Process step will surface the real error.
+                continue
+            try:
+                eps = float(params["epsilon"])
+            except (TypeError, ValueError):
+                continue
+            if not (cfg.epsilon_min <= eps <= cfg.epsilon_max):
+                raise HTTPException(
+                    400,
+                    f"epsilon for '{col}' ({eps:.6g}) outside allowed range "
+                    f"[{cfg.epsilon_min}, {cfg.epsilon_max}]",
+                )
+            try:
+                lp = LaplaceParams(
+                    epsilon=eps,
+                    lower=float(params["lower"]),
+                    upper=float(params["upper"]),
+                )
+            except (TypeError, ValueError):
+                continue
+            if not df[col].dtype.is_numeric():
+                continue
+            try:
+                df = df.with_columns(apply_laplace(df[col], lp).alias(col))
+            except ValueError:
+                continue
+    except PolicyApplicationError as e:
+        raise HTTPException(400, str(e))
+    dropped = [c for c in cols_in if c not in df.columns]
+    sanitized: list[list[str | None]] = []
+    for i in range(head.height):
+        row: list[str | None] = []
+        for c in cols_in:
+            if c in df.columns:
+                row.append(_stringify_cell(df[c][i]))
+            else:
+                row.append(None)  # dropped — frontend renders a marker
+        sanitized.append(row)
+    return PreviewResponse(
+        session_id=session_id,
+        columns=cols_in,
+        original=_serialize_sample(head),
+        sanitized=sanitized,
+        dropped_columns=dropped,
+    )
+@router.post("/preflight/{session_id}", response_model=PreflightResponse)
+async def preflight(session_id: str, request: PreflightRequest) -> PreflightResponse:
+    store = get_store()
+    snapshot = store.checkout(session_id)
+    if snapshot is None or snapshot.df is None or snapshot.hmac_key is None:
+        raise HTTPException(404, "session not found or expired")
+    try:
+        preview = preflight_k_anonymity(
+            original=snapshot.df,
+            request=request,
+            hmac_key=snapshot.hmac_key,
+        )
+    except PolicyApplicationError as e:
+        raise HTTPException(400, str(e))
+    return PreflightResponse(session_id=session_id, preflight=preview)
+@router.get("/download/{session_id}/data.csv")
+async def download_csv(session_id: str):
+    store = get_store()
+    snapshot = store.checkout(session_id)
+    if snapshot is None:
+        raise HTTPException(404, "session expired — please re-upload and reprocess")
+    if snapshot.output_bytes is None:
+        raise HTTPException(404, "no output for session")
+    headers = {"Content-Disposition": 'attachment; filename="sdsa-export.csv"'}
+    return Response(content=snapshot.output_bytes, media_type="text/csv", headers=headers)
+@router.get("/download/{session_id}/report.json")
+async def download_report_json(session_id: str):
+    store = get_store()
+    snapshot = store.checkout(session_id)
+    if snapshot is None:
+        raise HTTPException(404, "session expired — please re-upload and reprocess")
+    if snapshot.output_report is None:
+        raise HTTPException(404, "no report for session")
+    return JSONResponse(snapshot.output_report)
+@router.get("/download/{session_id}/report.md")
+async def download_report_md(session_id: str):
+    store = get_store()
+    snapshot = store.checkout(session_id)
+    if snapshot is None:
+        raise HTTPException(404, "session expired — please re-upload and reprocess")
+    if snapshot.output_report is None:
+        raise HTTPException(404, "no report for session")
+    md = render_markdown(snapshot.output_report)
+    headers = {"Content-Disposition": 'attachment; filename="sdsa-report.md"'}
+    return Response(content=md, media_type="text/markdown", headers=headers)
+@router.delete("/session/{session_id}")
+async def delete_session(session_id: str):
+    if not get_store().delete(session_id):
+        raise HTTPException(404, "session not found or already deleted")
+    return {"deleted": session_id}