PyPI - drc-scanner - Versions diffs - 0.2.0__py3-none-any.whl - Mend

drc-scanner 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

drc_scanner/__init__.py +7 -0
drc_scanner/cli.py +249 -0
drc_scanner/connectors/__init__.py +50 -0
drc_scanner/connectors/base.py +124 -0
drc_scanner/connectors/bigquery_conn.py +96 -0
drc_scanner/connectors/dialect.py +218 -0
drc_scanner/connectors/dialect_connector.py +170 -0
drc_scanner/connectors/guard.py +86 -0
drc_scanner/connectors/postgres.py +211 -0
drc_scanner/indicative.py +40 -0
drc_scanner/passport/__init__.py +29 -0
drc_scanner/passport/builder.py +138 -0
drc_scanner/passport/models.py +109 -0
drc_scanner/passport/signing.py +87 -0
drc_scanner/pii/__init__.py +4 -0
drc_scanner/pii/detector.py +104 -0
drc_scanner/pii/patterns.py +115 -0
drc_scanner/profiling/__init__.py +4 -0
drc_scanner/profiling/concentration.py +57 -0
drc_scanner/profiling/engine.py +144 -0
drc_scanner/profiling/heuristics.py +63 -0
drc_scanner/profiling/inventory.py +51 -0
drc_scanner/profiling/model.py +31 -0
drc_scanner/profiling/statistics.py +130 -0
drc_scanner/profiling/suggestions.py +92 -0
drc_scanner/profiling/timeseries.py +118 -0
drc_scanner-0.2.0.dist-info/METADATA +93 -0
drc_scanner-0.2.0.dist-info/RECORD +31 -0
drc_scanner-0.2.0.dist-info/WHEEL +5 -0
drc_scanner-0.2.0.dist-info/entry_points.txt +2 -0
drc_scanner-0.2.0.dist-info/top_level.txt +1 -0

drc_scanner/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""DRC Scanner — Data Revenue Connecter.
+A read-only database profiling agent that builds a signed Data Passport.
+"""
+__version__ = "0.2.0"
+AGENT_VERSION = __version__

drc_scanner/cli.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""drc-scan — the Data Revenue Connecter command-line interface."""
+from __future__ import annotations
+import hashlib
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+# Force UTF-8 output so status glyphs render on Windows consoles (cp1252) and
+# when piped. errors="replace" guarantees we never crash on an exotic terminal.
+for _stream in (sys.stdout, sys.stderr):
+    try:
+        _stream.reconfigure(encoding="utf-8", errors="replace")  # type: ignore[attr-defined]
+    except (AttributeError, ValueError):
+        pass
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from drc_scanner import AGENT_VERSION
+from drc_scanner.connectors import connector_for
+from drc_scanner.passport import signing
+from drc_scanner.passport.builder import build_passport, write_passport
+from drc_scanner.profiling.engine import run_full_profile
+_SIGNING_KEY_PATH = Path.home() / ".drc" / "signing_key.b64"
+app = typer.Typer(
+    add_completion=False,
+    help="Data Revenue Connecter — read-only database profiling. Your data never leaves your machine.",
+)
+console = Console()
+_READONLY_ROLE_SQL = {
+    "postgres": """-- Run as a Postgres superuser. Replace the password and database name.
+CREATE ROLE drc_readonly WITH LOGIN PASSWORD 'change-this-password';
+GRANT CONNECT ON DATABASE your_database TO drc_readonly;
+GRANT USAGE ON SCHEMA public TO drc_readonly;
+GRANT SELECT ON ALL TABLES IN SCHEMA public TO drc_readonly;
+-- Ensure future tables are readable too:
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO drc_readonly;
+-- Then connect the scanner with:
+--   drc-scan run --connect postgresql://drc_readonly:change-this-password@HOST:5432/your_database""",
+}
+def _host_fingerprint(conn: str) -> str:
+    """Hash of the host only — the real hostname is never recorded."""
+    host = urlparse(conn).hostname or "unknown"
+    digest = hashlib.sha256(f"drc:{host}".encode("utf-8")).hexdigest()
+    return f"sha256:{digest[:32]}"
+@app.command()
+def version() -> None:
+    """Print the agent version."""
+    console.print(f"DRC Scanner v{AGENT_VERSION}")
+@app.command()
+def setup(db: str = typer.Option("postgres", help="Database type: postgres")) -> None:
+    """Print the SQL to create a least-privilege, read-only role for the scanner."""
+    sql = _READONLY_ROLE_SQL.get(db.lower())
+    if sql is None:
+        console.print(f"[red]No setup template for '{db}'. Supported: postgres[/red]")
+        raise typer.Exit(code=1)
+    console.print(Panel(sql, title=f"Read-only role for {db}", border_style="cyan"))
+def _scan_and_build(connect: str, on_progress=None):
+    """Connect, profile (read-only), and build a signed Passport.
+    Returns (passport, profile, db_version, key_identifier, pub_b64). Shared by
+    the `run` and `monitor` commands so they never diverge.
+    """
+    connector = connector_for(connect)
+    started = datetime.now(timezone.utc)
+    try:
+        connector.connect()
+        db_version = connector.server_version()
+        profile = run_full_profile(connector, on_table=on_progress)
+    finally:
+        connector.close()
+    completed = datetime.now(timezone.utc)
+    private_key = signing.get_or_create_private_key(_SIGNING_KEY_PATH)
+    pub_b64 = signing.public_b64(private_key)
+    key_identifier = signing.key_id(pub_b64)
+    def _signer(content_hash: str) -> tuple[str, str]:
+        return signing.sign(private_key, content_hash.encode("utf-8")), key_identifier
+    passport = build_passport(
+        inventory=profile.inventory, metrics=profile.metrics,
+        suggestions=profile.suggestions, pii_findings=profile.pii_findings,
+        indicative_range=profile.indicative, db_type=connector.db_type,
+        db_version=db_version, host_fingerprint=_host_fingerprint(connect),
+        started_at=started, completed_at=completed, agent_version=AGENT_VERSION,
+        signer=_signer,
+    )
+    return passport, profile, db_version, key_identifier, pub_b64
+@app.command()
+def run(
+    connect: str = typer.Option(..., "--connect", help="Database connection string"),
+    output: Optional[Path] = typer.Option(
+        None, "--output", help="Passport output path (default: drc_passport_<date>.json)"
+    ),
+) -> None:
+    """Profile the database (read-only) and write a Data Passport."""
+    console.print(f"[bold]DRC Scanner v{AGENT_VERSION}[/bold] — Data Revenue Connecter")
+    console.print("[dim]Read-only mode · No data leaves this machine[/dim]\n")
+    try:
+        connector_for(connect)  # validate scheme early
+    except ValueError as exc:
+        console.print(f"[red]{exc}[/red]")
+        raise typer.Exit(code=1)
+    def _progress(table, i, total):
+        console.print(f"  [dim]profiling {i}/{total}: {table.fqn}[/dim]")
+    try:
+        passport, profile, db_version, key_identifier, pub_b64 = _scan_and_build(
+            connect, on_progress=_progress)
+    except Exception as exc:
+        console.print(f"  [red]✗ Scan failed: {exc}[/red]")
+        raise typer.Exit(code=1)
+    console.print(f"  [green]✔[/green] Profiled {profile.inventory.tables_scanned} tables, "
+                  f"{profile.inventory.total_rows:,} rows")
+    console.print(f"  [green]✔[/green] PII scan complete — {len(profile.pii_findings)} finding(s)")
+    out_path = output or Path(
+        f"drc_passport_{datetime.now(timezone.utc).date().isoformat()}.json")
+    write_passport(passport, out_path)
+    console.print(f"  [green]✔[/green] Data Passport written  [bold]{out_path}[/bold]\n")
+    _print_summary(profile, passport, out_path, key_identifier, pub_b64)
+@app.command()
+def monitor(
+    connect: str = typer.Option(..., "--connect", help="Database connection string"),
+    upload_url: str = typer.Option(..., "--upload-url",
+                                   help="Platform base URL, e.g. https://app.datarevenue.io"),
+    token: str = typer.Option(..., "--token", help="Monitor API token (X-API-Key)"),
+    license: str = typer.Option("", "--license", help="Monitor license key"),
+    schedule: str = typer.Option("quarterly", "--schedule",
+                                 help="monthly | quarterly (for the printed scheduling hint)"),
+) -> None:
+    """Re-scan and auto-upload the Data Passport (one run of a Monitor schedule)."""
+    import json as _json
+    import urllib.request
+    console.print(f"[bold]DRC Monitor[/bold] v{AGENT_VERSION} — scheduled re-scan")
+    try:
+        passport, profile, _, _, _ = _scan_and_build(connect)
+    except Exception as exc:
+        console.print(f"[red]✗ Scan failed: {exc}[/red]")
+        raise typer.Exit(code=1)
+    endpoint = upload_url.rstrip("/") + "/api/v1/passports/upload"
+    body = _json.dumps(passport.model_dump(mode="json")).encode("utf-8")
+    req = urllib.request.Request(
+        endpoint, data=body, method="POST",
+        headers={"Content-Type": "application/json", "X-API-Key": token},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=120) as resp:
+            result = _json.loads(resp.read().decode("utf-8"))
+    except Exception as exc:
+        console.print(f"[red]✗ Upload failed: {exc}[/red]")
+        raise typer.Exit(code=1)
+    console.print(f"  [green]✔[/green] Uploaded passport {result.get('passport_id')} "
+                  f"(records: {profile.metrics.record_count:,})")
+    console.print(Panel(
+        f"Schedule: [bold]{schedule}[/bold]   License: [dim]{license or '(none)'}[/dim]\n\n"
+        f"To automate this, add a {schedule} task that runs this exact command:\n"
+        f"[dim]Linux/macOS (cron, quarterly @ 2am day 1):[/dim]\n"
+        f"  0 2 1 */3 * drc-scan monitor --connect ... --upload-url {upload_url} --token ...\n"
+        f"[dim]Windows (Task Scheduler):[/dim]\n"
+        f"  schtasks /create /tn DRC-Monitor /sc monthly /mo 3 /tr \"drc-scan monitor ...\"",
+        title="Re-scan uploaded", border_style="green",
+    ))
+def _print_summary(profile, passport, out_path, key_identifier, pub_b64) -> None:
+    m = profile.metrics
+    # Top tables by row count
+    tbl = Table(title="Inventory Summary", border_style="cyan")
+    tbl.add_column("Table")
+    tbl.add_column("Rows", justify="right")
+    tbl.add_column("Columns", justify="right")
+    for t in sorted(profile.inventory.tables, key=lambda t: t.rows, reverse=True)[:8]:
+        tbl.add_row(t.name, f"{t.rows:,}", str(t.columns))
+    console.print(tbl)
+    # Quality metrics
+    def pct(x):
+        return f"{x:.0%}" if x is not None else "—"
+    q = Table(title="Measured Quality (DQF)", border_style="cyan")
+    q.add_column("Dimension")
+    q.add_column("Score", justify="right")
+    q.add_row("Completeness", pct(m.completeness))
+    q.add_row("Accuracy (floor)", pct(m.accuracy_floor))
+    q.add_row("Consistency", pct(m.consistency))
+    q.add_row("Timeliness", pct(m.timeliness))
+    q.add_row("Uniqueness", pct(m.uniqueness))
+    console.print(q)
+    cat = profile.suggestions.get("dataset_category")
+    cat_str = f"{cat.value} ({cat.confidence:.0%})" if cat and cat.value else "undetermined"
+    rng = profile.indicative
+    console.print(Panel(
+        f"Records: [bold]{m.record_count:,}[/bold]   Attributes: [bold]{m.attribute_count}[/bold]   "
+        f"Tables: [bold]{m.table_count}[/bold]\n"
+        f"Suggested category: [bold]{cat_str}[/bold]   "
+        f"Vintage: [bold]{m.data_vintage_years or '—'}[/bold] yrs\n"
+        f"PII findings: [bold]{len(profile.pii_findings)}[/bold]   "
+        f"Signed: [green]{passport.integrity.signing_key_id}[/green]",
+        title="Scan complete",
+        border_style="green",
+    ))
+    console.print(Panel(
+        f"[bold]${rng.low:,} – ${rng.high:,}[/bold]\n[dim]{rng.disclaimer}[/dim]\n\n"
+        f"Upload [bold]{out_path}[/bold] to app.datarevenue.io to auto-fill your "
+        f"valuation questionnaire.",
+        title="Preliminary Indicative Range",
+        border_style="yellow",
+    ))
+    console.print(
+        f"[dim]Agent public key ({key_identifier}): {pub_b64}[/dim]"
+    )
+if __name__ == "__main__":
+    app()

drc_scanner/connectors/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+from urllib.parse import urlparse
+from .base import (
+    AbstractConnector,
+    ColumnMeta,
+    ColumnStats,
+    ForeignKey,
+    TableRef,
+)
+from .guard import ReadOnlyViolation, assert_read_only
+# scheme prefix → (module, class). Lazy import keeps optional drivers optional.
+_SCHEME_MAP = {
+    "postgresql": ("postgres", "PostgresConnector"),
+    "postgres": ("postgres", "PostgresConnector"),
+    "mysql": ("dialect_connector", "MySQLConnector"),
+    "mariadb": ("dialect_connector", "MySQLConnector"),
+    "mssql": ("dialect_connector", "SQLServerConnector"),
+    "sqlserver": ("dialect_connector", "SQLServerConnector"),
+    "snowflake": ("dialect_connector", "SnowflakeConnector"),
+    "bigquery": ("bigquery_conn", "BigQueryConnector"),
+}
+def connector_for(conninfo: str) -> AbstractConnector:
+    """Return the right connector for a connection string's scheme."""
+    import importlib
+    scheme = urlparse(conninfo).scheme.lower().split("+")[0]
+    entry = _SCHEME_MAP.get(scheme)
+    if entry is None:
+        raise ValueError(
+            f"Unsupported database scheme '{scheme}'. Supported: "
+            f"{sorted(set(_SCHEME_MAP))}"
+        )
+    module, cls = entry
+    mod = importlib.import_module(f"drc_scanner.connectors.{module}")
+    return getattr(mod, cls)(conninfo)
+__all__ = [
+    "AbstractConnector",
+    "ColumnMeta",
+    "ColumnStats",
+    "ForeignKey",
+    "TableRef",
+    "ReadOnlyViolation",
+    "assert_read_only",
+    "connector_for",
+]

drc_scanner/connectors/base.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""Abstract connector interface shared by every database backend.
+A connector is a thin, read-only window onto one database. The profiling engine
+depends only on this interface, so adding a new database (MySQL, Snowflake, ...)
+never touches the profiler.
+Privacy note: ``column_aggregate`` may return MIN/MAX for timestamp and numeric
+columns (needed for data-vintage and plausibility), but the profiling engine
+never writes raw min/max values into the Passport — only derived numbers. Raw
+sample rows fetched via ``sample_rows`` are processed in scanner memory and
+discarded; they never leave the machine.
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Optional
+# System schemas excluded from scans by default across backends.
+DEFAULT_EXCLUDED_SCHEMAS = frozenset({
+    "pg_catalog", "information_schema", "pg_toast",
+    "sys", "mysql", "performance_schema",
+})
+@dataclass(frozen=True)
+class TableRef:
+    """Fully-qualified reference to a table."""
+    schema: str
+    name: str
+    @property
+    def fqn(self) -> str:
+        """schema.table label for display (not for SQL interpolation)."""
+        return f"{self.schema}.{self.name}"
+@dataclass
+class ColumnMeta:
+    name: str
+    data_type: str
+    nullable: bool
+@dataclass
+class ColumnStats:
+    total: int
+    null_count: int
+    distinct_count: Optional[int] = None
+    # min/max are for INTERNAL use only (timestamp vintage, numeric plausibility).
+    # The engine must never persist these raw values into the Passport.
+    min_value: Optional[Any] = None
+    max_value: Optional[Any] = None
+@dataclass
+class ForeignKey:
+    column: str
+    ref_schema: str
+    ref_table: str
+    ref_column: str
+class AbstractConnector(ABC):
+    """Minimal read-only interface. Implementations must route every query
+    through ``drc_scanner.connectors.guard.assert_read_only`` and connect with
+    a read-only session."""
+    db_type: str = "unknown"
+    @abstractmethod
+    def connect(self) -> None: ...
+    @abstractmethod
+    def close(self) -> None: ...
+    @abstractmethod
+    def server_version(self) -> str: ...
+    @abstractmethod
+    def list_tables(self, excluded_schemas: frozenset[str] = DEFAULT_EXCLUDED_SCHEMAS
+                    ) -> list[TableRef]: ...
+    @abstractmethod
+    def get_columns(self, table: TableRef) -> list[ColumnMeta]: ...
+    @abstractmethod
+    def count_rows(self, table: TableRef) -> int: ...
+    # ── Profiling extensions (Step 2) ─────────────────────────────────────────
+    @abstractmethod
+    def column_aggregate(self, table: TableRef, column: ColumnMeta) -> ColumnStats:
+        """Pushed-down COUNT/COUNT(DISTINCT)/MIN/MAX. MIN/MAX only for
+        timestamp and numeric columns (privacy)."""
+    @abstractmethod
+    def sample_rows(self, table: TableRef, columns: list[str], n: int) -> list[dict]:
+        """Up to n rows for local-only profiling (PII, accuracy, uniqueness).
+        Never persisted to the Passport."""
+    @abstractmethod
+    def count_since(self, table: TableRef, ts_column: str, days: int) -> int:
+        """Rows whose ts_column falls within the trailing ``days`` window."""
+    @abstractmethod
+    def monthly_counts(self, table: TableRef, ts_column: str) -> list[tuple[str, int]]:
+        """(month_iso, row_count) buckets for growth/cadence analysis."""
+    @abstractmethod
+    def top_group_counts(self, table: TableRef, column: str, n: int) -> list[int]:
+        """Descending row counts of the top-n groups, VALUES DISCARDED — only
+        the counts are returned, never the grouping keys (privacy)."""
+    def list_foreign_keys(self, table: TableRef) -> list[ForeignKey]:
+        """Declared foreign keys; default empty for backends that don't expose them."""
+        return []
+    def __enter__(self) -> "AbstractConnector":
+        self.connect()
+        return self
+    def __exit__(self, *exc) -> None:
+        self.close()

drc_scanner/connectors/bigquery_conn.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""BigQuery connector.
+BigQuery is not a DBAPI engine — it runs queries through a client and scopes
+catalog introspection to a dataset's INFORMATION_SCHEMA — so it implements
+AbstractConnector directly while reusing BigQueryDialect for SQL strings.
+Connection string: bigquery://<project>/<dataset>
+Auth uses Application Default Credentials (GOOGLE_APPLICATION_CREDENTIALS).
+NOT yet verified against a live BigQuery project — smoke-test before production.
+"""
+from __future__ import annotations
+from urllib.parse import urlparse
+from .base import DEFAULT_EXCLUDED_SCHEMAS, AbstractConnector, ColumnMeta, ColumnStats, TableRef
+from .dialect import BigQueryDialect
+from .guard import assert_read_only
+class BigQueryConnector(AbstractConnector):
+    db_type = "bigquery"
+    def __init__(self, conninfo: str):
+        p = urlparse(conninfo)
+        self.project = p.hostname or ""
+        self.dataset = (p.path or "/").lstrip("/").split("/")[0]
+        self.dialect = BigQueryDialect(self.project, self.dataset)
+        self._client = None
+    def connect(self) -> None:
+        from google.cloud import bigquery  # lazy: optional dependency
+        self._client = bigquery.Client(project=self.project)
+    def close(self) -> None:
+        if self._client is not None:
+            self._client.close()
+            self._client = None
+    def _rows(self, sql: str) -> list[tuple]:
+        if self._client is None:
+            raise RuntimeError("Connector is not connected")
+        assert_read_only(sql)
+        return [tuple(row.values()) for row in self._client.query(sql).result()]
+    def server_version(self) -> str:
+        return "bigquery"
+    def list_tables(self, excluded_schemas: frozenset[str] = DEFAULT_EXCLUDED_SCHEMAS
+                    ) -> list[TableRef]:
+        rows = self._rows(self.dialect.list_tables_sql())
+        return [TableRef(schema=str(s), name=str(n)) for (s, n) in rows]
+    def get_columns(self, table: TableRef) -> list[ColumnMeta]:
+        rows = self._rows(self.dialect.columns_sql(table.schema, table.name))
+        return [ColumnMeta(name=str(c), data_type=str(t),
+                           nullable=str(nul).upper() == "YES")
+                for (c, t, nul) in rows]
+    def count_rows(self, table: TableRef) -> int:
+        rows = self._rows(self.dialect.count_rows_sql(table.schema, table.name))
+        return int(rows[0][0]) if rows else 0
+    def column_aggregate(self, table: TableRef, column: ColumnMeta) -> ColumnStats:
+        try:
+            sql = self.dialect.column_aggregate_sql(
+                table.schema, table.name, column.name, do_minmax=True, do_distinct=True)
+            total, non_null, distinct, mn, mx = self._rows(sql)[0]
+        except Exception:
+            sql = self.dialect.column_counts_sql(table.schema, table.name, column.name)
+            total, non_null = self._rows(sql)[0]
+            distinct = mn = mx = None
+        return ColumnStats(
+            total=int(total), null_count=int(total) - int(non_null),
+            distinct_count=int(distinct) if distinct is not None else None,
+            min_value=mn, max_value=mx,
+        )
+    def sample_rows(self, table: TableRef, columns: list[str], n: int) -> list[dict]:
+        if not columns:
+            return []
+        sql = self.dialect.sample_sql(table.schema, table.name, columns, n)
+        assert_read_only(sql)
+        return [dict(row.items()) for row in self._client.query(sql).result()]
+    def count_since(self, table: TableRef, ts_column: str, days: int) -> int:
+        rows = self._rows(self.dialect.count_since_sql(table.schema, table.name, ts_column, days))
+        return int(rows[0][0]) if rows else 0
+    def monthly_counts(self, table: TableRef, ts_column: str) -> list[tuple[str, int]]:
+        rows = self._rows(self.dialect.monthly_counts_sql(table.schema, table.name, ts_column))
+        return [(str(m), int(c)) for (m, c) in rows]
+    def top_group_counts(self, table: TableRef, column: str, n: int) -> list[int]:
+        rows = self._rows(self.dialect.top_groups_sql(table.schema, table.name, column, n))
+        return [int(c) for (c,) in rows]