drc-scanner 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ """DRC Scanner — Data Revenue Connecter.
2
+
3
+ A read-only database profiling agent that builds a signed Data Passport.
4
+ """
5
+
6
+ __version__ = "0.2.0"
7
+ AGENT_VERSION = __version__
drc_scanner/cli.py ADDED
@@ -0,0 +1,249 @@
1
+ """drc-scan — the Data Revenue Connecter command-line interface."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ import sys
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+ from typing import Optional
9
+ from urllib.parse import urlparse
10
+
11
+ # Force UTF-8 output so status glyphs render on Windows consoles (cp1252) and
12
+ # when piped. errors="replace" guarantees we never crash on an exotic terminal.
13
+ for _stream in (sys.stdout, sys.stderr):
14
+ try:
15
+ _stream.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined]
16
+ except (AttributeError, ValueError):
17
+ pass
18
+
19
+ import typer
20
+ from rich.console import Console
21
+ from rich.panel import Panel
22
+ from rich.table import Table
23
+
24
+ from drc_scanner import AGENT_VERSION
25
+ from drc_scanner.connectors import connector_for
26
+ from drc_scanner.passport import signing
27
+ from drc_scanner.passport.builder import build_passport, write_passport
28
+ from drc_scanner.profiling.engine import run_full_profile
29
+
30
+ _SIGNING_KEY_PATH = Path.home() / ".drc" / "signing_key.b64"
31
+
32
+ app = typer.Typer(
33
+ add_completion=False,
34
+ help="Data Revenue Connecter — read-only database profiling. Your data never leaves your machine.",
35
+ )
36
+ console = Console()
37
+
38
+ _READONLY_ROLE_SQL = {
39
+ "postgres": """-- Run as a Postgres superuser. Replace the password and database name.
40
+ CREATE ROLE drc_readonly WITH LOGIN PASSWORD 'change-this-password';
41
+ GRANT CONNECT ON DATABASE your_database TO drc_readonly;
42
+ GRANT USAGE ON SCHEMA public TO drc_readonly;
43
+ GRANT SELECT ON ALL TABLES IN SCHEMA public TO drc_readonly;
44
+ -- Ensure future tables are readable too:
45
+ ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO drc_readonly;
46
+ -- Then connect the scanner with:
47
+ -- drc-scan run --connect postgresql://drc_readonly:change-this-password@HOST:5432/your_database""",
48
+ }
49
+
50
+
51
+ def _host_fingerprint(conn: str) -> str:
52
+ """Hash of the host only — the real hostname is never recorded."""
53
+ host = urlparse(conn).hostname or "unknown"
54
+ digest = hashlib.sha256(f"drc:{host}".encode("utf-8")).hexdigest()
55
+ return f"sha256:{digest[:32]}"
56
+
57
+
58
+ @app.command()
59
+ def version() -> None:
60
+ """Print the agent version."""
61
+ console.print(f"DRC Scanner v{AGENT_VERSION}")
62
+
63
+
64
+ @app.command()
65
+ def setup(db: str = typer.Option("postgres", help="Database type: postgres")) -> None:
66
+ """Print the SQL to create a least-privilege, read-only role for the scanner."""
67
+ sql = _READONLY_ROLE_SQL.get(db.lower())
68
+ if sql is None:
69
+ console.print(f"[red]No setup template for '{db}'. Supported: postgres[/red]")
70
+ raise typer.Exit(code=1)
71
+ console.print(Panel(sql, title=f"Read-only role for {db}", border_style="cyan"))
72
+
73
+
74
+ def _scan_and_build(connect: str, on_progress=None):
75
+ """Connect, profile (read-only), and build a signed Passport.
76
+
77
+ Returns (passport, profile, db_version, key_identifier, pub_b64). Shared by
78
+ the `run` and `monitor` commands so they never diverge.
79
+ """
80
+ connector = connector_for(connect)
81
+ started = datetime.now(timezone.utc)
82
+ try:
83
+ connector.connect()
84
+ db_version = connector.server_version()
85
+ profile = run_full_profile(connector, on_table=on_progress)
86
+ finally:
87
+ connector.close()
88
+ completed = datetime.now(timezone.utc)
89
+
90
+ private_key = signing.get_or_create_private_key(_SIGNING_KEY_PATH)
91
+ pub_b64 = signing.public_b64(private_key)
92
+ key_identifier = signing.key_id(pub_b64)
93
+
94
+ def _signer(content_hash: str) -> tuple[str, str]:
95
+ return signing.sign(private_key, content_hash.encode("utf-8")), key_identifier
96
+
97
+ passport = build_passport(
98
+ inventory=profile.inventory, metrics=profile.metrics,
99
+ suggestions=profile.suggestions, pii_findings=profile.pii_findings,
100
+ indicative_range=profile.indicative, db_type=connector.db_type,
101
+ db_version=db_version, host_fingerprint=_host_fingerprint(connect),
102
+ started_at=started, completed_at=completed, agent_version=AGENT_VERSION,
103
+ signer=_signer,
104
+ )
105
+ return passport, profile, db_version, key_identifier, pub_b64
106
+
107
+
108
+ @app.command()
109
+ def run(
110
+ connect: str = typer.Option(..., "--connect", help="Database connection string"),
111
+ output: Optional[Path] = typer.Option(
112
+ None, "--output", help="Passport output path (default: drc_passport_<date>.json)"
113
+ ),
114
+ ) -> None:
115
+ """Profile the database (read-only) and write a Data Passport."""
116
+ console.print(f"[bold]DRC Scanner v{AGENT_VERSION}[/bold] — Data Revenue Connecter")
117
+ console.print("[dim]Read-only mode · No data leaves this machine[/dim]\n")
118
+
119
+ try:
120
+ connector_for(connect) # validate scheme early
121
+ except ValueError as exc:
122
+ console.print(f"[red]{exc}[/red]")
123
+ raise typer.Exit(code=1)
124
+
125
+ def _progress(table, i, total):
126
+ console.print(f" [dim]profiling {i}/{total}: {table.fqn}[/dim]")
127
+
128
+ try:
129
+ passport, profile, db_version, key_identifier, pub_b64 = _scan_and_build(
130
+ connect, on_progress=_progress)
131
+ except Exception as exc:
132
+ console.print(f" [red]✗ Scan failed: {exc}[/red]")
133
+ raise typer.Exit(code=1)
134
+
135
+ console.print(f" [green]✔[/green] Profiled {profile.inventory.tables_scanned} tables, "
136
+ f"{profile.inventory.total_rows:,} rows")
137
+ console.print(f" [green]✔[/green] PII scan complete — {len(profile.pii_findings)} finding(s)")
138
+
139
+ out_path = output or Path(
140
+ f"drc_passport_{datetime.now(timezone.utc).date().isoformat()}.json")
141
+ write_passport(passport, out_path)
142
+ console.print(f" [green]✔[/green] Data Passport written [bold]{out_path}[/bold]\n")
143
+
144
+ _print_summary(profile, passport, out_path, key_identifier, pub_b64)
145
+
146
+
147
+ @app.command()
148
+ def monitor(
149
+ connect: str = typer.Option(..., "--connect", help="Database connection string"),
150
+ upload_url: str = typer.Option(..., "--upload-url",
151
+ help="Platform base URL, e.g. https://app.datarevenue.io"),
152
+ token: str = typer.Option(..., "--token", help="Monitor API token (X-API-Key)"),
153
+ license: str = typer.Option("", "--license", help="Monitor license key"),
154
+ schedule: str = typer.Option("quarterly", "--schedule",
155
+ help="monthly | quarterly (for the printed scheduling hint)"),
156
+ ) -> None:
157
+ """Re-scan and auto-upload the Data Passport (one run of a Monitor schedule)."""
158
+ import json as _json
159
+ import urllib.request
160
+
161
+ console.print(f"[bold]DRC Monitor[/bold] v{AGENT_VERSION} — scheduled re-scan")
162
+ try:
163
+ passport, profile, _, _, _ = _scan_and_build(connect)
164
+ except Exception as exc:
165
+ console.print(f"[red]✗ Scan failed: {exc}[/red]")
166
+ raise typer.Exit(code=1)
167
+
168
+ endpoint = upload_url.rstrip("/") + "/api/v1/passports/upload"
169
+ body = _json.dumps(passport.model_dump(mode="json")).encode("utf-8")
170
+ req = urllib.request.Request(
171
+ endpoint, data=body, method="POST",
172
+ headers={"Content-Type": "application/json", "X-API-Key": token},
173
+ )
174
+ try:
175
+ with urllib.request.urlopen(req, timeout=120) as resp:
176
+ result = _json.loads(resp.read().decode("utf-8"))
177
+ except Exception as exc:
178
+ console.print(f"[red]✗ Upload failed: {exc}[/red]")
179
+ raise typer.Exit(code=1)
180
+
181
+ console.print(f" [green]✔[/green] Uploaded passport {result.get('passport_id')} "
182
+ f"(records: {profile.metrics.record_count:,})")
183
+ console.print(Panel(
184
+ f"Schedule: [bold]{schedule}[/bold] License: [dim]{license or '(none)'}[/dim]\n\n"
185
+ f"To automate this, add a {schedule} task that runs this exact command:\n"
186
+ f"[dim]Linux/macOS (cron, quarterly @ 2am day 1):[/dim]\n"
187
+ f" 0 2 1 */3 * drc-scan monitor --connect ... --upload-url {upload_url} --token ...\n"
188
+ f"[dim]Windows (Task Scheduler):[/dim]\n"
189
+ f" schtasks /create /tn DRC-Monitor /sc monthly /mo 3 /tr \"drc-scan monitor ...\"",
190
+ title="Re-scan uploaded", border_style="green",
191
+ ))
192
+
193
+
194
+ def _print_summary(profile, passport, out_path, key_identifier, pub_b64) -> None:
195
+ m = profile.metrics
196
+
197
+ # Top tables by row count
198
+ tbl = Table(title="Inventory Summary", border_style="cyan")
199
+ tbl.add_column("Table")
200
+ tbl.add_column("Rows", justify="right")
201
+ tbl.add_column("Columns", justify="right")
202
+ for t in sorted(profile.inventory.tables, key=lambda t: t.rows, reverse=True)[:8]:
203
+ tbl.add_row(t.name, f"{t.rows:,}", str(t.columns))
204
+ console.print(tbl)
205
+
206
+ # Quality metrics
207
+ def pct(x):
208
+ return f"{x:.0%}" if x is not None else "—"
209
+
210
+ q = Table(title="Measured Quality (DQF)", border_style="cyan")
211
+ q.add_column("Dimension")
212
+ q.add_column("Score", justify="right")
213
+ q.add_row("Completeness", pct(m.completeness))
214
+ q.add_row("Accuracy (floor)", pct(m.accuracy_floor))
215
+ q.add_row("Consistency", pct(m.consistency))
216
+ q.add_row("Timeliness", pct(m.timeliness))
217
+ q.add_row("Uniqueness", pct(m.uniqueness))
218
+ console.print(q)
219
+
220
+ cat = profile.suggestions.get("dataset_category")
221
+ cat_str = f"{cat.value} ({cat.confidence:.0%})" if cat and cat.value else "undetermined"
222
+ rng = profile.indicative
223
+
224
+ console.print(Panel(
225
+ f"Records: [bold]{m.record_count:,}[/bold] Attributes: [bold]{m.attribute_count}[/bold] "
226
+ f"Tables: [bold]{m.table_count}[/bold]\n"
227
+ f"Suggested category: [bold]{cat_str}[/bold] "
228
+ f"Vintage: [bold]{m.data_vintage_years or '—'}[/bold] yrs\n"
229
+ f"PII findings: [bold]{len(profile.pii_findings)}[/bold] "
230
+ f"Signed: [green]{passport.integrity.signing_key_id}[/green]",
231
+ title="Scan complete",
232
+ border_style="green",
233
+ ))
234
+
235
+ console.print(Panel(
236
+ f"[bold]${rng.low:,} – ${rng.high:,}[/bold]\n[dim]{rng.disclaimer}[/dim]\n\n"
237
+ f"Upload [bold]{out_path}[/bold] to app.datarevenue.io to auto-fill your "
238
+ f"valuation questionnaire.",
239
+ title="Preliminary Indicative Range",
240
+ border_style="yellow",
241
+ ))
242
+
243
+ console.print(
244
+ f"[dim]Agent public key ({key_identifier}): {pub_b64}[/dim]"
245
+ )
246
+
247
+
248
+ if __name__ == "__main__":
249
+ app()
@@ -0,0 +1,50 @@
1
+ from urllib.parse import urlparse
2
+
3
+ from .base import (
4
+ AbstractConnector,
5
+ ColumnMeta,
6
+ ColumnStats,
7
+ ForeignKey,
8
+ TableRef,
9
+ )
10
+ from .guard import ReadOnlyViolation, assert_read_only
11
+
12
+ # scheme prefix → (module, class). Lazy import keeps optional drivers optional.
13
+ _SCHEME_MAP = {
14
+ "postgresql": ("postgres", "PostgresConnector"),
15
+ "postgres": ("postgres", "PostgresConnector"),
16
+ "mysql": ("dialect_connector", "MySQLConnector"),
17
+ "mariadb": ("dialect_connector", "MySQLConnector"),
18
+ "mssql": ("dialect_connector", "SQLServerConnector"),
19
+ "sqlserver": ("dialect_connector", "SQLServerConnector"),
20
+ "snowflake": ("dialect_connector", "SnowflakeConnector"),
21
+ "bigquery": ("bigquery_conn", "BigQueryConnector"),
22
+ }
23
+
24
+
25
+ def connector_for(conninfo: str) -> AbstractConnector:
26
+ """Return the right connector for a connection string's scheme."""
27
+ import importlib
28
+
29
+ scheme = urlparse(conninfo).scheme.lower().split("+")[0]
30
+ entry = _SCHEME_MAP.get(scheme)
31
+ if entry is None:
32
+ raise ValueError(
33
+ f"Unsupported database scheme '{scheme}'. Supported: "
34
+ f"{sorted(set(_SCHEME_MAP))}"
35
+ )
36
+ module, cls = entry
37
+ mod = importlib.import_module(f"drc_scanner.connectors.{module}")
38
+ return getattr(mod, cls)(conninfo)
39
+
40
+
41
+ __all__ = [
42
+ "AbstractConnector",
43
+ "ColumnMeta",
44
+ "ColumnStats",
45
+ "ForeignKey",
46
+ "TableRef",
47
+ "ReadOnlyViolation",
48
+ "assert_read_only",
49
+ "connector_for",
50
+ ]
@@ -0,0 +1,124 @@
1
+ """Abstract connector interface shared by every database backend.
2
+
3
+ A connector is a thin, read-only window onto one database. The profiling engine
4
+ depends only on this interface, so adding a new database (MySQL, Snowflake, ...)
5
+ never touches the profiler.
6
+
7
+ Privacy note: ``column_aggregate`` may return MIN/MAX for timestamp and numeric
8
+ columns (needed for data-vintage and plausibility), but the profiling engine
9
+ never writes raw min/max values into the Passport — only derived numbers. Raw
10
+ sample rows fetched via ``sample_rows`` are processed in scanner memory and
11
+ discarded; they never leave the machine.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ from abc import ABC, abstractmethod
16
+ from dataclasses import dataclass, field
17
+ from typing import Any, Optional
18
+
19
+ # System schemas excluded from scans by default across backends.
20
+ DEFAULT_EXCLUDED_SCHEMAS = frozenset({
21
+ "pg_catalog", "information_schema", "pg_toast",
22
+ "sys", "mysql", "performance_schema",
23
+ })
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class TableRef:
28
+ """Fully-qualified reference to a table."""
29
+ schema: str
30
+ name: str
31
+
32
+ @property
33
+ def fqn(self) -> str:
34
+ """schema.table label for display (not for SQL interpolation)."""
35
+ return f"{self.schema}.{self.name}"
36
+
37
+
38
+ @dataclass
39
+ class ColumnMeta:
40
+ name: str
41
+ data_type: str
42
+ nullable: bool
43
+
44
+
45
+ @dataclass
46
+ class ColumnStats:
47
+ total: int
48
+ null_count: int
49
+ distinct_count: Optional[int] = None
50
+ # min/max are for INTERNAL use only (timestamp vintage, numeric plausibility).
51
+ # The engine must never persist these raw values into the Passport.
52
+ min_value: Optional[Any] = None
53
+ max_value: Optional[Any] = None
54
+
55
+
56
+ @dataclass
57
+ class ForeignKey:
58
+ column: str
59
+ ref_schema: str
60
+ ref_table: str
61
+ ref_column: str
62
+
63
+
64
+ class AbstractConnector(ABC):
65
+ """Minimal read-only interface. Implementations must route every query
66
+ through ``drc_scanner.connectors.guard.assert_read_only`` and connect with
67
+ a read-only session."""
68
+
69
+ db_type: str = "unknown"
70
+
71
+ @abstractmethod
72
+ def connect(self) -> None: ...
73
+
74
+ @abstractmethod
75
+ def close(self) -> None: ...
76
+
77
+ @abstractmethod
78
+ def server_version(self) -> str: ...
79
+
80
+ @abstractmethod
81
+ def list_tables(self, excluded_schemas: frozenset[str] = DEFAULT_EXCLUDED_SCHEMAS
82
+ ) -> list[TableRef]: ...
83
+
84
+ @abstractmethod
85
+ def get_columns(self, table: TableRef) -> list[ColumnMeta]: ...
86
+
87
+ @abstractmethod
88
+ def count_rows(self, table: TableRef) -> int: ...
89
+
90
+ # ── Profiling extensions (Step 2) ─────────────────────────────────────────
91
+
92
+ @abstractmethod
93
+ def column_aggregate(self, table: TableRef, column: ColumnMeta) -> ColumnStats:
94
+ """Pushed-down COUNT/COUNT(DISTINCT)/MIN/MAX. MIN/MAX only for
95
+ timestamp and numeric columns (privacy)."""
96
+
97
+ @abstractmethod
98
+ def sample_rows(self, table: TableRef, columns: list[str], n: int) -> list[dict]:
99
+ """Up to n rows for local-only profiling (PII, accuracy, uniqueness).
100
+ Never persisted to the Passport."""
101
+
102
+ @abstractmethod
103
+ def count_since(self, table: TableRef, ts_column: str, days: int) -> int:
104
+ """Rows whose ts_column falls within the trailing ``days`` window."""
105
+
106
+ @abstractmethod
107
+ def monthly_counts(self, table: TableRef, ts_column: str) -> list[tuple[str, int]]:
108
+ """(month_iso, row_count) buckets for growth/cadence analysis."""
109
+
110
+ @abstractmethod
111
+ def top_group_counts(self, table: TableRef, column: str, n: int) -> list[int]:
112
+ """Descending row counts of the top-n groups, VALUES DISCARDED — only
113
+ the counts are returned, never the grouping keys (privacy)."""
114
+
115
+ def list_foreign_keys(self, table: TableRef) -> list[ForeignKey]:
116
+ """Declared foreign keys; default empty for backends that don't expose them."""
117
+ return []
118
+
119
+ def __enter__(self) -> "AbstractConnector":
120
+ self.connect()
121
+ return self
122
+
123
+ def __exit__(self, *exc) -> None:
124
+ self.close()
@@ -0,0 +1,96 @@
1
+ """BigQuery connector.
2
+
3
+ BigQuery is not a DBAPI engine — it runs queries through a client and scopes
4
+ catalog introspection to a dataset's INFORMATION_SCHEMA — so it implements
5
+ AbstractConnector directly while reusing BigQueryDialect for SQL strings.
6
+
7
+ Connection string: bigquery://<project>/<dataset>
8
+ Auth uses Application Default Credentials (GOOGLE_APPLICATION_CREDENTIALS).
9
+
10
+ NOT yet verified against a live BigQuery project — smoke-test before production.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from urllib.parse import urlparse
15
+
16
+ from .base import DEFAULT_EXCLUDED_SCHEMAS, AbstractConnector, ColumnMeta, ColumnStats, TableRef
17
+ from .dialect import BigQueryDialect
18
+ from .guard import assert_read_only
19
+
20
+
21
+ class BigQueryConnector(AbstractConnector):
22
+ db_type = "bigquery"
23
+
24
+ def __init__(self, conninfo: str):
25
+ p = urlparse(conninfo)
26
+ self.project = p.hostname or ""
27
+ self.dataset = (p.path or "/").lstrip("/").split("/")[0]
28
+ self.dialect = BigQueryDialect(self.project, self.dataset)
29
+ self._client = None
30
+
31
+ def connect(self) -> None:
32
+ from google.cloud import bigquery # lazy: optional dependency
33
+ self._client = bigquery.Client(project=self.project)
34
+
35
+ def close(self) -> None:
36
+ if self._client is not None:
37
+ self._client.close()
38
+ self._client = None
39
+
40
+ def _rows(self, sql: str) -> list[tuple]:
41
+ if self._client is None:
42
+ raise RuntimeError("Connector is not connected")
43
+ assert_read_only(sql)
44
+ return [tuple(row.values()) for row in self._client.query(sql).result()]
45
+
46
+ def server_version(self) -> str:
47
+ return "bigquery"
48
+
49
+ def list_tables(self, excluded_schemas: frozenset[str] = DEFAULT_EXCLUDED_SCHEMAS
50
+ ) -> list[TableRef]:
51
+ rows = self._rows(self.dialect.list_tables_sql())
52
+ return [TableRef(schema=str(s), name=str(n)) for (s, n) in rows]
53
+
54
+ def get_columns(self, table: TableRef) -> list[ColumnMeta]:
55
+ rows = self._rows(self.dialect.columns_sql(table.schema, table.name))
56
+ return [ColumnMeta(name=str(c), data_type=str(t),
57
+ nullable=str(nul).upper() == "YES")
58
+ for (c, t, nul) in rows]
59
+
60
+ def count_rows(self, table: TableRef) -> int:
61
+ rows = self._rows(self.dialect.count_rows_sql(table.schema, table.name))
62
+ return int(rows[0][0]) if rows else 0
63
+
64
+ def column_aggregate(self, table: TableRef, column: ColumnMeta) -> ColumnStats:
65
+ try:
66
+ sql = self.dialect.column_aggregate_sql(
67
+ table.schema, table.name, column.name, do_minmax=True, do_distinct=True)
68
+ total, non_null, distinct, mn, mx = self._rows(sql)[0]
69
+ except Exception:
70
+ sql = self.dialect.column_counts_sql(table.schema, table.name, column.name)
71
+ total, non_null = self._rows(sql)[0]
72
+ distinct = mn = mx = None
73
+ return ColumnStats(
74
+ total=int(total), null_count=int(total) - int(non_null),
75
+ distinct_count=int(distinct) if distinct is not None else None,
76
+ min_value=mn, max_value=mx,
77
+ )
78
+
79
+ def sample_rows(self, table: TableRef, columns: list[str], n: int) -> list[dict]:
80
+ if not columns:
81
+ return []
82
+ sql = self.dialect.sample_sql(table.schema, table.name, columns, n)
83
+ assert_read_only(sql)
84
+ return [dict(row.items()) for row in self._client.query(sql).result()]
85
+
86
+ def count_since(self, table: TableRef, ts_column: str, days: int) -> int:
87
+ rows = self._rows(self.dialect.count_since_sql(table.schema, table.name, ts_column, days))
88
+ return int(rows[0][0]) if rows else 0
89
+
90
+ def monthly_counts(self, table: TableRef, ts_column: str) -> list[tuple[str, int]]:
91
+ rows = self._rows(self.dialect.monthly_counts_sql(table.schema, table.name, ts_column))
92
+ return [(str(m), int(c)) for (m, c) in rows]
93
+
94
+ def top_group_counts(self, table: TableRef, column: str, n: int) -> list[int]:
95
+ rows = self._rows(self.dialect.top_groups_sql(table.schema, table.name, column, n))
96
+ return [int(c) for (c,) in rows]