drc-scanner 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. drc_scanner-0.2.0/PKG-INFO +93 -0
  2. drc_scanner-0.2.0/README.md +56 -0
  3. drc_scanner-0.2.0/drc_scanner/__init__.py +7 -0
  4. drc_scanner-0.2.0/drc_scanner/cli.py +249 -0
  5. drc_scanner-0.2.0/drc_scanner/connectors/__init__.py +50 -0
  6. drc_scanner-0.2.0/drc_scanner/connectors/base.py +124 -0
  7. drc_scanner-0.2.0/drc_scanner/connectors/bigquery_conn.py +96 -0
  8. drc_scanner-0.2.0/drc_scanner/connectors/dialect.py +218 -0
  9. drc_scanner-0.2.0/drc_scanner/connectors/dialect_connector.py +170 -0
  10. drc_scanner-0.2.0/drc_scanner/connectors/guard.py +86 -0
  11. drc_scanner-0.2.0/drc_scanner/connectors/postgres.py +211 -0
  12. drc_scanner-0.2.0/drc_scanner/indicative.py +40 -0
  13. drc_scanner-0.2.0/drc_scanner/passport/__init__.py +29 -0
  14. drc_scanner-0.2.0/drc_scanner/passport/builder.py +138 -0
  15. drc_scanner-0.2.0/drc_scanner/passport/models.py +109 -0
  16. drc_scanner-0.2.0/drc_scanner/passport/signing.py +87 -0
  17. drc_scanner-0.2.0/drc_scanner/pii/__init__.py +4 -0
  18. drc_scanner-0.2.0/drc_scanner/pii/detector.py +104 -0
  19. drc_scanner-0.2.0/drc_scanner/pii/patterns.py +115 -0
  20. drc_scanner-0.2.0/drc_scanner/profiling/__init__.py +4 -0
  21. drc_scanner-0.2.0/drc_scanner/profiling/concentration.py +57 -0
  22. drc_scanner-0.2.0/drc_scanner/profiling/engine.py +144 -0
  23. drc_scanner-0.2.0/drc_scanner/profiling/heuristics.py +63 -0
  24. drc_scanner-0.2.0/drc_scanner/profiling/inventory.py +51 -0
  25. drc_scanner-0.2.0/drc_scanner/profiling/model.py +31 -0
  26. drc_scanner-0.2.0/drc_scanner/profiling/statistics.py +130 -0
  27. drc_scanner-0.2.0/drc_scanner/profiling/suggestions.py +92 -0
  28. drc_scanner-0.2.0/drc_scanner/profiling/timeseries.py +118 -0
  29. drc_scanner-0.2.0/drc_scanner.egg-info/PKG-INFO +93 -0
  30. drc_scanner-0.2.0/drc_scanner.egg-info/SOURCES.txt +41 -0
  31. drc_scanner-0.2.0/drc_scanner.egg-info/dependency_links.txt +1 -0
  32. drc_scanner-0.2.0/drc_scanner.egg-info/entry_points.txt +2 -0
  33. drc_scanner-0.2.0/drc_scanner.egg-info/requires.txt +27 -0
  34. drc_scanner-0.2.0/drc_scanner.egg-info/top_level.txt +1 -0
  35. drc_scanner-0.2.0/pyproject.toml +62 -0
  36. drc_scanner-0.2.0/setup.cfg +4 -0
  37. drc_scanner-0.2.0/tests/test_dialects.py +86 -0
  38. drc_scanner-0.2.0/tests/test_guard.py +74 -0
  39. drc_scanner-0.2.0/tests/test_metrics.py +79 -0
  40. drc_scanner-0.2.0/tests/test_passport.py +77 -0
  41. drc_scanner-0.2.0/tests/test_pii.py +77 -0
  42. drc_scanner-0.2.0/tests/test_security.py +110 -0
  43. drc_scanner-0.2.0/tests/test_signing.py +41 -0
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.4
2
+ Name: drc-scanner
3
+ Version: 0.2.0
4
+ Summary: Data Revenue Connecter — read-only database profiling agent that builds a signed Data Passport
5
+ Author: Data Revenue Group
6
+ License: Proprietary
7
+ Project-URL: Homepage, https://app.datarevenue.io
8
+ Keywords: data valuation,database profiling,data passport,read-only
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Topic :: Database
13
+ Classifier: Intended Audience :: Information Technology
14
+ Requires-Python: >=3.11
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: typer>=0.12.0
17
+ Requires-Dist: rich>=13.7.0
18
+ Requires-Dist: pydantic>=2.7.0
19
+ Requires-Dist: psycopg[binary]>=3.1.0
20
+ Requires-Dist: cryptography>=42.0.0
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest>=8.2.0; extra == "dev"
23
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
24
+ Provides-Extra: mysql
25
+ Requires-Dist: mysql-connector-python>=8.4.0; extra == "mysql"
26
+ Provides-Extra: sqlserver
27
+ Requires-Dist: pyodbc>=5.1.0; extra == "sqlserver"
28
+ Provides-Extra: snowflake
29
+ Requires-Dist: snowflake-connector-python>=3.10.0; extra == "snowflake"
30
+ Provides-Extra: bigquery
31
+ Requires-Dist: google-cloud-bigquery>=3.20.0; extra == "bigquery"
32
+ Provides-Extra: all-connectors
33
+ Requires-Dist: mysql-connector-python>=8.4.0; extra == "all-connectors"
34
+ Requires-Dist: pyodbc>=5.1.0; extra == "all-connectors"
35
+ Requires-Dist: snowflake-connector-python>=3.10.0; extra == "all-connectors"
36
+ Requires-Dist: google-cloud-bigquery>=3.20.0; extra == "all-connectors"
37
+
38
+ # DRC Scanner — Data Revenue Connecter
39
+
40
+ A **read-only** database profiling agent. It runs inside your own network, profiles your
41
+ database, and produces a small, human-readable, signed **Data Passport** — a JSON file of
42
+ aggregate statistics only. **Your raw data never leaves your machine.** You then upload the
43
+ Passport to app.datarevenue.io to auto-fill your data valuation questionnaire.
44
+
45
+ ## What leaves your network
46
+
47
+ Nothing during the scan. The scanner makes **zero network calls** while profiling. The only
48
+ artifact produced is the Data Passport (`drc_passport_<date>.json`) — column names, row
49
+ counts, and aggregate metrics. Never row values, never sample data. You can inspect the
50
+ entire file before uploading it.
51
+
52
+ ## Install & run
53
+
54
+ ```bash
55
+ # Option 1 — pip
56
+ pip install drc-scanner
57
+ drc-scan run --connect postgresql://readonly@host:5432/proddb
58
+
59
+ # Option 2 — Docker (no Python required)
60
+ docker run -it datarevenue/scanner \
61
+ --connect postgresql://readonly@host.docker.internal:5432/proddb
62
+ ```
63
+
64
+ Output:
65
+
66
+ ```
67
+ drc_passport_2026-06-11.json (the Data Passport)
68
+ + on-screen inventory summary and upload instructions
69
+ ```
70
+
71
+ ## Create a read-only database user first
72
+
73
+ ```bash
74
+ drc-scan setup --db postgres
75
+ ```
76
+
77
+ This prints the exact SQL to create a least-privilege, read-only role so the scanner can
78
+ never modify your data — copy-paste it for your DBA.
79
+
80
+ ## Safety guarantees
81
+
82
+ - **Read-only enforced in code.** Every query passes through a verb allow-list; `INSERT`,
83
+ `UPDATE`, `DELETE`, and all DDL raise a hard error *before* reaching the database.
84
+ - **No raw export.** The Passport builder rejects any field containing row-level values.
85
+ - **Offline scan.** No network egress during profiling — verifiable with a firewall rule.
86
+ - **Signed & hashed.** Each Passport carries a SHA-256 content hash (Ed25519 signing added
87
+ in the next build) so the platform can detect tampering.
88
+
89
+ ## Status
90
+
91
+ v0.1 skeleton: Postgres connector, table/column inventory, record counts, Passport with
92
+ content hash. Full metric engine, PII detection, additional connectors, and Ed25519 signing
93
+ follow in subsequent builds.
@@ -0,0 +1,56 @@
1
+ # DRC Scanner — Data Revenue Connecter
2
+
3
+ A **read-only** database profiling agent. It runs inside your own network, profiles your
4
+ database, and produces a small, human-readable, signed **Data Passport** — a JSON file of
5
+ aggregate statistics only. **Your raw data never leaves your machine.** You then upload the
6
+ Passport to app.datarevenue.io to auto-fill your data valuation questionnaire.
7
+
8
+ ## What leaves your network
9
+
10
+ Nothing during the scan. The scanner makes **zero network calls** while profiling. The only
11
+ artifact produced is the Data Passport (`drc_passport_<date>.json`) — column names, row
12
+ counts, and aggregate metrics. Never row values, never sample data. You can inspect the
13
+ entire file before uploading it.
14
+
15
+ ## Install & run
16
+
17
+ ```bash
18
+ # Option 1 — pip
19
+ pip install drc-scanner
20
+ drc-scan run --connect postgresql://readonly@host:5432/proddb
21
+
22
+ # Option 2 — Docker (no Python required)
23
+ docker run -it datarevenue/scanner \
24
+ --connect postgresql://readonly@host.docker.internal:5432/proddb
25
+ ```
26
+
27
+ Output:
28
+
29
+ ```
30
+ drc_passport_2026-06-11.json (the Data Passport)
31
+ + on-screen inventory summary and upload instructions
32
+ ```
33
+
34
+ ## Create a read-only database user first
35
+
36
+ ```bash
37
+ drc-scan setup --db postgres
38
+ ```
39
+
40
+ This prints the exact SQL to create a least-privilege, read-only role so the scanner can
41
+ never modify your data — copy-paste it for your DBA.
42
+
43
+ ## Safety guarantees
44
+
45
+ - **Read-only enforced in code.** Every query passes through a verb allow-list; `INSERT`,
46
+ `UPDATE`, `DELETE`, and all DDL raise a hard error *before* reaching the database.
47
+ - **No raw export.** The Passport builder rejects any field containing row-level values.
48
+ - **Offline scan.** No network egress during profiling — verifiable with a firewall rule.
49
+ - **Signed & hashed.** Each Passport carries a SHA-256 content hash (Ed25519 signing added
50
+ in the next build) so the platform can detect tampering.
51
+
52
+ ## Status
53
+
54
+ v0.1 skeleton: Postgres connector, table/column inventory, record counts, Passport with
55
+ content hash. Full metric engine, PII detection, additional connectors, and Ed25519 signing
56
+ follow in subsequent builds.
@@ -0,0 +1,7 @@
1
+ """DRC Scanner — Data Revenue Connecter.
2
+
3
+ A read-only database profiling agent that builds a signed Data Passport.
4
+ """
5
+
6
+ __version__ = "0.2.0"
7
+ AGENT_VERSION = __version__
@@ -0,0 +1,249 @@
1
+ """drc-scan — the Data Revenue Connecter command-line interface."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ import sys
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+ from typing import Optional
9
+ from urllib.parse import urlparse
10
+
11
+ # Force UTF-8 output so status glyphs render on Windows consoles (cp1252) and
12
+ # when piped. errors="replace" guarantees we never crash on an exotic terminal.
13
+ for _stream in (sys.stdout, sys.stderr):
14
+ try:
15
+ _stream.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined]
16
+ except (AttributeError, ValueError):
17
+ pass
18
+
19
+ import typer
20
+ from rich.console import Console
21
+ from rich.panel import Panel
22
+ from rich.table import Table
23
+
24
+ from drc_scanner import AGENT_VERSION
25
+ from drc_scanner.connectors import connector_for
26
+ from drc_scanner.passport import signing
27
+ from drc_scanner.passport.builder import build_passport, write_passport
28
+ from drc_scanner.profiling.engine import run_full_profile
29
+
30
+ _SIGNING_KEY_PATH = Path.home() / ".drc" / "signing_key.b64"
31
+
32
+ app = typer.Typer(
33
+ add_completion=False,
34
+ help="Data Revenue Connecter — read-only database profiling. Your data never leaves your machine.",
35
+ )
36
+ console = Console()
37
+
38
+ _READONLY_ROLE_SQL = {
39
+ "postgres": """-- Run as a Postgres superuser. Replace the password and database name.
40
+ CREATE ROLE drc_readonly WITH LOGIN PASSWORD 'change-this-password';
41
+ GRANT CONNECT ON DATABASE your_database TO drc_readonly;
42
+ GRANT USAGE ON SCHEMA public TO drc_readonly;
43
+ GRANT SELECT ON ALL TABLES IN SCHEMA public TO drc_readonly;
44
+ -- Ensure future tables are readable too:
45
+ ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO drc_readonly;
46
+ -- Then connect the scanner with:
47
+ -- drc-scan run --connect postgresql://drc_readonly:change-this-password@HOST:5432/your_database""",
48
+ }
49
+
50
+
51
+ def _host_fingerprint(conn: str) -> str:
52
+ """Hash of the host only — the real hostname is never recorded."""
53
+ host = urlparse(conn).hostname or "unknown"
54
+ digest = hashlib.sha256(f"drc:{host}".encode("utf-8")).hexdigest()
55
+ return f"sha256:{digest[:32]}"
56
+
57
+
58
+ @app.command()
59
+ def version() -> None:
60
+ """Print the agent version."""
61
+ console.print(f"DRC Scanner v{AGENT_VERSION}")
62
+
63
+
64
+ @app.command()
65
+ def setup(db: str = typer.Option("postgres", help="Database type: postgres")) -> None:
66
+ """Print the SQL to create a least-privilege, read-only role for the scanner."""
67
+ sql = _READONLY_ROLE_SQL.get(db.lower())
68
+ if sql is None:
69
+ console.print(f"[red]No setup template for '{db}'. Supported: postgres[/red]")
70
+ raise typer.Exit(code=1)
71
+ console.print(Panel(sql, title=f"Read-only role for {db}", border_style="cyan"))
72
+
73
+
74
+ def _scan_and_build(connect: str, on_progress=None):
75
+ """Connect, profile (read-only), and build a signed Passport.
76
+
77
+ Returns (passport, profile, db_version, key_identifier, pub_b64). Shared by
78
+ the `run` and `monitor` commands so they never diverge.
79
+ """
80
+ connector = connector_for(connect)
81
+ started = datetime.now(timezone.utc)
82
+ try:
83
+ connector.connect()
84
+ db_version = connector.server_version()
85
+ profile = run_full_profile(connector, on_table=on_progress)
86
+ finally:
87
+ connector.close()
88
+ completed = datetime.now(timezone.utc)
89
+
90
+ private_key = signing.get_or_create_private_key(_SIGNING_KEY_PATH)
91
+ pub_b64 = signing.public_b64(private_key)
92
+ key_identifier = signing.key_id(pub_b64)
93
+
94
+ def _signer(content_hash: str) -> tuple[str, str]:
95
+ return signing.sign(private_key, content_hash.encode("utf-8")), key_identifier
96
+
97
+ passport = build_passport(
98
+ inventory=profile.inventory, metrics=profile.metrics,
99
+ suggestions=profile.suggestions, pii_findings=profile.pii_findings,
100
+ indicative_range=profile.indicative, db_type=connector.db_type,
101
+ db_version=db_version, host_fingerprint=_host_fingerprint(connect),
102
+ started_at=started, completed_at=completed, agent_version=AGENT_VERSION,
103
+ signer=_signer,
104
+ )
105
+ return passport, profile, db_version, key_identifier, pub_b64
106
+
107
+
108
+ @app.command()
109
+ def run(
110
+ connect: str = typer.Option(..., "--connect", help="Database connection string"),
111
+ output: Optional[Path] = typer.Option(
112
+ None, "--output", help="Passport output path (default: drc_passport_<date>.json)"
113
+ ),
114
+ ) -> None:
115
+ """Profile the database (read-only) and write a Data Passport."""
116
+ console.print(f"[bold]DRC Scanner v{AGENT_VERSION}[/bold] — Data Revenue Connecter")
117
+ console.print("[dim]Read-only mode · No data leaves this machine[/dim]\n")
118
+
119
+ try:
120
+ connector_for(connect) # validate scheme early
121
+ except ValueError as exc:
122
+ console.print(f"[red]{exc}[/red]")
123
+ raise typer.Exit(code=1)
124
+
125
+ def _progress(table, i, total):
126
+ console.print(f" [dim]profiling {i}/{total}: {table.fqn}[/dim]")
127
+
128
+ try:
129
+ passport, profile, db_version, key_identifier, pub_b64 = _scan_and_build(
130
+ connect, on_progress=_progress)
131
+ except Exception as exc:
132
+ console.print(f" [red]✗ Scan failed: {exc}[/red]")
133
+ raise typer.Exit(code=1)
134
+
135
+ console.print(f" [green]✔[/green] Profiled {profile.inventory.tables_scanned} tables, "
136
+ f"{profile.inventory.total_rows:,} rows")
137
+ console.print(f" [green]✔[/green] PII scan complete — {len(profile.pii_findings)} finding(s)")
138
+
139
+ out_path = output or Path(
140
+ f"drc_passport_{datetime.now(timezone.utc).date().isoformat()}.json")
141
+ write_passport(passport, out_path)
142
+ console.print(f" [green]✔[/green] Data Passport written [bold]{out_path}[/bold]\n")
143
+
144
+ _print_summary(profile, passport, out_path, key_identifier, pub_b64)
145
+
146
+
147
+ @app.command()
148
+ def monitor(
149
+ connect: str = typer.Option(..., "--connect", help="Database connection string"),
150
+ upload_url: str = typer.Option(..., "--upload-url",
151
+ help="Platform base URL, e.g. https://app.datarevenue.io"),
152
+ token: str = typer.Option(..., "--token", help="Monitor API token (X-API-Key)"),
153
+ license: str = typer.Option("", "--license", help="Monitor license key"),
154
+ schedule: str = typer.Option("quarterly", "--schedule",
155
+ help="monthly | quarterly (for the printed scheduling hint)"),
156
+ ) -> None:
157
+ """Re-scan and auto-upload the Data Passport (one run of a Monitor schedule)."""
158
+ import json as _json
159
+ import urllib.request
160
+
161
+ console.print(f"[bold]DRC Monitor[/bold] v{AGENT_VERSION} — scheduled re-scan")
162
+ try:
163
+ passport, profile, _, _, _ = _scan_and_build(connect)
164
+ except Exception as exc:
165
+ console.print(f"[red]✗ Scan failed: {exc}[/red]")
166
+ raise typer.Exit(code=1)
167
+
168
+ endpoint = upload_url.rstrip("/") + "/api/v1/passports/upload"
169
+ body = _json.dumps(passport.model_dump(mode="json")).encode("utf-8")
170
+ req = urllib.request.Request(
171
+ endpoint, data=body, method="POST",
172
+ headers={"Content-Type": "application/json", "X-API-Key": token},
173
+ )
174
+ try:
175
+ with urllib.request.urlopen(req, timeout=120) as resp:
176
+ result = _json.loads(resp.read().decode("utf-8"))
177
+ except Exception as exc:
178
+ console.print(f"[red]✗ Upload failed: {exc}[/red]")
179
+ raise typer.Exit(code=1)
180
+
181
+ console.print(f" [green]✔[/green] Uploaded passport {result.get('passport_id')} "
182
+ f"(records: {profile.metrics.record_count:,})")
183
+ console.print(Panel(
184
+ f"Schedule: [bold]{schedule}[/bold] License: [dim]{license or '(none)'}[/dim]\n\n"
185
+ f"To automate this, add a {schedule} task that runs this exact command:\n"
186
+ f"[dim]Linux/macOS (cron, quarterly @ 2am day 1):[/dim]\n"
187
+ f" 0 2 1 */3 * drc-scan monitor --connect ... --upload-url {upload_url} --token ...\n"
188
+ f"[dim]Windows (Task Scheduler):[/dim]\n"
189
+ f" schtasks /create /tn DRC-Monitor /sc monthly /mo 3 /tr \"drc-scan monitor ...\"",
190
+ title="Re-scan uploaded", border_style="green",
191
+ ))
192
+
193
+
194
+ def _print_summary(profile, passport, out_path, key_identifier, pub_b64) -> None:
195
+ m = profile.metrics
196
+
197
+ # Top tables by row count
198
+ tbl = Table(title="Inventory Summary", border_style="cyan")
199
+ tbl.add_column("Table")
200
+ tbl.add_column("Rows", justify="right")
201
+ tbl.add_column("Columns", justify="right")
202
+ for t in sorted(profile.inventory.tables, key=lambda t: t.rows, reverse=True)[:8]:
203
+ tbl.add_row(t.name, f"{t.rows:,}", str(t.columns))
204
+ console.print(tbl)
205
+
206
+ # Quality metrics
207
+ def pct(x):
208
+ return f"{x:.0%}" if x is not None else "—"
209
+
210
+ q = Table(title="Measured Quality (DQF)", border_style="cyan")
211
+ q.add_column("Dimension")
212
+ q.add_column("Score", justify="right")
213
+ q.add_row("Completeness", pct(m.completeness))
214
+ q.add_row("Accuracy (floor)", pct(m.accuracy_floor))
215
+ q.add_row("Consistency", pct(m.consistency))
216
+ q.add_row("Timeliness", pct(m.timeliness))
217
+ q.add_row("Uniqueness", pct(m.uniqueness))
218
+ console.print(q)
219
+
220
+ cat = profile.suggestions.get("dataset_category")
221
+ cat_str = f"{cat.value} ({cat.confidence:.0%})" if cat and cat.value else "undetermined"
222
+ rng = profile.indicative
223
+
224
+ console.print(Panel(
225
+ f"Records: [bold]{m.record_count:,}[/bold] Attributes: [bold]{m.attribute_count}[/bold] "
226
+ f"Tables: [bold]{m.table_count}[/bold]\n"
227
+ f"Suggested category: [bold]{cat_str}[/bold] "
228
+ f"Vintage: [bold]{m.data_vintage_years or '—'}[/bold] yrs\n"
229
+ f"PII findings: [bold]{len(profile.pii_findings)}[/bold] "
230
+ f"Signed: [green]{passport.integrity.signing_key_id}[/green]",
231
+ title="Scan complete",
232
+ border_style="green",
233
+ ))
234
+
235
+ console.print(Panel(
236
+ f"[bold]${rng.low:,} – ${rng.high:,}[/bold]\n[dim]{rng.disclaimer}[/dim]\n\n"
237
+ f"Upload [bold]{out_path}[/bold] to app.datarevenue.io to auto-fill your "
238
+ f"valuation questionnaire.",
239
+ title="Preliminary Indicative Range",
240
+ border_style="yellow",
241
+ ))
242
+
243
+ console.print(
244
+ f"[dim]Agent public key ({key_identifier}): {pub_b64}[/dim]"
245
+ )
246
+
247
+
248
+ if __name__ == "__main__":
249
+ app()
@@ -0,0 +1,50 @@
1
+ from urllib.parse import urlparse
2
+
3
+ from .base import (
4
+ AbstractConnector,
5
+ ColumnMeta,
6
+ ColumnStats,
7
+ ForeignKey,
8
+ TableRef,
9
+ )
10
+ from .guard import ReadOnlyViolation, assert_read_only
11
+
12
+ # scheme prefix → (module, class). Lazy import keeps optional drivers optional.
13
+ _SCHEME_MAP = {
14
+ "postgresql": ("postgres", "PostgresConnector"),
15
+ "postgres": ("postgres", "PostgresConnector"),
16
+ "mysql": ("dialect_connector", "MySQLConnector"),
17
+ "mariadb": ("dialect_connector", "MySQLConnector"),
18
+ "mssql": ("dialect_connector", "SQLServerConnector"),
19
+ "sqlserver": ("dialect_connector", "SQLServerConnector"),
20
+ "snowflake": ("dialect_connector", "SnowflakeConnector"),
21
+ "bigquery": ("bigquery_conn", "BigQueryConnector"),
22
+ }
23
+
24
+
25
+ def connector_for(conninfo: str) -> AbstractConnector:
26
+ """Return the right connector for a connection string's scheme."""
27
+ import importlib
28
+
29
+ scheme = urlparse(conninfo).scheme.lower().split("+")[0]
30
+ entry = _SCHEME_MAP.get(scheme)
31
+ if entry is None:
32
+ raise ValueError(
33
+ f"Unsupported database scheme '{scheme}'. Supported: "
34
+ f"{sorted(set(_SCHEME_MAP))}"
35
+ )
36
+ module, cls = entry
37
+ mod = importlib.import_module(f"drc_scanner.connectors.{module}")
38
+ return getattr(mod, cls)(conninfo)
39
+
40
+
41
+ __all__ = [
42
+ "AbstractConnector",
43
+ "ColumnMeta",
44
+ "ColumnStats",
45
+ "ForeignKey",
46
+ "TableRef",
47
+ "ReadOnlyViolation",
48
+ "assert_read_only",
49
+ "connector_for",
50
+ ]
@@ -0,0 +1,124 @@
1
+ """Abstract connector interface shared by every database backend.
2
+
3
+ A connector is a thin, read-only window onto one database. The profiling engine
4
+ depends only on this interface, so adding a new database (MySQL, Snowflake, ...)
5
+ never touches the profiler.
6
+
7
+ Privacy note: ``column_aggregate`` may return MIN/MAX for timestamp and numeric
8
+ columns (needed for data-vintage and plausibility), but the profiling engine
9
+ never writes raw min/max values into the Passport — only derived numbers. Raw
10
+ sample rows fetched via ``sample_rows`` are processed in scanner memory and
11
+ discarded; they never leave the machine.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ from abc import ABC, abstractmethod
16
+ from dataclasses import dataclass, field
17
+ from typing import Any, Optional
18
+
19
+ # System schemas excluded from scans by default across backends.
20
+ DEFAULT_EXCLUDED_SCHEMAS = frozenset({
21
+ "pg_catalog", "information_schema", "pg_toast",
22
+ "sys", "mysql", "performance_schema",
23
+ })
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class TableRef:
28
+ """Fully-qualified reference to a table."""
29
+ schema: str
30
+ name: str
31
+
32
+ @property
33
+ def fqn(self) -> str:
34
+ """schema.table label for display (not for SQL interpolation)."""
35
+ return f"{self.schema}.{self.name}"
36
+
37
+
38
+ @dataclass
39
+ class ColumnMeta:
40
+ name: str
41
+ data_type: str
42
+ nullable: bool
43
+
44
+
45
+ @dataclass
46
+ class ColumnStats:
47
+ total: int
48
+ null_count: int
49
+ distinct_count: Optional[int] = None
50
+ # min/max are for INTERNAL use only (timestamp vintage, numeric plausibility).
51
+ # The engine must never persist these raw values into the Passport.
52
+ min_value: Optional[Any] = None
53
+ max_value: Optional[Any] = None
54
+
55
+
56
+ @dataclass
57
+ class ForeignKey:
58
+ column: str
59
+ ref_schema: str
60
+ ref_table: str
61
+ ref_column: str
62
+
63
+
64
+ class AbstractConnector(ABC):
65
+ """Minimal read-only interface. Implementations must route every query
66
+ through ``drc_scanner.connectors.guard.assert_read_only`` and connect with
67
+ a read-only session."""
68
+
69
+ db_type: str = "unknown"
70
+
71
+ @abstractmethod
72
+ def connect(self) -> None: ...
73
+
74
+ @abstractmethod
75
+ def close(self) -> None: ...
76
+
77
+ @abstractmethod
78
+ def server_version(self) -> str: ...
79
+
80
+ @abstractmethod
81
+ def list_tables(self, excluded_schemas: frozenset[str] = DEFAULT_EXCLUDED_SCHEMAS
82
+ ) -> list[TableRef]: ...
83
+
84
+ @abstractmethod
85
+ def get_columns(self, table: TableRef) -> list[ColumnMeta]: ...
86
+
87
+ @abstractmethod
88
+ def count_rows(self, table: TableRef) -> int: ...
89
+
90
+ # ── Profiling extensions (Step 2) ─────────────────────────────────────────
91
+
92
+ @abstractmethod
93
+ def column_aggregate(self, table: TableRef, column: ColumnMeta) -> ColumnStats:
94
+ """Pushed-down COUNT/COUNT(DISTINCT)/MIN/MAX. MIN/MAX only for
95
+ timestamp and numeric columns (privacy)."""
96
+
97
+ @abstractmethod
98
+ def sample_rows(self, table: TableRef, columns: list[str], n: int) -> list[dict]:
99
+ """Up to n rows for local-only profiling (PII, accuracy, uniqueness).
100
+ Never persisted to the Passport."""
101
+
102
+ @abstractmethod
103
+ def count_since(self, table: TableRef, ts_column: str, days: int) -> int:
104
+ """Rows whose ts_column falls within the trailing ``days`` window."""
105
+
106
+ @abstractmethod
107
+ def monthly_counts(self, table: TableRef, ts_column: str) -> list[tuple[str, int]]:
108
+ """(month_iso, row_count) buckets for growth/cadence analysis."""
109
+
110
+ @abstractmethod
111
+ def top_group_counts(self, table: TableRef, column: str, n: int) -> list[int]:
112
+ """Descending row counts of the top-n groups, VALUES DISCARDED — only
113
+ the counts are returned, never the grouping keys (privacy)."""
114
+
115
+ def list_foreign_keys(self, table: TableRef) -> list[ForeignKey]:
116
+ """Declared foreign keys; default empty for backends that don't expose them."""
117
+ return []
118
+
119
+ def __enter__(self) -> "AbstractConnector":
120
+ self.connect()
121
+ return self
122
+
123
+ def __exit__(self, *exc) -> None:
124
+ self.close()
@@ -0,0 +1,96 @@
1
+ """BigQuery connector.
2
+
3
+ BigQuery is not a DBAPI engine — it runs queries through a client and scopes
4
+ catalog introspection to a dataset's INFORMATION_SCHEMA — so it implements
5
+ AbstractConnector directly while reusing BigQueryDialect for SQL strings.
6
+
7
+ Connection string: bigquery://<project>/<dataset>
8
+ Auth uses Application Default Credentials (GOOGLE_APPLICATION_CREDENTIALS).
9
+
10
+ NOT yet verified against a live BigQuery project — smoke-test before production.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from urllib.parse import urlparse
15
+
16
+ from .base import DEFAULT_EXCLUDED_SCHEMAS, AbstractConnector, ColumnMeta, ColumnStats, TableRef
17
+ from .dialect import BigQueryDialect
18
+ from .guard import assert_read_only
19
+
20
+
21
+ class BigQueryConnector(AbstractConnector):
22
+ db_type = "bigquery"
23
+
24
+ def __init__(self, conninfo: str):
25
+ p = urlparse(conninfo)
26
+ self.project = p.hostname or ""
27
+ self.dataset = (p.path or "/").lstrip("/").split("/")[0]
28
+ self.dialect = BigQueryDialect(self.project, self.dataset)
29
+ self._client = None
30
+
31
+ def connect(self) -> None:
32
+ from google.cloud import bigquery # lazy: optional dependency
33
+ self._client = bigquery.Client(project=self.project)
34
+
35
+ def close(self) -> None:
36
+ if self._client is not None:
37
+ self._client.close()
38
+ self._client = None
39
+
40
+ def _rows(self, sql: str) -> list[tuple]:
41
+ if self._client is None:
42
+ raise RuntimeError("Connector is not connected")
43
+ assert_read_only(sql)
44
+ return [tuple(row.values()) for row in self._client.query(sql).result()]
45
+
46
+ def server_version(self) -> str:
47
+ return "bigquery"
48
+
49
+ def list_tables(self, excluded_schemas: frozenset[str] = DEFAULT_EXCLUDED_SCHEMAS
50
+ ) -> list[TableRef]:
51
+ rows = self._rows(self.dialect.list_tables_sql())
52
+ return [TableRef(schema=str(s), name=str(n)) for (s, n) in rows]
53
+
54
+ def get_columns(self, table: TableRef) -> list[ColumnMeta]:
55
+ rows = self._rows(self.dialect.columns_sql(table.schema, table.name))
56
+ return [ColumnMeta(name=str(c), data_type=str(t),
57
+ nullable=str(nul).upper() == "YES")
58
+ for (c, t, nul) in rows]
59
+
60
+ def count_rows(self, table: TableRef) -> int:
61
+ rows = self._rows(self.dialect.count_rows_sql(table.schema, table.name))
62
+ return int(rows[0][0]) if rows else 0
63
+
64
+ def column_aggregate(self, table: TableRef, column: ColumnMeta) -> ColumnStats:
65
+ try:
66
+ sql = self.dialect.column_aggregate_sql(
67
+ table.schema, table.name, column.name, do_minmax=True, do_distinct=True)
68
+ total, non_null, distinct, mn, mx = self._rows(sql)[0]
69
+ except Exception:
70
+ sql = self.dialect.column_counts_sql(table.schema, table.name, column.name)
71
+ total, non_null = self._rows(sql)[0]
72
+ distinct = mn = mx = None
73
+ return ColumnStats(
74
+ total=int(total), null_count=int(total) - int(non_null),
75
+ distinct_count=int(distinct) if distinct is not None else None,
76
+ min_value=mn, max_value=mx,
77
+ )
78
+
79
+ def sample_rows(self, table: TableRef, columns: list[str], n: int) -> list[dict]:
80
+ if not columns:
81
+ return []
82
+ sql = self.dialect.sample_sql(table.schema, table.name, columns, n)
83
+ assert_read_only(sql)
84
+ return [dict(row.items()) for row in self._client.query(sql).result()]
85
+
86
+ def count_since(self, table: TableRef, ts_column: str, days: int) -> int:
87
+ rows = self._rows(self.dialect.count_since_sql(table.schema, table.name, ts_column, days))
88
+ return int(rows[0][0]) if rows else 0
89
+
90
+ def monthly_counts(self, table: TableRef, ts_column: str) -> list[tuple[str, int]]:
91
+ rows = self._rows(self.dialect.monthly_counts_sql(table.schema, table.name, ts_column))
92
+ return [(str(m), int(c)) for (m, c) in rows]
93
+
94
+ def top_group_counts(self, table: TableRef, column: str, n: int) -> list[int]:
95
+ rows = self._rows(self.dialect.top_groups_sql(table.schema, table.name, column, n))
96
+ return [int(c) for (c,) in rows]