dbdocs 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbdocs/__init__.py +0 -0
- dbdocs/__main__.py +3 -0
- dbdocs/cli/__init__.py +0 -0
- dbdocs/cli/main.py +86 -0
- dbdocs/core/__init__.py +0 -0
- dbdocs/core/artifacts.py +82 -0
- dbdocs/core/config.py +117 -0
- dbdocs/core/exceptions.py +24 -0
- dbdocs/core/log.py +58 -0
- dbdocs/extract/__init__.py +0 -0
- dbdocs/extract/_sqlglot_lineage.py +267 -0
- dbdocs/extract/column_lineage.py +181 -0
- dbdocs/extract/erd.py +102 -0
- dbdocs/extract/erd_json.py +80 -0
- dbdocs/extract/graph.py +72 -0
- dbdocs/extract/nodes.py +119 -0
- dbdocs/main.py +6 -0
- dbdocs/site/__init__.py +0 -0
- dbdocs/site/builder.py +132 -0
- dbdocs/site/bundle/assets/app.js +500 -0
- dbdocs/site/bundle/assets/favicon.svg +12 -0
- dbdocs/site/bundle/assets/graph/index.css +1 -0
- dbdocs/site/bundle/assets/graph/index.js +62 -0
- dbdocs/site/bundle/assets/style.css +289 -0
- dbdocs/site/bundle/assets/vendor/marked.min.js +6 -0
- dbdocs/site/bundle/assets/vendor/minisearch.min.js +8 -0
- dbdocs/site/bundle/index.html +48 -0
- dbdocs/site/deploy.py +123 -0
- dbdocs/site/inject.py +32 -0
- dbdocs-0.0.0.dist-info/METADATA +78 -0
- dbdocs-0.0.0.dist-info/RECORD +34 -0
- dbdocs-0.0.0.dist-info/WHEEL +4 -0
- dbdocs-0.0.0.dist-info/entry_points.txt +2 -0
- dbdocs-0.0.0.dist-info/licenses/LICENSE +21 -0
dbdocs/__init__.py
ADDED
|
File without changes
|
dbdocs/__main__.py
ADDED
dbdocs/cli/__init__.py
ADDED
|
File without changes
|
dbdocs/cli/main.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import importlib.metadata
|
|
3
|
+
import socketserver
|
|
4
|
+
from http.server import SimpleHTTPRequestHandler
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from dbdocs.core.config import DbDocsConfig
|
|
9
|
+
from dbdocs.core.exceptions import DbDocsError
|
|
10
|
+
from dbdocs.core.log import logger
|
|
11
|
+
from dbdocs.site import deploy as deploy_module
|
|
12
|
+
from dbdocs.site.builder import ReportBuilder
|
|
13
|
+
|
|
14
|
+
__version__ = importlib.metadata.version("dbdocs")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# dbdocs
|
|
18
|
+
@click.group(
|
|
19
|
+
context_settings={"help_option_names": ["-h", "--help"]},
|
|
20
|
+
invoke_without_command=True,
|
|
21
|
+
no_args_is_help=True,
|
|
22
|
+
epilog="Specify one of these sub-commands and you can find more help from there.",
|
|
23
|
+
)
|
|
24
|
+
@click.version_option(__version__)
|
|
25
|
+
@click.option("-c", "--config", "config_path", default=None, help="Path to dbdocs.yml.")
|
|
26
|
+
@click.pass_context
|
|
27
|
+
def dbdocs(ctx, config_path):
|
|
28
|
+
"""Alternative dbt docs site: dbt docs + ERD + column-level lineage."""
|
|
29
|
+
logger.info("Run with dbdocs==%s", __version__)
|
|
30
|
+
try:
|
|
31
|
+
ctx.obj = DbDocsConfig.load(config_path)
|
|
32
|
+
except DbDocsError as exc:
|
|
33
|
+
raise click.ClickException(str(exc)) from exc
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dbdocs.command(name="generate")
|
|
37
|
+
@click.option("-o", "--output-dir", default=None, help="Where to write the site (default: config).")
|
|
38
|
+
@click.option(
|
|
39
|
+
"--dialect", default=None, help="SQL dialect for column lineage (default: adapter_type)."
|
|
40
|
+
)
|
|
41
|
+
@click.pass_obj
|
|
42
|
+
def generate(config: DbDocsConfig, output_dir, dialect):
|
|
43
|
+
"""Build the self-contained site from dbt artifacts."""
|
|
44
|
+
if dialect is not None:
|
|
45
|
+
config.dialect = dialect
|
|
46
|
+
try:
|
|
47
|
+
out = ReportBuilder(config).generate(output_dir=output_dir)
|
|
48
|
+
except DbDocsError as exc:
|
|
49
|
+
raise click.ClickException(str(exc)) from exc
|
|
50
|
+
click.echo(f"Generated site into {out}")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dbdocs.command(name="serve")
|
|
54
|
+
@click.option("-p", "--port", default=8000, show_default=True, help="Port to serve on.")
|
|
55
|
+
@click.pass_obj
|
|
56
|
+
def serve(config: DbDocsConfig, port):
|
|
57
|
+
"""Serve the generated site locally (static http server)."""
|
|
58
|
+
handler = functools.partial(SimpleHTTPRequestHandler, directory=config.output_path)
|
|
59
|
+
click.echo(f"Serving {config.output_path} at http://127.0.0.1:{port} (Ctrl-C to stop)")
|
|
60
|
+
socketserver.ThreadingTCPServer.allow_reuse_address = True
|
|
61
|
+
with socketserver.ThreadingTCPServer(("127.0.0.1", port), handler) as httpd:
|
|
62
|
+
httpd.serve_forever()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dbdocs.command(name="deploy")
|
|
66
|
+
@click.option("--version", "version", required=True, help="Version label to deploy (e.g. 1.2).")
|
|
67
|
+
@click.option("--alias", default=None, help="Moving alias for this version (e.g. latest).")
|
|
68
|
+
@click.option(
|
|
69
|
+
"--title", default=None, help="Display title for this version (default: the version)."
|
|
70
|
+
)
|
|
71
|
+
@click.option(
|
|
72
|
+
"--delete", "delete", is_flag=True, default=False, help="Delete this version instead."
|
|
73
|
+
)
|
|
74
|
+
@click.option("--push/--no-push", default=False, help="Publish to the gh-pages branch.")
|
|
75
|
+
@click.pass_obj
|
|
76
|
+
def deploy(config: DbDocsConfig, version, alias, title, delete, push):
|
|
77
|
+
"""Generate a versioned build and update the version index (or --delete one)."""
|
|
78
|
+
try:
|
|
79
|
+
if delete:
|
|
80
|
+
deploy_module.delete(config, version=version, push=push)
|
|
81
|
+
click.echo(f"Deleted version {version}")
|
|
82
|
+
return
|
|
83
|
+
out = deploy_module.deploy(config, version=version, alias=alias, push=push, title=title)
|
|
84
|
+
except DbDocsError as exc:
|
|
85
|
+
raise click.ClickException(str(exc)) from exc
|
|
86
|
+
click.echo(f"Deployed version {version} into {out}")
|
dbdocs/core/__init__.py
ADDED
|
File without changes
|
dbdocs/core/artifacts.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Loading dbt artifacts (manifest/catalog) via the dbterd parser.
|
|
2
|
+
|
|
3
|
+
dbterd parses ``manifest.json`` / ``catalog.json`` into ``dbt_artifacts_parser``
|
|
4
|
+
Pydantic models. Two cross-cutting gotchas live here so the rest of dbdocs never
|
|
5
|
+
has to think about them:
|
|
6
|
+
|
|
7
|
+
* **Schema field aliasing.** ``dbt_artifacts_parser`` aliases the ``schema``
|
|
8
|
+
field to ``schema_`` to avoid clobbering Pydantic's ``BaseModel.schema()`` —
|
|
9
|
+
so ``node.schema`` is a *bound method*, not the value. Always read
|
|
10
|
+
``node.schema_``; :func:`db_schema` centralizes that.
|
|
11
|
+
* **Schema-version relaxation.** Passing the detected schema version to
|
|
12
|
+
``read_manifest``/``read_catalog`` makes dbterd apply its relaxation policies,
|
|
13
|
+
keeping parsing robust across dbt versions (including dbt Core 2.0).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from dbterd.helpers import file
|
|
21
|
+
|
|
22
|
+
#: Bucket label used when a node/source has no database or schema set.
|
|
23
|
+
UNKNOWN = "_unknown"
|
|
24
|
+
|
|
25
|
+
#: unique_id prefixes surfaced as catalog nodes (tests/macros/etc. excluded).
|
|
26
|
+
NODE_PREFIXES = ("model.", "seed.", "snapshot.")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def artifact_version(target_path: str, artifact: str) -> "int | None":
|
|
30
|
+
"""Resolve a dbt artifact's schema version int from its ``dbt_schema_version``.
|
|
31
|
+
|
|
32
|
+
Returns ``None`` (auto-detect, strict) if the version can't be determined —
|
|
33
|
+
e.g. the file is missing or not valid JSON.
|
|
34
|
+
"""
|
|
35
|
+
artifact_path = Path(target_path) / f"{artifact}.json"
|
|
36
|
+
try:
|
|
37
|
+
metadata = json.loads(artifact_path.read_text(encoding="utf-8")).get("metadata", {})
|
|
38
|
+
except (OSError, json.JSONDecodeError):
|
|
39
|
+
return None
|
|
40
|
+
extracted = file.extract_artifact_version_from_file(metadata.get("dbt_schema_version", ""))
|
|
41
|
+
return int(extracted) if extracted else None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def load_artifacts(target_path: str) -> "tuple[Any, Any]":
|
|
45
|
+
"""Return the dbterd-parsed ``(manifest, catalog)`` for a dbt target dir."""
|
|
46
|
+
manifest = file.read_manifest(
|
|
47
|
+
path=target_path, version=artifact_version(target_path, "manifest")
|
|
48
|
+
)
|
|
49
|
+
catalog = file.read_catalog(path=target_path, version=artifact_version(target_path, "catalog"))
|
|
50
|
+
return manifest, catalog
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def adapter_type(target_path: str) -> "str | None":
|
|
54
|
+
"""The warehouse adapter (``snowflake``/``bigquery``/…) from manifest metadata.
|
|
55
|
+
|
|
56
|
+
Read from the raw JSON rather than the parsed model so it works regardless of
|
|
57
|
+
how the parser exposes ``metadata``. Used as the default sqlglot dialect for
|
|
58
|
+
column-level lineage. ``None`` if unreadable.
|
|
59
|
+
"""
|
|
60
|
+
manifest_path = Path(target_path) / "manifest.json"
|
|
61
|
+
try:
|
|
62
|
+
metadata = json.loads(manifest_path.read_text(encoding="utf-8")).get("metadata", {})
|
|
63
|
+
except (OSError, json.JSONDecodeError):
|
|
64
|
+
return None
|
|
65
|
+
return metadata.get("adapter_type")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def db_schema(entity: Any) -> "tuple[str, str]":
|
|
69
|
+
"""The ``(database, schema)`` an entity lands in, with safe fallbacks.
|
|
70
|
+
|
|
71
|
+
Reads ``schema_`` (the Pydantic alias — ``schema`` is a bound method) and
|
|
72
|
+
falls back to :data:`UNKNOWN` when either part is missing, so grouping never
|
|
73
|
+
produces a ``None`` bucket.
|
|
74
|
+
"""
|
|
75
|
+
database = getattr(entity, "database", None) or UNKNOWN
|
|
76
|
+
schema = getattr(entity, "schema_", None) or UNKNOWN
|
|
77
|
+
return str(database), str(schema)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def node_name(unique_id: str) -> str:
|
|
81
|
+
"""The dbt node's short name — the last dotted segment of its unique_id."""
|
|
82
|
+
return unique_id.split(".")[-1]
|
dbdocs/core/config.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from dataclasses import asdict, dataclass, field, fields
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
|
|
6
|
+
from dbdocs.core.exceptions import DbDocsConfigError
|
|
7
|
+
|
|
8
|
+
DEFAULT_CONFIG_FILENAME = "dbdocs.yml"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DbDocsConfig:
|
|
13
|
+
"""Site configuration for a dbdocs build.
|
|
14
|
+
|
|
15
|
+
Loaded from a ``dbdocs.yml`` in the working directory; every field has a
|
|
16
|
+
default so the file is optional. ``version`` is intentionally absent — it is
|
|
17
|
+
a ``deploy`` CLI argument, not site config.
|
|
18
|
+
|
|
19
|
+
``target_dir`` is where the dbt artifacts are read from; ``output_dir`` is
|
|
20
|
+
where the generated self-contained site is written.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
site_name: str = "dbt docs"
|
|
24
|
+
site_url: str = "https://github.com/datnguye/dbt-docs"
|
|
25
|
+
site_author: str = "Dat Nguyen"
|
|
26
|
+
site_description: str = "Alternative dbt documentation site"
|
|
27
|
+
repo_name: str = "datnguye/dbt-docs"
|
|
28
|
+
repo_url: str = "https://github.com/datnguye/dbt-docs"
|
|
29
|
+
project_name: str = "dbt docs"
|
|
30
|
+
#: The footer's Buy-me-a-coffee badge shows by default; set false to hide it.
|
|
31
|
+
show_buy_me_a_coffee: bool = True
|
|
32
|
+
#: Project README rendered on the overview (relative to the working dir). Set
|
|
33
|
+
#: empty to omit the README section. Missing file ⇒ section simply absent.
|
|
34
|
+
readme: str = "README.md"
|
|
35
|
+
target_dir: str = "target"
|
|
36
|
+
#: Where the generated site is written. Nested under the dbt ``target/`` by
|
|
37
|
+
#: default so docs sit alongside the artifacts they're built from.
|
|
38
|
+
output_dir: str = "target/site"
|
|
39
|
+
#: SQL dialect for column-lineage parsing; ``None`` ⇒ derive from the
|
|
40
|
+
#: artifact's ``adapter_type`` (e.g. snowflake, bigquery, postgres).
|
|
41
|
+
dialect: "str | None" = None
|
|
42
|
+
#: Alias the SPA's version switcher treats as the default landing version.
|
|
43
|
+
default_version: str = "latest"
|
|
44
|
+
#: dbterd ERD options (``algo``, ``entity_name_format``, ``select``,
|
|
45
|
+
#: ``resource_type``, …) passed straight to ``DbtErd``. Configured here so the
|
|
46
|
+
#: ERD shape lives in ``dbdocs.yml`` rather than a separate ``.dbterd.yml``.
|
|
47
|
+
dbterd: dict = field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def load(cls, path: "str | Path | None" = None) -> "DbDocsConfig":
|
|
51
|
+
"""Load config from ``path`` (or ``./dbdocs.yml``); all-defaults if absent."""
|
|
52
|
+
config_path = Path(path) if path is not None else Path.cwd() / DEFAULT_CONFIG_FILENAME
|
|
53
|
+
if not config_path.is_file():
|
|
54
|
+
return cls()
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
raw = yaml.safe_load(config_path.read_text(encoding="utf-8"))
|
|
58
|
+
except yaml.YAMLError as exc:
|
|
59
|
+
raise DbDocsConfigError(f"Could not parse {config_path}: {exc}") from exc
|
|
60
|
+
|
|
61
|
+
if raw is None:
|
|
62
|
+
return cls()
|
|
63
|
+
if not isinstance(raw, dict):
|
|
64
|
+
raise DbDocsConfigError(
|
|
65
|
+
f"{config_path} must contain a mapping, got {type(raw).__name__}"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
known = {f.name for f in fields(cls)}
|
|
69
|
+
unknown = set(raw) - known
|
|
70
|
+
if unknown:
|
|
71
|
+
raise DbDocsConfigError(
|
|
72
|
+
f"Unknown keys in {config_path}: {', '.join(sorted(unknown))}. "
|
|
73
|
+
f"Allowed keys: {', '.join(sorted(known))}."
|
|
74
|
+
)
|
|
75
|
+
return cls(**raw)
|
|
76
|
+
|
|
77
|
+
#: Build-control fields that are not part of the site's display metadata.
|
|
78
|
+
_NON_METADATA_FIELDS = (
|
|
79
|
+
"target_dir",
|
|
80
|
+
"output_dir",
|
|
81
|
+
"dialect",
|
|
82
|
+
"default_version",
|
|
83
|
+
"dbterd",
|
|
84
|
+
"readme",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def render_context(self) -> dict:
|
|
88
|
+
"""The site display metadata injected into the SPA's ``metadata`` block.
|
|
89
|
+
|
|
90
|
+
Excludes build-control fields (where artifacts are read, where the site
|
|
91
|
+
is written, the lineage dialect override) that aren't site metadata.
|
|
92
|
+
"""
|
|
93
|
+
context = asdict(self)
|
|
94
|
+
for field_name in self._NON_METADATA_FIELDS:
|
|
95
|
+
context.pop(field_name, None)
|
|
96
|
+
return context
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def target_path(self) -> str:
|
|
100
|
+
"""Absolute path to the dbt target/ dir where the artifacts live.
|
|
101
|
+
|
|
102
|
+
A relative ``target_dir`` is resolved against the current working
|
|
103
|
+
directory **at access time** — this is intentional and must stay aligned
|
|
104
|
+
with dbterd's ``DbtErd``, which also reads artifacts from ``./target``
|
|
105
|
+
relative to the cwd. An absolute ``target_dir`` is returned unchanged.
|
|
106
|
+
"""
|
|
107
|
+
return str(Path.cwd() / self.target_dir)
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def output_path(self) -> str:
|
|
111
|
+
"""Absolute path to the dir the generated site is written into.
|
|
112
|
+
|
|
113
|
+
Resolved against the cwd at access time, mirroring ``target_path`` — a
|
|
114
|
+
relative ``output_dir`` follows the working directory, an absolute one is
|
|
115
|
+
returned unchanged.
|
|
116
|
+
"""
|
|
117
|
+
return str(Path.cwd() / self.output_dir)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""dbdocs exception types.
|
|
2
|
+
|
|
3
|
+
Multiple exception classes may share one file (per the project's Python style).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DbDocsError(Exception):
|
|
8
|
+
"""Base class for all dbdocs errors."""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DbDocsConfigError(DbDocsError):
|
|
12
|
+
"""Raised when dbdocs.yml is malformed or holds invalid values."""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LineageError(DbDocsError):
|
|
16
|
+
"""Raised when column-level lineage can't be parsed for a model.
|
|
17
|
+
|
|
18
|
+
Always caught per-model by the extractor so one unparseable model never
|
|
19
|
+
fails the whole ``generate`` — the model is skipped and logged instead.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DeployError(DbDocsError):
|
|
24
|
+
"""Raised when a versioned deploy step (e.g. the git push) fails."""
|
dbdocs/core/log.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
#: Where the DEBUG-level file log is streamed (relative to the working dir).
|
|
5
|
+
LOG_FILE = Path("logs") / "dbdocs.log"
|
|
6
|
+
#: Plain (non-ANSI) line format for the file — colour codes don't belong in a file.
|
|
7
|
+
FILE_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LogFormatter(logging.Formatter):
|
|
11
|
+
grey = "\x1b[38;20m"
|
|
12
|
+
blue = "\x1b[34;20m"
|
|
13
|
+
yellow = "\x1b[33;20m"
|
|
14
|
+
red = "\x1b[31;20m"
|
|
15
|
+
bold_red = "\x1b[31;1m"
|
|
16
|
+
reset = "\x1b[0m"
|
|
17
|
+
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)"
|
|
18
|
+
|
|
19
|
+
FORMATS = {
|
|
20
|
+
logging.DEBUG: blue + format + reset,
|
|
21
|
+
logging.INFO: grey + format + reset,
|
|
22
|
+
logging.WARNING: yellow + format + reset,
|
|
23
|
+
logging.ERROR: red + format + reset,
|
|
24
|
+
logging.CRITICAL: bold_red + format + reset,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
def format(self, record):
|
|
28
|
+
log_fmt = self.FORMATS.get(record.levelno)
|
|
29
|
+
formatter = logging.Formatter(log_fmt)
|
|
30
|
+
return formatter.format(record)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Named "dbdocs" (not "dbterd") so our handler/level config doesn't collide with
|
|
34
|
+
# the dbterd library's own logger of the same name.
|
|
35
|
+
logger = logging.getLogger("dbdocs")
|
|
36
|
+
logger.setLevel(logging.DEBUG)
|
|
37
|
+
# Emit only through our own handlers. Without this, records also propagate to the
|
|
38
|
+
# root logger — which dbterd configures via basicConfig — producing duplicate,
|
|
39
|
+
# differently-formatted "INFO:dbdocs:…" lines.
|
|
40
|
+
logger.propagate = False
|
|
41
|
+
|
|
42
|
+
if len(logger.handlers) == 0: # pragma: no cover - import-time handler guard
|
|
43
|
+
ch = logging.StreamHandler()
|
|
44
|
+
ch.setLevel(logging.DEBUG)
|
|
45
|
+
ch.setFormatter(LogFormatter())
|
|
46
|
+
logger.addHandler(ch)
|
|
47
|
+
|
|
48
|
+
# Stream everything (DEBUG and up) to logs/dbdocs.log too. Best-effort: if the
|
|
49
|
+
# logs dir can't be created/written (read-only fs), the console handler still
|
|
50
|
+
# works and we don't crash on import.
|
|
51
|
+
try:
|
|
52
|
+
LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
fh = logging.FileHandler(LOG_FILE, encoding="utf-8")
|
|
54
|
+
fh.setLevel(logging.DEBUG)
|
|
55
|
+
fh.setFormatter(logging.Formatter(FILE_FORMAT))
|
|
56
|
+
logger.addHandler(fh)
|
|
57
|
+
except OSError:
|
|
58
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""Column-level lineage engine: trace a SELECT's output column to its sources.
|
|
2
|
+
|
|
3
|
+
A self-contained lineage builder over sqlglot's optimizer, with case-insensitive
|
|
4
|
+
column resolution and cycle-safe recursion so it copes with dbt-compiled SQL
|
|
5
|
+
(uppercased warehouse identifiers, recursive CTEs) without relying on sqlglot's
|
|
6
|
+
internal, version-unstable lineage API.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import typing as t
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
|
|
15
|
+
from sqlglot import Schema, exp, maybe_parse
|
|
16
|
+
from sqlglot.errors import SqlglotError
|
|
17
|
+
from sqlglot.optimizer import (
|
|
18
|
+
Scope,
|
|
19
|
+
build_scope,
|
|
20
|
+
find_all_in_scope,
|
|
21
|
+
normalize_identifiers,
|
|
22
|
+
qualify,
|
|
23
|
+
)
|
|
24
|
+
from sqlglot.optimizer.scope import ScopeType
|
|
25
|
+
|
|
26
|
+
if t.TYPE_CHECKING:
|
|
27
|
+
from sqlglot.dialects.dialect import DialectType
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger("sqlglot")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class Node:
|
|
34
|
+
name: str
|
|
35
|
+
expression: exp.Expression
|
|
36
|
+
source: exp.Expression
|
|
37
|
+
downstream: list[Node] = field(default_factory=list)
|
|
38
|
+
source_name: str = ""
|
|
39
|
+
reference_node_name: str = ""
|
|
40
|
+
|
|
41
|
+
def walk(self) -> t.Iterator[Node]:
|
|
42
|
+
yield self
|
|
43
|
+
for d in self.downstream:
|
|
44
|
+
yield from d.walk()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def lineage(
|
|
48
|
+
column: str | exp.Column,
|
|
49
|
+
sql: str | exp.Expression,
|
|
50
|
+
schema: dict | Schema | None = None,
|
|
51
|
+
sources: t.Mapping[str, str | exp.Query] | None = None,
|
|
52
|
+
dialect: DialectType = None,
|
|
53
|
+
scope: Scope | None = None,
|
|
54
|
+
trim_selects: bool = True,
|
|
55
|
+
**kwargs,
|
|
56
|
+
) -> Node:
|
|
57
|
+
"""Build the lineage graph for a column of a SQL query."""
|
|
58
|
+
expression = maybe_parse(sql, dialect=dialect)
|
|
59
|
+
column = normalize_identifiers.normalize_identifiers(column, dialect=dialect).name
|
|
60
|
+
|
|
61
|
+
if sources:
|
|
62
|
+
expression = exp.expand(
|
|
63
|
+
expression,
|
|
64
|
+
{k: t.cast(exp.Query, maybe_parse(v, dialect=dialect)) for k, v in sources.items()},
|
|
65
|
+
dialect=dialect,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if not scope:
|
|
69
|
+
expression = qualify.qualify(
|
|
70
|
+
expression,
|
|
71
|
+
dialect=dialect,
|
|
72
|
+
schema=schema,
|
|
73
|
+
**{
|
|
74
|
+
"validate_qualify_columns": False,
|
|
75
|
+
"identify": False,
|
|
76
|
+
"allow_partial_qualification": True,
|
|
77
|
+
**kwargs,
|
|
78
|
+
},
|
|
79
|
+
)
|
|
80
|
+
scope = build_scope(expression)
|
|
81
|
+
|
|
82
|
+
if not scope:
|
|
83
|
+
raise SqlglotError("Cannot build lineage, sql must be SELECT")
|
|
84
|
+
|
|
85
|
+
select_names_original = {select.alias_or_name for select in scope.expression.selects}
|
|
86
|
+
select_names_lower = {name.lower(): name for name in select_names_original}
|
|
87
|
+
# Case-insensitive resolution: dbt/warehouse casing rarely matches exactly.
|
|
88
|
+
if column not in select_names_original:
|
|
89
|
+
column_lower = column.lower()
|
|
90
|
+
if column_lower in select_names_lower:
|
|
91
|
+
column = select_names_lower[column_lower]
|
|
92
|
+
else:
|
|
93
|
+
raise SqlglotError(f"Cannot find column '{column}' in query.")
|
|
94
|
+
|
|
95
|
+
return to_node(column, scope, dialect, trim_selects=trim_selects)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def to_node(
|
|
99
|
+
column: str | int,
|
|
100
|
+
scope: Scope,
|
|
101
|
+
dialect: DialectType,
|
|
102
|
+
scope_name: str | None = None,
|
|
103
|
+
upstream: Node | None = None,
|
|
104
|
+
source_name: str | None = None,
|
|
105
|
+
reference_node_name: str | None = None,
|
|
106
|
+
trim_selects: bool = True,
|
|
107
|
+
visited: set | None = None,
|
|
108
|
+
) -> Node | None:
|
|
109
|
+
if visited is None:
|
|
110
|
+
visited = set()
|
|
111
|
+
|
|
112
|
+
key = (column, id(scope))
|
|
113
|
+
if key in visited:
|
|
114
|
+
# Already visited this column-scope: stop, or recursive CTEs loop forever.
|
|
115
|
+
return None
|
|
116
|
+
visited.add(key)
|
|
117
|
+
|
|
118
|
+
select = (
|
|
119
|
+
scope.expression.selects[column]
|
|
120
|
+
if isinstance(column, int)
|
|
121
|
+
else next(
|
|
122
|
+
(select for select in scope.expression.selects if select.alias_or_name == column),
|
|
123
|
+
exp.Star() if scope.expression.is_star else scope.expression,
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if isinstance(scope.expression, exp.Subquery):
|
|
128
|
+
for source in scope.subquery_scopes:
|
|
129
|
+
return to_node(
|
|
130
|
+
column,
|
|
131
|
+
scope=source,
|
|
132
|
+
dialect=dialect,
|
|
133
|
+
upstream=upstream,
|
|
134
|
+
source_name=source_name,
|
|
135
|
+
reference_node_name=reference_node_name,
|
|
136
|
+
trim_selects=trim_selects,
|
|
137
|
+
visited=visited,
|
|
138
|
+
)
|
|
139
|
+
if isinstance(scope.expression, exp.SetOperation):
|
|
140
|
+
name = type(scope.expression).__name__.upper()
|
|
141
|
+
upstream = upstream or Node(name=name, source=scope.expression, expression=select)
|
|
142
|
+
|
|
143
|
+
index = (
|
|
144
|
+
column
|
|
145
|
+
if isinstance(column, int)
|
|
146
|
+
else next(
|
|
147
|
+
(
|
|
148
|
+
i
|
|
149
|
+
for i, select in enumerate(scope.expression.selects)
|
|
150
|
+
if select.alias_or_name == column or select.is_star
|
|
151
|
+
),
|
|
152
|
+
-1,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if index == -1:
|
|
157
|
+
raise ValueError(f"Could not find {column} in {scope.expression}")
|
|
158
|
+
|
|
159
|
+
for s in scope.union_scopes:
|
|
160
|
+
to_node(
|
|
161
|
+
index,
|
|
162
|
+
scope=s,
|
|
163
|
+
dialect=dialect,
|
|
164
|
+
upstream=upstream,
|
|
165
|
+
source_name=source_name,
|
|
166
|
+
reference_node_name=reference_node_name,
|
|
167
|
+
trim_selects=trim_selects,
|
|
168
|
+
visited=visited,
|
|
169
|
+
)
|
|
170
|
+
return upstream
|
|
171
|
+
|
|
172
|
+
if trim_selects and isinstance(scope.expression, exp.Select):
|
|
173
|
+
source = exp.Select()
|
|
174
|
+
source.set("expressions", [select])
|
|
175
|
+
source.set("from", scope.expression.args.get("from"))
|
|
176
|
+
source.set("where", scope.expression.args.get("where"))
|
|
177
|
+
source.set("group", scope.expression.args.get("group"))
|
|
178
|
+
else:
|
|
179
|
+
source = scope.expression
|
|
180
|
+
|
|
181
|
+
node = Node(
|
|
182
|
+
name=f"{scope_name}.{column}" if scope_name else str(column),
|
|
183
|
+
source=source,
|
|
184
|
+
expression=select,
|
|
185
|
+
source_name=source_name or "",
|
|
186
|
+
reference_node_name=reference_node_name or "",
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if upstream:
|
|
190
|
+
upstream.downstream.append(node)
|
|
191
|
+
|
|
192
|
+
subquery_scopes = {
|
|
193
|
+
id(subquery_scope.expression): subquery_scope for subquery_scope in scope.subquery_scopes
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
for subquery in find_all_in_scope(select, exp.UNWRAPPED_QUERIES):
|
|
197
|
+
subquery_scope = subquery_scopes.get(id(subquery))
|
|
198
|
+
if not subquery_scope:
|
|
199
|
+
logger.warning("Unknown subquery scope: %s", subquery.sql(dialect=dialect))
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
for name in subquery.named_selects:
|
|
203
|
+
to_node(
|
|
204
|
+
name,
|
|
205
|
+
scope=subquery_scope,
|
|
206
|
+
dialect=dialect,
|
|
207
|
+
upstream=node,
|
|
208
|
+
trim_selects=trim_selects,
|
|
209
|
+
visited=visited,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if select.is_star:
|
|
213
|
+
for source in scope.sources.values():
|
|
214
|
+
if isinstance(source, Scope):
|
|
215
|
+
source = source.expression
|
|
216
|
+
node.downstream.append(
|
|
217
|
+
Node(name=select.sql(comments=False), source=source, expression=source)
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
source_columns = set(find_all_in_scope(select, exp.Column))
|
|
221
|
+
|
|
222
|
+
if isinstance(source, exp.UDTF):
|
|
223
|
+
source_columns |= set(source.find_all(exp.Column))
|
|
224
|
+
derived_tables = [
|
|
225
|
+
source.expression.parent
|
|
226
|
+
for source in scope.sources.values()
|
|
227
|
+
if isinstance(source, Scope) and source.is_derived_table
|
|
228
|
+
]
|
|
229
|
+
else:
|
|
230
|
+
derived_tables = scope.derived_tables
|
|
231
|
+
|
|
232
|
+
source_names = {
|
|
233
|
+
dt.alias: dt.comments[0].split()[1]
|
|
234
|
+
for dt in derived_tables
|
|
235
|
+
if dt.comments and dt.comments[0].startswith("source: ")
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
for c in source_columns:
|
|
239
|
+
table = c.table
|
|
240
|
+
source = scope.sources.get(table)
|
|
241
|
+
|
|
242
|
+
if isinstance(source, Scope):
|
|
243
|
+
reference_node_name = None
|
|
244
|
+
if source.scope_type == ScopeType.DERIVED_TABLE and table not in source_names:
|
|
245
|
+
reference_node_name = table
|
|
246
|
+
elif source.scope_type == ScopeType.CTE:
|
|
247
|
+
selected_node, _ = scope.selected_sources.get(table, (None, None))
|
|
248
|
+
reference_node_name = selected_node.name if selected_node else None
|
|
249
|
+
|
|
250
|
+
to_node(
|
|
251
|
+
c.name,
|
|
252
|
+
scope=source,
|
|
253
|
+
dialect=dialect,
|
|
254
|
+
scope_name=table,
|
|
255
|
+
upstream=node,
|
|
256
|
+
source_name=source_names.get(table) or source_name,
|
|
257
|
+
reference_node_name=reference_node_name,
|
|
258
|
+
trim_selects=trim_selects,
|
|
259
|
+
visited=visited,
|
|
260
|
+
)
|
|
261
|
+
else:
|
|
262
|
+
source = source or exp.Placeholder()
|
|
263
|
+
node.downstream.append(
|
|
264
|
+
Node(name=c.sql(comments=False), source=source, expression=source)
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
return node
|