PyPI - dbdocs - Versions diffs - 0.0.0__py3-none-any.whl - Mend

dbdocs 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

dbdocs/__init__.py +0 -0
dbdocs/__main__.py +3 -0
dbdocs/cli/__init__.py +0 -0
dbdocs/cli/main.py +86 -0
dbdocs/core/__init__.py +0 -0
dbdocs/core/artifacts.py +82 -0
dbdocs/core/config.py +117 -0
dbdocs/core/exceptions.py +24 -0
dbdocs/core/log.py +58 -0
dbdocs/extract/__init__.py +0 -0
dbdocs/extract/_sqlglot_lineage.py +267 -0
dbdocs/extract/column_lineage.py +181 -0
dbdocs/extract/erd.py +102 -0
dbdocs/extract/erd_json.py +80 -0
dbdocs/extract/graph.py +72 -0
dbdocs/extract/nodes.py +119 -0
dbdocs/main.py +6 -0
dbdocs/site/__init__.py +0 -0
dbdocs/site/builder.py +132 -0
dbdocs/site/bundle/assets/app.js +500 -0
dbdocs/site/bundle/assets/favicon.svg +12 -0
dbdocs/site/bundle/assets/graph/index.css +1 -0
dbdocs/site/bundle/assets/graph/index.js +62 -0
dbdocs/site/bundle/assets/style.css +289 -0
dbdocs/site/bundle/assets/vendor/marked.min.js +6 -0
dbdocs/site/bundle/assets/vendor/minisearch.min.js +8 -0
dbdocs/site/bundle/index.html +48 -0
dbdocs/site/deploy.py +123 -0
dbdocs/site/inject.py +32 -0
dbdocs-0.0.0.dist-info/METADATA +78 -0
dbdocs-0.0.0.dist-info/RECORD +34 -0
dbdocs-0.0.0.dist-info/WHEEL +4 -0
dbdocs-0.0.0.dist-info/entry_points.txt +2 -0
dbdocs-0.0.0.dist-info/licenses/LICENSE +21 -0

dbdocs/extract/column_lineage.py ADDED Viewed

@@ -0,0 +1,181 @@
+"""Column-level lineage: trace each model column back to its source columns.
+For every model we parse its **compiled** SQL with sqlglot, qualify it against a
+schema built from the dbt catalog (so ``SELECT *`` and unqualified columns
+resolve), then walk each output column's lineage tree to its leaf table columns
+and map those tables back to dbt unique_ids. The heavy lifting lives in the
+vendored :mod:`dbdocs.extract._sqlglot_lineage` (a self-contained lineage
+builder over sqlglot's optimizer).
+Design: **fail-soft per model.** A single model with SQL sqlglot can't parse
+must never sink the whole ``generate`` — it's caught, logged, and skipped, and
+the run reports how many were skipped.
+"""
+from typing import Any
+from sqlglot import exp
+from sqlglot.errors import SqlglotError
+from dbdocs.core.artifacts import db_schema, node_name
+from dbdocs.core.exceptions import LineageError
+from dbdocs.core.log import logger
+from dbdocs.extract._sqlglot_lineage import Node, lineage
+#: dbt adapter_type → sqlglot dialect, when the names differ. Most match 1:1.
+_DIALECT_ALIASES = {
+    "databricks": "spark",
+}
+def _to_dialect(adapter_type: "str | None") -> "str | None":
+    if not adapter_type:
+        return None
+    return _DIALECT_ALIASES.get(adapter_type, adapter_type)
+class ColumnLineageExtractor:
+    """Build the ``columnLineage`` map for a dbt project's models.
+    The output maps a fully-qualified output column to the upstream columns it is
+    derived from::
+        {"model.shop.customers.customer_id": [{"node": "model.shop.stg_customers",
+                                               "column": "customer_id"}, ...]}
+    """
+    def __init__(self, manifest: Any, catalog: Any, dialect: "str | None" = None) -> None:
+        self.manifest = manifest
+        self.catalog = catalog
+        self.dialect = _to_dialect(dialect)
+        self.schema = self._schema_from_catalog()
+        # Map a lower-cased ``db.schema.table`` relation back to its unique_id.
+        self._relation_to_node = self._relation_index()
+        self.skipped = 0
+    def extract(self) -> dict:
+        """Return the ``columnLineage`` map across all models (fail-soft)."""
+        result: dict = {}
+        for unique_id, model in (getattr(self.manifest, "nodes", {}) or {}).items():
+            if not str(unique_id).startswith("model."):
+                continue
+            compiled = getattr(model, "compiled_code", "") or ""
+            if not compiled.strip():
+                continue
+            try:
+                self._extract_model(unique_id, compiled, result)
+            except (SqlglotError, LineageError, KeyError, ValueError, RecursionError) as exc:
+                self.skipped += 1
+                logger.warning("Column lineage skipped for %s: %s", node_name(unique_id), exc)
+        if self.skipped:
+            logger.info("Column lineage: skipped %s model(s) that failed to parse.", self.skipped)
+        return result
+    def _extract_model(self, unique_id: str, compiled: str, result: dict) -> None:
+        node = self.manifest.nodes[unique_id]
+        output_columns = [c for c in (getattr(node, "columns", {}) or {})]
+        # Fall back to the catalog's column list when the manifest has none.
+        if not output_columns:
+            catalog_node = (getattr(self.catalog, "nodes", {}) or {}).get(unique_id)
+            output_columns = (
+                list(getattr(catalog_node, "columns", {}) or {}) if catalog_node else []
+            )
+        for column in output_columns:
+            try:
+                root = lineage(column, compiled, schema=self.schema, dialect=self.dialect)
+            except SqlglotError:
+                # One unresolvable column shouldn't drop the rest of the model.
+                continue
+            upstream = self._leaf_columns(root)
+            if upstream:
+                result[f"{unique_id}.{column}"] = upstream
+    def _leaf_columns(self, root: Node) -> list:
+        """Collect distinct upstream ``{node, column}`` leaves of a lineage tree.
+        A leaf is a node whose source is a real ``Table`` (not a CTE/subquery
+        scope) that we can map back to a dbt node. The root itself is skipped.
+        """
+        seen = set()
+        upstream = []
+        for node in root.walk():
+            if node is root:
+                continue
+            source = node.source
+            if not isinstance(source, exp.Table):
+                continue
+            mapped = self._map_table(source)
+            if mapped is None:
+                continue
+            column = node_name(node.name)
+            key = (mapped, column)
+            if key in seen:
+                continue
+            seen.add(key)
+            upstream.append({"node": mapped, "column": column})
+        return upstream
+    def _map_table(self, table: exp.Table) -> "str | None":
+        catalog = table.catalog
+        db = table.db
+        name = table.name
+        candidates = [
+            f"{catalog}.{db}.{name}",
+            f"{db}.{name}",
+            name,
+        ]
+        for candidate in candidates:
+            mapped = self._relation_to_node.get(candidate.lower().strip("."))
+            if mapped:
+                return mapped
+        return None
+    def _relation_index(self) -> dict:
+        """Map ``db.schema.table`` (and shorter forms) → dbt unique_id."""
+        index: dict = {}
+        for unique_id, entity in self._all_entities():
+            database, schema = db_schema(entity)
+            table = getattr(entity, "alias", None) or getattr(entity, "name", None)
+            if not table:
+                continue
+            full = f"{database}.{schema}.{table}".lower()
+            index[full] = unique_id
+            index.setdefault(f"{schema}.{table}".lower(), unique_id)
+            index.setdefault(str(table).lower(), unique_id)
+            relation = getattr(entity, "relation_name", None)
+            if relation:
+                index.setdefault(str(relation).replace('"', "").lower(), unique_id)
+        return index
+    def _schema_from_catalog(self) -> dict:
+        """Build sqlglot's nested ``{db: {schema: {table: {col: type}}}}`` schema."""
+        schema: dict = {}
+        for _, entity, columns in self._catalog_entities():
+            database, db_schema_name = db_schema(entity)
+            table = getattr(entity, "alias", None) or getattr(entity, "name", None)
+            if not table:
+                continue
+            col_types = {name: (col.type or "UNKNOWN") for name, col in columns.items()}
+            schema.setdefault(database, {}).setdefault(db_schema_name, {})[table] = col_types
+        return schema
+    def _all_entities(self):
+        yield from (getattr(self.manifest, "nodes", {}) or {}).items()
+        yield from (getattr(self.manifest, "sources", {}) or {}).items()
+    def _catalog_entities(self):
+        """Yield ``(unique_id, manifest_entity, catalog_columns)`` for schema build.
+        Pairs the catalog's column list (types) with the manifest entity (the
+        authoritative database/schema, read via ``schema_``).
+        """
+        manifest_nodes = getattr(self.manifest, "nodes", {}) or {}
+        manifest_sources = getattr(self.manifest, "sources", {}) or {}
+        for unique_id, catalog_node in (getattr(self.catalog, "nodes", {}) or {}).items():
+            entity = manifest_nodes.get(unique_id)
+            if entity is not None:
+                yield unique_id, entity, getattr(catalog_node, "columns", {}) or {}
+        for unique_id, catalog_source in (getattr(self.catalog, "sources", {}) or {}).items():
+            entity = manifest_sources.get(unique_id)
+            if entity is not None:
+                yield unique_id, entity, getattr(catalog_source, "columns", {}) or {}

dbdocs/extract/erd.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Structured ERD data via dbterd's ``json`` target.
+dbterd's built-in targets emit diagram text; the SPA renders its ERD with React
+Flow, which needs structured node/edge data. We register a ``json`` target
+(:mod:`dbdocs.extract.erd_json`) and parse its ``{tables, relationships}`` output
+into the SPA's ``{nodes, edges}`` — entities with columns (PK/FK flags) and
+foreign-key edges between them, all keyed by dbt unique_id.
+"""
+import json
+from dbterd.api import DbtErd
+# Importing the module registers the "json" target with dbterd's PluginRegistry.
+from dbdocs.extract import erd_json  # noqa: F401
+def build_erd(dbterd_options: "dict | None" = None, artifacts_dir: "str | None" = None) -> DbtErd:
+    """Build the ERD generator (json target) from dbdocs' ``dbterd`` options.
+    ``dbterd_options`` is the ``dbterd:`` block of ``dbdocs.yml`` (``algo``,
+    ``entity_name_format``, ``resource_type``, ``select``, …) passed straight to
+    ``DbtErd``. We force ``target="json"`` — the SPA needs structured data — but
+    let everything else come from the project's config so the ERD matches what
+    the team configured. (Config lives in ``dbdocs.yml``, not a separate
+    ``.dbterd.yml``.)
+    ``artifacts_dir`` is the dbt target dir (``config.target_dir``). dbterd reads
+    the manifest/catalog directly from this dir; without it dbterd would default
+    to ``./target`` and ignore the configured ``target_dir``. An explicit
+    ``artifacts_dir`` in ``dbterd_options`` still wins.
+    """
+    dbterd_kwargs = {k: v for k, v in (dbterd_options or {}).items() if k != "target"}
+    if artifacts_dir is not None:
+        dbterd_kwargs.setdefault("artifacts_dir", artifacts_dir)
+    return DbtErd(target="json", **dbterd_kwargs)
+def build_erd_data(erd: DbtErd) -> dict:
+    """Parse the json target into ``{"nodes": [...], "edges": [...]}``.
+    Nodes are entities (with columns, ``is_primary_key``/``is_foreign_key`` flags
+    and the resolved dbt unique_id); edges are foreign-key relationships between
+    them. dbterd's relationships reference tables by *name*, so we map those back
+    to unique_ids via each table's ``node_name``.
+    """
+    payload = json.loads(erd.get_erd())
+    tables = payload.get("tables", [])
+    relationships = payload.get("relationships", [])
+    # table name (as dbterd refers to it in relationships) → dbt unique_id.
+    name_to_id = {t["name"]: (t.get("node_name") or t["name"]) for t in tables}
+    edges, fk_columns = _build_edges(relationships, name_to_id)
+    nodes = [_build_node(t, fk_columns.get(t.get("node_name") or t["name"], set())) for t in tables]
+    return {"nodes": nodes, "edges": edges}
+def _build_edges(relationships: list, name_to_id: dict) -> "tuple[list, dict]":
+    """Map relationships → edges and collect each node's FK column names."""
+    edges = []
+    fk_columns: dict = {}
+    for index, rel in enumerate(relationships):
+        parent_name, child_name = rel["table_map"]
+        parent_cols, child_cols = rel["column_map"]
+        source = name_to_id.get(parent_name, parent_name)
+        target = name_to_id.get(child_name, child_name)
+        # The child side holds the foreign key columns.
+        fk_columns.setdefault(target, set()).update(child_cols)
+        edges.append(
+            {
+                "id": rel.get("name") or f"e{index}",
+                "source": source,
+                "target": target,
+                "from_columns": list(parent_cols),
+                "to_columns": list(child_cols),
+                "label": rel.get("relationship_label"),
+                "type": rel.get("type", ""),
+            }
+        )
+    return edges, fk_columns
+def _build_node(table: dict, fk_cols: set) -> dict:
+    node_id = table.get("node_name") or table["name"]
+    return {
+        "id": node_id,
+        "label": table["name"],
+        "database": table.get("database") or "",
+        "schema": table.get("schema") or "",
+        "resource_type": table.get("resource_type") or "model",
+        "columns": [
+            {
+                "name": c["name"],
+                "type": c.get("data_type") or "",
+                "description": c.get("description") or "",
+                "is_primary_key": bool(c.get("is_primary_key")),
+                "is_foreign_key": c["name"] in fk_cols,
+            }
+            for c in table.get("columns", [])
+        ],
+    }

dbdocs/extract/erd_json.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""A dbterd ``json`` target adapter: structured tables + relationships.
+dbterd's built-in targets emit diagram *text* (Mermaid, DBML, …). The dbdocs SPA
+renders its ERD with React Flow, which needs structured node/edge data, not a
+diagram string. Registering this adapter makes ``DbtErd(target="json").get_erd()``
+return a JSON document of ``{tables, relationships}`` that ``erd.build_erd_data``
+turns into the SPA's ``{nodes, edges}``.
+The ``Table``/``Column``/``Ref`` → dict serializers are pure and tested in
+isolation; the adapter is the thin dbterd-contract shell over them.
+"""
+import json
+from typing import Any
+from dbterd.core.adapters.target import BaseTargetAdapter
+from dbterd.core.models import Column, Ref, Table
+from dbterd.core.registry.decorators import register_target
+def column_to_dict(column: Column) -> dict:
+    """A dbterd ``Column`` as a plain dict (name, type, description, PK flag)."""
+    return {
+        "name": column.name,
+        "data_type": column.data_type,
+        "description": column.description,
+        "is_primary_key": bool(getattr(column, "is_primary_key", False)),
+    }
+def table_to_dict(table: Table) -> dict:
+    """A dbterd ``Table`` as a plain dict, keyed for ERD rendering."""
+    return {
+        "name": table.name,
+        "database": table.database,
+        "schema": table.schema,
+        "resource_type": table.resource_type,
+        "node_name": table.node_name,
+        "raw_sql": table.raw_sql,
+        "description": table.description,
+        "label": table.label,
+        "columns": [column_to_dict(c) for c in (table.columns or [])],
+    }
+def relationship_to_dict(ref: Ref) -> dict:
+    """A dbterd ``Ref`` as a plain dict: endpoints + the joined columns."""
+    parent, child = ref.table_map
+    parent_cols, child_cols = ref.column_map
+    return {
+        "name": ref.name,
+        "type": ref.type,
+        "table_map": [parent, child],
+        "column_map": [list(parent_cols), list(child_cols)],
+        "relationship_label": getattr(ref, "relationship_label", None),
+    }
+@register_target("json", description="Structured JSON of tables and relationships")
+class JsonAdapter(BaseTargetAdapter):
+    """Emit dbterd tables + relationships as one structured JSON document."""
+    file_extension = ".json"
+    default_filename = "output.json"
+    def build_erd(self, tables: list, relationships: list, **kwargs: Any) -> str:
+        payload = {
+            "tables": [table_to_dict(t) for t in tables],
+            "relationships": [relationship_to_dict(r) for r in relationships],
+        }
+        return json.dumps(payload)
+    def format_table(self, table: Table, **kwargs: Any) -> str:
+        return json.dumps(table_to_dict(table))
+    def format_relationship(self, relationship: Ref, **kwargs: Any) -> str:
+        return json.dumps(relationship_to_dict(relationship))
+    def get_rel_symbol(self, relationship_type: str) -> str:
+        return ""

dbdocs/extract/graph.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Node-level lineage (the DAG) from a dbt manifest.
+``LineageGraph`` turns the manifest's ``parent_map`` (falling back to per-node
+``depends_on.nodes``) into directed parent→child edges plus adjacency maps,
+restricted to the nodes the SPA actually surfaces (models/seeds/snapshots/
+sources) so test/macro dependencies don't dangle. The result feeds the
+interactive DAG view.
+"""
+from typing import Any
+from dbdocs.core.artifacts import NODE_PREFIXES
+class LineageGraph:
+    """The project lineage as ``{edges, parents, children}`` over surfaced nodes."""
+    def __init__(self, manifest: Any, node_ids: "set | None" = None) -> None:
+        self.manifest = manifest
+        #: Restrict edges to these ids. Defaults to models/seeds/snapshots +
+        #: sources derived from the manifest, matching ``nodes.build_nodes``.
+        self.node_ids = node_ids if node_ids is not None else self._default_node_ids()
+    def _default_node_ids(self) -> set:
+        ids = {
+            uid
+            for uid in (getattr(self.manifest, "nodes", {}) or {})
+            if str(uid).startswith(NODE_PREFIXES)
+        }
+        ids.update(getattr(self.manifest, "sources", {}) or {})
+        return ids
+    def build(self) -> dict:
+        """Return ``{"edges": [...], "parents": {...}, "children": {...}}``."""
+        edges = self._edges()
+        parents: dict = {n: [] for n in self.node_ids}
+        children: dict = {n: [] for n in self.node_ids}
+        for edge in edges:
+            parents[edge["target"]].append(edge["source"])
+            children[edge["source"]].append(edge["target"])
+        return {"edges": edges, "parents": parents, "children": children}
+    def _edges(self) -> list:
+        seen = set()
+        edges = []
+        for child, raw_parents in self._parent_pairs():
+            if child not in self.node_ids:
+                continue
+            for parent in raw_parents:
+                if parent not in self.node_ids:
+                    continue
+                key = (parent, child)
+                if key in seen:
+                    continue
+                seen.add(key)
+                edges.append({"source": parent, "target": child})
+        return edges
+    def _parent_pairs(self):
+        parent_map = getattr(self.manifest, "parent_map", None)
+        if parent_map:
+            yield from parent_map.items()
+            return
+        for unique_id in self.node_ids:
+            entity = self._lookup(unique_id)
+            depends_on = getattr(entity, "depends_on", None)
+            yield unique_id, list(getattr(depends_on, "nodes", []) or [])
+    def _lookup(self, unique_id: str) -> Any:
+        if str(unique_id).startswith("source."):
+            return (getattr(self.manifest, "sources", {}) or {}).get(unique_id)
+        return (getattr(self.manifest, "nodes", {}) or {}).get(unique_id)

dbdocs/extract/nodes.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""Extract the SPA's ``nodes`` and ``tree`` data from dbt artifacts.
+``build_nodes`` flattens every model/source/seed/snapshot into a display record
+(columns merged from manifest descriptions + catalog types, transformation code,
+resolved macros). ``build_tree`` groups those into the ``database → schema``
+navigation tree. Pure functions — no I/O, no dbterd calls beyond reading the
+already-parsed objects — so they're trivially testable with lightweight fakes.
+"""
+from typing import Any
+from dbdocs.core.artifacts import NODE_PREFIXES, db_schema, node_name
+def _columns(model: Any, catalog_node: Any) -> list:
+    """Merge manifest column metadata (description/tags) with catalog types.
+    Iterates the catalog's columns (the warehouse truth for which columns exist
+    and their types) and layers on the manifest description/tags when present.
+    Newlines in descriptions become ``<br>`` so they survive HTML rendering.
+    """
+    manifest_columns = getattr(model, "columns", {}) or {}
+    catalog_columns = getattr(catalog_node, "columns", {}) or {} if catalog_node else {}
+    columns = []
+    for name in catalog_columns:
+        manifest_column = manifest_columns.get(name)
+        description = getattr(manifest_column, "description", "") or "" if manifest_column else ""
+        columns.append(
+            {
+                "name": name,
+                "type": catalog_columns[name].type,
+                "tags": (getattr(manifest_column, "tags", []) or []) if manifest_column else [],
+                "description": description.replace("\n", "<br>"),
+            }
+        )
+    return columns
+def macros_used(manifest: Any, node: Any) -> list:
+    """The macros a node depends on, resolved to ``{name, package, sql}`` dicts.
+    ``depends_on.macros`` holds macro unique_ids; each is looked up in
+    ``manifest.macros``. Project macros come first (what a reader most wants),
+    then everything else, each group name-sorted.
+    """
+    macros = getattr(manifest, "macros", {}) or {}
+    depends_on = getattr(node, "depends_on", None)
+    macro_ids = list(getattr(depends_on, "macros", []) or [])
+    resolved = []
+    for macro_id in macro_ids:
+        macro = macros.get(macro_id)
+        if macro is None:
+            continue
+        resolved.append(
+            {
+                "name": getattr(macro, "name", node_name(macro_id)),
+                "package": getattr(macro, "package_name", "") or "",
+                "sql": getattr(macro, "macro_sql", "") or "",
+            }
+        )
+    project_pkg = getattr(node, "package_name", "") or ""
+    resolved.sort(key=lambda m: (m["package"] != project_pkg, m["package"], m["name"]))
+    return resolved
+def _node_record(unique_id: str, entity: Any, catalog_node: Any, resource_type: str, manifest: Any):
+    database, schema = db_schema(entity)
+    return {
+        "id": unique_id,
+        "name": getattr(entity, "name", node_name(unique_id)),
+        "label": node_name(unique_id),
+        "resource_type": resource_type,
+        "database": database,
+        "schema": schema,
+        "package": getattr(entity, "package_name", "") or "",
+        "description": getattr(entity, "description", "") or "",
+        "tags": list(getattr(entity, "tags", []) or []),
+        "relation_name": getattr(entity, "relation_name", "") or "",
+        "columns": _columns(entity, catalog_node),
+        "language": getattr(entity, "language", "") or "",
+        "raw_code": getattr(entity, "raw_code", "") or "",
+        "compiled_code": getattr(entity, "compiled_code", "") or "",
+        "macros": macros_used(manifest, entity),
+    }
+def build_nodes(manifest: Any, catalog: Any) -> dict:
+    """Return the ``nodes`` dict keyed by unique_id (models + sources)."""
+    catalog_nodes = getattr(catalog, "nodes", {}) or {}
+    catalog_sources = getattr(catalog, "sources", {}) or {}
+    nodes: dict = {}
+    for unique_id, entity in (getattr(manifest, "nodes", {}) or {}).items():
+        if not str(unique_id).startswith(NODE_PREFIXES):
+            continue
+        resource_type = str(unique_id).split(".")[0]
+        nodes[unique_id] = _node_record(
+            unique_id, entity, catalog_nodes.get(unique_id), resource_type, manifest
+        )
+    for unique_id, entity in (getattr(manifest, "sources", {}) or {}).items():
+        nodes[unique_id] = _node_record(
+            unique_id, entity, catalog_sources.get(unique_id), "source", manifest
+        )
+    return nodes
+def build_tree(nodes: dict) -> dict:
+    """Group node ids into an ordered ``{database: {schema: [ids]}}`` nav tree."""
+    by_database: dict = {}
+    for unique_id, record in nodes.items():
+        database = record["database"]
+        schema = record["schema"]
+        by_database.setdefault(database, {}).setdefault(schema, []).append(unique_id)
+    return {
+        database: {
+            schema: sorted(by_database[database][schema], key=lambda i: nodes[i]["label"])
+            for schema in sorted(by_database[database])
+        }
+        for database in sorted(by_database)
+    }

dbdocs/main.py ADDED Viewed

@@ -0,0 +1,6 @@
+from dbdocs.cli import main as cli
+def main():
+    """dbdocs entrypoint"""
+    cli.dbdocs()

dbdocs/site/__init__.py ADDED Viewed

File without changes