PyPI - perspective-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

perspective-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

perspective/__init__.py +1 -0
perspective/config.py +240 -0
perspective/exceptions.py +15 -0
perspective/ingest/dbt.py +150 -0
perspective/ingest/ingest.py +164 -0
perspective/ingest/postgres.py +388 -0
perspective/ingest/sources/bi/powerbi/extract.py +184 -0
perspective/ingest/sources/bi/powerbi/models.py +137 -0
perspective/ingest/sources/bi/powerbi/pipeline.py +29 -0
perspective/ingest/sources/bi/powerbi/transform.py +478 -0
perspective/ingest/sources/bi/qlik_sense/extract.py +297 -0
perspective/ingest/sources/bi/qlik_sense/models.py +22 -0
perspective/ingest/sources/bi/qlik_sense/pipeline.py +19 -0
perspective/ingest/sources/bi/qlik_sense/transform.py +76 -0
perspective/ingest/sources/database/sap/extract.py +253 -0
perspective/ingest/sources/database/sap/pipeline.py +23 -0
perspective/ingest/sources/database/sap/transform.py +85 -0
perspective/main.py +74 -0
perspective/models/configs.py +422 -0
perspective/models/dashboards.py +44 -0
perspective/models/databases.py +26 -0
perspective/utils/__init__.py +3 -0
perspective/utils/options.py +77 -0
perspective/utils/utils.py +274 -0
perspective_cli-0.1.0.dist-info/METADATA +49 -0
perspective_cli-0.1.0.dist-info/RECORD +29 -0
perspective_cli-0.1.0.dist-info/WHEEL +5 -0
perspective_cli-0.1.0.dist-info/entry_points.txt +2 -0
perspective_cli-0.1.0.dist-info/top_level.txt +1 -0

perspective/ingest/postgres.py ADDED Viewed

@@ -0,0 +1,388 @@
+"""Utility functions for working with PostgreSQL metadata."""
+import os
+import re
+from typing import Any
+from urllib.parse import urljoin
+import psycopg
+from requests.models import Response
+from typer import Exit, Option, Typer
+from perspective.utils import console, run_command, send_request
+from perspective.utils.options import DryRun, PerspectiveURL
+app = Typer(no_args_is_help=True, pretty_exceptions_show_locals=False)
+def create_conn(
+    username: str, password: str, host: str, port: str, database: str
+) -> psycopg.Connection:
+    """Create and return a new database connection using the given credentials.
+    Args:
+        username (str): Username for the database.
+        password (str): Password for the database.
+        host (str): Host address of the database.
+        port (str): Port number for the database connection.
+        database (str): Name of the database to connect to.
+    Returns:
+        psycopg.extensions.connection: A new database connection.
+    """
+    return psycopg.connect(
+        user=username, password=password, host=host, port=port, dbname=database
+    )
+def generate_pg_dump_content(
+    username: str,
+    database: str,
+    password: str,
+    host: str = "localhost",
+    port: str = "5432",
+) -> str:
+    """Generate a string dump of the PostgreSQL database schema, tables, etc.
+    Args:
+        username (str): The username for the database.
+        database (str): The name of the database.
+        password (str): The password for the database.
+        host (str, optional): The host of the database. Defaults to "localhost".
+        port (str, optional): The port of the database. Defaults to "5432".
+    Returns:
+        str: The result of the pg_dump command as a string.
+    """
+    os.environ["PGPASSWORD"] = password
+    command = f"pg_dump -h {host} -p {port} -U {username} -d {database} --column-inserts --rows-per-insert=1 --no-password"
+    return run_command(command, capture_output=True)
+def get_pg_dump_tables_info(content: str) -> list[dict]:  # noqa: PLR0914
+    """Extract table information from a PostgreSQL database dump content.
+    Args:
+        content (str): PostgreSQL database dump content as a string.
+    Returns:
+        list[dict]: A list of dictionaries, each containing information about a table
+            in the database.
+    """
+    # Find table creation statements
+    table_creations = re.findall(
+        r"CREATE TABLE (.*?)\.(.*?) \((.*?)\);", content, re.DOTALL
+    )
+    # Find row counts and table sizes
+    row_counts = re.findall(
+        r"-- Name: (.*?); Type: TABLE;.*?-- Total rows: (\base_dict+)", content
+    )
+    table_sizes = re.findall(
+        r"-- Name: (.*?); Type: TABLE;.*?-- Size: (.*?)\n", content
+    )
+    # Find table owners
+    table_owners = re.findall(r"ALTER TABLE (.*?)\.(.*?) OWNER TO (.*?);", content)
+    # Find column defaults
+    column_defaults = re.findall(
+        r"ALTER TABLE ONLY (.*?)\.(.*?) ALTER COLUMN (.*?) SET DEFAULT (.*?);", content
+    )
+    # Find primary keys
+    primary_keys = re.findall(
+        r"ALTER TABLE ONLY (.*?)\.(.*?)\s+ADD CONSTRAINT (.*?) PRIMARY KEY \((.*?)\);",
+        content,
+    )
+    # Find foreign keys
+    foreign_keys = re.findall(
+        r"ALTER TABLE ONLY (.*?)\.(.*?)\s+ADD CONSTRAINT (.*?) FOREIGN KEY \((.*?)\) REFERENCES (.*?)\.(.*?)\(id\);",
+        content,
+    )
+    # Initialize the dictionary
+    tables_list = []
+    # Process each table
+    for table_schema, table_name, table_definition in table_creations:
+        # Extract column names and types
+        columns = re.findall(r"(\w+)\s+([\w\s()]+)(?:,|\))", table_definition)
+        # Store column names and types in a dictionary
+        columns_dict = {
+            column_name: column_type.strip() for column_name, column_type in columns
+        }
+        # Find row count for the table
+        row_count = next((count for name, count in row_counts if name == table_name), 0)
+        # Find table size (in MB) for the table
+        table_size = next(
+            (size for name, size in table_sizes if name == table_name), "Unknown"
+        )
+        # Find table owner
+        table_owner = next(
+            (owner for schema, name, owner in table_owners if name == table_name),
+            "Unknown",
+        )
+        # Find column defaults
+        column_default_dict = {
+            column_name: default_value
+            for schema, table, column_name, default_value in column_defaults
+            if table == table_name
+        }
+        # Find primary key
+        primary_key = next(
+            (key for schema, name, _, key in primary_keys if name == table_name), None
+        )
+        # Find foreign keys
+        foreign_keys_list = [
+            {"constraint_name": name, "column": column, "ref_table": ref_table}
+            for schema, table, name, column, ref_schema, ref_table in foreign_keys
+            if table == table_name
+        ]
+        # Add the table details to the schema dictionary
+        tables_list.append({
+            "table_name": table_name,
+            "table_schema": table_schema,
+            "columns": columns_dict,
+            "row_count": int(row_count),
+            "table_size": table_size,
+            "owner": table_owner,
+            "column_defaults": column_default_dict,
+            "primary_key": primary_key,
+            "foreign_keys": foreign_keys_list,
+        })
+    return tables_list
+def get_pg_dump_views_info(content: str) -> list[dict]:
+    """Extract view information from a PostgreSQL database dump content.
+    Args:
+        content (str): PostgreSQL database dump content as a string.
+    Returns:
+        list[dict]: A list of dictionaries, each containing information about a view in
+            the database.
+    """
+    # Find view creation statements
+    view_creations = re.findall(
+        r"CREATE VIEW (.*?)\.(.*?) AS(.*?);", content, re.DOTALL
+    )
+    # Initialize the list for storing view information
+    views_list = []
+    # Process each view
+    for view_schema, view_name, view_definition in view_creations:
+        # Remove newline characters and extra spaces
+        view_definition_clean = re.sub(r"\s+", " ", view_definition)
+        # Extract column names and aliases
+        columns = re.findall(r"(\w+)\s+AS\s+(\w+)", view_definition_clean)
+        # Add the view details to the list
+        views_list.append({
+            "view_name": view_name,
+            "view_schema": view_schema,
+            "columns": dict(columns),
+        })
+    return views_list
+def get_tables_size_info(
+    username: str,
+    database: str,
+    password: str,
+    host: str = "localhost",
+    port: str = "5432",
+) -> dict:
+    """Retrieve size information for all tables in the specified PostgreSQL database.
+    Args:
+        username (str): Database username.
+        database (str): Database name.
+        password (str): Database password.
+        host (str, optional): Database host. Defaults to "localhost".
+        port (str, optional): Database port. Defaults to "5432".
+    Returns:
+        dict: A dictionary where keys are table names and values are their respective
+            sizes.
+    """
+    conn = create_conn(username, password, host, port, database)
+    cur = conn.cursor()
+    cur.execute(
+        """
+        SELECT table_schema,
+               table_name,
+               pg_size_pretty(pg_total_relation_size(('"' || table_schema || '"."' || table_name || '"'))) AS table_size
+        FROM information_schema.tables
+        ORDER BY pg_total_relation_size(('"' || table_schema || '"."' || table_name || '"')) DESC;
+    """
+    )
+    rows = cur.fetchall()
+    table_size_dict = {}
+    for table_schema, table_name, table_size in rows:
+        table_size_dict[table_schema, table_name] = table_size
+    return table_size_dict
+def get_tables_row_counts(
+    username: str,
+    database: str,
+    password: str,
+    host: str = "localhost",
+    port: str = "5432",
+) -> dict:
+    """Retrieve row counts for all tables in a specified PostgreSQL database.
+    Args:
+        username (str): Database username.
+        database (str): Database name.
+        password (str): Database password.
+        host (str, optional): Database host. Defaults to "localhost".
+        port (str, optional): Database port. Defaults to "5432".
+    Returns:
+        dict: A dictionary where keys are table names and values are the number of rows
+            in each table.
+    """
+    conn = create_conn(username, password, host, port, database)
+    cur = conn.cursor()
+    cur.execute(
+        """
+        SELECT schemaname,relname,n_live_tup
+        FROM pg_stat_user_tables
+        ORDER BY n_live_tup DESC;
+    """
+    )
+    rows = cur.fetchall()
+    row_count_dict = {}
+    for table_schema, table_name, row_count in rows:
+        row_count_dict[table_schema, table_name] = row_count
+    return row_count_dict
+def get_db_metadata(
+    username: str,
+    database: str,
+    password: str,
+    host: str = "localhost",
+    port: str = "5432",
+) -> dict[str, list[dict]]:
+    """Gather metadata about the specified PostgreSQL database, tables, and views.
+    Args:
+        username (str): Database username.
+        database (str): Database name.
+        password (str): Database password.
+        host (str, optional): Database host. Defaults to "localhost".
+        port (str, optional): Database port. Defaults to "5432".
+    Returns:
+        dict[str, list[dict]]: A dictionary containing metadata about tables and views
+            in the database.
+    """
+    dump_content: str = generate_pg_dump_content(
+        username=username, database=database, host=host, port=port, password=password
+    )
+    # Transform SQL dump to dictionary
+    tables_info: list[dict] = get_pg_dump_tables_info(dump_content)
+    # Get row count information
+    row_count_dict: dict = get_tables_row_counts(
+        username=username, database=database, host=host, port=port, password=password
+    )
+    # Get table size information:
+    table_size_dict: dict = get_tables_size_info(
+        username=username, database=database, host=host, port=port, password=password
+    )
+    for table in tables_info:
+        key = (table["table_schema"], table["table_name"])
+        table["row_count"] = row_count_dict.get(key, 0)
+        table["table_size"] = table_size_dict.get(key, 0)
+    views_info: list[dict] = get_pg_dump_views_info(dump_content)
+    return {"tables": tables_info, "views": views_info}
+@app.callback(invoke_without_command=True)
+def ingest(
+    perspective_url: str = PerspectiveURL,
+    username: str = Option(
+        ...,
+        "--username",
+        "-n",
+        envvar="LUMA_POSTGRES_USERNAME",
+        help="The username for the PostgreSQL database.",
+        prompt="PostgreSQL username",
+    ),
+    database: str = Option(
+        ...,
+        "--database",
+        "-d",
+        envvar="LUMA_POSTGRES_DATABASE",
+        help="The name of the PostgreSQL database.",
+        prompt="PostgreSQL database",
+    ),
+    host: str = Option(
+        "localhost",
+        "--host",
+        "-h",
+        envvar="LUMA_POSTGRES_HOST",
+        help="The host address of the PostgreSQL database.",
+    ),
+    port: str = Option(
+        "5432",
+        "--port",
+        "-p",
+        envvar="LUMA_POSTGRES_PORT",
+        help="The port number for the PostgreSQL database.",
+    ),
+    password: str = Option(
+        ...,
+        "--password",
+        "-P",
+        envvar="LUMA_POSTGRES_PASSWORD",
+        help="The password for the PostgreSQL database.",
+        prompt="PostgreSQL password",
+        hide_input=True,
+    ),
+    dry_run: bool = DryRun,
+) -> Response:
+    """Ingest metadata from PostgreSQL database into a Luma ingestion endpoint."""
+    # Retrieve database metadata.
+    db_metadata: dict[str, list[dict[str, Any]]] = get_db_metadata(
+        username=username, database=database, host=host, port=port, password=password
+    )
+    # In dry run mode, print the database metadata and exit.
+    if dry_run:
+        console.print(db_metadata)
+        raise Exit(0)
+    # Send ingestion request.
+    return send_request(
+        url=urljoin(perspective_url, "postgres/"),
+        method="POST",
+        payload=db_metadata,
+        verify=False,
+    )

perspective/ingest/sources/bi/powerbi/extract.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""Download lineage information from PowerBI API."""
+from collections.abc import Generator
+import logging
+from pathlib import Path
+from time import sleep
+from typing import Any
+from azure.identity import ClientSecretCredential
+import dlt
+from dlt.extract.resource import DltResource
+from dlt.sources.helpers.rest_client import RESTClient
+from dlt.sources.helpers.rest_client.auth import OAuth2AuthBase
+from dlt.sources.helpers.rest_client.paginators import SinglePagePaginator
+from perspective.exceptions import ExtractionError
+Workspace = dict[str, Any]
+WorkspaceDataflows = dict[str, Any]
+DataflowDetails = dict[str, Any]
+logger = logging.getLogger("dlt")
+class PowerBIOauthClientCredentials(OAuth2AuthBase):
+    def __init__(self, tenant_id: str, client_id: str, client_secret: str):
+        """PowerBI OAuth2 client credentials authentication.
+        Args:
+            tenant_id (str): The Azure tenant ID.
+            client_id (str): The client ID of the service principal app.
+            client_secret (str): The client secret of the service principal app.
+        """
+        super().__init__()
+        self.access_token = self._get_token(tenant_id, client_id, client_secret)
+    @staticmethod
+    def _get_token(tenant_id: str, client_id: str, client_secret: str) -> str:
+        scope = "https://analysis.windows.net/powerbi/api/.default"
+        client_secret_credential_class = ClientSecretCredential(
+            tenant_id=tenant_id, client_id=client_id, client_secret=client_secret
+        )
+        return client_secret_credential_class.get_token(scope).token
+@dlt.source(name="powerbi")
+def powerbi(  # noqa: C901
+    tenant_id: str = dlt.secrets.value,
+    client_id: str = dlt.secrets.value,
+    client_secret: str = dlt.secrets.value,
+) -> list[DltResource]:
+    """The PowerBI metadata source.
+    Args:
+        tenant_id (str, optional): The Azure tenant ID. Defaults to dlt.secrets.value.
+        client_id (str, optional): The client ID of the service principal app. Defaults
+            to dlt.secrets.value.
+        client_secret (str, optional): The client secret of the service principal app.
+            Defaults to dlt.secrets.value.
+    Returns:
+        dict: _description_
+    Yields:
+        Iterator[dict]: _description_
+    """
+    client = RESTClient(
+        base_url="https://api.powerbi.com/v1.0/myorg",
+        auth=PowerBIOauthClientCredentials(
+            tenant_id=tenant_id,
+            client_id=client_id,
+            client_secret=client_secret,
+        ),
+        paginator=SinglePagePaginator(),
+    )
+    @dlt.resource(primary_key="id", write_disposition="replace")
+    def workspaces() -> Generator[list[dict[str, Any]], None, None]:
+        endpoint = "groups"
+        yield client.get(endpoint).json()["value"]
+    # We need to add the type hint for our custom column here as this is required for
+    # dbt-osmosis to correctly generate the bronze properties file, and consequently,
+    # for dbt-ibis to work.
+    @dlt.transformer(
+        data_from=workspaces, columns={"description": {"data_type": "text"}}
+    )
+    def workspaces_lineage(
+        workspaces: list[Workspace],
+    ) -> Generator[list[Workspace], None, None]:
+        workspace_ids = [workspace["id"] for workspace in workspaces]
+        request_lineage_endpoint = "admin/workspaces/getInfo"
+        params = {
+            "lineage": True,
+            "datasourceDetails": True,
+            "datasetSchema": True,
+            "datasetExpressions": True,
+            "getArtifactUsers": True,
+        }
+        body = {"workspaces": workspace_ids}
+        # Request a workspace lineage scan and await scan completion.
+        response = client.post(
+            request_lineage_endpoint, params=params, json=body
+        ).json()
+        if response.get("error"):
+            msg = f"Error requesting workspace lineage scan: {response['error']}"
+            raise ExtractionError(msg)
+        scan_id = response["id"]
+        scan_status = None
+        logger.info("Waiting for scan to complete...")
+        while scan_status != "Succeeded":
+            scan_status_endpoint = f"admin/workspaces/scanStatus/{scan_id}"
+            scan_status = client.get(scan_status_endpoint).json()["status"]
+            sleep(0.2)
+        # Get the scan result.
+        scan_result_endpoint = f"admin/workspaces/scanResult/{scan_id}"
+        response = client.get(scan_result_endpoint)
+        lineage = response.json()
+        # Add "description" column if it doesn't exist.
+        # This is required as the schema returned by PowerBI REST API is dynamic, which
+        # can break everything downstream. For example, if a workspace description is
+        # not set, instead of returning "description": "", PBI REST API simply omits the
+        # "description" key.
+        for i, workspace in enumerate(lineage["workspaces"]):
+            if "description" not in workspace:
+                lineage["workspaces"][i]["description"] = ""
+        yield lineage
+    @dlt.transformer(data_from=workspaces)
+    def workspace_dataflows(
+        workspaces: list[Workspace],
+    ) -> Generator[WorkspaceDataflows, None, None]:
+        for workspace in workspaces:
+            workspace_id = workspace["id"]
+            logger.info(f"Extracting dataflows for workspace '{workspace_id}'...")
+            endpoint = f"groups/{workspace_id}/dataflows"
+            dataflows = client.get(endpoint).json()["value"]
+            logger.info(f"Extracted {len(dataflows)} dataflows.")
+            yield {
+                "workspace_id": workspace_id,
+                "dataflows": dataflows,
+            }
+    @dlt.transformer(data_from=workspace_dataflows)
+    def dataflows_details(
+        workspace_dataflows: WorkspaceDataflows,
+    ) -> Generator[list[DataflowDetails], None, None]:
+        dataflows_details = []
+        workspace_id = workspace_dataflows["workspace_id"]
+        for dataflow in workspace_dataflows["dataflows"]:
+            dataflow_id = dataflow["objectId"]
+            endpoint = f"groups/{workspace_id}/dataflows/{dataflow_id}"
+            dataflow_details = client.get(endpoint).json()
+            dataflows_details.append({"id": dataflow_id, **dataflow_details})
+        yield dataflows_details
+    return [workspaces_lineage, dataflows_details]
+if __name__ == "__main__":
+    import json
+    source = powerbi()
+    lineage = source.workspaces_lineage
+    dataflows = source.dataflows_details
+    with Path("powerbi_workspace_info.json").open("w", encoding="utf-8") as f:
+        json.dump(list(lineage), f, indent=4)
+    with Path("powerbi_dataflows_info.json").open("w", encoding="utf-8") as f:
+        json.dump(list(dataflows), f, indent=4)

perspective/ingest/sources/bi/powerbi/models.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""Pydantic models for Power BI metadata ingestion."""
+from enum import Enum
+from typing import Any
+from pydantic import BaseModel, EmailStr, Field
+class ColumnType(str, Enum):
+    CalculatedTableColumn = "CalculatedTableColumn"
+    Calculated = "Calculated"
+    Data = "Data"
+class TableColumn(BaseModel):
+    name: str
+    dataType: str
+    isHidden: bool
+    columnType: ColumnType
+class TableMeasure(BaseModel):
+    name: str
+    expression: str
+    description: str | None = None
+    isHidden: bool
+class TableSource(BaseModel):
+    expression: str
+class SourceExpression(BaseModel):
+    name: str
+    expression: str
+class Table(BaseModel):
+    name: str
+    isHidden: bool
+    columns: list[TableColumn] = []
+    measures: list[TableMeasure] = []
+    source: list[TableSource] | None = None
+class Dataset(BaseModel):
+    id: str
+    name: str
+    tables: list[Table] = []
+    expressions: list[SourceExpression] | None = None
+    description: str | None = None
+    configuredBy: EmailStr | None = None
+    configuredById: str | None = None
+    directQueryRefreshSchedule: dict[str, Any] | None = None
+    createdDate: str
+    users: list[dict] | None = None
+class Dataflow(BaseModel):
+    objectId: str
+    name: str
+    description: str | None = None
+    configuredBy: str
+    users: list[str] = []
+class WorkspaceDataflows(BaseModel):
+    workspace_id: str
+    dataflows: list[Dataflow]
+class PBIMashup(BaseModel):
+    queriesMetadata: dict[str, dict[str, Any]]
+    document: str
+    connectionOverrides: list[dict[str, Any]] | None = None
+class DataflowDetails(BaseModel):
+    id: str
+    name: str
+    description: str
+    version: str
+    modifiedTime: str
+    pbi_mashup: PBIMashup = Field(alias="pbi:mashup")
+    entities: list[dict[str, Any]] = Field(default_factory=list)
+class DashboardTile(BaseModel):
+    id: str
+    title: str
+    reportId: str
+    datasetId: str
+class Dashboard(BaseModel):
+    id: str
+    displayName: str
+    tiles: list[DashboardTile]
+    users: list[dict]
+    tags: list[str]
+class ReportUser(BaseModel):
+    reportUserAccessRight: str
+    emailAddress: EmailStr | None
+    displayName: str
+    identifier: EmailStr | str
+    graphId: str
+    principalType: str
+    userType: str | None
+class Report(BaseModel):
+    id: str
+    appId: str | None = None
+    name: str
+    datasetId: str
+    description: str | None = None
+    createdDateTime: str | None = None
+    modifiedDateTime: str | None = None
+    users: list[ReportUser] | None = None
+class Workspace(BaseModel):
+    id: str
+    name: str
+    type: str = "Workspace"
+    state: str
+    reports: list[Report] = []
+    datasets: list[Dataset] = []
+    dashboards: list[Dashboard] = []
+    users: list[dict]
+class WorkspaceInfo(BaseModel):
+    workspaces: list[Workspace]
+    datasourceInstances: list[dict] | None = None