PyPI - acryl-datahub - Versions diffs - 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl - Mend

acryl-datahub 1.0.0.2rc4py3-none-any.whl → 1.0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show

{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
datahub/api/entities/datacontract/datacontract.py +35 -3
datahub/api/entities/datajob/dataflow.py +3 -3
datahub/api/entities/datajob/datajob.py +7 -4
datahub/api/entities/dataset/dataset.py +9 -11
datahub/api/entities/forms/forms.py +34 -34
datahub/api/graphql/assertion.py +1 -1
datahub/api/graphql/operation.py +4 -4
datahub/cli/check_cli.py +3 -2
datahub/cli/config_utils.py +2 -2
datahub/cli/delete_cli.py +6 -5
datahub/cli/docker_cli.py +2 -2
datahub/cli/exists_cli.py +2 -1
datahub/cli/get_cli.py +2 -1
datahub/cli/iceberg_cli.py +6 -5
datahub/cli/ingest_cli.py +9 -6
datahub/cli/migrate.py +4 -3
datahub/cli/migration_utils.py +4 -3
datahub/cli/put_cli.py +3 -2
datahub/cli/specific/assertions_cli.py +2 -1
datahub/cli/specific/datacontract_cli.py +3 -2
datahub/cli/specific/dataproduct_cli.py +10 -9
datahub/cli/specific/dataset_cli.py +4 -3
datahub/cli/specific/forms_cli.py +2 -1
datahub/cli/specific/group_cli.py +2 -1
datahub/cli/specific/structuredproperties_cli.py +4 -3
datahub/cli/specific/user_cli.py +2 -1
datahub/cli/state_cli.py +2 -1
datahub/cli/timeline_cli.py +2 -1
datahub/configuration/common.py +5 -0
datahub/configuration/source_common.py +1 -1
datahub/emitter/mcp.py +20 -5
datahub/emitter/request_helper.py +116 -3
datahub/emitter/rest_emitter.py +163 -93
datahub/entrypoints.py +2 -1
datahub/errors.py +4 -0
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
datahub/ingestion/api/source.py +2 -5
datahub/ingestion/api/source_helpers.py +1 -0
datahub/ingestion/glossary/classification_mixin.py +4 -2
datahub/ingestion/graph/client.py +33 -8
datahub/ingestion/graph/config.py +14 -0
datahub/ingestion/graph/filters.py +1 -1
datahub/ingestion/graph/links.py +53 -0
datahub/ingestion/run/pipeline.py +9 -6
datahub/ingestion/run/pipeline_config.py +1 -1
datahub/ingestion/sink/datahub_rest.py +5 -6
datahub/ingestion/source/apply/datahub_apply.py +2 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
datahub/ingestion/source/common/subtypes.py +3 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
datahub/ingestion/source/dbt/dbt_common.py +10 -2
datahub/ingestion/source/dbt/dbt_core.py +82 -42
datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
datahub/ingestion/source/feast.py +4 -4
datahub/ingestion/source/fivetran/config.py +1 -1
datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
datahub/ingestion/source/ge_data_profiler.py +27 -1
datahub/ingestion/source/hex/api.py +1 -20
datahub/ingestion/source/hex/query_fetcher.py +4 -1
datahub/ingestion/source/iceberg/iceberg.py +20 -4
datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
datahub/ingestion/source/ldap.py +1 -1
datahub/ingestion/source/looker/looker_common.py +17 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
datahub/ingestion/source/looker/looker_source.py +34 -5
datahub/ingestion/source/looker/lookml_source.py +7 -1
datahub/ingestion/source/metadata/lineage.py +2 -1
datahub/ingestion/source/mlflow.py +19 -6
datahub/ingestion/source/mode.py +74 -28
datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
datahub/ingestion/source/powerbi/config.py +13 -1
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
datahub/ingestion/source/redshift/usage.py +10 -9
datahub/ingestion/source/sigma/config.py +74 -6
datahub/ingestion/source/sigma/sigma.py +16 -1
datahub/ingestion/source/sigma/sigma_api.py +99 -58
datahub/ingestion/source/slack/slack.py +4 -52
datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
datahub/ingestion/source/sql/athena.py +2 -1
datahub/ingestion/source/sql/clickhouse.py +5 -1
datahub/ingestion/source/sql/druid.py +7 -2
datahub/ingestion/source/sql/hive.py +7 -2
datahub/ingestion/source/sql/hive_metastore.py +5 -5
datahub/ingestion/source/sql/mssql/source.py +1 -1
datahub/ingestion/source/sql/oracle.py +6 -2
datahub/ingestion/source/sql/sql_config.py +1 -34
datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
datahub/ingestion/source/tableau/tableau.py +31 -6
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/config.py +2 -1
datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
datahub/ingestion/source/vertexai/vertexai.py +316 -4
datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
datahub/integrations/assertion/common.py +3 -2
datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
datahub/metadata/_urns/urn_defs.py +1819 -1763
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/schema.avsc +17296 -16883
datahub/metadata/schema_classes.py +3 -3
datahub/metadata/schemas/DataContractKey.avsc +2 -1
datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
datahub/metadata/schemas/FormInfo.avsc +5 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
datahub/metadata/schemas/QueryProperties.avsc +4 -2
datahub/metadata/schemas/SystemMetadata.avsc +86 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/_all_entities.py +4 -0
datahub/sdk/_shared.py +142 -4
datahub/sdk/_utils.py +4 -0
datahub/sdk/dataset.py +2 -2
datahub/sdk/entity_client.py +8 -0
datahub/sdk/lineage_client.py +235 -0
datahub/sdk/main_client.py +6 -3
datahub/sdk/mlmodel.py +301 -0
datahub/sdk/mlmodelgroup.py +233 -0
datahub/secret/datahub_secret_store.py +2 -1
datahub/specific/dataset.py +12 -0
datahub/sql_parsing/fingerprint_utils.py +6 -0
datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
datahub/sql_parsing/sqlglot_utils.py +18 -14
datahub/telemetry/telemetry.py +2 -2
datahub/testing/check_imports.py +1 -1
datahub/testing/mcp_diff.py +15 -2
datahub/upgrade/upgrade.py +10 -12
datahub/utilities/logging_manager.py +8 -1
datahub/utilities/server_config_util.py +350 -10
datahub/utilities/sqlalchemy_query_combiner.py +4 -5
datahub/utilities/urn_encoder.py +1 -1
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/powerbi/config.py CHANGED Viewed

@@ -192,6 +192,11 @@ class SupportedDataPlatform(Enum):
         datahub_data_platform_name="mysql",
     )
+    ODBC = DataPlatformPair(
+        powerbi_data_platform_name="Odbc",
+        datahub_data_platform_name="odbc",
+    )
 @dataclass
 class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
@@ -341,6 +346,13 @@ class PowerBiDashboardSourceConfig(
         "For Google BigQuery the datasource's server is google bigquery project name. "
         "For Databricks Unity Catalog the datasource's server is workspace FQDN.",
     )
+    # ODBC DSN to platform mapping
+    dsn_to_platform_name: Dict[str, str] = pydantic.Field(
+        default={},
+        description="A mapping of ODBC DSN to DataHub data platform name. "
+        "For example with an ODBC connection string 'DSN=database' where the database type "
+        "is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
+    )
     # deprecated warning
     _dataset_type_mapping = pydantic_field_deprecated(
         "dataset_type_mapping",
@@ -501,7 +513,7 @@ class PowerBiDashboardSourceConfig(
     include_workspace_name_in_dataset_urn: bool = pydantic.Field(
         default=False,
         description="It is recommended to set this to true, as it helps prevent the overwriting of datasets."
-        "Read section #11560 at https://datahubproject.io/docs/how/updating-datahub/ before enabling this option."
+        "Read section #11560 at https://docs.datahub.com/docs/how/updating-datahub/ before enabling this option."
         "To maintain backward compatibility, this is set to False.",
     )

datahub/ingestion/source/powerbi/m_query/data_classes.py CHANGED Viewed

@@ -75,3 +75,4 @@ class FunctionName(Enum):
     AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
     DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
     MYSQL_DATA_ACCESS = "MySQL.Database"
+    ODBC_DATA_ACCESS = "Odbc.DataSource"

datahub/ingestion/source/powerbi/m_query/odbc.py ADDED Viewed

@@ -0,0 +1,185 @@
+import re
+from typing import Optional, Tuple, Union
+server_patterns = [
+    r"Server=([^:]+)[:][0-9]+/.*",
+    r"SERVER=\{([^}]*)\}",
+    r"SERVER=([^;]*)",
+    r"HOST=\{([^}]*)\}",
+    r"HOST=([^;]*)",
+    r"DATA SOURCE=\{([^}]*)\}",
+    r"DATA SOURCE=([^;]*)",
+    r"DSN=\{([^}]*)\}",
+    r"DSN=([^;]*)",
+    r"Server=([^;]*)",
+    r"S3OutputLocation=([^;]*)",
+    r"HTTPPath=([^;]*)",
+    r"Host=([^;]*)",
+]
+dsn_patterns = [
+    r"DSN\s*=\s*\"([^\"]+)\"",
+    r"DSN\s*=\s*\'([^\']+)\'",
+    r"DSN\s*=\s*([^;]+)",
+]
+platform_patterns = {
+    "mysql": r"mysql",
+    "postgres": r"post(gre(s|sql)?|gres)",
+    "mssql": r"(sql\s*server|mssql|sqlncli)",
+    "oracle": r"oracle",
+    "db2": r"db2",
+    "sqlite": r"sqlite",
+    "access": r"(access|\.mdb|\.accdb)",
+    "excel": r"(excel|\.xls)",
+    "firebird": r"firebird",
+    "informix": r"informix",
+    "sybase": r"sybase",
+    "teradata": r"teradata",
+    "hadoop": r"(hadoop|hive)",
+    "snowflake": r"snowflake",
+    "redshift": r"redshift",
+    "bigquery": r"bigquery",
+    "athena": r"(athena|aws\s*athena)",
+    "databricks": r"(databricks|spark)",
+}
+powerbi_platform_names = {
+    "mysql": "MySQL",
+    "postgres": "PostgreSQL",
+    "mssql": "SQL Server",
+    "oracle": "Oracle",
+    "db2": "IBM DB2",
+    "sqlite": "SQLite",
+    "access": "Microsoft Access",
+    "excel": "Microsoft Excel",
+    "firebird": "Firebird",
+    "informix": "IBM Informix",
+    "sybase": "SAP Sybase",
+    "teradata": "Teradata",
+    "hadoop": "Hadoop",
+    "snowflake": "Snowflake",
+    "redshift": "Amazon Redshift",
+    "bigquery": "Google BigQuery",
+    "athena": "Amazon Athena",
+    "databricks": "Databricks",
+}
+def extract_driver(connection_string: str) -> Union[str, None]:
+    """
+    Parse an ODBC connection string and extract the driver name.
+    Handles whitespace in driver names and various connection string formats.
+    Args:
+        connection_string (str): The ODBC connection string
+    Returns:
+        str: The extracted driver name, or None if not found
+    """
+    # Match DRIVER={driver name} pattern
+    driver_match = re.search(r"DRIVER=\{([^}]*)}", connection_string, re.IGNORECASE)
+    if driver_match:
+        return driver_match.group(1).strip()
+    # Alternative pattern for DRIVER=driver
+    driver_match = re.search(r"DRIVER=([^;]*)", connection_string, re.IGNORECASE)
+    if driver_match:
+        return driver_match.group(1).strip()
+    return None
+def extract_dsn(connection_string: str) -> Union[str, None]:
+    """
+    Extract the DSN value from an ODBC connection string.
+    Args:
+        connection_string (str): The ODBC connection string
+    Returns:
+        str or None: The extracted DSN value, or None if not found
+    """
+    for pattern in dsn_patterns:
+        match = re.search(pattern, connection_string, re.IGNORECASE)
+        if match:
+            return match.group(1).strip()
+    return None
+def extract_server(connection_string: str) -> Union[str, None]:
+    """
+    Parse an ODBC connection string and extract the server name.
+    Handles various parameter names for server (SERVER, Host, Data Source, etc.)
+    Args:
+        connection_string (str): The ODBC connection string
+    Returns:
+        str: The extracted server name, or None if not found
+    """
+    for pattern in server_patterns:
+        server_match = re.search(pattern, connection_string, re.IGNORECASE)
+        if server_match:
+            return server_match.group(1).strip()
+    # Special case for Athena: extract from AwsRegion if no server found
+    region_match = re.search(r"AwsRegion=([^;]*)", connection_string, re.IGNORECASE)
+    if region_match:
+        return f"aws-athena-{region_match.group(1).strip()}"
+    # Special case for Databricks: try to extract hostname from JDBC URL
+    jdbc_match = re.search(r"jdbc:spark://([^:;/]+)", connection_string, re.IGNORECASE)
+    if jdbc_match:
+        return jdbc_match.group(1).strip()
+    return None
+def extract_platform(connection_string: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Extract the database platform name from the ODBC driver name.
+    Returns the lowercase platform name.
+    Args:
+        connection_string (str): The ODBC connection string
+    Returns:
+        tuple: A tuple containing the normalized platform name and the corresponding
+        Power BI platform name, or None if not recognized.
+    """
+    driver_name = extract_driver(connection_string)
+    if not driver_name:
+        return None, None
+    driver_lower = driver_name.lower()
+    for platform, pattern in platform_patterns.items():
+        if re.search(pattern, driver_lower):
+            return platform, powerbi_platform_names.get(platform)
+    return None, None
+def normalize_platform_name(platform: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Normalizes the platform name by matching it with predefined patterns and maps it to
+    a corresponding Power BI platform name.
+    Args:
+        platform (str): The platform name to normalize
+    Returns:
+        tuple: A tuple containing the normalized platform name and the corresponding
+        Power BI platform name, or None if not recognized.
+    """
+    platform_lower = platform.lower()
+    for platform, pattern in platform_patterns.items():
+        if re.search(pattern, platform_lower):
+            return platform, powerbi_platform_names.get(platform)
+    return None, None

datahub/ingestion/source/powerbi/m_query/pattern_handler.py CHANGED Viewed

@@ -29,6 +29,12 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
     Lineage,
     ReferencedTable,
 )
+from datahub.ingestion.source.powerbi.m_query.odbc import (
+    extract_dsn,
+    extract_platform,
+    extract_server,
+    normalize_platform_name,
+)
 from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
 from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
 from datahub.sql_parsing.sqlglot_lineage import (
@@ -155,6 +161,7 @@ class AbstractLineage(ABC):
                 tree_function.token_values(arg_list)
             ),
         )
+        logger.debug(f"DB Details: {arguments}")
         if len(arguments) < 2:
             logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
@@ -940,6 +947,147 @@ class NativeQueryLineage(AbstractLineage):
         )
+class OdbcLineage(AbstractLineage):
+    def create_lineage(
+        self, data_access_func_detail: DataAccessFunctionDetail
+    ) -> Lineage:
+        logger.debug(
+            f"Processing {self.get_platform_pair().powerbi_data_platform_name} "
+            f"data-access function detail {data_access_func_detail}"
+        )
+        connect_string, _ = self.get_db_detail_from_argument(
+            data_access_func_detail.arg_list
+        )
+        if not connect_string:
+            self.reporter.warning(
+                title="Can not extract ODBC connect string",
+                message="Can not extract ODBC connect string from data access function. Skipping Lineage creation.",
+                context=f"table-name={self.table.full_name}, data-access-func-detail={data_access_func_detail}",
+            )
+            return Lineage.empty()
+        logger.debug(f"ODBC connect string: {connect_string}")
+        data_platform, powerbi_platform = extract_platform(connect_string)
+        server_name = extract_server(connect_string)
+        if not data_platform:
+            dsn = extract_dsn(connect_string)
+            if dsn:
+                logger.debug(f"Extracted DSN: {dsn}")
+                server_name = dsn
+            if dsn and self.config.dsn_to_platform_name:
+                logger.debug(f"Attempting to map DSN {dsn} to platform")
+                name = self.config.dsn_to_platform_name.get(dsn)
+                if name:
+                    logger.debug(f"Found DSN {dsn} mapped to platform {name}")
+                    data_platform, powerbi_platform = normalize_platform_name(name)
+        if not data_platform or not powerbi_platform:
+            self.reporter.warning(
+                title="Can not determine ODBC platform",
+                message="Can not determine platform from ODBC connect string. Skipping Lineage creation.",
+                context=f"table-name={self.table.full_name}, connect-string={connect_string}",
+            )
+            return Lineage.empty()
+        platform_pair: DataPlatformPair = self.create_platform_pair(
+            data_platform, powerbi_platform
+        )
+        if not server_name and self.config.server_to_platform_instance:
+            self.reporter.warning(
+                title="Can not determine ODBC server name",
+                message="Can not determine server name with server_to_platform_instance mapping. Skipping Lineage creation.",
+                context=f"table-name={self.table.full_name}",
+            )
+            return Lineage.empty()
+        elif not server_name:
+            server_name = "unknown"
+        database_name = None
+        schema_name = None
+        table_name = None
+        qualified_table_name = None
+        temp_accessor: Optional[IdentifierAccessor] = (
+            data_access_func_detail.identifier_accessor
+        )
+        while temp_accessor:
+            logger.debug(
+                f"identifier = {temp_accessor.identifier} items = {temp_accessor.items}"
+            )
+            if temp_accessor.items.get("Kind") == "Database":
+                database_name = temp_accessor.items["Name"]
+            if temp_accessor.items.get("Kind") == "Schema":
+                schema_name = temp_accessor.items["Name"]
+            if temp_accessor.items.get("Kind") == "Table":
+                table_name = temp_accessor.items["Name"]
+            if temp_accessor.next is not None:
+                temp_accessor = temp_accessor.next
+            else:
+                break
+        if (
+            database_name is not None
+            and schema_name is not None
+            and table_name is not None
+        ):
+            qualified_table_name = f"{database_name}.{schema_name}.{table_name}"
+        elif database_name is not None and table_name is not None:
+            qualified_table_name = f"{database_name}.{table_name}"
+        if not qualified_table_name:
+            self.reporter.warning(
+                title="Can not determine qualified table name",
+                message="Can not determine qualified table name for ODBC data source. Skipping Lineage creation.",
+                context=f"table-name={self.table.full_name}, data-platform={data_platform}",
+            )
+            logger.warning(
+                f"Can not determine qualified table name for ODBC data source {data_platform} "
+                f"table {self.table.full_name}."
+            )
+            return Lineage.empty()
+        logger.debug(
+            f"ODBC Platform {data_platform} found qualified table name {qualified_table_name}"
+        )
+        urn = make_urn(
+            config=self.config,
+            platform_instance_resolver=self.platform_instance_resolver,
+            data_platform_pair=platform_pair,
+            server=server_name,
+            qualified_table_name=qualified_table_name,
+        )
+        column_lineage = self.create_table_column_lineage(urn)
+        return Lineage(
+            upstreams=[
+                DataPlatformTable(
+                    data_platform_pair=platform_pair,
+                    urn=urn,
+                )
+            ],
+            column_lineage=column_lineage,
+        )
+    @staticmethod
+    def create_platform_pair(
+        data_platform: str, powerbi_platform: str
+    ) -> DataPlatformPair:
+        return DataPlatformPair(data_platform, powerbi_platform)
+    def get_platform_pair(self) -> DataPlatformPair:
+        return SupportedDataPlatform.ODBC.value
 class SupportedPattern(Enum):
     DATABRICKS_QUERY = (
         DatabricksLineage,
@@ -991,6 +1139,11 @@ class SupportedPattern(Enum):
         FunctionName.NATIVE_QUERY,
     )
+    ODBC = (
+        OdbcLineage,
+        FunctionName.ODBC_DATA_ACCESS,
+    )
     def handler(self) -> Type[AbstractLineage]:
         return self.value[0]

datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py CHANGED Viewed

@@ -63,10 +63,10 @@ class SessionWithTimeout(requests.Session):
         super().__init__(*args, **kwargs)
         self.timeout = timeout
-    def request(self, method, url, **kwargs):
+    def request(self, method, url, *args, **kwargs):
         # Set the default timeout if none is provided
         kwargs.setdefault("timeout", self.timeout)
-        return super().request(method, url, **kwargs)
+        return super().request(method, url, *args, **kwargs)
 class DataResolverBase(ABC):

datahub/ingestion/source/redshift/usage.py CHANGED Viewed

@@ -182,15 +182,16 @@ class RedshiftUsageExtractor:
         self.report.num_operational_stats_filtered = 0
         if self.config.include_operational_stats:
-            with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
-                with PerfTimer() as timer:
-                    # Generate operation aspect workunits
-                    yield from self._gen_operation_aspect_workunits(
-                        self.connection, all_tables
-                    )
-                    self.report.operational_metadata_extraction_sec[
-                        self.config.database
-                    ] = timer.elapsed_seconds(digits=2)
+            with self.report.new_stage(
+                USAGE_EXTRACTION_OPERATIONAL_STATS
+            ), PerfTimer() as timer:
+                # Generate operation aspect workunits
+                yield from self._gen_operation_aspect_workunits(
+                    self.connection, all_tables
+                )
+                self.report.operational_metadata_extraction_sec[
+                    self.config.database
+                ] = timer.elapsed_seconds(digits=2)
         # Generate aggregate events
         with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):

datahub/ingestion/source/sigma/config.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 import pydantic
+from pydantic import BaseModel, Field
 from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.source_common import (
@@ -53,15 +54,82 @@ class Constant:
     DEFAULT_API_URL = "https://aws-api.sigmacomputing.com/v2"
+class WorkspaceCounts(BaseModel):
+    workbooks_count: int = 0
+    datasets_count: int = 0
+    elements_count: int = 0
+    pages_count: int = 0
+    def is_empty(self) -> bool:
+        return (
+            self.workbooks_count == 0
+            and self.datasets_count == 0
+            and self.elements_count == 0
+            and self.pages_count == 0
+        )
+    def as_obj(self) -> dict:
+        return {
+            "workbooks_count": self.workbooks_count,
+            "datasets_count": self.datasets_count,
+            "elements_count": self.elements_count,
+            "pages_count": self.pages_count,
+        }
+class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
+    type: str = "workspace"
+    workspace_counts: Dict[str, WorkspaceCounts] = Field(
+        default_factory=dict,
+        description="Counts of workbooks, datasets, elements and pages in each workspace.",
+    )
+    def increment_workbooks_count(self, workspace_id: str) -> None:
+        if workspace_id not in self.workspace_counts:
+            self.workspace_counts[workspace_id] = WorkspaceCounts()
+        self.workspace_counts[workspace_id].workbooks_count += 1
+    def increment_datasets_count(self, workspace_id: str) -> None:
+        if workspace_id not in self.workspace_counts:
+            self.workspace_counts[workspace_id] = WorkspaceCounts()
+        self.workspace_counts[workspace_id].datasets_count += 1
+    def increment_elements_count(self, workspace_id: str) -> None:
+        if workspace_id not in self.workspace_counts:
+            self.workspace_counts[workspace_id] = WorkspaceCounts()
+        self.workspace_counts[workspace_id].elements_count += 1
+    def increment_pages_count(self, workspace_id: str) -> None:
+        if workspace_id not in self.workspace_counts:
+            self.workspace_counts[workspace_id] = WorkspaceCounts()
+        self.workspace_counts[workspace_id].pages_count += 1
+    def as_obj(self) -> dict:
+        return {
+            "filtered": self.dropped_entities.as_obj(),
+            "processed": self.processed_entities.as_obj(),
+            "workspace_counts": {
+                key: item.as_obj() for key, item in self.workspace_counts.items()
+            },
+        }
 @dataclass
 class SigmaSourceReport(StaleEntityRemovalSourceReport):
-    workspaces: EntityFilterReport = EntityFilterReport.field(type="workspace")
-    number_of_workspaces: Optional[int] = None
+    workspaces: SigmaWorkspaceEntityFilterReport = field(
+        default_factory=SigmaWorkspaceEntityFilterReport
+    )
     non_accessible_workspaces_count: int = 0
-    shared_entities_count: int = 0
-    number_of_datasets: int = 0
-    number_of_workbooks: int = 0
+    datasets: EntityFilterReport = EntityFilterReport.field(type="dataset")
+    datasets_without_workspace: int = 0
+    workbooks: EntityFilterReport = EntityFilterReport.field(type="workbook")
+    workbooks_without_workspace: int = 0
     number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
+    empty_workspaces: List[str] = field(default_factory=list)
 class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):

datahub/ingestion/source/sigma/sigma.py CHANGED Viewed

@@ -35,6 +35,7 @@ from datahub.ingestion.source.sigma.config import (
     PlatformDetail,
     SigmaSourceConfig,
     SigmaSourceReport,
+    WorkspaceCounts,
 )
 from datahub.ingestion.source.sigma.data_classes import (
     Element,
@@ -163,7 +164,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
     def _get_allowed_workspaces(self) -> List[Workspace]:
         all_workspaces = self.sigma_api.workspaces.values()
         logger.info(f"Number of workspaces = {len(all_workspaces)}")
-        self.reporter.number_of_workspaces = len(all_workspaces)
         allowed_workspaces = []
         for workspace in all_workspaces:
@@ -285,6 +285,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
         yield self._gen_dataset_properties(dataset_urn, dataset)
         if dataset.workspaceId:
+            self.reporter.workspaces.increment_datasets_count(dataset.workspaceId)
             yield from add_entity_to_container(
                 container_key=self._gen_workspace_key(dataset.workspaceId),
                 entity_type="dataset",
@@ -468,6 +469,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
             ).as_workunit()
             if workbook.workspaceId:
+                self.reporter.workspaces.increment_elements_count(workbook.workspaceId)
                 yield self._gen_entity_browsepath_aspect(
                     entity_urn=chart_urn,
                     parent_entity_urn=builder.make_container_urn(
@@ -525,6 +528,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
             all_input_fields: List[InputFieldClass] = []
             if workbook.workspaceId:
+                self.reporter.workspaces.increment_pages_count(workbook.workspaceId)
                 yield self._gen_entity_browsepath_aspect(
                     entity_urn=dashboard_urn,
                     parent_entity_urn=builder.make_container_urn(
@@ -614,6 +618,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
         paths = workbook.path.split("/")[1:]
         if workbook.workspaceId:
+            self.reporter.workspaces.increment_workbooks_count(workbook.workspaceId)
             yield self._gen_entity_browsepath_aspect(
                 entity_urn=dashboard_urn,
                 parent_entity_urn=builder.make_container_urn(
@@ -667,6 +673,15 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
                 f"{workspace.name} ({workspace.workspaceId})"
             )
             yield from self._gen_workspace_workunit(workspace)
+            if self.reporter.workspaces.workspace_counts.get(
+                workspace.workspaceId, WorkspaceCounts()
+            ).is_empty():
+                logger.warning(
+                    f"Workspace {workspace.name} ({workspace.workspaceId}) is empty. If this is not expected, add the user associated with the Client ID/Secret to each workspace with missing metadata"
+                )
+                self.reporter.empty_workspaces.append(
+                    f"{workspace.name} ({workspace.workspaceId})"
+                )
         yield from self._gen_sigma_dataset_upstream_lineage_workunit()
     def get_report(self) -> SourceReport:

acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.2rc4py3-none-any.whl → 1.0.0.3py3-none-any.whl