PyPI - acryl-datahub - Versions diffs - 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show

{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
datahub/__init__.py +1 -1
datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
datahub/configuration/common.py +2 -5
datahub/configuration/source_common.py +13 -0
datahub/emitter/mce_builder.py +20 -4
datahub/emitter/mcp_builder.py +2 -7
datahub/emitter/mcp_patch_builder.py +37 -13
datahub/emitter/rest_emitter.py +25 -3
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
datahub/ingestion/api/closeable.py +3 -3
datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
datahub/ingestion/api/report.py +4 -1
datahub/ingestion/api/sink.py +4 -3
datahub/ingestion/api/source.py +4 -0
datahub/ingestion/api/source_helpers.py +2 -6
datahub/ingestion/glossary/classifier.py +2 -3
datahub/ingestion/graph/client.py +6 -3
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
datahub/ingestion/source/aws/aws_common.py +231 -27
datahub/ingestion/source/aws/glue.py +12 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
datahub/ingestion/source/datahub/config.py +22 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
datahub/ingestion/source/gc/datahub_gc.py +21 -5
datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
datahub/ingestion/source/iceberg/iceberg.py +27 -1
datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
datahub/ingestion/source/kafka_connect/__init__.py +0 -0
datahub/ingestion/source/kafka_connect/common.py +202 -0
datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
datahub/ingestion/source/looker/looker_common.py +63 -2
datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
datahub/ingestion/source/looker/looker_source.py +31 -4
datahub/ingestion/source/looker/looker_usage.py +23 -17
datahub/ingestion/source/mlflow.py +30 -5
datahub/ingestion/source/mode.py +40 -27
datahub/ingestion/source/powerbi/config.py +1 -14
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
datahub/ingestion/source/s3/source.py +1 -1
datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
datahub/ingestion/source/sql/hive.py +621 -8
datahub/ingestion/source/sql/hive_metastore.py +7 -0
datahub/ingestion/source/sql/mssql/job_models.py +30 -1
datahub/ingestion/source/sql/mssql/source.py +15 -1
datahub/ingestion/source/sql/sql_common.py +41 -102
datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
datahub/ingestion/source/sql/sql_report.py +2 -0
datahub/ingestion/source/state/checkpoint.py +2 -1
datahub/ingestion/source/tableau/tableau.py +122 -45
datahub/ingestion/source/tableau/tableau_common.py +18 -0
datahub/ingestion/source/tableau/tableau_constant.py +3 -1
datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/proxy.py +8 -27
datahub/ingestion/source/usage/usage_common.py +15 -1
datahub/ingestion/source_report/ingestion_stage.py +3 -0
datahub/metadata/_schema_classes.py +256 -3
datahub/metadata/_urns/urn_defs.py +168 -168
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
datahub/metadata/schema.avsc +252 -33
datahub/metadata/schemas/DataJobKey.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
datahub/metadata/schemas/MLModelProperties.avsc +62 -2
datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
datahub/specific/aspect_helpers/__init__.py +0 -0
datahub/specific/aspect_helpers/custom_properties.py +79 -0
datahub/specific/aspect_helpers/ownership.py +67 -0
datahub/specific/aspect_helpers/structured_properties.py +72 -0
datahub/specific/aspect_helpers/tags.py +42 -0
datahub/specific/aspect_helpers/terms.py +43 -0
datahub/specific/chart.py +28 -184
datahub/specific/dashboard.py +31 -196
datahub/specific/datajob.py +34 -189
datahub/specific/dataproduct.py +24 -86
datahub/specific/dataset.py +48 -133
datahub/specific/form.py +12 -32
datahub/specific/structured_property.py +9 -9
datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
datahub/sql_parsing/sqlglot_lineage.py +15 -5
datahub/sql_parsing/tool_meta_extractor.py +119 -5
datahub/utilities/time.py +8 -3
datahub/utilities/urns/_urn_base.py +5 -7
datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
datahub/specific/custom_properties.py +0 -37
datahub/specific/ownership.py +0 -48
datahub/specific/structured_properties.py +0 -53
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0

datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py CHANGED Viewed

@@ -146,12 +146,55 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
             aspect_value=source_info_aspect,
         )
+    @staticmethod
+    def _convert_sets_to_lists(obj: Any) -> Any:
+        """
+        Recursively converts all sets to lists in a Python object.
+        Works with nested dictionaries, lists, and sets.
+        Args:
+            obj: Any Python object that might contain sets
+        Returns:
+            The object with all sets converted to lists
+        """
+        if isinstance(obj, dict):
+            return {
+                key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
+                for key, value in obj.items()
+            }
+        elif isinstance(obj, list):
+            return [
+                DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
+                for element in obj
+            ]
+        elif isinstance(obj, set):
+            return [
+                DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
+                for element in obj
+            ]
+        elif isinstance(obj, tuple):
+            return tuple(
+                DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
+                for element in obj
+            )
+        else:
+            return obj
     def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
         assert ctx.pipeline_config
         if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
             return ""
         else:
-            return json.dumps(redact_raw_config(ctx.pipeline_config.get_raw_dict()))
+            redacted_recipe = redact_raw_config(ctx.pipeline_config.get_raw_dict())
+            # This is required otherwise json dumps will fail
+            # with a TypeError: Object of type set is not JSON serializable
+            converted_recipe = (
+                DatahubIngestionRunSummaryProvider._convert_sets_to_lists(
+                    redacted_recipe
+                )
+            )
+            return json.dumps(converted_recipe)
     def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
         self.sink.write_record_async(

datahub/ingestion/source/aws/aws_common.py CHANGED Viewed

@@ -1,7 +1,12 @@
+import logging
+import os
 from datetime import datetime, timedelta, timezone
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
+from enum import Enum
+from http import HTTPStatus
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
 import boto3
+import requests
 from boto3.session import Session
 from botocore.config import DEFAULT_TIMEOUT, Config
 from botocore.utils import fix_s3_host
@@ -14,6 +19,8 @@ from datahub.configuration.common import (
 )
 from datahub.configuration.source_common import EnvConfigMixin
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from mypy_boto3_dynamodb import DynamoDBClient
     from mypy_boto3_glue import GlueClient
@@ -22,6 +29,26 @@ if TYPE_CHECKING:
     from mypy_boto3_sts import STSClient
+class AwsEnvironment(Enum):
+    EC2 = "EC2"
+    ECS = "ECS"
+    EKS = "EKS"
+    LAMBDA = "LAMBDA"
+    APP_RUNNER = "APP_RUNNER"
+    BEANSTALK = "ELASTIC_BEANSTALK"
+    CLOUD_FORMATION = "CLOUD_FORMATION"
+    UNKNOWN = "UNKNOWN"
+class AwsServicePrincipal(Enum):
+    LAMBDA = "lambda.amazonaws.com"
+    EKS = "eks.amazonaws.com"
+    APP_RUNNER = "apprunner.amazonaws.com"
+    ECS = "ecs.amazonaws.com"
+    ELASTIC_BEANSTALK = "elasticbeanstalk.amazonaws.com"
+    EC2 = "ec2.amazonaws.com"
 class AwsAssumeRoleConfig(PermissiveConfigModel):
     # Using the PermissiveConfigModel to allow the user to pass additional arguments.
@@ -34,6 +61,163 @@ class AwsAssumeRoleConfig(PermissiveConfigModel):
     )
+def get_instance_metadata_token() -> Optional[str]:
+    """Get IMDSv2 token"""
+    try:
+        response = requests.put(
+            "http://169.254.169.254/latest/api/token",
+            headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
+            timeout=1,
+        )
+        if response.status_code == HTTPStatus.OK:
+            return response.text
+    except requests.exceptions.RequestException:
+        logger.debug("Failed to get IMDSv2 token")
+    return None
+def is_running_on_ec2() -> bool:
+    """Check if code is running on EC2 using IMDSv2"""
+    token = get_instance_metadata_token()
+    if not token:
+        return False
+    try:
+        response = requests.get(
+            "http://169.254.169.254/latest/meta-data/instance-id",
+            headers={"X-aws-ec2-metadata-token": token},
+            timeout=1,
+        )
+        return response.status_code == HTTPStatus.OK
+    except requests.exceptions.RequestException:
+        return False
+def detect_aws_environment() -> AwsEnvironment:
+    """
+    Detect the AWS environment we're running in.
+    Order matters as some environments may have multiple indicators.
+    """
+    # Check Lambda first as it's most specific
+    if os.getenv("AWS_LAMBDA_FUNCTION_NAME"):
+        if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"):
+            return AwsEnvironment.CLOUD_FORMATION
+        return AwsEnvironment.LAMBDA
+    # Check EKS (IRSA)
+    if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"):
+        return AwsEnvironment.EKS
+    # Check App Runner
+    if os.getenv("AWS_APP_RUNNER_SERVICE_ID"):
+        return AwsEnvironment.APP_RUNNER
+    # Check ECS
+    if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
+        "ECS_CONTAINER_METADATA_URI"
+    ):
+        return AwsEnvironment.ECS
+    # Check Elastic Beanstalk
+    if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"):
+        return AwsEnvironment.BEANSTALK
+    if is_running_on_ec2():
+        return AwsEnvironment.EC2
+    return AwsEnvironment.UNKNOWN
+def get_instance_role_arn() -> Optional[str]:
+    """Get role ARN from EC2 instance metadata using IMDSv2"""
+    token = get_instance_metadata_token()
+    if not token:
+        return None
+    try:
+        response = requests.get(
+            "http://169.254.169.254/latest/meta-data/iam/security-credentials/",
+            headers={"X-aws-ec2-metadata-token": token},
+            timeout=1,
+        )
+        if response.status_code == 200:
+            role_name = response.text.strip()
+            if role_name:
+                sts = boto3.client("sts")
+                identity = sts.get_caller_identity()
+                return identity.get("Arn")
+    except Exception as e:
+        logger.debug(f"Failed to get instance role ARN: {e}")
+    return None
+def get_lambda_role_arn() -> Optional[str]:
+    """Get the Lambda function's role ARN"""
+    try:
+        function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME")
+        if not function_name:
+            return None
+        lambda_client = boto3.client("lambda")
+        function_config = lambda_client.get_function_configuration(
+            FunctionName=function_name
+        )
+        return function_config.get("Role")
+    except Exception as e:
+        logger.debug(f"Failed to get Lambda role ARN: {e}")
+        return None
+def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
+    """
+    Get the current role ARN and source type based on the runtime environment.
+    Returns (role_arn, credential_source)
+    """
+    env = detect_aws_environment()
+    if env == AwsEnvironment.LAMBDA:
+        role_arn = get_lambda_role_arn()
+        return role_arn, AwsServicePrincipal.LAMBDA.value
+    elif env == AwsEnvironment.EKS:
+        role_arn = os.getenv("AWS_ROLE_ARN")
+        return role_arn, AwsServicePrincipal.EKS.value
+    elif env == AwsEnvironment.APP_RUNNER:
+        try:
+            sts = boto3.client("sts")
+            identity = sts.get_caller_identity()
+            return identity.get("Arn"), AwsServicePrincipal.APP_RUNNER.value
+        except Exception as e:
+            logger.debug(f"Failed to get App Runner role: {e}")
+    elif env == AwsEnvironment.ECS:
+        try:
+            metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
+                "ECS_CONTAINER_METADATA_URI"
+            )
+            if metadata_uri:
+                response = requests.get(f"{metadata_uri}/task", timeout=1)
+                if response.status_code == HTTPStatus.OK:
+                    task_metadata = response.json()
+                    if "TaskARN" in task_metadata:
+                        return (
+                            task_metadata.get("TaskARN"),
+                            AwsServicePrincipal.ECS.value,
+                        )
+        except Exception as e:
+            logger.debug(f"Failed to get ECS task role: {e}")
+    elif env == AwsEnvironment.BEANSTALK:
+        # Beanstalk uses EC2 instance metadata
+        return get_instance_role_arn(), AwsServicePrincipal.ELASTIC_BEANSTALK.value
+    elif env == AwsEnvironment.EC2:
+        return get_instance_role_arn(), AwsServicePrincipal.EC2.value
+    return None, None
 def assume_role(
     role: AwsAssumeRoleConfig,
     aws_region: Optional[str],
@@ -95,7 +279,7 @@ class AwsConnectionConfig(ConfigModel):
     )
     aws_profile: Optional[str] = Field(
         default=None,
-        description="Named AWS profile to use. Only used if access key / secret are unset. If not set the default will be used",
+        description="The [named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use from AWS credentials. Falls back to default profile if not specified and no access keys provided. Profiles are configured in ~/.aws/credentials or ~/.aws/config.",
     )
     aws_region: Optional[str] = Field(None, description="AWS region code.")
@@ -145,6 +329,7 @@ class AwsConnectionConfig(ConfigModel):
     def get_session(self) -> Session:
         if self.aws_access_key_id and self.aws_secret_access_key:
+            # Explicit credentials take precedence
             session = Session(
                 aws_access_key_id=self.aws_access_key_id,
                 aws_secret_access_key=self.aws_secret_access_key,
@@ -152,38 +337,57 @@ class AwsConnectionConfig(ConfigModel):
                 region_name=self.aws_region,
             )
         elif self.aws_profile:
+            # Named profile is second priority
             session = Session(
                 region_name=self.aws_region, profile_name=self.aws_profile
             )
         else:
-            # Use boto3's credential autodetection.
+            # Use boto3's credential autodetection
             session = Session(region_name=self.aws_region)
-        if self._normalized_aws_roles():
-            # Use existing session credentials to start the chain of role assumption.
-            current_credentials = session.get_credentials()
-            credentials = {
-                "AccessKeyId": current_credentials.access_key,
-                "SecretAccessKey": current_credentials.secret_key,
-                "SessionToken": current_credentials.token,
-            }
-            for role in self._normalized_aws_roles():
-                if self._should_refresh_credentials():
-                    credentials = assume_role(
-                        role,
-                        self.aws_region,
-                        credentials=credentials,
+            target_roles = self._normalized_aws_roles()
+            if target_roles:
+                current_role_arn, credential_source = get_current_identity()
+                # Only assume role if:
+                # 1. We're not in a known AWS environment with a role, or
+                # 2. We need to assume a different role than our current one
+                should_assume_role = current_role_arn is None or any(
+                    role.RoleArn != current_role_arn for role in target_roles
+                )
+                if should_assume_role:
+                    env = detect_aws_environment()
+                    logger.debug(f"Assuming role(s) from {env.value} environment")
+                    current_credentials = session.get_credentials()
+                    if current_credentials is None:
+                        raise ValueError("No credentials available for role assumption")
+                    credentials = {
+                        "AccessKeyId": current_credentials.access_key,
+                        "SecretAccessKey": current_credentials.secret_key,
+                        "SessionToken": current_credentials.token,
+                    }
+                    for role in target_roles:
+                        if self._should_refresh_credentials():
+                            credentials = assume_role(
+                                role=role,
+                                aws_region=self.aws_region,
+                                credentials=credentials,
+                            )
+                            if isinstance(credentials["Expiration"], datetime):
+                                self._credentials_expiration = credentials["Expiration"]
+                    session = Session(
+                        aws_access_key_id=credentials["AccessKeyId"],
+                        aws_secret_access_key=credentials["SecretAccessKey"],
+                        aws_session_token=credentials["SessionToken"],
+                        region_name=self.aws_region,
                     )
-                    if isinstance(credentials["Expiration"], datetime):
-                        self._credentials_expiration = credentials["Expiration"]
-            session = Session(
-                aws_access_key_id=credentials["AccessKeyId"],
-                aws_secret_access_key=credentials["SecretAccessKey"],
-                aws_session_token=credentials["SessionToken"],
-                region_name=self.aws_region,
-            )
+                else:
+                    logger.debug(f"Using existing role from {credential_source}")
         return session

datahub/ingestion/source/aws/glue.py CHANGED Viewed

@@ -52,6 +52,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
+from datahub.ingestion.api.report import EntityFilterReport
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.aws import s3_util
@@ -115,7 +116,6 @@ from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_col
 logger = logging.getLogger(__name__)
 DEFAULT_PLATFORM = "glue"
 VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
@@ -220,6 +220,7 @@ class GlueSourceConfig(
 class GlueSourceReport(StaleEntityRemovalSourceReport):
     tables_scanned = 0
     filtered: List[str] = dataclass_field(default_factory=list)
+    databases: EntityFilterReport = EntityFilterReport.field(type="database")
     num_job_script_location_missing: int = 0
     num_job_script_location_invalid: int = 0
@@ -668,6 +669,7 @@ class GlueSource(StatefulIngestionSourceBase):
         return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
     def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
+        logger.debug("Getting all databases")
         # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html
         paginator = self.glue_client.get_paginator("get_databases")
@@ -684,10 +686,18 @@ class GlueSource(StatefulIngestionSourceBase):
             pattern += "[?!TargetDatabase]"
         for database in paginator_response.search(pattern):
-            if self.source_config.database_pattern.allowed(database["Name"]):
+            if (not self.source_config.database_pattern.allowed(database["Name"])) or (
+                self.source_config.catalog_id
+                and database.get("CatalogId")
+                and database.get("CatalogId") != self.source_config.catalog_id
+            ):
+                self.report.databases.dropped(database["Name"])
+            else:
+                self.report.databases.processed(database["Name"])
                 yield database
     def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]:
+        logger.debug(f"Getting tables from database {database['Name']}")
         # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html
         paginator = self.glue_client.get_paginator("get_tables")
         database_name = database["Name"]

datahub/ingestion/source/bigquery_v2/bigquery.py CHANGED Viewed

@@ -206,9 +206,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
     def _init_schema_resolver(self) -> SchemaResolver:
         schema_resolution_required = (
-            self.config.use_queries_v2
-            or self.config.lineage_parse_view_ddl
-            or self.config.lineage_use_sql_parser
+            self.config.use_queries_v2 or self.config.lineage_use_sql_parser
         )
         schema_ingestion_enabled = (
             self.config.include_schema_metadata
@@ -255,18 +253,16 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
         for project in projects:
             yield from self.bq_schema_extractor.get_project_workunits(project)
-        if self.config.use_queries_v2:
-            # Always ingest View and Snapshot lineage with schema ingestion
-            self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
-            yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
-                [p.id for p in projects],
-                self.bq_schema_extractor.view_refs_by_project,
-                self.bq_schema_extractor.view_definitions,
-                self.bq_schema_extractor.snapshot_refs_by_project,
-                self.bq_schema_extractor.snapshots_by_ref,
-            )
+        self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
+        yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
+            [p.id for p in projects],
+            self.bq_schema_extractor.view_refs_by_project,
+            self.bq_schema_extractor.view_definitions,
+            self.bq_schema_extractor.snapshot_refs_by_project,
+            self.bq_schema_extractor.snapshots_by_ref,
+        )
+        if self.config.use_queries_v2:
             # if both usage and lineage are disabled then skip queries extractor piece
             if (
                 not self.config.include_usage_statistics
@@ -306,10 +302,6 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
             if self.config.include_table_lineage:
                 yield from self.lineage_extractor.get_lineage_workunits(
                     [p.id for p in projects],
-                    self.bq_schema_extractor.view_refs_by_project,
-                    self.bq_schema_extractor.view_definitions,
-                    self.bq_schema_extractor.snapshot_refs_by_project,
-                    self.bq_schema_extractor.snapshots_by_ref,
                     self.bq_schema_extractor.table_refs,
                 )

datahub/ingestion/source/bigquery_v2/bigquery_config.py CHANGED Viewed

@@ -463,10 +463,6 @@ class BigQueryV2Config(
         default=True,
         description="Use sql parser to resolve view/table lineage.",
     )
-    lineage_parse_view_ddl: bool = Field(
-        default=True,
-        description="Sql parse view ddl to get lineage.",
-    )
     lineage_sql_parser_use_raw_names: bool = Field(
         default=False,
@@ -572,11 +568,9 @@ class BigQueryV2Config(
         "See [this](https://cloud.google.com/bigquery/docs/information-schema-jobs#scope_and_syntax) for details.",
     )
-    # include_view_lineage and include_view_column_lineage are inherited from SQLCommonConfig
-    # but not used in bigquery so we hide them from docs.
-    include_view_lineage: bool = Field(default=True, hidden_from_docs=True)
-    include_view_column_lineage: bool = Field(default=True, hidden_from_docs=True)
+    _include_view_lineage = pydantic_removed_field("include_view_lineage")
+    _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage")
+    _lineage_parse_view_ddl = pydantic_removed_field("lineage_parse_view_ddl")
     @root_validator(pre=True)
     def set_include_schema_metadata(cls, values: Dict) -> Dict:

datahub/ingestion/source/bigquery_v2/bigquery_schema.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from collections import defaultdict
 from dataclasses import dataclass, field
-from datetime import datetime, timezone
+from datetime import datetime
 from functools import lru_cache
 from typing import Any, Dict, FrozenSet, Iterable, Iterator, List, Optional
@@ -15,6 +15,7 @@ from google.cloud.bigquery.table import (
     TimePartitioningType,
 )
+from datahub.emitter.mce_builder import parse_ts_millis
 from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
 from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels
@@ -393,13 +394,7 @@ class BigQuerySchemaApi:
             name=table.table_name,
             created=table.created,
             table_type=table.table_type,
-            last_altered=(
-                datetime.fromtimestamp(
-                    table.get("last_altered") / 1000, tz=timezone.utc
-                )
-                if table.get("last_altered") is not None
-                else None
-            ),
+            last_altered=parse_ts_millis(table.get("last_altered")),
             size_in_bytes=table.get("bytes"),
             rows_count=table.get("row_count"),
             comment=table.comment,
@@ -460,11 +455,7 @@ class BigQuerySchemaApi:
         return BigqueryView(
             name=view.table_name,
             created=view.created,
-            last_altered=(
-                datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc)
-                if view.get("last_altered") is not None
-                else None
-            ),
+            last_altered=(parse_ts_millis(view.get("last_altered"))),
             comment=view.comment,
             view_definition=view.view_definition,
             materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
@@ -705,13 +696,7 @@ class BigQuerySchemaApi:
         return BigqueryTableSnapshot(
             name=snapshot.table_name,
             created=snapshot.created,
-            last_altered=(
-                datetime.fromtimestamp(
-                    snapshot.get("last_altered") / 1000, tz=timezone.utc
-                )
-                if snapshot.get("last_altered") is not None
-                else None
-            ),
+            last_altered=parse_ts_millis(snapshot.get("last_altered")),
             comment=snapshot.comment,
             ddl=snapshot.ddl,
             snapshot_time=snapshot.snapshot_time,

datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py CHANGED Viewed

@@ -653,14 +653,11 @@ class BigQuerySchemaGenerator:
             self.report.report_dropped(table_identifier.raw_table_name())
             return
-        if self.store_table_refs:
-            table_ref = str(
-                BigQueryTableRef(table_identifier).get_sanitized_table_ref()
-            )
-            self.table_refs.add(table_ref)
-            if self.config.lineage_parse_view_ddl and view.view_definition:
-                self.view_refs_by_project[project_id].add(table_ref)
-                self.view_definitions[table_ref] = view.view_definition
+        table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref())
+        self.table_refs.add(table_ref)
+        if view.view_definition:
+            self.view_refs_by_project[project_id].add(table_ref)
+            self.view_definitions[table_ref] = view.view_definition
         view.column_count = len(columns)
         if not view.column_count:
@@ -701,14 +698,11 @@ class BigQuerySchemaGenerator:
                 f"Snapshot doesn't have any column or unable to get columns for snapshot: {table_identifier}"
             )
-        if self.store_table_refs:
-            table_ref = str(
-                BigQueryTableRef(table_identifier).get_sanitized_table_ref()
-            )
-            self.table_refs.add(table_ref)
-            if snapshot.base_table_identifier:
-                self.snapshot_refs_by_project[project_id].add(table_ref)
-                self.snapshots_by_ref[table_ref] = snapshot
+        table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref())
+        self.table_refs.add(table_ref)
+        if snapshot.base_table_identifier:
+            self.snapshot_refs_by_project[project_id].add(table_ref)
+            self.snapshots_by_ref[table_ref] = snapshot
         yield from self.gen_snapshot_dataset_workunits(
             table=snapshot,
@@ -1148,7 +1142,7 @@ class BigQuerySchemaGenerator:
             foreignKeys=foreign_keys if foreign_keys else None,
         )
-        if self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser:
+        if self.config.lineage_use_sql_parser:
             self.sql_parser_schema_resolver.add_schema_metadata(
                 dataset_urn, schema_metadata
             )

datahub/ingestion/source/bigquery_v2/lineage.py CHANGED Viewed

@@ -291,16 +291,15 @@ class BigqueryLineageExtractor:
         snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot],
     ) -> Iterable[MetadataWorkUnit]:
         for project in projects:
-            if self.config.lineage_parse_view_ddl:
-                for view in view_refs_by_project[project]:
-                    self.datasets_skip_audit_log_lineage.add(view)
-                    self.aggregator.add_view_definition(
-                        view_urn=self.identifiers.gen_dataset_urn_from_raw_ref(
-                            BigQueryTableRef.from_string_name(view)
-                        ),
-                        view_definition=view_definitions[view],
-                        default_db=project,
-                    )
+            for view in view_refs_by_project[project]:
+                self.datasets_skip_audit_log_lineage.add(view)
+                self.aggregator.add_view_definition(
+                    view_urn=self.identifiers.gen_dataset_urn_from_raw_ref(
+                        BigQueryTableRef.from_string_name(view)
+                    ),
+                    view_definition=view_definitions[view],
+                    default_db=project,
+                )
             for snapshot_ref in snapshot_refs_by_project[project]:
                 snapshot = snapshots_by_ref[snapshot_ref]
@@ -322,23 +321,11 @@ class BigqueryLineageExtractor:
     def get_lineage_workunits(
         self,
         projects: List[str],
-        view_refs_by_project: Dict[str, Set[str]],
-        view_definitions: FileBackedDict[str],
-        snapshot_refs_by_project: Dict[str, Set[str]],
-        snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot],
         table_refs: Set[str],
     ) -> Iterable[MetadataWorkUnit]:
         if not self._should_ingest_lineage():
             return
-        yield from self.get_lineage_workunits_for_views_and_snapshots(
-            projects,
-            view_refs_by_project,
-            view_definitions,
-            snapshot_refs_by_project,
-            snapshots_by_ref,
-        )
         if self.config.use_exported_bigquery_audit_metadata:
             projects = ["*"]  # project_id not used when using exported metadata

acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl