PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show

{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
datahub/cli/cli_utils.py +2 -0
datahub/cli/delete_cli.py +103 -24
datahub/cli/ingest_cli.py +110 -0
datahub/cli/put_cli.py +1 -1
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +2 -1
datahub/configuration/common.py +3 -3
datahub/configuration/git.py +7 -1
datahub/configuration/kafka_consumer_config.py +31 -1
datahub/emitter/mcp_patch_builder.py +43 -0
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/api/incremental_properties_helper.py +69 -0
datahub/ingestion/api/source.py +6 -1
datahub/ingestion/api/source_helpers.py +4 -2
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
datahub/ingestion/run/pipeline.py +6 -5
datahub/ingestion/run/pipeline_config.py +6 -0
datahub/ingestion/sink/datahub_rest.py +15 -4
datahub/ingestion/source/abs/source.py +4 -0
datahub/ingestion/source/aws/aws_common.py +13 -1
datahub/ingestion/source/aws/sagemaker.py +8 -0
datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
datahub/ingestion/source/common/subtypes.py +2 -0
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_common.py +7 -61
datahub/ingestion/source/dremio/dremio_api.py +204 -86
datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
datahub/ingestion/source/dremio/dremio_config.py +5 -0
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_entities.py +4 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/dremio/dremio_source.py +7 -2
datahub/ingestion/source/elastic_search.py +1 -1
datahub/ingestion/source/feast.py +97 -6
datahub/ingestion/source/gc/datahub_gc.py +46 -35
datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
datahub/ingestion/source/ge_data_profiler.py +46 -9
datahub/ingestion/source/ge_profiling_config.py +5 -0
datahub/ingestion/source/iceberg/iceberg.py +12 -5
datahub/ingestion/source/kafka/kafka.py +39 -19
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
datahub/ingestion/source/looker/view_upstream.py +65 -30
datahub/ingestion/source/metadata/business_glossary.py +35 -18
datahub/ingestion/source/mode.py +0 -23
datahub/ingestion/source/neo4j/__init__.py +0 -0
datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
datahub/ingestion/source/powerbi/__init__.py +0 -1
datahub/ingestion/source/powerbi/config.py +3 -3
datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
datahub/ingestion/source/powerbi/powerbi.py +12 -6
datahub/ingestion/source/preset.py +1 -0
datahub/ingestion/source/pulsar.py +21 -2
datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
datahub/ingestion/source/redash.py +13 -63
datahub/ingestion/source/redshift/config.py +1 -0
datahub/ingestion/source/redshift/redshift.py +3 -0
datahub/ingestion/source/s3/source.py +2 -3
datahub/ingestion/source/sigma/data_classes.py +1 -0
datahub/ingestion/source/sigma/sigma.py +101 -43
datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
datahub/ingestion/source/sql/athena.py +46 -22
datahub/ingestion/source/sql/mssql/source.py +18 -6
datahub/ingestion/source/sql/sql_common.py +34 -21
datahub/ingestion/source/sql/sql_report.py +1 -0
datahub/ingestion/source/sql/sql_types.py +85 -8
datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
datahub/ingestion/source/superset.py +215 -65
datahub/ingestion/source/tableau/tableau.py +237 -76
datahub/ingestion/source/tableau/tableau_common.py +12 -6
datahub/ingestion/source/tableau/tableau_constant.py +2 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
datahub/ingestion/source/tableau/tableau_validation.py +48 -0
datahub/ingestion/source/unity/proxy_types.py +1 -0
datahub/ingestion/source/unity/source.py +4 -0
datahub/ingestion/source/unity/usage.py +20 -11
datahub/ingestion/transformer/add_dataset_tags.py +1 -1
datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
datahub/integrations/assertion/common.py +1 -1
datahub/lite/duckdb_lite.py +12 -17
datahub/metadata/_schema_classes.py +512 -392
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
datahub/metadata/schema.avsc +17222 -17499
datahub/metadata/schemas/FormInfo.avsc +4 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
datahub/specific/chart.py +0 -39
datahub/specific/dashboard.py +0 -39
datahub/specific/datajob.py +7 -57
datahub/sql_parsing/schema_resolver.py +23 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
datahub/sql_parsing/sqlglot_lineage.py +55 -14
datahub/sql_parsing/sqlglot_utils.py +8 -2
datahub/telemetry/telemetry.py +23 -9
datahub/testing/compare_metadata_json.py +1 -1
datahub/testing/doctest.py +12 -0
datahub/utilities/file_backed_collections.py +35 -2
datahub/utilities/partition_executor.py +1 -1
datahub/utilities/urn_encoder.py +2 -1
datahub/utilities/urns/_urn_base.py +1 -1
datahub/utilities/urns/structured_properties_urn.py +1 -1
datahub/utilities/sql_lineage_parser_impl.py +0 -160
datahub/utilities/sql_parser.py +0 -94
datahub/utilities/sql_parser_base.py +0 -21
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -141,6 +141,10 @@ class KafkaSourceConfig(
         default=False,
         description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
     )
+    ingest_schemas_as_entities: bool = pydantic.Field(
+        default=False,
+        description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
+    )
 def get_kafka_consumer(
@@ -148,7 +152,7 @@ def get_kafka_consumer(
 ) -> confluent_kafka.Consumer:
     consumer = confluent_kafka.Consumer(
         {
-            "group.id": "test",
+            "group.id": "datahub-kafka-ingestion",
             "bootstrap.servers": connection.bootstrap,
             **connection.consumer_config,
         }
@@ -164,6 +168,25 @@ def get_kafka_consumer(
     return consumer
+def get_kafka_admin_client(
+    connection: KafkaConsumerConnectionConfig,
+) -> AdminClient:
+    client = AdminClient(
+        {
+            "group.id": "datahub-kafka-ingestion",
+            "bootstrap.servers": connection.bootstrap,
+            **connection.consumer_config,
+        }
+    )
+    if CallableConsumerConfig.is_callable_config(connection.consumer_config):
+        # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
+        # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
+        logger.debug("Initiating polling for kafka admin client")
+        client.poll(timeout=30)
+        logger.debug("Initiated polling for kafka admin client")
+    return client
 @dataclass
 class KafkaSourceReport(StaleEntityRemovalSourceReport):
     topics_scanned: int = 0
@@ -278,13 +301,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
     def init_kafka_admin_client(self) -> None:
         try:
             # TODO: Do we require separate config than existing consumer_config ?
-            self.admin_client = AdminClient(
-                {
-                    "group.id": "test",
-                    "bootstrap.servers": self.source_config.connection.bootstrap,
-                    **self.source_config.connection.consumer_config,
-                }
-            )
+            self.admin_client = get_kafka_admin_client(self.source_config.connection)
         except Exception as e:
             logger.debug(e, exc_info=e)
             self.report.report_warning(
@@ -330,17 +347,20 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
             else:
                 self.report.report_dropped(topic)
-        # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
-        for subject in self.schema_registry_client.get_subjects():
-            try:
-                yield from self._extract_record(
-                    subject, True, topic_detail=None, extra_topic_config=None
-                )
-            except Exception as e:
-                logger.warning(f"Failed to extract subject {subject}", exc_info=True)
-                self.report.report_warning(
-                    "subject", f"Exception while extracting topic {subject}: {e}"
-                )
+        if self.source_config.ingest_schemas_as_entities:
+            # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
+            for subject in self.schema_registry_client.get_subjects():
+                try:
+                    yield from self._extract_record(
+                        subject, True, topic_detail=None, extra_topic_config=None
+                    )
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to extract subject {subject}", exc_info=True
+                    )
+                    self.report.report_warning(
+                        "subject", f"Exception while extracting topic {subject}: {e}"
+                    )
     def _extract_record(
         self,

datahub/ingestion/source/kafka/kafka_connect.py CHANGED Viewed

@@ -282,10 +282,6 @@ class ConfluentJDBCSourceConnector:
         query: str
         transforms: list
-    def report_warning(self, key: str, reason: str) -> None:
-        logger.warning(f"{key}: {reason}")
-        self.report.report_warning(key, reason)
     def get_parser(
         self,
         connector_manifest: ConnectorManifest,
@@ -355,9 +351,9 @@ class ConfluentJDBCSourceConnector:
                     source_table = f"{table_name_tuple[-2]}.{source_table}"
                 else:
                     include_source_dataset = False
-                    self.report_warning(
-                        self.connector_manifest.name,
-                        f"could not find schema for table {source_table}",
+                    self.report.warning(
+                        "Could not find schema for table"
+                        f"{self.connector_manifest.name} : {source_table}",
                     )
             dataset_name: str = get_dataset_name(database_name, source_table)
             lineage = KafkaConnectLineage(
@@ -457,9 +453,9 @@ class ConfluentJDBCSourceConnector:
                     target_platform=KAFKA,
                 )
                 lineages.append(lineage)
-                self.report_warning(
+                self.report.warning(
+                    "Could not find input dataset, the connector has query configuration set",
                     self.connector_manifest.name,
-                    "could not find input dataset, the connector has query configuration set",
                 )
                 self.connector_manifest.lineages = lineages
                 return
@@ -535,24 +531,24 @@ class ConfluentJDBCSourceConnector:
                         include_source_dataset=False,
                     )
                 )
-                self.report_warning(
-                    self.connector_manifest.name,
-                    f"could not find input dataset, for connector topics {topic_names}",
+                self.report.warning(
+                    "Could not find input dataset for connector topics",
+                    f"{self.connector_manifest.name} : {topic_names}",
                 )
             self.connector_manifest.lineages = lineages
             return
         else:
             include_source_dataset = True
             if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
-                self.report_warning(
-                    self.connector_manifest.name,
-                    f"could not find input dataset, connector has unknown transform - {transforms[0]['type']}",
+                self.report.warning(
+                    "Could not find input dataset, connector has unknown transform",
+                    f"{self.connector_manifest.name} : {transforms[0]['type']}",
                 )
                 include_source_dataset = False
             if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
-                self.report_warning(
+                self.report.warning(
+                    "Could not find input dataset, connector has one or more unknown transforms",
                     self.connector_manifest.name,
-                    "could not find input dataset, connector has one or more unknown transforms",
                 )
                 include_source_dataset = False
             lineages = self.default_get_lineages(
@@ -753,8 +749,10 @@ class DebeziumSourceConnector:
                     lineages.append(lineage)
             self.connector_manifest.lineages = lineages
         except Exception as e:
-            self.report.report_warning(
-                self.connector_manifest.name, f"Error resolving lineage: {e}"
+            self.report.warning(
+                "Error resolving lineage for connector",
+                self.connector_manifest.name,
+                exc=e,
             )
         return
@@ -783,10 +781,6 @@ class BigQuerySinkConnector:
         defaultDataset: Optional[str] = None
         version: str = "v1"
-    def report_warning(self, key: str, reason: str) -> None:
-        logger.warning(f"{key}: {reason}")
-        self.report.report_warning(key, reason)
     def get_parser(
         self,
         connector_manifest: ConnectorManifest,
@@ -917,9 +911,9 @@ class BigQuerySinkConnector:
             transformed_topic = self.apply_transformations(topic, transforms)
             dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
             if dataset_table is None:
-                self.report_warning(
-                    self.connector_manifest.name,
-                    f"could not find target dataset for topic {transformed_topic}, please check your connector configuration",
+                self.report.warning(
+                    "Could not find target dataset for topic, please check your connector configuration"
+                    f"{self.connector_manifest.name} : {transformed_topic} ",
                 )
                 continue
             target_dataset = f"{project}.{dataset_table}"
@@ -954,10 +948,6 @@ class SnowflakeSinkConnector:
         schema_name: str
         topics_to_tables: Dict[str, str]
-    def report_warning(self, key: str, reason: str) -> None:
-        logger.warning(f"{key}: {reason}")
-        self.report.report_warning(key, reason)
     def get_table_name_from_topic_name(self, topic_name: str) -> str:
         """
         This function converts the topic name to a valid Snowflake table name using some rules.
@@ -1105,8 +1095,10 @@ class ConfluentS3SinkConnector:
                 )
             self.connector_manifest.lineages = lineages
         except Exception as e:
-            self.report.report_warning(
-                self.connector_manifest.name, f"Error resolving lineage: {e}"
+            self.report.warning(
+                "Error resolving lineage for connector",
+                self.connector_manifest.name,
+                exc=e,
             )
         return
@@ -1155,7 +1147,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
             )
             self.session.auth = (self.config.username, self.config.password)
-        test_response = self.session.get(f"{self.config.connect_uri}")
+        test_response = self.session.get(f"{self.config.connect_uri}/connectors")
         test_response.raise_for_status()
         logger.info(f"Connection to {self.config.connect_uri} is ok")
         if not jpype.isJVMStarted():
@@ -1178,13 +1170,16 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
         payload = connector_response.json()
-        for c in payload:
-            connector_url = f"{self.config.connect_uri}/connectors/{c}"
-            connector_response = self.session.get(connector_url)
-            manifest = connector_response.json()
-            connector_manifest = ConnectorManifest(**manifest)
-            if not self.config.connector_patterns.allowed(connector_manifest.name):
-                self.report.report_dropped(connector_manifest.name)
+        for connector_name in payload:
+            connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
+            connector_manifest = self._get_connector_manifest(
+                connector_name, connector_url
+            )
+            if (
+                connector_manifest is None
+                or not self.config.connector_patterns.allowed(connector_manifest.name)
+            ):
+                self.report.report_dropped(connector_name)
                 continue
             if self.config.provided_configs:
@@ -1195,19 +1190,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
             connector_manifest.lineages = list()
             connector_manifest.url = connector_url
-            topics = self.session.get(
-                f"{self.config.connect_uri}/connectors/{c}/topics",
-            ).json()
-            connector_manifest.topic_names = topics[c]["topics"]
+            connector_manifest.topic_names = self._get_connector_topics(connector_name)
             # Populate Source Connector metadata
             if connector_manifest.type == SOURCE:
-                tasks = self.session.get(
-                    f"{self.config.connect_uri}/connectors/{c}/tasks",
-                ).json()
-                connector_manifest.tasks = tasks
+                connector_manifest.tasks = self._get_connector_tasks(connector_name)
                 # JDBC source connector lineages
                 if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
@@ -1246,7 +1233,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
                         )
                         continue
-                    for topic in topics:
+                    for topic in connector_manifest.topic_names:
                         lineage = KafkaConnectLineage(
                             source_dataset=target_connector.source_dataset,
                             source_platform=target_connector.source_platform,
@@ -1286,6 +1273,49 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
         return connectors_manifest
+    def _get_connector_manifest(
+        self, connector_name: str, connector_url: str
+    ) -> Optional[ConnectorManifest]:
+        try:
+            connector_response = self.session.get(connector_url)
+            connector_response.raise_for_status()
+        except Exception as e:
+            self.report.warning(
+                "Failed to get connector details", connector_name, exc=e
+            )
+            return None
+        manifest = connector_response.json()
+        connector_manifest = ConnectorManifest(**manifest)
+        return connector_manifest
+    def _get_connector_tasks(self, connector_name: str) -> dict:
+        try:
+            response = self.session.get(
+                f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
+            )
+            response.raise_for_status()
+        except Exception as e:
+            self.report.warning(
+                "Error getting connector tasks", context=connector_name, exc=e
+            )
+            return {}
+        return response.json()
+    def _get_connector_topics(self, connector_name: str) -> List[str]:
+        try:
+            response = self.session.get(
+                f"{self.config.connect_uri}/connectors/{connector_name}/topics",
+            )
+            response.raise_for_status()
+        except Exception as e:
+            self.report.warning(
+                "Error getting connector topics", context=connector_name, exc=e
+            )
+            return []
+        return response.json()[connector_name]["topics"]
     def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
         connector_name = connector.name
         connector_type = connector.type

datahub/ingestion/source/looker/looker_liquid_tag.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import ClassVar, Optional, TextIO
 from liquid import Environment
 from liquid.ast import Node
 from liquid.context import Context
+from liquid.filter import string_filter
 from liquid.parse import expect, get_parser
 from liquid.stream import TokenStream
 from liquid.tag import Tag
@@ -81,12 +82,18 @@ class ConditionTag(Tag):
 custom_tags = [ConditionTag]
+@string_filter
+def sql_quote_filter(variable: str) -> str:
+    return f"'{variable}'"
 @lru_cache(maxsize=1)
 def _create_env() -> Environment:
-    env: Environment = Environment()
+    env: Environment = Environment(strict_filters=False)
     # register tag. One time activity
     for custom_tag in custom_tags:
         env.add_tag(custom_tag)
+    env.add_filter("sql_quote", sql_quote_filter)
     return env

datahub/ingestion/source/looker/lookml_concept_context.py CHANGED Viewed

@@ -88,8 +88,7 @@ class LookerFieldContext:
         for upstream_field_match in re.finditer(r"\${TABLE}\.[\"]*([\.\w]+)", sql):
             matched_field = upstream_field_match.group(1)
             # Remove quotes from field names
-            matched_field = matched_field.replace('"', "").replace("`", "").lower()
-            column_names.append(matched_field)
+            column_names.append(matched_field.replace('"', "").replace("`", "").lower())
         return column_names

datahub/ingestion/source/looker/view_upstream.py CHANGED Viewed

@@ -25,11 +25,13 @@ from datahub.ingestion.source.looker.lookml_config import (
     LookMLSourceReport,
 )
 from datahub.ingestion.source.looker.urn_functions import get_qualified_table_name
+from datahub.sql_parsing.schema_resolver import match_columns_to_schema
 from datahub.sql_parsing.sqlglot_lineage import (
     ColumnLineageInfo,
     ColumnRef,
     SqlParsingResult,
     Urn,
+    create_and_cache_schema_resolver,
     create_lineage_sql_parsed_result,
 )
@@ -200,7 +202,7 @@ def _generate_fully_qualified_name(
 class AbstractViewUpstream(ABC):
     """
     Implementation of this interface extracts the view upstream as per the way the view is bound to datasets.
-    For detail explanation please refer lookml_concept_context.LookerViewContext documentation.
+    For detail explanation, please refer lookml_concept_context.LookerViewContext documentation.
     """
     view_context: LookerViewContext
@@ -236,6 +238,47 @@ class AbstractViewUpstream(ABC):
     def create_fields(self) -> List[ViewField]:
         return []  # it is for the special case
+    def create_upstream_column_refs(
+        self, upstream_urn: str, downstream_looker_columns: List[str]
+    ) -> List[ColumnRef]:
+        """
+        - **`upstream_urn`**: The URN of the upstream dataset.
+        - **`expected_columns`**: These are the columns identified by the Looker connector as belonging to the `upstream_urn` dataset. However, there is potential for human error in specifying the columns of the upstream dataset. For example, a user might declare a column in lowercase, while on the actual platform, it may exist in uppercase, or vice versa.
+        - This function ensures consistency in column-level lineage by consulting GMS before creating the final `ColumnRef` instance, avoiding discrepancies.
+        """
+        schema_resolver = create_and_cache_schema_resolver(
+            platform=self.view_context.view_connection.platform,
+            platform_instance=self.view_context.view_connection.platform_instance,
+            env=self.view_context.view_connection.platform_env or self.config.env,
+            graph=self.ctx.graph,
+        )
+        urn, schema_info = schema_resolver.resolve_urn(urn=upstream_urn)
+        if schema_info:
+            actual_columns = match_columns_to_schema(
+                schema_info, downstream_looker_columns
+            )
+        else:
+            logger.info(
+                f"schema_info not found for dataset {urn} in GMS. Using expected_columns to form ColumnRef"
+            )
+            actual_columns = [column.lower() for column in downstream_looker_columns]
+        upstream_column_refs: List[ColumnRef] = []
+        for column in actual_columns:
+            upstream_column_refs.append(
+                ColumnRef(
+                    column=column,
+                    table=upstream_urn,
+                )
+            )
+        return upstream_column_refs
 class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
     """
@@ -372,15 +415,12 @@ class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
         # in-case of "select * from look_ml_view.SQL_TABLE_NAME" or extra field are defined in the looker view which is
         # referring to upstream table
         if self._get_upstream_dataset_urn() and not upstreams_column_refs:
-            upstreams_column_refs = [
-                ColumnRef(
-                    table=self._get_upstream_dataset_urn()[
-                        0
-                    ],  # 0th index has table of from clause
-                    column=column,
-                )
-                for column in field_context.column_name_in_sql_attribute()
-            ]
+            upstreams_column_refs = self.create_upstream_column_refs(
+                upstream_urn=self._get_upstream_dataset_urn()[
+                    0
+                ],  # 0th index has table of from clause,
+                downstream_looker_columns=field_context.column_name_in_sql_attribute(),
+            )
         # fix any derived view reference present in urn
         upstreams_column_refs = resolve_derived_view_urn_of_col_ref(
@@ -487,18 +527,18 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
             return upstream_column_refs
         explore_urn: str = self._get_upstream_dataset_urn()[0]
+        expected_columns: List[str] = []
         for column in field_context.column_name_in_sql_attribute():
             if column in self._get_explore_column_mapping():
                 explore_column: Dict = self._get_explore_column_mapping()[column]
-                upstream_column_refs.append(
-                    ColumnRef(
-                        column=explore_column.get("field", explore_column[NAME]),
-                        table=explore_urn,
-                    )
+                expected_columns.append(
+                    explore_column.get("field", explore_column[NAME])
                 )
-        return upstream_column_refs
+        return self.create_upstream_column_refs(
+            upstream_urn=explore_urn, downstream_looker_columns=expected_columns
+        )
     def get_upstream_dataset_urn(self) -> List[Urn]:
         return self._get_upstream_dataset_urn()
@@ -548,14 +588,10 @@ class RegularViewUpstream(AbstractViewUpstream):
     def get_upstream_column_ref(
         self, field_context: LookerFieldContext
     ) -> List[ColumnRef]:
-        upstream_column_ref: List[ColumnRef] = []
-        for column_name in field_context.column_name_in_sql_attribute():
-            upstream_column_ref.append(
-                ColumnRef(table=self._get_upstream_dataset_urn(), column=column_name)
-            )
-        return upstream_column_ref
+        return self.create_upstream_column_refs(
+            upstream_urn=self._get_upstream_dataset_urn(),
+            downstream_looker_columns=field_context.column_name_in_sql_attribute(),
+        )
     def get_upstream_dataset_urn(self) -> List[Urn]:
         return [self._get_upstream_dataset_urn()]
@@ -609,15 +645,14 @@ class DotSqlTableNameViewUpstream(AbstractViewUpstream):
         self, field_context: LookerFieldContext
     ) -> List[ColumnRef]:
         upstream_column_ref: List[ColumnRef] = []
         if not self._get_upstream_dataset_urn():
             return upstream_column_ref
-        for column_name in field_context.column_name_in_sql_attribute():
-            upstream_column_ref.append(
-                ColumnRef(table=self._get_upstream_dataset_urn()[0], column=column_name)
-            )
-        return upstream_column_ref
+        return self.create_upstream_column_refs(
+            upstream_urn=self._get_upstream_dataset_urn()[0],
+            downstream_looker_columns=field_context.column_name_in_sql_attribute(),
+        )
     def get_upstream_dataset_urn(self) -> List[Urn]:
         return self._get_upstream_dataset_urn()

datahub/ingestion/source/metadata/business_glossary.py CHANGED Viewed

@@ -45,6 +45,9 @@ class Owners(ConfigModel):
     groups: Optional[List[str]] = None
+OwnersMultipleTypes = Union[List[Owners], Owners]
 class KnowledgeCard(ConfigModel):
     url: Optional[str] = None
     label: Optional[str] = None
@@ -57,7 +60,7 @@ class GlossaryTermConfig(ConfigModel):
     term_source: Optional[str] = None
     source_ref: Optional[str] = None
     source_url: Optional[str] = None
-    owners: Optional[Owners] = None
+    owners: Optional[OwnersMultipleTypes] = None
     inherits: Optional[List[str]] = None
     contains: Optional[List[str]] = None
     values: Optional[List[str]] = None
@@ -74,7 +77,7 @@ class GlossaryNodeConfig(ConfigModel):
     id: Optional[str] = None
     name: str
     description: str
-    owners: Optional[Owners] = None
+    owners: Optional[OwnersMultipleTypes] = None
     terms: Optional[List["GlossaryTermConfig"]] = None
     nodes: Optional[List["GlossaryNodeConfig"]] = None
     knowledge_links: Optional[List[KnowledgeCard]] = None
@@ -88,7 +91,7 @@ class DefaultConfig(ConfigModel):
     """Holds defaults for populating fields in glossary terms"""
     source: Optional[str] = None
-    owners: Owners
+    owners: OwnersMultipleTypes
     url: Optional[str] = None
     source_type: str = "INTERNAL"
@@ -153,30 +156,44 @@ def make_glossary_term_urn(
     return "urn:li:glossaryTerm:" + create_id(path, default_id, enable_auto_id)
-def get_owners(owners: Owners) -> models.OwnershipClass:
-    ownership_type, ownership_type_urn = validate_ownership_type(owners.type)
+def get_owners_multiple_types(owners: OwnersMultipleTypes) -> models.OwnershipClass:
+    """Allows owner types to be a list and maintains backward compatibility"""
+    if isinstance(owners, Owners):
+        return models.OwnershipClass(owners=list(get_owners(owners)))
+    owners_meta: List[models.OwnerClass] = []
+    for owner in owners:
+        owners_meta.extend(get_owners(owner))
+    return models.OwnershipClass(owners=owners_meta)
+def get_owners(owners: Owners) -> Iterable[models.OwnerClass]:
+    actual_type = owners.type or models.OwnershipTypeClass.DEVELOPER
+    if actual_type.startswith("urn:li:ownershipType:"):
+        ownership_type: str = "CUSTOM"
+        ownership_type_urn: Optional[str] = actual_type
+    else:
+        ownership_type, ownership_type_urn = validate_ownership_type(actual_type)
     if owners.typeUrn is not None:
         ownership_type_urn = owners.typeUrn
-    owners_meta: List[models.OwnerClass] = []
     if owners.users is not None:
-        owners_meta = owners_meta + [
-            models.OwnerClass(
+        for o in owners.users:
+            yield models.OwnerClass(
                 owner=make_user_urn(o),
                 type=ownership_type,
                 typeUrn=ownership_type_urn,
             )
-            for o in owners.users
-        ]
     if owners.groups is not None:
-        owners_meta = owners_meta + [
-            models.OwnerClass(
+        for o in owners.groups:
+            yield models.OwnerClass(
                 owner=make_group_urn(o),
                 type=ownership_type,
                 typeUrn=ownership_type_urn,
             )
-            for o in owners.groups
-        ]
-    return models.OwnershipClass(owners=owners_meta)
 def get_mces(
@@ -185,7 +202,7 @@ def get_mces(
     ingestion_config: BusinessGlossarySourceConfig,
     ctx: PipelineContext,
 ) -> Iterable[Union[MetadataChangeProposalWrapper, models.MetadataChangeEventClass]]:
-    root_owners = get_owners(glossary.owners)
+    root_owners = get_owners_multiple_types(glossary.owners)
     if glossary.nodes:
         for node in glossary.nodes:
@@ -270,7 +287,7 @@ def get_mces_from_node(
     node_owners = parentOwners
     if glossaryNode.owners is not None:
         assert glossaryNode.owners is not None
-        node_owners = get_owners(glossaryNode.owners)
+        node_owners = get_owners_multiple_types(glossaryNode.owners)
     node_snapshot = models.GlossaryNodeSnapshotClass(
         urn=node_urn,
@@ -426,7 +443,7 @@ def get_mces_from_term(
     ownership: models.OwnershipClass = parentOwnership
     if glossaryTerm.owners is not None:
         assert glossaryTerm.owners is not None
-        ownership = get_owners(glossaryTerm.owners)
+        ownership = get_owners_multiple_types(glossaryTerm.owners)
     aspects.append(ownership)
     if glossaryTerm.domain is not None:

acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl