PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show

{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
datahub/cli/cli_utils.py +2 -0
datahub/cli/delete_cli.py +103 -24
datahub/cli/ingest_cli.py +110 -0
datahub/cli/put_cli.py +1 -1
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +2 -1
datahub/configuration/common.py +3 -3
datahub/configuration/git.py +7 -1
datahub/configuration/kafka_consumer_config.py +31 -1
datahub/emitter/mcp_patch_builder.py +43 -0
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/api/incremental_properties_helper.py +69 -0
datahub/ingestion/api/source.py +6 -1
datahub/ingestion/api/source_helpers.py +4 -2
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
datahub/ingestion/run/pipeline.py +6 -5
datahub/ingestion/run/pipeline_config.py +6 -0
datahub/ingestion/sink/datahub_rest.py +15 -4
datahub/ingestion/source/abs/source.py +4 -0
datahub/ingestion/source/aws/aws_common.py +13 -1
datahub/ingestion/source/aws/sagemaker.py +8 -0
datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
datahub/ingestion/source/common/subtypes.py +2 -0
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_common.py +7 -61
datahub/ingestion/source/dremio/dremio_api.py +204 -86
datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
datahub/ingestion/source/dremio/dremio_config.py +5 -0
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_entities.py +4 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/dremio/dremio_source.py +7 -2
datahub/ingestion/source/elastic_search.py +1 -1
datahub/ingestion/source/feast.py +97 -6
datahub/ingestion/source/gc/datahub_gc.py +46 -35
datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
datahub/ingestion/source/ge_data_profiler.py +46 -9
datahub/ingestion/source/ge_profiling_config.py +5 -0
datahub/ingestion/source/iceberg/iceberg.py +12 -5
datahub/ingestion/source/kafka/kafka.py +39 -19
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
datahub/ingestion/source/looker/view_upstream.py +65 -30
datahub/ingestion/source/metadata/business_glossary.py +35 -18
datahub/ingestion/source/mode.py +0 -23
datahub/ingestion/source/neo4j/__init__.py +0 -0
datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
datahub/ingestion/source/powerbi/__init__.py +0 -1
datahub/ingestion/source/powerbi/config.py +3 -3
datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
datahub/ingestion/source/powerbi/powerbi.py +12 -6
datahub/ingestion/source/preset.py +1 -0
datahub/ingestion/source/pulsar.py +21 -2
datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
datahub/ingestion/source/redash.py +13 -63
datahub/ingestion/source/redshift/config.py +1 -0
datahub/ingestion/source/redshift/redshift.py +3 -0
datahub/ingestion/source/s3/source.py +2 -3
datahub/ingestion/source/sigma/data_classes.py +1 -0
datahub/ingestion/source/sigma/sigma.py +101 -43
datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
datahub/ingestion/source/sql/athena.py +46 -22
datahub/ingestion/source/sql/mssql/source.py +18 -6
datahub/ingestion/source/sql/sql_common.py +34 -21
datahub/ingestion/source/sql/sql_report.py +1 -0
datahub/ingestion/source/sql/sql_types.py +85 -8
datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
datahub/ingestion/source/superset.py +215 -65
datahub/ingestion/source/tableau/tableau.py +237 -76
datahub/ingestion/source/tableau/tableau_common.py +12 -6
datahub/ingestion/source/tableau/tableau_constant.py +2 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
datahub/ingestion/source/tableau/tableau_validation.py +48 -0
datahub/ingestion/source/unity/proxy_types.py +1 -0
datahub/ingestion/source/unity/source.py +4 -0
datahub/ingestion/source/unity/usage.py +20 -11
datahub/ingestion/transformer/add_dataset_tags.py +1 -1
datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
datahub/integrations/assertion/common.py +1 -1
datahub/lite/duckdb_lite.py +12 -17
datahub/metadata/_schema_classes.py +512 -392
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
datahub/metadata/schema.avsc +17222 -17499
datahub/metadata/schemas/FormInfo.avsc +4 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
datahub/specific/chart.py +0 -39
datahub/specific/dashboard.py +0 -39
datahub/specific/datajob.py +7 -57
datahub/sql_parsing/schema_resolver.py +23 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
datahub/sql_parsing/sqlglot_lineage.py +55 -14
datahub/sql_parsing/sqlglot_utils.py +8 -2
datahub/telemetry/telemetry.py +23 -9
datahub/testing/compare_metadata_json.py +1 -1
datahub/testing/doctest.py +12 -0
datahub/utilities/file_backed_collections.py +35 -2
datahub/utilities/partition_executor.py +1 -1
datahub/utilities/urn_encoder.py +2 -1
datahub/utilities/urns/_urn_base.py +1 -1
datahub/utilities/urns/structured_properties_urn.py +1 -1
datahub/utilities/sql_lineage_parser_impl.py +0 -160
datahub/utilities/sql_parser.py +0 -94
datahub/utilities/sql_parser_base.py +0 -21
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
{acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/mode.py CHANGED Viewed

@@ -18,7 +18,6 @@ from pydantic import Field, validator
 from requests.adapters import HTTPAdapter, Retry
 from requests.exceptions import ConnectionError
 from requests.models import HTTPBasicAuth, HTTPError
-from sqllineage.runner import LineageRunner
 from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
 import datahub.emitter.mce_builder as builder
@@ -820,28 +819,6 @@ class ModeSource(StatefulIngestionSourceBase):
             )
         return None
-    @lru_cache(maxsize=None)
-    def _get_source_from_query(self, raw_query: str) -> set:
-        query = self._replace_definitions(raw_query)
-        parser = LineageRunner(query)
-        source_paths = set()
-        try:
-            for table in parser.source_tables:
-                sources = str(table).split(".")
-                source_schema, source_table = sources[-2], sources[-1]
-                if source_schema == "<default>":
-                    source_schema = str(self.config.default_schema)
-                source_paths.add(f"{source_schema}.{source_table}")
-        except Exception as e:
-            self.report.report_failure(
-                title="Failed to Extract Lineage From Query",
-                message="Unable to retrieve lineage from Mode query.",
-                context=f"Query: {raw_query}, Error: {str(e)}",
-            )
-        return source_paths
     def _get_datasource_urn(
         self,
         platform: str,

datahub/ingestion/source/neo4j/__init__.py ADDED Viewed

File without changes

datahub/ingestion/source/neo4j/neo4j_source.py ADDED Viewed

@@ -0,0 +1,331 @@
+import logging
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Type, Union
+import pandas as pd
+from neo4j import GraphDatabase
+from pydantic.fields import Field
+from datahub.configuration.source_common import EnvConfigMixin
+from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.decorators import (
+    SupportStatus,
+    config_class,
+    platform_name,
+    support_status,
+)
+from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.common.subtypes import DatasetSubTypes
+from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
+from datahub.metadata.schema_classes import (
+    AuditStampClass,
+    BooleanTypeClass,
+    DatasetPropertiesClass,
+    DateTypeClass,
+    NullTypeClass,
+    NumberTypeClass,
+    OtherSchemaClass,
+    SchemaFieldClass,
+    SchemaMetadataClass,
+    StringTypeClass,
+    SubTypesClass,
+    UnionTypeClass,
+)
+log = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+_type_mapping: Dict[Union[Type, str], Type] = {
+    "list": UnionTypeClass,
+    "boolean": BooleanTypeClass,
+    "integer": NumberTypeClass,
+    "local_date_time": DateTypeClass,
+    "float": NumberTypeClass,
+    "string": StringTypeClass,
+    "date": DateTypeClass,
+    "node": StringTypeClass,
+    "relationship": StringTypeClass,
+}
+class Neo4jConfig(EnvConfigMixin):
+    username: str = Field(description="Neo4j Username")
+    password: str = Field(description="Neo4j Password")
+    uri: str = Field(description="The URI for the Neo4j server")
+    env: str = Field(description="Neo4j env")
+@dataclass
+class Neo4jSourceReport(SourceReport):
+    obj_failures: int = 0
+    obj_created: int = 0
+@platform_name("Neo4j", id="neo4j")
+@config_class(Neo4jConfig)
+@support_status(SupportStatus.CERTIFIED)
+class Neo4jSource(Source):
+    NODE = "node"
+    RELATIONSHIP = "relationship"
+    PLATFORM = "neo4j"
+    def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
+        self.ctx = ctx
+        self.config = config
+        self.report = Neo4jSourceReport()
+    @classmethod
+    def create(cls, config_dict, ctx):
+        config = Neo4jConfig.parse_obj(config_dict)
+        return cls(ctx, config)
+    def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
+        type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
+        return SchemaFieldDataType(type=type_class())
+    def get_schema_field_class(
+        self, col_name: str, col_type: str, **kwargs: Any
+    ) -> SchemaFieldClass:
+        if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
+            col_type = self.NODE
+        else:
+            col_type = col_type
+        return SchemaFieldClass(
+            fieldPath=col_name,
+            type=self.get_field_type(col_type),
+            nativeDataType=col_type,
+            description=col_type.upper()
+            if col_type in (self.NODE, self.RELATIONSHIP)
+            else col_type,
+            lastModified=AuditStampClass(
+                time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
+            ),
+        )
+    def add_properties(
+        self,
+        dataset: str,
+        description: Optional[str] = None,
+        custom_properties: Optional[Dict[str, str]] = None,
+    ) -> MetadataChangeProposalWrapper:
+        dataset_properties = DatasetPropertiesClass(
+            description=description,
+            customProperties=custom_properties,
+        )
+        return MetadataChangeProposalWrapper(
+            entityUrn=make_dataset_urn(
+                platform=self.PLATFORM, name=dataset, env=self.config.env
+            ),
+            aspect=dataset_properties,
+        )
+    def generate_neo4j_object(
+        self, dataset: str, columns: list, obj_type: Optional[str] = None
+    ) -> MetadataChangeProposalWrapper:
+        try:
+            fields = [
+                self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
+                for d in columns
+                for key, value in d.items()
+            ]
+            mcp = MetadataChangeProposalWrapper(
+                entityUrn=make_dataset_urn(
+                    platform=self.PLATFORM, name=dataset, env=self.config.env
+                ),
+                aspect=SchemaMetadataClass(
+                    schemaName=dataset,
+                    platform=make_data_platform_urn(self.PLATFORM),
+                    version=0,
+                    hash="",
+                    platformSchema=OtherSchemaClass(rawSchema=""),
+                    lastModified=AuditStampClass(
+                        time=round(time.time() * 1000),
+                        actor="urn:li:corpuser:ingestion",
+                    ),
+                    fields=fields,
+                ),
+            )
+            self.report.obj_created += 1
+        except Exception as e:
+            log.error(e)
+            self.report.obj_failures += 1
+        return mcp
+    def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
+        driver = GraphDatabase.driver(
+            self.config.uri, auth=(self.config.username, self.config.password)
+        )
+        """
+        This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
+        with two columns: key and value. The key represents the Neo4j object, while the value contains the
+        corresponding metadata.
+        When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
+        metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
+        relationships.
+        In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
+        dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
+        Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
+        single dataframe, which will be used to create the DataHub objects.
+        See the docs for examples of metadata:  metadata-ingestion/docs/sources/neo4j/neo4j.md
+        """
+        try:
+            log.info(f"{query}")
+            with driver.session() as session:
+                result = session.run(query)
+                data = [record for record in result]
+            log.info("Closing Neo4j driver")
+            driver.close()
+            node_df = self.process_nodes(data)
+            rel_df = self.process_relationships(data, node_df)
+            union_cols = ["key", "obj_type", "property_data_types", "description"]
+            df = pd.concat([node_df[union_cols], rel_df[union_cols]])
+        except Exception as e:
+            self.report.failure(
+                message="Failed to get neo4j metadata",
+                exc=e,
+            )
+        return df
+    def process_nodes(self, data: list) -> pd.DataFrame:
+        nodes = [record for record in data if record["value"]["type"] == self.NODE]
+        node_df = pd.DataFrame(
+            nodes,
+            columns=["key", "value"],
+        )
+        node_df["obj_type"] = node_df["value"].apply(
+            lambda record: self.get_obj_type(record)
+        )
+        node_df["relationships"] = node_df["value"].apply(
+            lambda record: self.get_relationships(record)
+        )
+        node_df["properties"] = node_df["value"].apply(
+            lambda record: self.get_properties(record)
+        )
+        node_df["property_data_types"] = node_df["properties"].apply(
+            lambda record: self.get_property_data_types(record)
+        )
+        node_df["description"] = node_df.apply(
+            lambda record: self.get_node_description(record, node_df), axis=1
+        )
+        return node_df
+    def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
+        rels = [
+            record for record in data if record["value"]["type"] == self.RELATIONSHIP
+        ]
+        rel_df = pd.DataFrame(rels, columns=["key", "value"])
+        rel_df["obj_type"] = rel_df["value"].apply(
+            lambda record: self.get_obj_type(record)
+        )
+        rel_df["properties"] = rel_df["value"].apply(
+            lambda record: self.get_properties(record)
+        )
+        rel_df["property_data_types"] = rel_df["properties"].apply(
+            lambda record: self.get_property_data_types(record)
+        )
+        rel_df["description"] = rel_df.apply(
+            lambda record: self.get_rel_descriptions(record, node_df), axis=1
+        )
+        return rel_df
+    def get_obj_type(self, record: dict) -> str:
+        return record["type"]
+    def get_rel_descriptions(self, record: dict, df: pd.DataFrame) -> str:
+        descriptions = []
+        for _, row in df.iterrows():
+            relationships = row.get("relationships", {})
+            for relationship, props in relationships.items():
+                if record["key"] == relationship:
+                    if props["direction"] == "in":
+                        for prop in props["labels"]:
+                            descriptions.append(
+                                f"({row['key']})-[{record['key']}]->({prop})"
+                            )
+        return "\n".join(descriptions)
+    def get_node_description(self, record: dict, df: pd.DataFrame) -> str:
+        descriptions = []
+        for _, row in df.iterrows():
+            if record["key"] == row["key"]:
+                for relationship, props in row["relationships"].items():
+                    direction = props["direction"]
+                    for node in set(props["labels"]):
+                        if direction == "in":
+                            descriptions.append(
+                                f"({row['key']})<-[{relationship}]-({node})"
+                            )
+                        elif direction == "out":
+                            descriptions.append(
+                                f"({row['key']})-[{relationship}]->({node})"
+                            )
+        return "\n".join(descriptions)
+    def get_property_data_types(self, record: dict) -> List[dict]:
+        return [{k: v["type"]} for k, v in record.items()]
+    def get_properties(self, record: dict) -> str:
+        return record["properties"]
+    def get_relationships(self, record: dict) -> dict:
+        return record.get("relationships", None)
+    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        df = self.get_neo4j_metadata(
+            "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
+        )
+        for index, row in df.iterrows():
+            try:
+                yield MetadataWorkUnit(
+                    id=row["key"],
+                    mcp=self.generate_neo4j_object(
+                        columns=row["property_data_types"],
+                        dataset=row["key"],
+                    ),
+                    is_primary_source=True,
+                )
+                yield MetadataWorkUnit(
+                    id=row["key"],
+                    mcp=MetadataChangeProposalWrapper(
+                        entityUrn=make_dataset_urn(
+                            platform=self.PLATFORM,
+                            name=row["key"],
+                            env=self.config.env,
+                        ),
+                        aspect=SubTypesClass(
+                            typeNames=[
+                                DatasetSubTypes.NEO4J_NODE
+                                if row["obj_type"] == self.NODE
+                                else DatasetSubTypes.NEO4J_RELATIONSHIP
+                            ]
+                        ),
+                    ),
+                )
+                yield MetadataWorkUnit(
+                    id=row["key"],
+                    mcp=self.add_properties(
+                        dataset=row["key"],
+                        custom_properties=None,
+                        description=row["description"],
+                    ),
+                )
+            except Exception as e:
+                raise e
+    def get_report(self):
+        return self.report

datahub/ingestion/source/powerbi/__init__.py CHANGED Viewed

	@@ -1 +0,0 @@
1	- from datahub.ingestion.source.powerbi.powerbi import PowerBiDashboardSource

datahub/ingestion/source/powerbi/config.py CHANGED Viewed

@@ -173,7 +173,7 @@ class SupportedDataPlatform(Enum):
         datahub_data_platform_name="redshift",
     )
-    DATABRICK_SQL = DataPlatformPair(
+    DATABRICKS_SQL = DataPlatformPair(
         powerbi_data_platform_name="Databricks", datahub_data_platform_name="databricks"
     )
@@ -313,8 +313,8 @@ class PowerBiDashboardSourceConfig(
         " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
     )
-    # Dataset type mapping PowerBI support many type of data-sources. Here user need to define what type of PowerBI
-    # DataSource need to be mapped to corresponding DataHub Platform DataSource. For example PowerBI `Snowflake` is
+    # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
+    # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
     # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
     dataset_type_mapping: Union[
         Dict[str, str], Dict[str, PlatformDetail]

datahub/ingestion/source/powerbi/m_query/data_classes.py CHANGED Viewed

@@ -1,25 +1,18 @@
 import os
-from abc import ABC
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from enum import Enum
+from typing import Any, Dict, List, Optional
 from lark import Tree
-TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False)
-class AbstractIdentifierAccessor(ABC):  # To pass lint
-    pass
+from datahub.ingestion.source.powerbi.config import DataPlatformPair
+from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
-# @dataclass
-# class ItemSelector:
-#     items: Dict[str, Any]
-#     next: Optional[AbstractIdentifierAccessor]
+TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False)
 @dataclass
-class IdentifierAccessor(AbstractIdentifierAccessor):
+class IdentifierAccessor:
     """
     statement
         public_order_date = Source{[Schema="public",Item="order_date"]}[Data]
@@ -30,13 +23,13 @@ class IdentifierAccessor(AbstractIdentifierAccessor):
         "[Schema="public",Item="order_date"]" is "items" in ItemSelector. Data of items varies as per DataSource
-        "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e. table
+        "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e., table
     """
     identifier: str
     items: Dict[str, Any]
-    next: Optional[AbstractIdentifierAccessor]
+    next: Optional["IdentifierAccessor"]
 @dataclass
@@ -53,3 +46,31 @@ class ReferencedTable:
     database: str
     schema: str
     table: str
+@dataclass
+class DataPlatformTable:
+    data_platform_pair: DataPlatformPair
+    urn: str
+@dataclass
+class Lineage:
+    upstreams: List[DataPlatformTable]
+    column_lineage: List[ColumnLineageInfo]
+    @staticmethod
+    def empty() -> "Lineage":
+        return Lineage(upstreams=[], column_lineage=[])
+class FunctionName(Enum):
+    NATIVE_QUERY = "Value.NativeQuery"
+    POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database"
+    ORACLE_DATA_ACCESS = "Oracle.Database"
+    SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases"
+    MSSQL_DATA_ACCESS = "Sql.Database"
+    DATABRICK_DATA_ACCESS = "Databricks.Catalogs"
+    GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
+    AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
+    DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"

datahub/ingestion/source/powerbi/m_query/parser.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Dict, List
 import lark
 from lark import Lark, Tree
+import datahub.ingestion.source.powerbi.m_query.data_classes
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.source.powerbi.config import (
     PowerBiDashboardSourceConfig,
@@ -65,7 +66,7 @@ def get_upstream_tables(
     ctx: PipelineContext,
     config: PowerBiDashboardSourceConfig,
     parameters: Dict[str, str] = {},
-) -> List[resolver.Lineage]:
+) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
     if table.expression is None:
         logger.debug(f"There is no M-Query expression in table {table.full_name}")
         return []
@@ -127,12 +128,14 @@ def get_upstream_tables(
     reporter.m_query_parse_successes += 1
     try:
-        lineage: List[resolver.Lineage] = resolver.MQueryResolver(
+        lineage: List[
+            datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+        ] = resolver.MQueryResolver(
             table=table,
             parse_tree=parse_tree,
             reporter=reporter,
             parameters=parameters,
-        ).resolve_to_data_platform_table_list(
+        ).resolve_to_lineage(
             ctx=ctx,
             config=config,
             platform_instance_resolver=platform_instance_resolver,

acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc8py3-none-any.whl → 0.15.0py3-none-any.whl