PyPI - acryl-datahub - Versions diffs - 1.0.0rc6__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc6py3-none-any.whl → 1.0.0rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (74) hide show

{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2490 -2490
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +74 -74
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/cli/docker_cli.py +1 -1
datahub/cli/iceberg_cli.py +1 -1
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -1
datahub/ingestion/graph/client.py +16 -7
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
datahub/ingestion/source/cassandra/cassandra_api.py +2 -1
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/dbt/dbt_common.py +1 -1
datahub/ingestion/source/dremio/dremio_api.py +3 -3
datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
datahub/ingestion/source/file.py +5 -2
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +11 -14
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
datahub/ingestion/source/identity/okta.py +1 -3
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +54 -32
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mode.py +1 -1
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +6 -3
datahub/ingestion/source/openapi_parser.py +2 -2
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/pulsar.py +2 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +2 -1
datahub/ingestion/source/s3/config.py +2 -4
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +1 -1
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +2 -2
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/sql_common.py +2 -2
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +4 -2
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/superset.py +65 -37
datahub/ingestion/source/tableau/tableau.py +3 -6
datahub/ingestion/source/tableau/tableau_common.py +2 -1
datahub/lite/duckdb_lite.py +5 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/sdk/dataset.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/superset.py CHANGED Viewed

@@ -36,9 +36,6 @@ from datahub.ingestion.api.decorators import (
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.sql.sql_types import resolve_sql_type
-from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
-    get_platform_from_sqlalchemy_uri,
-)
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
@@ -71,7 +68,12 @@ from datahub.metadata.schema_classes import (
     ChartInfoClass,
     ChartTypeClass,
     DashboardInfoClass,
+    DatasetLineageTypeClass,
     DatasetPropertiesClass,
+    GlobalTagsClass,
+    TagAssociationClass,
+    UpstreamClass,
+    UpstreamLineageClass,
 )
 from datahub.utilities import config_clean
 from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -287,26 +289,6 @@ class SupersetSource(StatefulIngestionSourceBase):
             current_page += 1
-    @lru_cache(maxsize=None)
-    def get_platform_from_database_id(self, database_id):
-        database_response = self.session.get(
-            f"{self.config.connect_uri}/api/v1/database/{database_id}"
-        ).json()
-        sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri")
-        if sqlalchemy_uri is None:
-            platform_name = database_response.get("result", {}).get(
-                "backend", "external"
-            )
-        else:
-            platform_name = get_platform_from_sqlalchemy_uri(sqlalchemy_uri)
-        if platform_name == "awsathena":
-            return "athena"
-        if platform_name == "clickhousedb":
-            return "clickhouse"
-        if platform_name == "postgresql":
-            return "postgres"
-        return platform_name
     @lru_cache(maxsize=None)
     def get_dataset_info(self, dataset_id: int) -> dict:
         dataset_response = self.session.get(
@@ -323,8 +305,6 @@ class SupersetSource(StatefulIngestionSourceBase):
         schema_name = dataset_response.get("result", {}).get("schema")
         table_name = dataset_response.get("result", {}).get("table_name")
         database_id = dataset_response.get("result", {}).get("database", {}).get("id")
-        platform = self.get_platform_from_database_id(database_id)
         database_name = (
             dataset_response.get("result", {}).get("database", {}).get("database_name")
         )
@@ -333,21 +313,24 @@ class SupersetSource(StatefulIngestionSourceBase):
         # Druid do not have a database concept and has a limited schema concept, but they are nonetheless reported
         # from superset. There is only one database per platform instance, and one schema named druid, so it would be
         # redundant to systemically store them both in the URN.
-        if platform in platform_without_databases:
+        if platform_instance in platform_without_databases:
             database_name = None
-        if platform == "druid" and schema_name == "druid":
+        if platform_instance == "druid" and schema_name == "druid":
             # Follow DataHub's druid source convention.
             schema_name = None
-        if database_id and table_name:
+        # If the information about the datasource is already contained in the dataset response,
+        # can just return the urn directly
+        if table_name and database_id:
             return make_dataset_urn(
-                platform=platform,
+                platform=platform_instance,
                 name=".".join(
                     name for name in [database_name, schema_name, table_name] if name
                 ),
                 env=self.config.env,
             )
         raise ValueError("Could not construct dataset URN")
     def construct_dashboard_from_api_data(
@@ -469,10 +452,16 @@ class SupersetSource(StatefulIngestionSourceBase):
         chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
         datasource_id = chart_data.get("datasource_id")
-        dataset_response = self.get_dataset_info(datasource_id)
-        datasource_urn = self.get_datasource_urn_from_id(
-            dataset_response, self.platform
-        )
+        if not datasource_id:
+            logger.debug(
+                f"chart {chart_data['id']} has no datasource_id, skipping fetching dataset info"
+            )
+            datasource_urn = None
+        else:
+            dataset_response = self.get_dataset_info(datasource_id)
+            datasource_urn = self.get_datasource_urn_from_id(
+                dataset_response, self.platform
+            )
         params = json.loads(chart_data.get("params", "{}"))
         metrics = [
@@ -588,25 +577,61 @@ class SupersetSource(StatefulIngestionSourceBase):
     ) -> DatasetSnapshot:
         dataset_response = self.get_dataset_info(dataset_data.get("id"))
         dataset = SupersetDataset(**dataset_response["result"])
         datasource_urn = self.get_datasource_urn_from_id(
             dataset_response, self.platform
         )
+        dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
-        dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
+        upstream_warehouse_platform = (
+            dataset_response.get("result", {}).get("database", {}).get("backend")
+        )
+        # Preset has a way of naming their platforms differently than
+        # how datahub names them, so map the platform name to the correct naming
+        warehouse_naming = {
+            "awsathena": "athena",
+            "clickhousedb": "clickhouse",
+            "postgresql": "postgres",
+        }
+        if upstream_warehouse_platform in warehouse_naming:
+            upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
+        # TODO: Categorize physical vs virtual upstream dataset
+        # mark all upstream dataset as physical for now, in the future we would ideally like
+        # to differentiate physical vs virtual upstream datasets
+        tag_urn = f"urn:li:tag:{self.platform}:physical"
+        upstream_dataset = self.get_datasource_urn_from_id(
+            dataset_response, upstream_warehouse_platform
+        )
+        upstream_lineage = UpstreamLineageClass(
+            upstreams=[
+                UpstreamClass(
+                    type=DatasetLineageTypeClass.TRANSFORMED,
+                    dataset=upstream_dataset,
+                    properties={"externalUrl": dataset_url},
+                )
+            ]
+        )
         dataset_info = DatasetPropertiesClass(
             name=dataset.table_name,
             description="",
-            lastModified=TimeStamp(time=dataset.modified_ts)
-            if dataset.modified_ts
-            else None,
+            lastModified=(
+                TimeStamp(time=dataset.modified_ts) if dataset.modified_ts else None
+            ),
             externalUrl=dataset_url,
         )
+        global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
         aspects_items: List[Any] = []
         aspects_items.extend(
             [
                 self.gen_schema_metadata(dataset_response),
                 dataset_info,
+                upstream_lineage,
+                global_tags,
             ]
         )
@@ -614,6 +639,9 @@ class SupersetSource(StatefulIngestionSourceBase):
             urn=datasource_urn,
             aspects=aspects_items,
         )
+        logger.info(f"Constructed dataset {datasource_urn}")
         return dataset_snapshot
     def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -1562,8 +1562,9 @@ class TableauSiteSource:
         query: str,
         connection_type: str,
         page_size: int,
-        query_filter: dict = {},
+        query_filter: Optional[dict] = None,
     ) -> Iterable[dict]:
+        query_filter = query_filter or {}
         query_filter = optimize_query_filter(query_filter)
         # Calls the get_connection_object_page function to get the objects,
@@ -1910,11 +1911,7 @@ class TableauSiteSource:
                     if upstream_col.get(c.TABLE)
                     else None
                 )
-                if (
-                    name
-                    and upstream_table_id
-                    and upstream_table_id in table_id_to_urn.keys()
-                ):
+                if name and upstream_table_id and upstream_table_id in table_id_to_urn:
                     parent_dataset_urn = table_id_to_urn[upstream_table_id]
                     if (
                         self.is_snowflake_urn(parent_dataset_urn)

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -514,7 +514,8 @@ FIELD_TYPE_MAPPING = {
 }
-def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass:
+def get_tags_from_params(params: Optional[List[str]] = None) -> GlobalTagsClass:
+    params = params or []
     tags = [
         TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
         for tag in params

datahub/lite/duckdb_lite.py CHANGED Viewed

@@ -284,9 +284,10 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
         self,
         query: str,
         flavor: SearchFlavor,
-        aspects: List[str] = [],
+        aspects: Optional[List[str]] = None,
         snippet: bool = True,
     ) -> Iterable[Searchable]:
+        aspects = aspects or []
         if flavor == SearchFlavor.FREE_TEXT:
             base_query = f"SELECT distinct(urn), 'urn', NULL from metadata_aspect_v2 where urn ILIKE '%{query}%' UNION SELECT urn, aspect_name, metadata from metadata_aspect_v2 where metadata->>'$.name' ILIKE '%{query}%'"
             for r in self.duckdb_client.execute(base_query).fetchall():
@@ -759,15 +760,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
                 entity_id=[str(data_platform_urn), data_platform_instance],
             )
             self._create_edges_from_data_platform_instance(data_platform_instance_urn)
-        elif isinstance(aspect, ChartInfoClass):
-            urn = Urn.from_string(entity_urn)
-            self.add_edge(
-                entity_urn,
-                "name",
-                aspect.title + f" ({urn.get_entity_id()[-1]})",
-                remove_existing=True,
-            )
-        elif isinstance(aspect, DashboardInfoClass):
+        elif isinstance(aspect, ChartInfoClass) or isinstance(
+            aspect, DashboardInfoClass
+        ):
             urn = Urn.from_string(entity_urn)
             self.add_edge(
                 entity_urn,

datahub/lite/lite_local.py CHANGED Viewed

@@ -90,7 +90,7 @@ class DataHubLiteLocal(Generic[LiteConfig], Closeable, metaclass=ABCMeta):
         self,
         query: str,
         flavor: SearchFlavor,
-        aspects: List[str] = [],
+        aspects: Optional[List[str]] = None,
         snippet: bool = True,
     ) -> Iterable[Searchable]:
         pass

datahub/lite/lite_util.py CHANGED Viewed

@@ -70,9 +70,10 @@ class DataHubLiteWrapper(DataHubLiteLocal):
         self,
         query: str,
         flavor: SearchFlavor,
-        aspects: List[str] = [],
+        aspects: Optional[List[str]] = None,
         snippet: bool = True,
     ) -> Iterable[Searchable]:
+        aspects = aspects or []
         yield from self.lite.search(query, flavor, aspects, snippet)
     def ls(self, path: str) -> List[Browseable]:
@@ -96,10 +97,10 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
     lite_type = lite_local_config.type
     try:
         lite_class = lite_registry.get(lite_type)
-    except KeyError:
+    except KeyError as e:
         raise Exception(
             f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping.keys()]}"
-        )
+        ) from e
     lite_specific_config = lite_class.get_config_class().parse_obj(
         lite_local_config.config

datahub/sdk/dataset.py CHANGED Viewed

@@ -72,9 +72,9 @@ UpstreamLineageInputType: TypeAlias = Union[
 def _parse_upstream_input(
     upstream_input: UpstreamInputType,
 ) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
-    if isinstance(upstream_input, models.UpstreamClass):
-        return upstream_input
-    elif isinstance(upstream_input, models.FineGrainedLineageClass):
+    if isinstance(upstream_input, models.UpstreamClass) or isinstance(
+        upstream_input, models.FineGrainedLineageClass
+    ):
         return upstream_input
     elif isinstance(upstream_input, (str, DatasetUrn)):
         return models.UpstreamClass(

datahub/utilities/memory_footprint.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from collections import deque
 from itertools import chain
 from sys import getsizeof
-from typing import Any, Iterator
+from typing import Any, Iterator, Optional
-def total_size(o: Any, handlers: Any = {}) -> int:
+def total_size(o: Any, handlers: Optional[Any] = None) -> int:
     """Returns the approximate memory footprint an object and all of its contents.
     Automatically finds the contents of the following builtin containers and
     their subclasses:  tuple, list, deque, dict, set and frozenset.
@@ -14,6 +14,7 @@ def total_size(o: Any, handlers: Any = {}) -> int:
     Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
     """
+    handlers = handlers or {}
     def dict_handler(d: dict) -> Iterator[Any]:
         return chain.from_iterable(d.items())

{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE RENAMED Viewed

File without changes

{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 1.0.0rc6__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc6py3-none-any.whl → 1.0.0rc8py3-none-any.whl