PyPI - acryl-datahub - Versions diffs - 1.0.0rc18__py3-none-any.whl → 1.0.0.1__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc18py3-none-any.whl → 1.0.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (106) hide show

{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2391 -2392
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +105 -88
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -28
datahub/cli/specific/dataset_cli.py +26 -10
datahub/emitter/mce_builder.py +1 -3
datahub/emitter/mcp_builder.py +8 -0
datahub/emitter/request_helper.py +19 -14
datahub/emitter/response_helper.py +25 -18
datahub/emitter/rest_emitter.py +23 -7
datahub/errors.py +8 -0
datahub/ingestion/api/source.py +7 -2
datahub/ingestion/api/source_helpers.py +14 -2
datahub/ingestion/extractor/schema_util.py +1 -0
datahub/ingestion/graph/client.py +26 -20
datahub/ingestion/graph/filters.py +62 -17
datahub/ingestion/sink/datahub_rest.py +2 -2
datahub/ingestion/source/cassandra/cassandra.py +1 -10
datahub/ingestion/source/common/data_platforms.py +23 -0
datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
datahub/ingestion/source/common/subtypes.py +17 -1
datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
datahub/ingestion/source/dbt/dbt_common.py +6 -4
datahub/ingestion/source/dbt/dbt_core.py +4 -6
datahub/ingestion/source/dbt/dbt_tests.py +8 -6
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
datahub/ingestion/source/dremio/dremio_entities.py +6 -5
datahub/ingestion/source/dremio/dremio_source.py +96 -117
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
datahub/ingestion/source/ge_data_profiler.py +11 -1
datahub/ingestion/source/hex/__init__.py +0 -0
datahub/ingestion/source/hex/api.py +394 -0
datahub/ingestion/source/hex/constants.py +3 -0
datahub/ingestion/source/hex/hex.py +167 -0
datahub/ingestion/source/hex/mapper.py +372 -0
datahub/ingestion/source/hex/model.py +68 -0
datahub/ingestion/source/iceberg/iceberg.py +193 -140
datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
datahub/ingestion/source/mlflow.py +217 -8
datahub/ingestion/source/mode.py +11 -1
datahub/ingestion/source/openapi.py +69 -34
datahub/ingestion/source/powerbi/config.py +31 -4
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
datahub/ingestion/source/powerbi/powerbi.py +41 -24
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
datahub/ingestion/source/redshift/lineage_v2.py +9 -1
datahub/ingestion/source/redshift/query.py +1 -1
datahub/ingestion/source/s3/source.py +11 -0
datahub/ingestion/source/sigma/config.py +3 -4
datahub/ingestion/source/sigma/sigma.py +10 -6
datahub/ingestion/source/slack/slack.py +399 -82
datahub/ingestion/source/snowflake/constants.py +1 -0
datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
datahub/ingestion/source/sql/mssql/job_models.py +15 -1
datahub/ingestion/source/sql/mssql/source.py +8 -4
datahub/ingestion/source/sql/oracle.py +51 -4
datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
datahub/ingestion/source/superset.py +291 -35
datahub/ingestion/source/usage/usage_common.py +0 -65
datahub/ingestion/source/vertexai/__init__.py +0 -0
datahub/ingestion/source/vertexai/vertexai.py +1055 -0
datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
datahub/metadata/_schema_classes.py +472 -1
datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
datahub/metadata/schema.avsc +313 -2
datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/Deprecation.avsc +2 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
datahub/metadata/schemas/QueryProperties.avsc +20 -0
datahub/metadata/schemas/Siblings.avsc +2 -0
datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
datahub/sdk/__init__.py +1 -0
datahub/sdk/dataset.py +122 -0
datahub/sdk/entity.py +99 -3
datahub/sdk/entity_client.py +27 -3
datahub/sdk/main_client.py +24 -1
datahub/sdk/search_client.py +81 -8
datahub/sdk/search_filters.py +94 -37
datahub/sql_parsing/split_statements.py +17 -3
datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
datahub/sql_parsing/tool_meta_extractor.py +27 -2
datahub/testing/mcp_diff.py +1 -18
datahub/utilities/threaded_iterator_executor.py +16 -3
datahub/ingestion/source/vertexai.py +0 -697
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
{acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/mlflow.py CHANGED Viewed

@@ -1,9 +1,11 @@
+import json
+import os
 import time
 from dataclasses import dataclass
-from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
+from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
 from mlflow import MlflowClient
-from mlflow.entities import Experiment, Run
+from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
 from mlflow.entities.model_registry import ModelVersion, RegisteredModel
 from mlflow.store.entities import PagedList
 from pydantic.fields import Field
@@ -29,6 +31,7 @@ from datahub.ingestion.api.source import (
     SourceReport,
 )
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
 from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
@@ -42,6 +45,7 @@ from datahub.metadata.schema_classes import (
     AuditStampClass,
     ContainerClass,
     DataPlatformInstanceClass,
+    DataProcessInstanceInputClass,
     DataProcessInstanceOutputClass,
     DataProcessInstancePropertiesClass,
     DataProcessInstanceRunEventClass,
@@ -60,16 +64,15 @@ from datahub.metadata.schema_classes import (
     TagAssociationClass,
     TagPropertiesClass,
     TimeStampClass,
+    UpstreamClass,
+    UpstreamLineageClass,
     VersionPropertiesClass,
     VersionTagClass,
     _Aspect,
 )
-from datahub.metadata.urns import (
-    DataPlatformUrn,
-    MlModelUrn,
-    VersionSetUrn,
-)
+from datahub.metadata.urns import DataPlatformUrn, DatasetUrn, MlModelUrn, VersionSetUrn
 from datahub.sdk.container import Container
+from datahub.sdk.dataset import Dataset
 T = TypeVar("T")
@@ -105,6 +108,20 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
             " If neither is set, external URLs are not generated."
         ),
     )
+    materialize_dataset_inputs: Optional[bool] = Field(
+        default=False,
+        description="Whether to materialize dataset inputs for each run",
+    )
+    source_mapping_to_platform: Optional[dict] = Field(
+        default=None, description="Mapping of source type to datahub platform"
+    )
+    username: Optional[str] = Field(
+        default=None, description="Username for MLflow authentication"
+    )
+    password: Optional[str] = Field(
+        default=None, description="Password for MLflow authentication"
+    )
 @dataclass
@@ -152,7 +169,17 @@ class MLflowSource(StatefulIngestionSourceBase):
         self.ctx = ctx
         self.config = config
         self.report = StaleEntityRemovalSourceReport()
-        self.client = MlflowClient(
+        self.client = self._configure_client()
+    def _configure_client(self) -> MlflowClient:
+        if bool(self.config.username) != bool(self.config.password):
+            raise ValueError("Both username and password must be set together")
+        if self.config.username and self.config.password:
+            os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.username
+            os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.password
+        return MlflowClient(
             tracking_uri=self.config.tracking_uri,
             registry_uri=self.config.registry_uri,
         )
@@ -213,6 +240,7 @@ class MLflowSource(StatefulIngestionSourceBase):
             if runs:
                 for run in runs:
                     yield from self._get_run_workunits(experiment, run)
+                    yield from self._get_dataset_input_workunits(run)
     def _get_experiment_custom_properties(self, experiment):
         experiment_custom_props = getattr(experiment, "tags", {}) or {}
@@ -262,6 +290,183 @@ class MLflowSource(StatefulIngestionSourceBase):
                 type="SKIPPED", nativeResultType=self.platform
             )
+    def _get_dataset_schema(
+        self, dataset: MlflowDataset
+    ) -> Optional[List[Tuple[str, str]]]:
+        try:
+            schema_dict = json.loads(dataset.schema)
+        except json.JSONDecodeError:
+            self.report.warning(
+                title="Failed to load dataset schema",
+                message="Schema metadata will be missing due to a JSON parsing error.",
+                context=f"Dataset: {dataset.name}, Schema: {dataset.schema}",
+            )
+            return None
+        if "mlflow_colspec" in schema_dict:
+            try:
+                return [
+                    (field["name"], field["type"])
+                    for field in schema_dict["mlflow_colspec"]
+                ]
+            except (KeyError, TypeError):
+                return None
+        # If the schema is not formatted, return None
+        return None
+    def _get_external_dataset_urn(self, platform: str, dataset_name: str) -> str:
+        """
+        Get the URN for an external dataset.
+        Args:
+            platform: The platform of the external dataset (e.g., 's3', 'bigquery')
+            dataset: The MLflow dataset
+        Returns:
+            str: The URN of the external dataset
+        """
+        return str(DatasetUrn(platform=platform, name=dataset_name))
+    def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
+        """
+        Generate workunits for dataset inputs in a run.
+        For each dataset input:
+        1. If source type is 'local' or 'code':
+           - Create a local dataset reference
+        2. Otherwise:
+           - If materialization is enabled:
+             - Create a hosted dataset and a dataset reference with upstream
+           - If materialization is not enabled:
+             - Create a dataset reference and add upstream if dataset exists
+        3. Add all dataset references as upstreams for the run
+        """
+        run_urn = DataProcessInstance(
+            id=run.info.run_id,
+            orchestrator=self.platform,
+        ).urn
+        dataset_reference_urns = []
+        for dataset_input in run.inputs.dataset_inputs:
+            dataset = dataset_input.dataset
+            source_type = dataset.source_type
+            dataset_tags = {k[1]: v[1] for k, v in dataset_input.tags}
+            # Prepare dataset properties
+            custom_properties = dataset_tags
+            formatted_schema = self._get_dataset_schema(dataset)
+            if formatted_schema is None:
+                custom_properties["schema"] = dataset.schema
+            # Handle local/code datasets
+            if source_type in ("local", "code"):
+                local_dataset = Dataset(
+                    platform=self.platform,
+                    name=dataset.name,
+                    schema=formatted_schema,
+                    custom_properties=custom_properties,
+                )
+                yield from local_dataset.as_workunits()
+                dataset_reference_urns.append(local_dataset.urn)
+                continue
+            # Handle hosted datasets
+            formatted_platform = self._get_dataset_platform_from_source_type(
+                source_type
+            )
+            # Validate platform if materialization is enabled
+            if self.config.materialize_dataset_inputs:
+                if not formatted_platform:
+                    self.report.failure(
+                        title="Unable to materialize dataset inputs",
+                        message=f"No mapping dataPlatform found for dataset input source type '{source_type}'",
+                        context=f"please add `materialize_dataset_inputs.source_mapping_to_platform` in config "
+                        f"(e.g. '{source_type}': 'snowflake')",
+                    )
+                    continue
+                # Create hosted dataset
+                hosted_dataset = Dataset(
+                    platform=formatted_platform,
+                    name=dataset.name,
+                    schema=formatted_schema,
+                    custom_properties=dataset_tags,
+                )
+                yield from hosted_dataset.as_workunits()
+            # Create dataset reference with upstream
+            hosted_dataset_reference = Dataset(
+                platform=self.platform,
+                name=dataset.name,
+                schema=formatted_schema,
+                custom_properties=dataset_tags,
+                upstreams=UpstreamLineageClass(
+                    upstreams=[
+                        UpstreamClass(
+                            self._get_external_dataset_urn(
+                                formatted_platform, dataset.name
+                            ),
+                            type="COPY",
+                        )
+                    ]
+                )
+                if formatted_platform
+                else None,
+            )
+            dataset_reference_urns.append(hosted_dataset_reference.urn)
+            yield from hosted_dataset_reference.as_workunits()
+        # Add dataset references as upstreams for the run
+        if dataset_reference_urns:
+            input_edges = [
+                EdgeClass(destinationUrn=str(dataset_ref_urn))
+                for dataset_ref_urn in dataset_reference_urns
+            ]
+            yield MetadataChangeProposalWrapper(
+                entityUrn=str(run_urn),
+                aspect=DataProcessInstanceInputClass(inputs=[], inputEdges=input_edges),
+            ).as_workunit()
+    def _get_dataset_platform_from_source_type(self, source_type: str) -> Optional[str]:
+        """
+        Map MLflow source type to DataHub platform.
+        Priority:
+        1. User-provided mapping in config
+        2. Internal mapping
+        3. Direct platform match from list of supported platforms
+        """
+        source_type = source_type.lower()
+        # User-provided mapping
+        platform = self._get_platform_from_user_mapping(source_type)
+        if platform:
+            return platform
+        # Internal mapping
+        if source_type == "gs":
+            return "gcs"
+        # Check direct platform match
+        if self._is_valid_platform(source_type):
+            return source_type
+        return None
+    def _get_platform_from_user_mapping(self, source_type: str) -> Optional[str]:
+        """
+        Get platform from user-provided mapping in config.
+        Returns None if mapping is invalid or platform is not supported.
+        """
+        source_mapping = self.config.source_mapping_to_platform
+        if not source_mapping:
+            return None
+        platform = source_mapping.get(source_type)
+        if not platform:
+            return None
+        return platform
     def _get_run_workunits(
         self, experiment: Experiment, run: Run
     ) -> Iterable[MetadataWorkUnit]:
@@ -659,6 +864,10 @@ class MLflowSource(StatefulIngestionSourceBase):
         )
         return wu
+    def _is_valid_platform(self, platform: Optional[str]) -> bool:
+        """Check if platform is registered as a source plugin"""
+        return platform in KNOWN_VALID_PLATFORM_NAMES
     @classmethod
     def create(cls, config_dict: dict, ctx: PipelineContext) -> "MLflowSource":
         config = MLflowConfig.parse_obj(config_dict)

datahub/ingestion/source/mode.py CHANGED Viewed

@@ -33,6 +33,7 @@ from datahub.emitter.mcp_builder import (
     add_dataset_to_container,
     gen_containers,
 )
+from datahub.emitter.request_helper import make_curl_command
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SourceCapability,
@@ -339,7 +340,8 @@ class ModeSource(StatefulIngestionSourceBase):
         # Test the connection
         try:
-            self._get_request_json(f"{self.config.connect_uri}/api/verify")
+            key_info = self._get_request_json(f"{self.config.connect_uri}/api/verify")
+            logger.debug(f"Auth info: {key_info}")
         except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Connect",
@@ -1485,12 +1487,17 @@ class ModeSource(StatefulIngestionSourceBase):
         @r.wraps
         def get_request():
+            curl_command = make_curl_command(self.session, "GET", url, "")
+            logger.debug(f"Issuing request; curl equivalent: {curl_command}")
             try:
                 response = self.session.get(
                     url, timeout=self.config.api_options.timeout
                 )
                 if response.status_code == 204:  # No content, don't parse json
                     return {}
+                response.raise_for_status()
                 return response.json()
             except HTTPError as http_error:
                 error_response = http_error.response
@@ -1501,6 +1508,9 @@ class ModeSource(StatefulIngestionSourceBase):
                         time.sleep(float(sleep_time))
                     raise HTTPError429 from None
+                logger.debug(
+                    f"Error response ({error_response.status_code}): {error_response.text}"
+                )
                 raise http_error
         return get_request()

datahub/ingestion/source/openapi.py CHANGED Viewed

@@ -2,13 +2,14 @@ import logging
 import time
 import warnings
 from abc import ABC
-from typing import Dict, Iterable, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple
 from pydantic import validator
 from pydantic.fields import Field
 from datahub.configuration.common import ConfigModel
 from datahub.emitter.mce_builder import make_tag_urn
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SourceCapability,
@@ -20,6 +21,7 @@ from datahub.ingestion.api.decorators import (
 )
 from datahub.ingestion.api.source import Source, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.common.subtypes import DatasetSubTypes
 from datahub.ingestion.source.openapi_parser import (
     clean_url,
     compose_url_attr,
@@ -32,14 +34,13 @@ from datahub.ingestion.source.openapi_parser import (
     set_metadata,
     try_guessing,
 )
-from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
-from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
 from datahub.metadata.schema_classes import (
     AuditStampClass,
     DatasetPropertiesClass,
     GlobalTagsClass,
     InstitutionalMemoryClass,
     InstitutionalMemoryMetadataClass,
+    SubTypesClass,
     TagAssociationClass,
 )
@@ -222,8 +223,9 @@ class APISource(Source, ABC):
     def init_dataset(
         self, endpoint_k: str, endpoint_dets: dict
-    ) -> Tuple[DatasetSnapshot, str]:
+    ) -> Tuple[str, str, List[MetadataWorkUnit]]:
         config = self.config
+        workunits = []
         dataset_name = endpoint_k[1:].replace("/", ".")
@@ -233,22 +235,27 @@ class APISource(Source, ABC):
         else:
             dataset_name = "root"
-        dataset_snapshot = DatasetSnapshot(
-            urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
-            aspects=[],
-        )
+        dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)"
-        # adding description
-        dataset_properties = DatasetPropertiesClass(
+        # Create dataset properties aspect
+        properties = DatasetPropertiesClass(
             description=endpoint_dets["description"], customProperties={}
         )
-        dataset_snapshot.aspects.append(dataset_properties)
+        wu = MetadataWorkUnit(
+            id=dataset_name,
+            mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=properties),
+        )
+        workunits.append(wu)
-        # adding tags
+        # Create tags aspect
         tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
         tags_tac = [TagAssociationClass(t) for t in tags_str]
         gtc = GlobalTagsClass(tags_tac)
-        dataset_snapshot.aspects.append(gtc)
+        wu = MetadataWorkUnit(
+            id=f"{dataset_name}-tags",
+            mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=gtc),
+        )
+        workunits.append(wu)
         # the link will appear in the "documentation"
         link_url = clean_url(config.url + self.url_basepath + endpoint_k)
@@ -260,17 +267,25 @@ class APISource(Source, ABC):
             url=link_url, description=link_description, createStamp=creation
         )
         inst_memory = InstitutionalMemoryClass([link_metadata])
-        dataset_snapshot.aspects.append(inst_memory)
+        wu = MetadataWorkUnit(
+            id=f"{dataset_name}-docs",
+            mcp=MetadataChangeProposalWrapper(
+                entityUrn=dataset_urn, aspect=inst_memory
+            ),
+        )
+        workunits.append(wu)
-        return dataset_snapshot, dataset_name
+        # Create subtype aspect
+        sub_types = SubTypesClass(typeNames=[DatasetSubTypes.API_ENDPOINT])
+        wu = MetadataWorkUnit(
+            id=f"{dataset_name}-subtype",
+            mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=sub_types),
+        )
+        workunits.append(wu)
-    def build_wu(
-        self, dataset_snapshot: DatasetSnapshot, dataset_name: str
-    ) -> ApiWorkUnit:
-        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
-        return ApiWorkUnit(id=dataset_name, mce=mce)
+        return dataset_name, dataset_urn, workunits
-    def get_workunits_internal(self) -> Iterable[ApiWorkUnit]:
+    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         config = self.config
         sw_dict = self.config.get_swagger()
@@ -294,16 +309,24 @@ class APISource(Source, ABC):
             if endpoint_k in config.ignore_endpoints:
                 continue
-            dataset_snapshot, dataset_name = self.init_dataset(
+            # Initialize dataset and get common aspects
+            dataset_name, dataset_urn, workunits = self.init_dataset(
                 endpoint_k, endpoint_dets
             )
+            for wu in workunits:
+                yield wu
-            # adding dataset fields
+            # Handle schema metadata if available
             if "data" in endpoint_dets.keys():
                 # we are lucky! data is defined in the swagger for this endpoint
                 schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
-                dataset_snapshot.aspects.append(schema_metadata)
-                yield self.build_wu(dataset_snapshot, dataset_name)
+                wu = MetadataWorkUnit(
+                    id=f"{dataset_name}-schema",
+                    mcp=MetadataChangeProposalWrapper(
+                        entityUrn=dataset_urn, aspect=schema_metadata
+                    ),
+                )
+                yield wu
             elif endpoint_dets["method"] != "get":
                 self.report.report_warning(
                     title="Failed to Extract Endpoint Metadata",
@@ -338,9 +361,13 @@ class APISource(Source, ABC):
                             context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
                         )
                     schema_metadata = set_metadata(dataset_name, fields2add)
-                    dataset_snapshot.aspects.append(schema_metadata)
-                    yield self.build_wu(dataset_snapshot, dataset_name)
+                    wu = MetadataWorkUnit(
+                        id=f"{dataset_name}-schema",
+                        mcp=MetadataChangeProposalWrapper(
+                            entityUrn=dataset_urn, aspect=schema_metadata
+                        ),
+                    )
+                    yield wu
                 else:
                     self.report_bad_responses(response.status_code, type=endpoint_k)
             else:
@@ -369,9 +396,13 @@ class APISource(Source, ABC):
                                 context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
                             )
                         schema_metadata = set_metadata(dataset_name, fields2add)
-                        dataset_snapshot.aspects.append(schema_metadata)
-                        yield self.build_wu(dataset_snapshot, dataset_name)
+                        wu = MetadataWorkUnit(
+                            id=f"{dataset_name}-schema",
+                            mcp=MetadataChangeProposalWrapper(
+                                entityUrn=dataset_urn, aspect=schema_metadata
+                            ),
+                        )
+                        yield wu
                     else:
                         self.report_bad_responses(response.status_code, type=endpoint_k)
                 else:
@@ -400,9 +431,13 @@ class APISource(Source, ABC):
                                 context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
                             )
                         schema_metadata = set_metadata(dataset_name, fields2add)
-                        dataset_snapshot.aspects.append(schema_metadata)
-                        yield self.build_wu(dataset_snapshot, dataset_name)
+                        wu = MetadataWorkUnit(
+                            id=f"{dataset_name}-schema",
+                            mcp=MetadataChangeProposalWrapper(
+                                entityUrn=dataset_urn, aspect=schema_metadata
+                            ),
+                        )
+                        yield wu
                     else:
                         self.report_bad_responses(response.status_code, type=endpoint_k)

datahub/ingestion/source/powerbi/config.py CHANGED Viewed

@@ -11,6 +11,9 @@ import datahub.emitter.mce_builder as builder
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
 from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
+from datahub.ingestion.api.incremental_lineage_helper import (
+    IncrementalLineageConfigMixin,
+)
 from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalSourceReport,
@@ -19,6 +22,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
+from datahub.utilities.global_warning_util import add_global_warning
 from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.perf_timer import PerfTimer
@@ -183,6 +187,11 @@ class SupportedDataPlatform(Enum):
         datahub_data_platform_name="databricks",
     )
+    MYSQL = DataPlatformPair(
+        powerbi_data_platform_name="MySQL",
+        datahub_data_platform_name="mysql",
+    )
 @dataclass
 class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
@@ -275,7 +284,7 @@ class PowerBiProfilingConfig(ConfigModel):
 class PowerBiDashboardSourceConfig(
-    StatefulIngestionConfigBase, DatasetSourceConfigMixin
+    StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
 ):
     platform_name: str = pydantic.Field(
         default=Constant.PLATFORM_NAME, hidden_from_docs=True
@@ -297,7 +306,15 @@ class PowerBiDashboardSourceConfig(
     # PowerBi workspace identifier
     workspace_id_pattern: AllowDenyPattern = pydantic.Field(
         default=AllowDenyPattern.allow_all(),
-        description="Regex patterns to filter PowerBI workspaces in ingestion."
+        description="Regex patterns to filter PowerBI workspaces in ingestion by ID."
+        " By default all IDs are allowed unless they are filtered by name using 'workspace_name_pattern'."
+        " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
+    )
+    # PowerBi workspace name
+    workspace_name_pattern: AllowDenyPattern = pydantic.Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns to filter PowerBI workspaces in ingestion by name."
+        " By default all names are allowed unless they are filtered by ID using 'workspace_id_pattern'."
         " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
     )
@@ -373,8 +390,9 @@ class PowerBiDashboardSourceConfig(
     )
     # Enable/Disable extracting dataset schema
     extract_dataset_schema: bool = pydantic.Field(
-        default=False,
-        description="Whether to ingest PBI Dataset Table columns and measures",
+        default=True,
+        description="Whether to ingest PBI Dataset Table columns and measures."
+        " Note: this setting must be `true` for schema extraction and column lineage to be enabled.",
     )
     # Enable/Disable extracting lineage information of PowerBI Dataset
     extract_lineage: bool = pydantic.Field(
@@ -510,6 +528,7 @@ class PowerBiDashboardSourceConfig(
             "native_query_parsing",
             "enable_advance_lineage_sql_construct",
             "extract_lineage",
+            "extract_dataset_schema",
         ]
         if (
@@ -575,3 +594,11 @@ class PowerBiDashboardSourceConfig(
             )
         return values
+    @root_validator(skip_on_failure=True)
+    def validate_extract_dataset_schema(cls, values: Dict) -> Dict:
+        if values.get("extract_dataset_schema") is False:
+            add_global_warning(
+                "Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
+            )
+        return values

datahub/ingestion/source/powerbi/m_query/data_classes.py CHANGED Viewed

@@ -74,3 +74,4 @@ class FunctionName(Enum):
     GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
     AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
     DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
+    MYSQL_DATA_ACCESS = "MySQL.Database"

acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc18py3-none-any.whl → 1.0.0.1py3-none-any.whl