PyPI - acryl-datahub - Versions diffs - 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show

{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
datahub/__init__.py +1 -1
datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
datahub/configuration/common.py +2 -5
datahub/configuration/source_common.py +13 -0
datahub/emitter/mce_builder.py +20 -4
datahub/emitter/mcp_builder.py +2 -7
datahub/emitter/mcp_patch_builder.py +37 -13
datahub/emitter/rest_emitter.py +25 -3
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
datahub/ingestion/api/closeable.py +3 -3
datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
datahub/ingestion/api/report.py +4 -1
datahub/ingestion/api/sink.py +4 -3
datahub/ingestion/api/source.py +4 -0
datahub/ingestion/api/source_helpers.py +2 -6
datahub/ingestion/glossary/classifier.py +2 -3
datahub/ingestion/graph/client.py +6 -3
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
datahub/ingestion/source/aws/aws_common.py +231 -27
datahub/ingestion/source/aws/glue.py +12 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
datahub/ingestion/source/datahub/config.py +22 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
datahub/ingestion/source/gc/datahub_gc.py +21 -5
datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
datahub/ingestion/source/iceberg/iceberg.py +27 -1
datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
datahub/ingestion/source/kafka_connect/__init__.py +0 -0
datahub/ingestion/source/kafka_connect/common.py +202 -0
datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
datahub/ingestion/source/looker/looker_common.py +63 -2
datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
datahub/ingestion/source/looker/looker_source.py +31 -4
datahub/ingestion/source/looker/looker_usage.py +23 -17
datahub/ingestion/source/mlflow.py +30 -5
datahub/ingestion/source/mode.py +40 -27
datahub/ingestion/source/powerbi/config.py +1 -14
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
datahub/ingestion/source/s3/source.py +1 -1
datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
datahub/ingestion/source/sql/hive.py +621 -8
datahub/ingestion/source/sql/hive_metastore.py +7 -0
datahub/ingestion/source/sql/mssql/job_models.py +30 -1
datahub/ingestion/source/sql/mssql/source.py +15 -1
datahub/ingestion/source/sql/sql_common.py +41 -102
datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
datahub/ingestion/source/sql/sql_report.py +2 -0
datahub/ingestion/source/state/checkpoint.py +2 -1
datahub/ingestion/source/tableau/tableau.py +122 -45
datahub/ingestion/source/tableau/tableau_common.py +18 -0
datahub/ingestion/source/tableau/tableau_constant.py +3 -1
datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/proxy.py +8 -27
datahub/ingestion/source/usage/usage_common.py +15 -1
datahub/ingestion/source_report/ingestion_stage.py +3 -0
datahub/metadata/_schema_classes.py +256 -3
datahub/metadata/_urns/urn_defs.py +168 -168
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
datahub/metadata/schema.avsc +252 -33
datahub/metadata/schemas/DataJobKey.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
datahub/metadata/schemas/MLModelProperties.avsc +62 -2
datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
datahub/specific/aspect_helpers/__init__.py +0 -0
datahub/specific/aspect_helpers/custom_properties.py +79 -0
datahub/specific/aspect_helpers/ownership.py +67 -0
datahub/specific/aspect_helpers/structured_properties.py +72 -0
datahub/specific/aspect_helpers/tags.py +42 -0
datahub/specific/aspect_helpers/terms.py +43 -0
datahub/specific/chart.py +28 -184
datahub/specific/dashboard.py +31 -196
datahub/specific/datajob.py +34 -189
datahub/specific/dataproduct.py +24 -86
datahub/specific/dataset.py +48 -133
datahub/specific/form.py +12 -32
datahub/specific/structured_property.py +9 -9
datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
datahub/sql_parsing/sqlglot_lineage.py +15 -5
datahub/sql_parsing/tool_meta_extractor.py +119 -5
datahub/utilities/time.py +8 -3
datahub/utilities/urns/_urn_base.py +5 -7
datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
datahub/specific/custom_properties.py +0 -37
datahub/specific/ownership.py +0 -48
datahub/specific/structured_properties.py +0 -53
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/looker/looker_dataclasses.py CHANGED Viewed

@@ -186,16 +186,16 @@ class LookerModel:
                 f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}"
             )
             if "*" not in inc and not included_files:
-                reporter.report_failure(
+                reporter.warning(
                     title="Error Resolving Include",
-                    message=f"Cannot resolve include {inc}",
-                    context=f"Path: {path}",
+                    message="Cannot resolve included file",
+                    context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
                 )
             elif not included_files:
-                reporter.report_failure(
+                reporter.warning(
                     title="Error Resolving Include",
-                    message=f"Did not resolve anything for wildcard include {inc}",
-                    context=f"Path: {path}",
+                    message="Did not find anything matching the wildcard include",
+                    context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
                 )
             # only load files that we haven't seen so far
             included_files = [x for x in included_files if x not in seen_so_far]
@@ -231,9 +231,7 @@ class LookerModel:
                                 source_config,
                                 reporter,
                                 seen_so_far,
-                                traversal_path=traversal_path
-                                + "."
-                                + pathlib.Path(included_file).stem,
+                                traversal_path=f"{traversal_path} -> {pathlib.Path(included_file).stem}",
                             )
                         )
                 except Exception as e:

datahub/ingestion/source/looker/looker_lib_wrapper.py CHANGED Viewed

@@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel):
     get_look_calls: int = 0
     search_looks_calls: int = 0
     search_dashboards_calls: int = 0
+    all_user_calls: int = 0
 class LookerAPI:
@@ -135,7 +136,7 @@ class LookerAPI:
         return permissions
-    @lru_cache(maxsize=1000)
+    @lru_cache(maxsize=5000)
     def get_user(self, id_: str, user_fields: str) -> Optional[User]:
         self.client_stats.user_calls += 1
         try:
@@ -154,6 +155,17 @@ class LookerAPI:
         # User not found
         return None
+    def all_users(self, user_fields: str) -> Sequence[User]:
+        self.client_stats.all_user_calls += 1
+        try:
+            return self.client.all_users(
+                fields=cast(str, user_fields),
+                transport_options=self.transport_options,
+            )
+        except SDKError as e:
+            logger.warning(f"Failure was {e}")
+        return []
     def execute_query(self, write_query: WriteQuery) -> List[Dict]:
         logger.debug(f"Executing query {write_query}")
         self.client_stats.query_calls += 1

datahub/ingestion/source/looker/looker_source.py CHANGED Viewed

@@ -68,6 +68,7 @@ from datahub.ingestion.source.looker.looker_common import (
     ViewField,
     ViewFieldType,
     gen_model_key,
+    get_urn_looker_element_id,
 )
 from datahub.ingestion.source.looker.looker_config import LookerDashboardSourceConfig
 from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI
@@ -145,7 +146,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         self.source_config: LookerDashboardSourceConfig = config
         self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport()
         self.looker_api: LookerAPI = LookerAPI(self.source_config)
-        self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api)
+        self.user_registry: LookerUserRegistry = LookerUserRegistry(
+            self.looker_api, self.reporter
+        )
         self.explore_registry: LookerExploreRegistry = LookerExploreRegistry(
             self.looker_api, self.reporter, self.source_config
         )
@@ -163,6 +166,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         # Required, as we do not ingest all folders but only those that have dashboards/looks
         self.processed_folders: List[str] = []
+        # Keep track of ingested chart urns, to omit usage for non-ingested entities
+        self.chart_urns: Set[str] = set()
     @staticmethod
     def test_connection(config_dict: dict) -> TestConnectionReport:
         test_report = TestConnectionReport()
@@ -640,6 +646,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         chart_urn = self._make_chart_urn(
             element_id=dashboard_element.get_urn_element_id()
         )
+        self.chart_urns.add(chart_urn)
         chart_snapshot = ChartSnapshot(
             urn=chart_urn,
             aspects=[Status(removed=False)],
@@ -1378,7 +1385,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         yield from self._emit_folder_as_container(folder)
     def extract_usage_stat(
-        self, looker_dashboards: List[looker_usage.LookerDashboardForUsage]
+        self,
+        looker_dashboards: List[looker_usage.LookerDashboardForUsage],
+        ingested_chart_urns: Set[str],
     ) -> List[MetadataChangeProposalWrapper]:
         looks: List[looker_usage.LookerChartForUsage] = []
         # filter out look from all dashboard
@@ -1389,6 +1398,15 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         # dedup looks
         looks = list({str(look.id): look for look in looks}.values())
+        filtered_looks = []
+        for look in looks:
+            if not look.id:
+                continue
+            chart_urn = self._make_chart_urn(get_urn_looker_element_id(look.id))
+            if chart_urn in ingested_chart_urns:
+                filtered_looks.append(look)
+            else:
+                self.reporter.charts_skipped_for_usage.add(look.id)
         # Keep stat generators to generate entity stat aspect later
         stat_generator_config: looker_usage.StatGeneratorConfig = (
@@ -1412,7 +1430,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
             stat_generator_config,
             self.reporter,
             self._make_chart_urn,
-            looks,
+            filtered_looks,
         )
         mcps: List[MetadataChangeProposalWrapper] = []
@@ -1667,11 +1685,20 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         if self.source_config.extract_usage_history:
             self.reporter.report_stage_start("usage_extraction")
             usage_mcps: List[MetadataChangeProposalWrapper] = self.extract_usage_stat(
-                looker_dashboards_for_usage
+                looker_dashboards_for_usage, self.chart_urns
             )
             for usage_mcp in usage_mcps:
                 yield usage_mcp.as_workunit()
             self.reporter.report_stage_end("usage_extraction")
+        # Dump looker user resource mappings.
+        logger.info("Ingesting looker user resource mapping workunits")
+        self.reporter.report_stage_start("user_resource_extraction")
+        yield from auto_workunit(
+            self.user_registry.to_platform_resource(
+                self.source_config.platform_instance
+            )
+        )
     def get_report(self) -> SourceReport:
         return self.reporter

datahub/ingestion/source/looker/looker_usage.py CHANGED Viewed

@@ -42,6 +42,7 @@ from datahub.metadata.schema_classes import (
     TimeWindowSizeClass,
     _Aspect as AspectAbstract,
 )
+from datahub.utilities.lossy_collections import LossySet
 logger = logging.getLogger(__name__)
@@ -170,7 +171,7 @@ class BaseStatGenerator(ABC):
         self.config = config
         self.looker_models = looker_models
         # Later it will help to find out for what are the looker entities from query result
-        self.id_vs_model: Dict[str, ModelForUsage] = {
+        self.id_to_model: Dict[str, ModelForUsage] = {
             self.get_id(looker_object): looker_object for looker_object in looker_models
         }
         self.post_filter = len(self.looker_models) > 100
@@ -225,6 +226,10 @@ class BaseStatGenerator(ABC):
     def get_id_from_row(self, row: dict) -> str:
         pass
+    @abstractmethod
+    def report_skip_set(self) -> LossySet[str]:
+        pass
     def create_mcp(
         self, model: ModelForUsage, aspect: Aspect
     ) -> MetadataChangeProposalWrapper:
@@ -258,20 +263,11 @@ class BaseStatGenerator(ABC):
         return entity_stat_aspect
-    def _process_absolute_aspect(self) -> List[Tuple[ModelForUsage, AspectAbstract]]:
-        aspects: List[Tuple[ModelForUsage, AspectAbstract]] = []
-        for looker_object in self.looker_models:
-            aspects.append(
-                (looker_object, self.to_entity_absolute_stat_aspect(looker_object))
-            )
-        return aspects
     def _fill_user_stat_aspect(
         self,
         entity_usage_stat: Dict[Tuple[str, str], Aspect],
         user_wise_rows: List[Dict],
-    ) -> Iterable[Tuple[ModelForUsage, Aspect]]:
+    ) -> Iterable[Tuple[str, Aspect]]:
         logger.debug("Entering fill user stat aspect")
         # We first resolve all the users using a threadpool to warm up the cache
@@ -300,7 +296,7 @@ class BaseStatGenerator(ABC):
         for row in user_wise_rows:
             # Confirm looker object was given for stat generation
-            looker_object = self.id_vs_model.get(self.get_id_from_row(row))
+            looker_object = self.id_to_model.get(self.get_id_from_row(row))
             if looker_object is None:
                 logger.warning(
                     "Looker object with id({}) was not register with stat generator".format(
@@ -338,7 +334,7 @@ class BaseStatGenerator(ABC):
         logger.debug("Starting to yield answers for user-wise counts")
         for (id, _), aspect in entity_usage_stat.items():
-            yield self.id_vs_model[id], aspect
+            yield id, aspect
     def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]:
         rows = []
@@ -357,7 +353,7 @@ class BaseStatGenerator(ABC):
             )
             if self.post_filter:
                 logger.debug("post filtering")
-                rows = [r for r in rows if self.get_id_from_row(r) in self.id_vs_model]
+                rows = [r for r in rows if self.get_id_from_row(r) in self.id_to_model]
                 logger.debug("Filtered down to %d rows", len(rows))
         except Exception as e:
             logger.warning(f"Failed to execute {query_name} query: {e}")
@@ -378,7 +374,8 @@ class BaseStatGenerator(ABC):
             return
         # yield absolute stat for looker entities
-        for looker_object, aspect in self._process_absolute_aspect():  # type: ignore
+        for looker_object in self.looker_models:
+            aspect = self.to_entity_absolute_stat_aspect(looker_object)
             yield self.create_mcp(looker_object, aspect)
         # Execute query and process the raw json which contains stat information
@@ -399,10 +396,13 @@ class BaseStatGenerator(ABC):
         )
         user_wise_rows = self._execute_query(user_wise_query_with_filters, "user_query")
         # yield absolute stat for entity
-        for looker_object, aspect in self._fill_user_stat_aspect(
+        for object_id, aspect in self._fill_user_stat_aspect(
             entity_usage_stat, user_wise_rows
         ):
-            yield self.create_mcp(looker_object, aspect)
+            if object_id in self.id_to_model:
+                yield self.create_mcp(self.id_to_model[object_id], aspect)
+            else:
+                self.report_skip_set().add(object_id)
 class DashboardStatGenerator(BaseStatGenerator):
@@ -425,6 +425,9 @@ class DashboardStatGenerator(BaseStatGenerator):
     def get_stats_generator_name(self) -> str:
         return "DashboardStats"
+    def report_skip_set(self) -> LossySet[str]:
+        return self.report.dashboards_skipped_for_usage
     def get_filter(self) -> Dict[ViewField, str]:
         return {
             HistoryViewField.HISTORY_DASHBOARD_ID: ",".join(
@@ -541,6 +544,9 @@ class LookStatGenerator(BaseStatGenerator):
     def get_stats_generator_name(self) -> str:
         return "ChartStats"
+    def report_skip_set(self) -> LossySet[str]:
+        return self.report.charts_skipped_for_usage
     def get_filter(self) -> Dict[ViewField, str]:
         return {
             LookViewField.LOOK_ID: ",".join(

datahub/ingestion/source/mlflow.py CHANGED Viewed

@@ -38,16 +38,30 @@ T = TypeVar("T")
 class MLflowConfig(EnvConfigMixin):
     tracking_uri: Optional[str] = Field(
         default=None,
-        description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)",
+        description=(
+            "Tracking server URI. If not set, an MLflow default tracking_uri is used"
+            " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)"
+        ),
     )
     registry_uri: Optional[str] = Field(
         default=None,
-        description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)",
+        description=(
+            "Registry server URI. If not set, an MLflow default registry_uri is used"
+            " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)"
+        ),
     )
     model_name_separator: str = Field(
         default="_",
         description="A string which separates model name from its version (e.g. model_1 or model-1)",
     )
+    base_external_url: Optional[str] = Field(
+        default=None,
+        description=(
+            "Base URL to use when constructing external URLs to MLflow."
+            " If not set, tracking_uri is used if it's an HTTP URL."
+            " If neither is set, external URLs are not generated."
+        ),
+    )
 @dataclass
@@ -279,12 +293,23 @@ class MLflowSource(Source):
         )
         return urn
-    def _make_external_url(self, model_version: ModelVersion) -> Union[None, str]:
+    def _get_base_external_url_from_tracking_uri(self) -> Optional[str]:
+        if isinstance(
+            self.client.tracking_uri, str
+        ) and self.client.tracking_uri.startswith("http"):
+            return self.client.tracking_uri
+        else:
+            return None
+    def _make_external_url(self, model_version: ModelVersion) -> Optional[str]:
         """
         Generate URL for a Model Version to MLflow UI.
         """
-        base_uri = self.client.tracking_uri
-        if base_uri.startswith("http"):
+        base_uri = (
+            self.config.base_external_url
+            or self._get_base_external_url_from_tracking_uri()
+        )
+        if base_uri:
             return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}"
         else:
             return None

datahub/ingestion/source/mode.py CHANGED Viewed

@@ -5,6 +5,7 @@ import time
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from functools import lru_cache
+from json import JSONDecodeError
 from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
 import dateutil.parser as dp
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
     TagPropertiesClass,
     UpstreamClass,
     UpstreamLineageClass,
+    ViewPropertiesClass,
 )
 from datahub.metadata.urns import QueryUrn
 from datahub.sql_parsing.sqlglot_lineage import (
@@ -192,6 +194,9 @@ class HTTPError429(HTTPError):
     pass
+ModeRequestError = (HTTPError, JSONDecodeError)
 @dataclass
 class ModeSourceReport(StaleEntityRemovalSourceReport):
     filtered_spaces: LossyList[str] = dataclasses.field(default_factory=LossyList)
@@ -327,11 +332,11 @@ class ModeSource(StatefulIngestionSourceBase):
         # Test the connection
         try:
             self._get_request_json(f"{self.config.connect_uri}/api/verify")
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Connect",
                 message="Unable to verify connection to mode.",
-                context=f"Error: {str(http_error)}",
+                context=f"Error: {str(e)}",
             )
         self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}"
@@ -520,11 +525,11 @@ class ModeSource(StatefulIngestionSourceBase):
                 if self.config.owner_username_instead_of_email
                 else user_json.get("email")
             )
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_warning(
                 title="Failed to retrieve Mode creator",
                 message=f"Unable to retrieve user for {href}",
-                context=f"Reason: {str(http_error)}",
+                context=f"Reason: {str(e)}",
             )
         return user
@@ -570,11 +575,11 @@ class ModeSource(StatefulIngestionSourceBase):
                     logging.debug(f"Skipping space {space_name} due to space pattern")
                     continue
                 space_info[s.get("token", "")] = s.get("name", "")
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Retrieve Spaces",
                 message="Unable to retrieve spaces / collections for workspace.",
-                context=f"Workspace: {self.workspace_uri}, Error: {str(http_error)}",
+                context=f"Workspace: {self.workspace_uri}, Error: {str(e)}",
             )
         return space_info
@@ -720,11 +725,11 @@ class ModeSource(StatefulIngestionSourceBase):
         try:
             ds_json = self._get_request_json(f"{self.workspace_uri}/data_sources")
             data_sources = ds_json.get("_embedded", {}).get("data_sources", [])
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to retrieve Data Sources",
                 message="Unable to retrieve data sources from Mode.",
-                context=f"Error: {str(http_error)}",
+                context=f"Error: {str(e)}",
             )
         return data_sources
@@ -811,11 +816,11 @@ class ModeSource(StatefulIngestionSourceBase):
                 if definition.get("name", "") == definition_name:
                     return definition.get("source", "")
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Retrieve Definition",
                 message="Unable to retrieve definition from Mode.",
-                context=f"Definition Name: {definition_name}, Error: {str(http_error)}",
+                context=f"Definition Name: {definition_name}, Error: {str(e)}",
             )
         return None
@@ -930,16 +935,13 @@ class ModeSource(StatefulIngestionSourceBase):
         dataset_props = DatasetPropertiesClass(
             name=report_info.get("name") if is_mode_dataset else query_data.get("name"),
-            description=f"""### Source Code
-``` sql
-{query_data.get("raw_query")}
-```
-            """,
+            description=None,
             externalUrl=externalUrl,
             customProperties=self.get_custom_props_from_dict(
                 query_data,
                 [
-                    "id" "created_at",
+                    "id",
+                    "created_at",
                     "updated_at",
                     "last_run_id",
                     "data_source_id",
@@ -949,7 +951,6 @@ class ModeSource(StatefulIngestionSourceBase):
                 ],
             ),
         )
         yield (
             MetadataChangeProposalWrapper(
                 entityUrn=query_urn,
@@ -957,6 +958,16 @@ class ModeSource(StatefulIngestionSourceBase):
             ).as_workunit()
         )
+        if raw_query := query_data.get("raw_query"):
+            yield MetadataChangeProposalWrapper(
+                entityUrn=query_urn,
+                aspect=ViewPropertiesClass(
+                    viewLogic=raw_query,
+                    viewLanguage=QueryLanguageClass.SQL,
+                    materialized=False,
+                ),
+            ).as_workunit()
         if is_mode_dataset:
             space_container_key = self.gen_space_key(space_token)
             yield from add_dataset_to_container(
@@ -1375,11 +1386,11 @@ class ModeSource(StatefulIngestionSourceBase):
                 f"{self.workspace_uri}/spaces/{space_token}/reports"
             )
             reports = reports_json.get("_embedded", {}).get("reports", {})
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Retrieve Reports for Space",
                 message="Unable to retrieve reports for space token.",
-                context=f"Space Token: {space_token}, Error: {str(http_error)}",
+                context=f"Space Token: {space_token}, Error: {str(e)}",
             )
         return reports
@@ -1393,11 +1404,11 @@ class ModeSource(StatefulIngestionSourceBase):
             url = f"{self.workspace_uri}/spaces/{space_token}/datasets"
             datasets_json = self._get_request_json(url)
             datasets = datasets_json.get("_embedded", {}).get("reports", [])
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Retrieve Datasets for Space",
                 message=f"Unable to retrieve datasets for space token {space_token}.",
-                context=f"Error: {str(http_error)}",
+                context=f"Error: {str(e)}",
             )
         return datasets
@@ -1409,11 +1420,11 @@ class ModeSource(StatefulIngestionSourceBase):
                 f"{self.workspace_uri}/reports/{report_token}/queries"
             )
             queries = queries_json.get("_embedded", {}).get("queries", {})
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Retrieve Queries",
                 message="Unable to retrieve queries for report token.",
-                context=f"Report Token: {report_token}, Error: {str(http_error)}",
+                context=f"Report Token: {report_token}, Error: {str(e)}",
             )
         return queries
@@ -1426,11 +1437,11 @@ class ModeSource(StatefulIngestionSourceBase):
                 f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}"
             )
             queries = queries_json.get("_embedded", {}).get("queries", {})
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Retrieve Queries for Report",
                 message="Unable to retrieve queries for report token.",
-                context=f"Report Token:{report_token}, Error: {str(http_error)}",
+                context=f"Report Token:{report_token}, Error: {str(e)}",
             )
             return {}
         return queries
@@ -1444,13 +1455,13 @@ class ModeSource(StatefulIngestionSourceBase):
                 f"/queries/{query_token}/charts"
             )
             charts = charts_json.get("_embedded", {}).get("charts", {})
-        except HTTPError as http_error:
+        except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Retrieve Charts",
                 message="Unable to retrieve charts from Mode.",
                 context=f"Report Token: {report_token}, "
                 f"Query token: {query_token}, "
-                f"Error: {str(http_error)}",
+                f"Error: {str(e)}",
             )
         return charts
@@ -1470,6 +1481,8 @@ class ModeSource(StatefulIngestionSourceBase):
                 response = self.session.get(
                     url, timeout=self.config.api_options.timeout
                 )
+                if response.status_code == 204:  # No content, don't parse json
+                    return {}
                 return response.json()
             except HTTPError as http_error:
                 error_response = http_error.response

datahub/ingestion/source/powerbi/config.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pydantic.class_validators import root_validator
 import datahub.emitter.mce_builder as builder
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
-from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
 from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
 from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]:
     return dict_
-class PlatformDetail(ConfigModel):
-    platform_instance: Optional[str] = pydantic.Field(
-        default=None,
-        description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
-        "with platform instance name used in ingestion "
-        "recipe of other datahub sources.",
-    )
-    env: str = pydantic.Field(
-        default=builder.DEFAULT_ENV,
-        description="The environment that all assets produced by DataHub platform ingestion source belong to",
-    )
 class DataBricksPlatformDetail(PlatformDetail):
     """
     metastore is an additional field used in Databricks connector to generate the dataset urn

datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py CHANGED Viewed

@@ -2,8 +2,8 @@ import logging
 from abc import ABC, abstractmethod
 from typing import Union
+from datahub.configuration.source_common import PlatformDetail
 from datahub.ingestion.source.powerbi.config import (
-    PlatformDetail,
     PowerBiDashboardSourceConfig,
     PowerBIPlatformDetail,
 )

datahub/ingestion/source/powerbi/m_query/pattern_handler.py CHANGED Viewed

@@ -5,13 +5,13 @@ from typing import Dict, List, Optional, Tuple, Type, cast
 from lark import Tree
+from datahub.configuration.source_common import PlatformDetail
 from datahub.emitter import mce_builder as builder
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.source.powerbi.config import (
     Constant,
     DataBricksPlatformDetail,
     DataPlatformPair,
-    PlatformDetail,
     PowerBiDashboardSourceConfig,
     PowerBiDashboardSourceReport,
     PowerBIPlatformDetail,

datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule CHANGED Viewed

@@ -21,6 +21,11 @@
 //      |   empty_string
 //      |   empty_string "," argument_list
 // - Added sql_string in any_literal
+// - Added WS_INLINE? in field expression
+//   Added to ignore any comments
+//    %ignore WS             // Ignore whitespace
+//    %ignore CPP_COMMENT    // Ignore single-line comments
+//    %ignore C_COMMENT      // Ignore multi-line comments
 lexical_unit:   lexical_elements?
@@ -245,6 +250,8 @@ operator_or_punctuator: ","
                     |   "=>"
                     |   ".."
                     |   "..."
+                    |   "{{"
+                    |   "}}"
 document:   section_document
       |     expression_document
@@ -275,6 +282,7 @@ expression: logical_or_expression
       |     if_expression
       |     error_raising_expression
       |     error_handling_expression
+      |     outer_expression
 logical_or_expression:  logical_and_expression
@@ -376,6 +384,8 @@ sql_content: /(?:[^\"\\]|\\[\"]|\"\"|\#\(lf\))+/
 sql_string: "\"" sql_content "\""
+outer_expression: "{{" expression "}}"
 argument_list:  WS_INLINE? expression
             |   WS_INLINE? expression WS_INLINE? "," WS_INLINE? argument_list
             |   WS_INLINE? sql_string
@@ -409,7 +419,7 @@ record_expression:  "[" field_list? "]"
 field_list: field
         |   field "," field_list
-field:  field_name WS_INLINE? "=" WS_INLINE? expression
+field:  WS_INLINE? field_name WS_INLINE? "=" WS_INLINE? expression
 field_name: generalized_identifier
         |   quoted_identifier
@@ -621,4 +631,8 @@ any_literal:      record_literal
 %import common.DIGIT
 %import common.LF
 %import common.CR
-%import common.ESCAPED_STRING
+%import common.ESCAPED_STRING
+%ignore WS            // Ignore whitespace
+%ignore CPP_COMMENT    // Ignore single-line comments
+%ignore C_COMMENT      // Ignore multi-line comments

acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl