PyPI - acryl-datahub - Versions diffs - 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3rc1__py3-none-any.whl - Mend

acryl-datahub 1.0.0.2rc4py3-none-any.whl → 1.0.0.3rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (30) hide show

datahub/ingestion/source/sigma/sigma_api.py CHANGED Viewed

@@ -95,22 +95,22 @@ class SigmaAPI:
         return get_response
     def get_workspace(self, workspace_id: str) -> Optional[Workspace]:
+        if workspace_id in self.workspaces:
+            return self.workspaces[workspace_id]
         logger.debug(f"Fetching workspace metadata with id '{workspace_id}'")
         try:
-            if workspace_id in self.workspaces:
-                return self.workspaces[workspace_id]
-            else:
-                response = self._get_api_call(
-                    f"{self.config.api_url}/workspaces/{workspace_id}"
-                )
-                if response.status_code == 403:
-                    logger.debug(f"Workspace {workspace_id} not accessible.")
-                    self.report.non_accessible_workspaces_count += 1
-                    return None
-                response.raise_for_status()
-                workspace = Workspace.parse_obj(response.json())
-                self.workspaces[workspace.workspaceId] = workspace
-                return workspace
+            response = self._get_api_call(
+                f"{self.config.api_url}/workspaces/{workspace_id}"
+            )
+            if response.status_code == 403:
+                logger.debug(f"Workspace {workspace_id} not accessible.")
+                self.report.non_accessible_workspaces_count += 1
+                return None
+            response.raise_for_status()
+            workspace = Workspace.parse_obj(response.json())
+            self.workspaces[workspace.workspaceId] = workspace
+            return workspace
         except Exception as e:
             self._log_http_error(
                 message=f"Unable to fetch workspace '{workspace_id}'. Exception: {e}"
@@ -187,7 +187,9 @@ class SigmaAPI:
     @functools.lru_cache
     def _get_files_metadata(self, file_type: str) -> Dict[str, File]:
         logger.debug(f"Fetching file metadata with type {file_type}.")
-        file_url = url = f"{self.config.api_url}/files?typeFilters={file_type}"
+        file_url = url = (
+            f"{self.config.api_url}/files?permissionFilter=view&typeFilters={file_type}"
+        )
         try:
             files_metadata: Dict[str, File] = {}
             while True:
@@ -225,31 +227,50 @@ class SigmaAPI:
                 for dataset_dict in response_dict[Constant.ENTRIES]:
                     dataset = SigmaDataset.parse_obj(dataset_dict)
-                    if dataset.datasetId in dataset_files_metadata:
-                        dataset.path = dataset_files_metadata[dataset.datasetId].path
-                        dataset.badge = dataset_files_metadata[dataset.datasetId].badge
-                        workspace_id = dataset_files_metadata[
-                            dataset.datasetId
-                        ].workspaceId
-                        if workspace_id:
-                            dataset.workspaceId = workspace_id
-                            workspace = self.get_workspace(dataset.workspaceId)
-                            if workspace:
-                                if self.config.workspace_pattern.allowed(
-                                    workspace.name
-                                ):
-                                    datasets.append(dataset)
-                            elif self.config.ingest_shared_entities:
-                                # If no workspace for dataset we can consider it as shared entity
-                                self.report.shared_entities_count += 1
-                                datasets.append(dataset)
+                    if dataset.datasetId not in dataset_files_metadata:
+                        self.report.datasets.dropped(
+                            f"{dataset.name} ({dataset.datasetId}) (missing file metadata)"
+                        )
+                        continue
+                    dataset.workspaceId = dataset_files_metadata[
+                        dataset.datasetId
+                    ].workspaceId
+                    dataset.path = dataset_files_metadata[dataset.datasetId].path
+                    dataset.badge = dataset_files_metadata[dataset.datasetId].badge
+                    workspace = None
+                    if dataset.workspaceId:
+                        workspace = self.get_workspace(dataset.workspaceId)
+                    if workspace:
+                        if self.config.workspace_pattern.allowed(workspace.name):
+                            self.report.datasets.processed(
+                                f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
+                            )
+                            datasets.append(dataset)
+                        else:
+                            self.report.datasets.dropped(
+                                f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
+                            )
+                    elif self.config.ingest_shared_entities:
+                        # If no workspace for dataset we can consider it as shared entity
+                        self.report.datasets_without_workspace += 1
+                        self.report.datasets.processed(
+                            f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
+                        )
+                        datasets.append(dataset)
+                    else:
+                        self.report.datasets.dropped(
+                            f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
+                        )
                 if response_dict[Constant.NEXTPAGE]:
                     url = f"{dataset_url}?page={response_dict[Constant.NEXTPAGE]}"
                 else:
                     break
-            self.report.number_of_datasets = len(datasets)
             return datasets
         except Exception as e:
             self._log_http_error(
@@ -381,34 +402,54 @@ class SigmaAPI:
                 for workbook_dict in response_dict[Constant.ENTRIES]:
                     workbook = Workbook.parse_obj(workbook_dict)
-                    if workbook.workbookId in workbook_files_metadata:
-                        workbook.badge = workbook_files_metadata[
-                            workbook.workbookId
-                        ].badge
-                        workspace_id = workbook_files_metadata[
-                            workbook.workbookId
-                        ].workspaceId
-                        if workspace_id:
-                            workbook.workspaceId = workspace_id
-                            workspace = self.get_workspace(workbook.workspaceId)
-                            if workspace:
-                                if self.config.workspace_pattern.allowed(
-                                    workspace.name
-                                ):
-                                    workbook.pages = self.get_workbook_pages(workbook)
-                                    workbooks.append(workbook)
-                            elif self.config.ingest_shared_entities:
-                                # If no workspace for workbook we can consider it as shared entity
-                                self.report.shared_entities_count += 1
-                                workbook.pages = self.get_workbook_pages(workbook)
-                                workbooks.append(workbook)
+                    if workbook.workbookId not in workbook_files_metadata:
+                        # Due to a bug in the Sigma API, it seems like the /files endpoint does not
+                        # return file metadata when the user has access via admin permissions. In
+                        # those cases, the user associated with the token needs to be manually added
+                        # to the workspace.
+                        self.report.workbooks.dropped(
+                            f"{workbook.name} ({workbook.workbookId}) (missing file metadata; path: {workbook.path}; likely need to manually add user to workspace)"
+                        )
+                        continue
+                    workbook.workspaceId = workbook_files_metadata[
+                        workbook.workbookId
+                    ].workspaceId
+                    workbook.badge = workbook_files_metadata[workbook.workbookId].badge
+                    workspace = None
+                    if workbook.workspaceId:
+                        workspace = self.get_workspace(workbook.workspaceId)
+                    if workspace:
+                        if self.config.workspace_pattern.allowed(workspace.name):
+                            self.report.workbooks.processed(
+                                f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
+                            )
+                            workbook.pages = self.get_workbook_pages(workbook)
+                            workbooks.append(workbook)
+                        else:
+                            self.report.workbooks.dropped(
+                                f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
+                            )
+                    elif self.config.ingest_shared_entities:
+                        # If no workspace for workbook we can consider it as shared entity
+                        self.report.workbooks_without_workspace += 1
+                        self.report.workbooks.processed(
+                            f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
+                        )
+                        workbook.pages = self.get_workbook_pages(workbook)
+                        workbooks.append(workbook)
+                    else:
+                        self.report.workbooks.dropped(
+                            f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
+                        )
                 if response_dict[Constant.NEXTPAGE]:
                     url = f"{workbook_url}?page={response_dict[Constant.NEXTPAGE]}"
                 else:
                     break
-            self.report.number_of_workbooks = len(workbooks)
             return workbooks
         except Exception as e:
             self._log_http_error(

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -515,7 +515,10 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
             # here
             query_id=get_query_fingerprint(
-                res["query_text"], self.identifiers.platform, fast=True
+                res["query_text"],
+                self.identifiers.platform,
+                fast=True,
+                secondary_id=res["query_secondary_fingerprint"],
             ),
             query_text=res["query_text"],
             upstreams=upstreams,
@@ -654,7 +657,17 @@ WITH
 fingerprinted_queries as (
     SELECT *,
         -- TODO: Generate better fingerprints for each query by pushing down regex logic.
-        query_history.query_parameterized_hash as query_fingerprint
+        query_history.query_parameterized_hash as query_fingerprint,
+        -- Optional and additional hash to be used for query deduplication and final query identity
+        CASE
+            WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
+            -- Extract project id and hash it
+            THEN CAST(HASH(
+                REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
+                REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
+            ) AS VARCHAR)
+            ELSE NULL
+        END as query_secondary_fingerprint
     FROM
         snowflake.account_usage.query_history
     WHERE
@@ -670,11 +683,11 @@ fingerprinted_queries as (
             {time_bucket_size},
             CONVERT_TIMEZONE('UTC', start_time)
         ) AS bucket_start_time,
-        COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count,
+        COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
     FROM
         fingerprinted_queries
     QUALIFY
-        ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1
+        ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
 )
 , raw_access_history AS (
     SELECT
@@ -714,6 +727,7 @@ fingerprinted_queries as (
         q.bucket_start_time,
         q.query_id,
         q.query_fingerprint,
+        q.query_secondary_fingerprint,
         q.query_count,
         q.session_id AS "SESSION_ID",
         q.start_time AS "QUERY_START_TIME",

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -1000,4 +1000,4 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
         from_clause = (
             f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
         )
-        return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""
+        return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""

datahub/ingestion/source/sql/stored_procedures/base.py CHANGED Viewed

@@ -26,6 +26,7 @@ from datahub.metadata.schema_classes import (
     DataPlatformInstanceClass,
     DataTransformClass,
     DataTransformLogicClass,
+    QueryLanguageClass,
     QueryStatementClass,
     SubTypesClass,
 )
@@ -176,7 +177,17 @@ def _generate_job_workunits(
                     DataTransformClass(
                         queryStatement=QueryStatementClass(
                             value=procedure.procedure_definition,
-                            language=procedure.language,
+                            language=(
+                                QueryLanguageClass.SQL
+                                if procedure.language == "SQL"
+                                # The language field uses a pretty limited enum.
+                                # The "UNKNOWN" enum value is pretty new, so we don't want to
+                                # emit it until it has broader server-side support. As a
+                                # short-term solution, we map all languages to "SQL".
+                                # TODO: Once we've released server 1.1.0, we should change
+                                # this to be "UNKNOWN" for all languages except "SQL".
+                                else QueryLanguageClass.SQL
+                            ),
                         ),
                     )
                 ]

datahub/metadata/_schema_classes.py CHANGED Viewed

@@ -15442,6 +15442,35 @@ class DataHubIngestionSourceKeyClass(_Aspect):
         self._inner_dict['id'] = value
+class DataHubOpenAPISchemaKeyClass(_Aspect):
+    """Key for a Query"""
+    ASPECT_NAME = 'dataHubOpenAPISchemaKey'
+    ASPECT_INFO = {'keyForEntity': 'dataHubOpenAPISchema', 'entityCategory': 'internal', 'entityAspects': ['systemMetadata'], 'entityDoc': 'Contains aspects which are used in OpenAPI requests/responses which are not otherwise present in the data model.'}
+    RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataHubOpenAPISchemaKey")
+    def __init__(self,
+        id: str,
+    ):
+        super().__init__()
+        self.id = id
+    def _restore_defaults(self) -> None:
+        self.id = str()
+    @property
+    def id(self) -> str:
+        """A unique id for the DataHub OpenAPI schema."""
+        return self._inner_dict.get('id')  # type: ignore
+    @id.setter
+    def id(self, value: str) -> None:
+        self._inner_dict['id'] = value
 class DataHubPersonaKeyClass(_Aspect):
     """Key for a persona type"""
@@ -20128,10 +20157,14 @@ class PlatformEventHeaderClass(DictWrapper):
         self._inner_dict['timestampMillis'] = value
-class SystemMetadataClass(DictWrapper):
+class SystemMetadataClass(_Aspect):
     """Metadata associated with each metadata change that is processed by the system"""
+    ASPECT_NAME = 'systemMetadata'
+    ASPECT_INFO = {}
     RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.mxe.SystemMetadata")
     def __init__(self,
         lastObserved: Optional[Union[int, None]]=None,
         runId: Optional[Union[str, None]]=None,
@@ -21738,6 +21771,9 @@ class QueryLanguageClass(object):
     SQL = "SQL"
     """A SQL Query"""
+    UNKNOWN = "UNKNOWN"
+    """Unknown query language"""
 class QueryPropertiesClass(_Aspect):
@@ -26135,6 +26171,7 @@ __SCHEMA_TYPES = {
     'com.linkedin.pegasus2avro.metadata.key.DataHubActionKey': DataHubActionKeyClass,
     'com.linkedin.pegasus2avro.metadata.key.DataHubConnectionKey': DataHubConnectionKeyClass,
     'com.linkedin.pegasus2avro.metadata.key.DataHubIngestionSourceKey': DataHubIngestionSourceKeyClass,
+    'com.linkedin.pegasus2avro.metadata.key.DataHubOpenAPISchemaKey': DataHubOpenAPISchemaKeyClass,
     'com.linkedin.pegasus2avro.metadata.key.DataHubPersonaKey': DataHubPersonaKeyClass,
     'com.linkedin.pegasus2avro.metadata.key.DataHubPolicyKey': DataHubPolicyKeyClass,
     'com.linkedin.pegasus2avro.metadata.key.DataHubRetentionKey': DataHubRetentionKeyClass,
@@ -26620,6 +26657,7 @@ __SCHEMA_TYPES = {
     'DataHubActionKey': DataHubActionKeyClass,
     'DataHubConnectionKey': DataHubConnectionKeyClass,
     'DataHubIngestionSourceKey': DataHubIngestionSourceKeyClass,
+    'DataHubOpenAPISchemaKey': DataHubOpenAPISchemaKeyClass,
     'DataHubPersonaKey': DataHubPersonaKeyClass,
     'DataHubPolicyKey': DataHubPolicyKeyClass,
     'DataHubRetentionKey': DataHubRetentionKeyClass,
@@ -26879,6 +26917,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
     ContainerClass,
     ContainerPropertiesClass,
     EditableContainerPropertiesClass,
+    SystemMetadataClass,
     DataHubSecretValueClass,
     DataHubUpgradeRequestClass,
     DataHubUpgradeResultClass,
@@ -26935,6 +26974,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
     MLModelKeyClass,
     NotebookKeyClass,
     RoleKeyClass,
+    DataHubOpenAPISchemaKeyClass,
     GlobalSettingsKeyClass,
     DatasetKeyClass,
     ChartKeyClass,
@@ -27102,6 +27142,7 @@ class AspectBag(TypedDict, total=False):
     container: ContainerClass
     containerProperties: ContainerPropertiesClass
     editableContainerProperties: EditableContainerPropertiesClass
+    systemMetadata: SystemMetadataClass
     dataHubSecretValue: DataHubSecretValueClass
     dataHubUpgradeRequest: DataHubUpgradeRequestClass
     dataHubUpgradeResult: DataHubUpgradeResultClass
@@ -27158,6 +27199,7 @@ class AspectBag(TypedDict, total=False):
     mlModelKey: MLModelKeyClass
     notebookKey: NotebookKeyClass
     roleKey: RoleKeyClass
+    dataHubOpenAPISchemaKey: DataHubOpenAPISchemaKeyClass
     globalSettingsKey: GlobalSettingsKeyClass
     datasetKey: DatasetKeyClass
     chartKey: ChartKeyClass
@@ -27292,6 +27334,7 @@ KEY_ASPECTS: Dict[str, Type[_Aspect]] = {
     'mlModel': MLModelKeyClass,
     'notebook': NotebookKeyClass,
     'role': RoleKeyClass,
+    'dataHubOpenAPISchema': DataHubOpenAPISchemaKeyClass,
     'globalSettings': GlobalSettingsKeyClass,
     'dataset': DatasetKeyClass,
     'chart': ChartKeyClass,
@@ -27352,6 +27395,7 @@ ENTITY_TYPE_NAMES: List[str] = [
     'mlModel',
     'notebook',
     'role',
+    'dataHubOpenAPISchema',
     'globalSettings',
     'dataset',
     'chart',
@@ -27411,6 +27455,7 @@ EntityTypeName = Literal[
     'mlModel',
     'notebook',
     'role',
+    'dataHubOpenAPISchema',
     'globalSettings',
     'dataset',
     'chart',

datahub/metadata/_urns/urn_defs.py CHANGED Viewed

@@ -594,6 +594,62 @@ class RoleUrn(_SpecificUrn):
     def id(self) -> str:
         return self._entity_ids[0]
+if TYPE_CHECKING:
+    from datahub.metadata.schema_classes import DataHubOpenAPISchemaKeyClass
+class DataHubOpenAPISchemaUrn(_SpecificUrn):
+    ENTITY_TYPE: ClassVar[Literal["dataHubOpenAPISchema"]] = "dataHubOpenAPISchema"
+    _URN_PARTS: ClassVar[int] = 1
+    def __init__(self, id: Union["DataHubOpenAPISchemaUrn", str], *, _allow_coercion: bool = True) -> None:
+        if _allow_coercion:
+            # Field coercion logic (if any is required).
+            if isinstance(id, str):
+                if id.startswith('urn:li:'):
+                    try:
+                        id = DataHubOpenAPISchemaUrn.from_string(id)
+                    except InvalidUrnError:
+                        raise InvalidUrnError(f'Expecting a DataHubOpenAPISchemaUrn but got {id}')
+                else:
+                    id = UrnEncoder.encode_string(id)
+        # Validation logic.
+        if not id:
+            raise InvalidUrnError("DataHubOpenAPISchemaUrn id cannot be empty")
+        if isinstance(id, DataHubOpenAPISchemaUrn):
+            id = id.id
+        elif isinstance(id, Urn):
+            raise InvalidUrnError(f'Expecting a DataHubOpenAPISchemaUrn but got {id}')
+        if UrnEncoder.contains_reserved_char(id):
+            raise InvalidUrnError(f'DataHubOpenAPISchemaUrn id contains reserved characters')
+        super().__init__(self.ENTITY_TYPE, [id])
+    @classmethod
+    def _parse_ids(cls, entity_ids: List[str]) -> "DataHubOpenAPISchemaUrn":
+        if len(entity_ids) != cls._URN_PARTS:
+            raise InvalidUrnError(f"DataHubOpenAPISchemaUrn should have {cls._URN_PARTS} parts, got {len(entity_ids)}: {entity_ids}")
+        return cls(id=entity_ids[0], _allow_coercion=False)
+    @classmethod
+    def underlying_key_aspect_type(cls) -> Type["DataHubOpenAPISchemaKeyClass"]:
+        from datahub.metadata.schema_classes import DataHubOpenAPISchemaKeyClass
+        return DataHubOpenAPISchemaKeyClass
+    def to_key_aspect(self) -> "DataHubOpenAPISchemaKeyClass":
+        from datahub.metadata.schema_classes import DataHubOpenAPISchemaKeyClass
+        return DataHubOpenAPISchemaKeyClass(id=self.id)
+    @classmethod
+    def from_key_aspect(cls, key_aspect: "DataHubOpenAPISchemaKeyClass") -> "DataHubOpenAPISchemaUrn":
+        return cls(id=key_aspect.id)
+    @property
+    def id(self) -> str:
+        return self._entity_ids[0]
 if TYPE_CHECKING:
     from datahub.metadata.schema_classes import GlobalSettingsKeyClass

datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ......schema_classes import DataHubAccessTokenKeyClass
 from ......schema_classes import DataHubActionKeyClass
 from ......schema_classes import DataHubConnectionKeyClass
 from ......schema_classes import DataHubIngestionSourceKeyClass
+from ......schema_classes import DataHubOpenAPISchemaKeyClass
 from ......schema_classes import DataHubPersonaKeyClass
 from ......schema_classes import DataHubPolicyKeyClass
 from ......schema_classes import DataHubRetentionKeyClass
@@ -72,6 +73,7 @@ DataHubAccessTokenKey = DataHubAccessTokenKeyClass
 DataHubActionKey = DataHubActionKeyClass
 DataHubConnectionKey = DataHubConnectionKeyClass
 DataHubIngestionSourceKey = DataHubIngestionSourceKeyClass
+DataHubOpenAPISchemaKey = DataHubOpenAPISchemaKeyClass
 DataHubPersonaKey = DataHubPersonaKeyClass
 DataHubPolicyKey = DataHubPolicyKeyClass
 DataHubRetentionKey = DataHubRetentionKeyClass

acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.2rc4py3-none-any.whl → 1.0.0.3rc1py3-none-any.whl