PyPI - acryl-datahub - Versions diffs - 1.1.0.1rc6__py3-none-any.whl → 1.1.0.2__py3-none-any.whl - Mend

acryl-datahub 1.1.0.1rc6py3-none-any.whl → 1.1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (25) hide show

{acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/METADATA +2522 -2522
{acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/RECORD +25 -23
datahub/_version.py +1 -1
datahub/api/entities/external/lake_formation_external_entites.py +161 -0
datahub/api/entities/external/restricted_text.py +10 -10
datahub/api/entities/external/unity_catalog_external_entites.py +5 -2
datahub/emitter/rest_emitter.py +29 -4
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
datahub/ingestion/sink/datahub_rest.py +1 -0
datahub/ingestion/source/aws/aws_common.py +4 -0
datahub/ingestion/source/aws/glue.py +488 -243
datahub/ingestion/source/aws/tag_entities.py +292 -0
datahub/ingestion/source/sql/mssql/source.py +207 -18
datahub/ingestion/source/unity/source.py +2 -3
datahub/metadata/_internal_schema_classes.py +499 -499
datahub/metadata/_urns/urn_defs.py +1766 -1766
datahub/metadata/schema.avsc +17480 -17093
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/main_client.py +3 -3
{acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.0.1rc6.dist-info → acryl_datahub-1.1.0.2.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/aws/tag_entities.py ADDED Viewed

@@ -0,0 +1,292 @@
+import logging
+from typing import List, Optional
+from pydantic import BaseModel
+from datahub.api.entities.external.external_entities import (
+    ExternalEntity,
+    ExternalEntityId,
+    LinkedResourceSet,
+    PlatformResourceRepository,
+)
+from datahub.api.entities.external.lake_formation_external_entites import (
+    LakeFormationTag,
+)
+from datahub.api.entities.platformresource.platform_resource import (
+    PlatformResource,
+    PlatformResourceKey,
+    PlatformResourceSearchFields,
+)
+from datahub.metadata.urns import TagUrn
+from datahub.utilities.search_utils import ElasticDocumentQuery
+from datahub.utilities.urns.urn import Urn
+logger = logging.getLogger(__name__)
+class LakeFormationTagSyncContext(BaseModel):
+    # it is intentionally empty
+    platform_instance: Optional[str] = None
+    catalog: Optional[str] = None
+class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
+    """
+    A LakeFormationTag is a unique identifier for a Lakeformation tag.
+    """
+    tag_key: str
+    tag_value: Optional[str] = None
+    platform_instance: Optional[str]
+    catalog: Optional[str] = None
+    exists_in_lake_formation: bool = False
+    persisted: bool = False
+    def __hash__(self) -> int:
+        return hash(self.to_platform_resource_key().id)
+    # this is a hack to make sure the property is a string and not private pydantic field
+    @staticmethod
+    def _RESOURCE_TYPE() -> str:
+        return "LakeFormationTagPlatformResource"
+    def to_platform_resource_key(self) -> PlatformResourceKey:
+        return PlatformResourceKey(
+            platform="glue",
+            resource_type=str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
+            primary_key=f"{self.catalog}.{self.tag_key}:{self.tag_value}"
+            if self.catalog
+            else f"{self.tag_key}:{self.tag_value}",
+            platform_instance=self.platform_instance,
+        )
+    @classmethod
+    def from_tag(
+        cls,
+        tag: LakeFormationTag,
+        platform_instance: Optional[str],
+        platform_resource_repository: PlatformResourceRepository,
+        catalog: Optional[str] = None,
+        exists_in_lake_formation: bool = False,
+    ) -> "LakeFormationTagPlatformResourceId":
+        """
+        Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
+        """
+        existing_platform_resource = cls.search_by_urn(
+            tag.to_datahub_tag_urn().urn(),
+            platform_resource_repository=platform_resource_repository,
+            tag_sync_context=LakeFormationTagSyncContext(
+                platform_instance=platform_instance,
+                catalog=catalog,
+            ),
+        )
+        if existing_platform_resource:
+            logger.info(
+                f"Found existing LakeFormationTagPlatformResourceId for tag {tag.key}: {existing_platform_resource}"
+            )
+            return existing_platform_resource
+        return LakeFormationTagPlatformResourceId(
+            tag_key=tag.key,
+            tag_value=tag.value if tag.value is not None else None,
+            platform_instance=platform_instance,
+            exists_in_lake_formation=exists_in_lake_formation,
+            catalog=catalog,
+            persisted=False,
+        )
+    @classmethod
+    def search_by_urn(
+        cls,
+        urn: str,
+        platform_resource_repository: PlatformResourceRepository,
+        tag_sync_context: LakeFormationTagSyncContext,
+    ) -> Optional["LakeFormationTagPlatformResourceId"]:
+        mapped_tags = [
+            t
+            for t in platform_resource_repository.search_by_filter(
+                ElasticDocumentQuery.create_from(
+                    (
+                        PlatformResourceSearchFields.RESOURCE_TYPE,
+                        str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
+                    ),
+                    (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
+                )
+            )
+        ]
+        logger.info(
+            f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
+        )
+        if len(mapped_tags) > 0:
+            for platform_resource in mapped_tags:
+                if (
+                    platform_resource.resource_info
+                    and platform_resource.resource_info.value
+                ):
+                    lake_formation_tag_platform_resource = (
+                        LakeFormationTagPlatformResource(
+                            **platform_resource.resource_info.value.as_pydantic_object(
+                                LakeFormationTagPlatformResource
+                            ).dict()
+                        )
+                    )
+                    if (
+                        lake_formation_tag_platform_resource.id.platform_instance
+                        == tag_sync_context.platform_instance
+                        and lake_formation_tag_platform_resource.id.catalog
+                        == tag_sync_context.catalog
+                    ):
+                        lake_formation_tag_id = lake_formation_tag_platform_resource.id
+                        lake_formation_tag_id.exists_in_lake_formation = True
+                        lake_formation_tag_id.persisted = True
+                        return lake_formation_tag_id
+                else:
+                    logger.warning(
+                        f"Platform resource {platform_resource} does not have a resource_info value"
+                    )
+                    continue
+            # If we reach here, it means we did not find a mapped tag for the URN
+            logger.info(
+                f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
+            )
+        return None
+    @classmethod
+    def from_datahub_urn(
+        cls,
+        urn: str,
+        platform_resource_repository: PlatformResourceRepository,
+        tag_sync_context: LakeFormationTagSyncContext,
+    ) -> "LakeFormationTagPlatformResourceId":
+        """
+        Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
+        """
+        # First we check if we already have a mapped platform resource for this
+        # urn that is of the type UnityCatalogTagPlatformResource
+        # If we do, we can use it to create the UnityCatalogTagPlatformResourceId
+        # Else, we need to generate a new UnityCatalogTagPlatformResourceId
+        existing_platform_resource_id = cls.search_by_urn(
+            urn, platform_resource_repository, tag_sync_context
+        )
+        if existing_platform_resource_id:
+            logger.info(
+                f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
+            )
+            return existing_platform_resource_id
+        # Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
+        new_tag_id = cls.generate_tag_id(tag_sync_context, urn)
+        if new_tag_id:
+            # we then check if this tag has already been ingested as a platform
+            # resource in the platform resource repository
+            resource_key = platform_resource_repository.get(
+                new_tag_id.to_platform_resource_key()
+            )
+            if resource_key:
+                logger.info(
+                    f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
+                )
+                new_tag_id.exists_in_lake_formation = (
+                    True  # TODO: Check if this is a safe assumption
+                )
+            return new_tag_id
+        raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
+    @classmethod
+    def generate_tag_id(
+        cls, tag_sync_context: LakeFormationTagSyncContext, urn: str
+    ) -> "LakeFormationTagPlatformResourceId":
+        parsed_urn = Urn.from_string(urn)
+        entity_type = parsed_urn.entity_type
+        if entity_type == "tag":
+            new_tag_id = LakeFormationTagPlatformResourceId.from_datahub_tag(
+                TagUrn.from_string(urn), tag_sync_context
+            )
+        else:
+            raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
+        return new_tag_id
+    @classmethod
+    def from_datahub_tag(
+        cls, tag_urn: TagUrn, tag_sync_context: LakeFormationTagSyncContext
+    ) -> "LakeFormationTagPlatformResourceId":
+        tag = LakeFormationTag.from_urn(tag_urn)
+        return LakeFormationTagPlatformResourceId(
+            tag_key=str(tag.key),
+            tag_value=str(tag.value),
+            platform_instance=tag_sync_context.platform_instance,
+            catalog=tag_sync_context.catalog,
+            exists_in_lake_formation=False,
+        )
+class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
+    datahub_urns: LinkedResourceSet
+    managed_by_datahub: bool
+    id: LakeFormationTagPlatformResourceId
+    allowed_values: Optional[List[str]]
+    def get_id(self) -> ExternalEntityId:
+        return self.id
+    def is_managed_by_datahub(self) -> bool:
+        return self.managed_by_datahub
+    def datahub_linked_resources(self) -> LinkedResourceSet:
+        return self.datahub_urns
+    def as_platform_resource(self) -> PlatformResource:
+        return PlatformResource.create(
+            key=self.id.to_platform_resource_key(),
+            secondary_keys=[u for u in self.datahub_urns.urns],
+            value=self,
+        )
+    @classmethod
+    def get_from_datahub(
+        cls,
+        lake_formation_tag_id: LakeFormationTagPlatformResourceId,
+        platform_resource_repository: PlatformResourceRepository,
+        managed_by_datahub: bool = False,
+    ) -> "LakeFormationTagPlatformResource":
+        # Search for linked DataHub URNs
+        platform_resources = [
+            r
+            for r in platform_resource_repository.search_by_filter(
+                ElasticDocumentQuery.create_from(
+                    (
+                        PlatformResourceSearchFields.RESOURCE_TYPE,
+                        str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
+                    ),
+                    (
+                        PlatformResourceSearchFields.PRIMARY_KEY,
+                        f"{lake_formation_tag_id.tag_key}/{lake_formation_tag_id.tag_value}",
+                    ),
+                )
+            )
+        ]
+        for platform_resource in platform_resources:
+            if (
+                platform_resource.resource_info
+                and platform_resource.resource_info.value
+            ):
+                lf_tag = LakeFormationTagPlatformResource(
+                    **platform_resource.resource_info.value.as_pydantic_object(
+                        LakeFormationTagPlatformResource
+                    ).dict()
+                )
+                if (
+                    lf_tag.id.platform_instance
+                    == lake_formation_tag_id.platform_instance
+                    and lf_tag.id.catalog == lake_formation_tag_id.catalog
+                ):
+                    return lf_tag
+        return cls(
+            id=lake_formation_tag_id,
+            datahub_urns=LinkedResourceSet(urns=[]),
+            managed_by_datahub=managed_by_datahub,
+            allowed_values=None,
+        )

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -323,9 +323,11 @@ class SQLServerSource(SQLAlchemySource):
             try:
                 yield from self.loop_jobs(inspector, self.config)
             except Exception as e:
-                self.report.report_failure(
-                    "jobs",
-                    f"Failed to list jobs due to error {e}",
+                self.report.failure(
+                    message="Failed to list jobs",
+                    title="SQL Server Jobs Extraction",
+                    context="Error occurred during database-level job extraction",
+                    exc=e,
                 )
     def get_schema_level_workunits(
@@ -343,12 +345,158 @@ class SQLServerSource(SQLAlchemySource):
             try:
                 yield from self.loop_stored_procedures(inspector, schema, self.config)
             except Exception as e:
-                self.report.report_failure(
-                    "jobs",
-                    f"Failed to list jobs due to error {e}",
+                self.report.failure(
+                    message="Failed to list stored procedures",
+                    title="SQL Server Stored Procedures Extraction",
+                    context="Error occurred during schema-level stored procedure extraction",
+                    exc=e,
                 )
+    def _detect_rds_environment(self, conn: Connection) -> bool:
+        """
+        Detect if we're running in an RDS/managed environment vs on-premises.
+        Returns True if RDS/managed, False if on-premises.
+        """
+        try:
+            # Try to access system tables directly - this typically fails in RDS
+            conn.execute("SELECT TOP 1 * FROM msdb.dbo.sysjobs")
+            logger.debug(
+                "Direct table access successful - likely on-premises environment"
+            )
+            return False
+        except Exception:
+            logger.debug("Direct table access failed - likely RDS/managed environment")
+            return True
     def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]:
+        """
+        Get job information with environment detection to choose optimal method first.
+        """
+        jobs: Dict[str, Dict[str, Any]] = {}
+        # Detect environment to choose optimal method first
+        is_rds = self._detect_rds_environment(conn)
+        if is_rds:
+            # Managed environment - try stored procedures first
+            try:
+                jobs = self._get_jobs_via_stored_procedures(conn, db_name)
+                logger.info(
+                    "Successfully retrieved jobs using stored procedures (managed environment)"
+                )
+                return jobs
+            except Exception as sp_error:
+                logger.warning(
+                    f"Failed to retrieve jobs via stored procedures in managed environment: {sp_error}"
+                )
+                # Try direct query as fallback (might work in some managed environments)
+                try:
+                    jobs = self._get_jobs_via_direct_query(conn, db_name)
+                    logger.info(
+                        "Successfully retrieved jobs using direct query fallback in managed environment"
+                    )
+                    return jobs
+                except Exception as direct_error:
+                    self.report.failure(
+                        message="Failed to retrieve jobs in managed environment",
+                        title="SQL Server Jobs Extraction",
+                        context="Both stored procedures and direct query methods failed",
+                        exc=direct_error,
+                    )
+        else:
+            # On-premises environment - try direct query first (usually faster)
+            try:
+                jobs = self._get_jobs_via_direct_query(conn, db_name)
+                logger.info(
+                    "Successfully retrieved jobs using direct query (on-premises environment)"
+                )
+                return jobs
+            except Exception as direct_error:
+                logger.warning(
+                    f"Failed to retrieve jobs via direct query in on-premises environment: {direct_error}"
+                )
+                # Try stored procedures as fallback
+                try:
+                    jobs = self._get_jobs_via_stored_procedures(conn, db_name)
+                    logger.info(
+                        "Successfully retrieved jobs using stored procedures fallback in on-premises environment"
+                    )
+                    return jobs
+                except Exception as sp_error:
+                    self.report.failure(
+                        message="Failed to retrieve jobs in on-premises environment",
+                        title="SQL Server Jobs Extraction",
+                        context="Both direct query and stored procedures methods failed",
+                        exc=sp_error,
+                    )
+        return jobs
+    def _get_jobs_via_stored_procedures(
+        self, conn: Connection, db_name: str
+    ) -> Dict[str, Dict[str, Any]]:
+        jobs: Dict[str, Dict[str, Any]] = {}
+        # First, get all jobs
+        jobs_result = conn.execute("EXEC msdb.dbo.sp_help_job")
+        jobs_data = {}
+        for row in jobs_result:
+            job_id = str(row["job_id"])
+            jobs_data[job_id] = {
+                "job_id": job_id,
+                "name": row["name"],
+                "description": row.get("description", ""),
+                "date_created": row.get("date_created"),
+                "date_modified": row.get("date_modified"),
+                "enabled": row.get("enabled", 1),
+            }
+        # Now get job steps for each job, filtering by database
+        for job_id, job_info in jobs_data.items():
+            try:
+                # Get steps for this specific job
+                steps_result = conn.execute(
+                    f"EXEC msdb.dbo.sp_help_jobstep @job_id = '{job_id}'"
+                )
+                job_steps = {}
+                for step_row in steps_result:
+                    # Only include steps that run against our target database
+                    step_database = step_row.get("database_name", "")
+                    if step_database.lower() == db_name.lower() or not step_database:
+                        step_data = {
+                            "job_id": job_id,
+                            "job_name": job_info["name"],
+                            "description": job_info["description"],
+                            "date_created": job_info["date_created"],
+                            "date_modified": job_info["date_modified"],
+                            "step_id": step_row["step_id"],
+                            "step_name": step_row["step_name"],
+                            "subsystem": step_row.get("subsystem", ""),
+                            "command": step_row.get("command", ""),
+                            "database_name": step_database,
+                        }
+                        job_steps[step_row["step_id"]] = step_data
+                # Only add job if it has relevant steps
+                if job_steps:
+                    jobs[job_info["name"]] = job_steps
+            except Exception as step_error:
+                logger.warning(
+                    f"Failed to get steps for job {job_info['name']}: {step_error}"
+                )
+                continue
+        return jobs
+    def _get_jobs_via_direct_query(
+        self, conn: Connection, db_name: str
+    ) -> Dict[str, Dict[str, Any]]:
+        """
+        Original method using direct table access for on-premises SQL Server.
+        """
         jobs_data = conn.execute(
             f"""
             SELECT
@@ -371,6 +519,7 @@ class SQLServerSource(SQLAlchemySource):
             where database_name = '{db_name}'
             """
         )
         jobs: Dict[str, Dict[str, Any]] = {}
         for row in jobs_data:
             step_data = dict(
@@ -383,11 +532,13 @@ class SQLServerSource(SQLAlchemySource):
                 step_name=row["step_name"],
                 subsystem=row["subsystem"],
                 command=row["command"],
+                database_name=row["database_name"],
             )
             if row["name"] in jobs:
                 jobs[row["name"]][row["step_id"]] = step_data
             else:
                 jobs[row["name"]] = {row["step_id"]: step_data}
         return jobs
     def loop_jobs(
@@ -397,21 +548,59 @@ class SQLServerSource(SQLAlchemySource):
     ) -> Iterable[MetadataWorkUnit]:
         """
         Loop MS SQL jobs as dataFlow-s.
-        :return:
+        Now supports both managed and on-premises SQL Server.
         """
         db_name = self.get_db_name(inspector)
-        with inspector.engine.connect() as conn:
-            jobs = self._get_jobs(conn, db_name)
-            for job_name, job_steps in jobs.items():
-                job = MSSQLJob(
-                    name=job_name,
-                    env=sql_config.env,
-                    db=db_name,
-                    platform_instance=sql_config.platform_instance,
+        try:
+            with inspector.engine.connect() as conn:
+                jobs = self._get_jobs(conn, db_name)
+                if not jobs:
+                    logger.info(f"No jobs found for database: {db_name}")
+                    return
+                logger.info(f"Found {len(jobs)} jobs for database: {db_name}")
+                for job_name, job_steps in jobs.items():
+                    try:
+                        job = MSSQLJob(
+                            name=job_name,
+                            env=sql_config.env,
+                            db=db_name,
+                            platform_instance=sql_config.platform_instance,
+                        )
+                        data_flow = MSSQLDataFlow(entity=job)
+                        yield from self.construct_flow_workunits(data_flow=data_flow)
+                        yield from self.loop_job_steps(job, job_steps)
+                    except Exception as job_error:
+                        logger.warning(f"Failed to process job {job_name}: {job_error}")
+                        self.report.warning(
+                            message=f"Failed to process job {job_name}",
+                            title="SQL Server Jobs Extraction",
+                            context="Error occurred while processing individual job",
+                            exc=job_error,
+                        )
+                        continue
+        except Exception as e:
+            error_message = f"Failed to retrieve jobs for database {db_name}: {e}"
+            logger.error(error_message)
+            # Provide specific guidance for permission issues
+            if "permission" in str(e).lower() or "denied" in str(e).lower():
+                permission_guidance = (
+                    "For managed SQL Server services, ensure the following permissions are granted:\n"
+                    "GRANT EXECUTE ON msdb.dbo.sp_help_job TO datahub_read;\n"
+                    "GRANT EXECUTE ON msdb.dbo.sp_help_jobstep TO datahub_read;\n"
+                    "For on-premises SQL Server, you may also need:\n"
+                    "GRANT SELECT ON msdb.dbo.sysjobs TO datahub_read;\n"
+                    "GRANT SELECT ON msdb.dbo.sysjobsteps TO datahub_read;"
                 )
-                data_flow = MSSQLDataFlow(entity=job)
-                yield from self.construct_flow_workunits(data_flow=data_flow)
-                yield from self.loop_job_steps(job, job_steps)
+                logger.info(permission_guidance)
+            raise e
     def loop_job_steps(
         self, job: MSSQLJob, job_steps: Dict[str, Any]

datahub/ingestion/source/unity/source.py CHANGED Viewed

@@ -785,7 +785,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
             description=schema.comment,
             owner_urn=self.get_owner_urn(schema.owner),
             external_url=f"{self.external_url_base}/{schema.catalog.name}/{schema.name}",
-            tags=[tag.to_datahub_tag_urn().urn() for tag in schema_tags]
+            tags=[tag.to_datahub_tag_urn().name for tag in schema_tags]
             if schema_tags
             else None,
         )
@@ -830,7 +830,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
             description=catalog.comment,
             owner_urn=self.get_owner_urn(catalog.owner),
             external_url=f"{self.external_url_base}/{catalog.name}",
-            tags=[tag.to_datahub_tag_urn().urn() for tag in catalog_tags]
+            tags=[tag.to_datahub_tag_urn().name for tag in catalog_tags]
             if catalog_tags
             else None,
         )
@@ -1083,7 +1083,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
             )
         else:
             if tags is not None:
-                logger.debug(f"Column tags are: {tags}")
                 attribution = MetadataAttribution(
                     source="urn:li:dataPlatform:unity-catalog",
                     actor="urn:li:corpuser:datahub",

acryl-datahub 1.1.0.1rc6__py3-none-any.whl → 1.1.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.1rc6py3-none-any.whl → 1.1.0.2py3-none-any.whl