PyPI - acryl-datahub - Versions diffs - 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2__py3-none-any.whl - Mend

acryl-datahub 1.0.0.1rc7py3-none-any.whl → 1.0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (76) hide show

{acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2561 -2561
{acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +75 -73
datahub/_version.py +1 -1
datahub/api/entities/datajob/dataflow.py +15 -0
datahub/api/entities/datajob/datajob.py +17 -0
datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
datahub/api/entities/dataset/dataset.py +2 -2
datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
datahub/cli/ingest_cli.py +4 -4
datahub/cli/migrate.py +6 -6
datahub/configuration/common.py +1 -1
datahub/emitter/mcp_builder.py +4 -0
datahub/ingestion/api/common.py +9 -0
datahub/ingestion/api/source.py +4 -1
datahub/ingestion/api/source_helpers.py +26 -1
datahub/ingestion/graph/client.py +104 -0
datahub/ingestion/run/pipeline.py +0 -6
datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
datahub/ingestion/source/fivetran/fivetran.py +1 -0
datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
datahub/ingestion/source/hex/constants.py +5 -0
datahub/ingestion/source/hex/hex.py +150 -22
datahub/ingestion/source/hex/mapper.py +28 -2
datahub/ingestion/source/hex/model.py +10 -2
datahub/ingestion/source/hex/query_fetcher.py +300 -0
datahub/ingestion/source/iceberg/iceberg.py +106 -18
datahub/ingestion/source/kafka/kafka.py +1 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
datahub/ingestion/source/looker/looker_source.py +2 -3
datahub/ingestion/source/mlflow.py +6 -7
datahub/ingestion/source/mode.py +2 -2
datahub/ingestion/source/nifi.py +3 -3
datahub/ingestion/source/openapi.py +3 -3
datahub/ingestion/source/openapi_parser.py +8 -8
datahub/ingestion/source/powerbi/config.py +1 -1
datahub/ingestion/source/powerbi/powerbi.py +16 -3
datahub/ingestion/source/redshift/profile.py +2 -2
datahub/ingestion/source/sigma/sigma.py +6 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
datahub/ingestion/source/sql/trino.py +4 -3
datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
datahub/ingestion/source/superset.py +108 -81
datahub/ingestion/source/tableau/tableau.py +4 -4
datahub/ingestion/source/tableau/tableau_common.py +2 -2
datahub/ingestion/source/unity/source.py +1 -1
datahub/ingestion/source/vertexai/vertexai.py +7 -7
datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
datahub/ingestion/transformer/dataset_domain.py +1 -1
datahub/lite/lite_util.py +2 -2
datahub/metadata/_schema_classes.py +47 -2
datahub/metadata/_urns/urn_defs.py +56 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/schema.avsc +121 -85
datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
datahub/metadata/schemas/FormInfo.avsc +5 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
datahub/metadata/schemas/QueryProperties.avsc +4 -2
datahub/metadata/schemas/SystemMetadata.avsc +86 -0
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +6 -6
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/ingest_utils.py +2 -2
datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
{acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
{acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0

datahub/api/entities/structuredproperties/structuredproperties.py CHANGED Viewed

@@ -43,7 +43,7 @@ class AllowedValue(ConfigModel):
 VALID_ENTITY_TYPE_URNS = [
-    Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys()
+    Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES
 ]
 _VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."

datahub/cli/ingest_cli.py CHANGED Viewed

@@ -216,9 +216,9 @@ def run(
 @click.option(
     "--executor-id",
     type=str,
-    default="default",
     help="Executor id to route execution requests to. Do not use this unless you have configured a custom executor.",
     required=False,
+    default=None,
 )
 @click.option(
     "--cli-version",
@@ -239,7 +239,7 @@ def run(
     type=str,
     help="Timezone for the schedule in 'America/New_York' format. Uses UTC by default.",
     required=False,
-    default="UTC",
+    default=None,
 )
 @click.option(
     "--debug", type=bool, help="Should we debug.", required=False, default=False
@@ -255,10 +255,10 @@ def deploy(
     name: Optional[str],
     config: str,
     urn: Optional[str],
-    executor_id: str,
+    executor_id: Optional[str],
     cli_version: Optional[str],
     schedule: Optional[str],
-    time_zone: str,
+    time_zone: Optional[str],
     extra_pip: Optional[str],
     debug: bool = False,
 ) -> None:

datahub/cli/migrate.py CHANGED Viewed

@@ -76,13 +76,13 @@ class MigrationReport:
     def __repr__(self) -> str:
         repr = f"{self._get_prefix()}Migration Report:\n--------------\n"
         repr += f"{self._get_prefix()}Migration Run Id: {self.run_id}\n"
-        repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created.keys()]))}\n"
-        repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected.keys()]))}\n"
-        repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated.keys()]))}\n"
+        repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created]))}\n"
+        repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected]))}\n"
+        repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated]))}\n"
         repr += f"{self._get_prefix()}Details:\n"
-        repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created.keys()]) or 'None'}\n"
-        repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected.keys()]) or 'None'}\n"
-        repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated.keys()]) or 'None'}\n"
+        repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created]) or 'None'}\n"
+        repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected]) or 'None'}\n"
+        repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated]) or 'None'}\n"
         return repr

datahub/configuration/common.py CHANGED Viewed

@@ -317,7 +317,7 @@ class KeyValuePattern(ConfigModel):
         return KeyValuePattern()
     def value(self, string: str) -> List[str]:
-        matching_keys = [key for key in self.rules.keys() if re.match(key, string)]
+        matching_keys = [key for key in self.rules if re.match(key, string)]
         if not matching_keys:
             return []
         elif self.first_match_only:

datahub/emitter/mcp_builder.py CHANGED Viewed

@@ -137,6 +137,10 @@ class ProjectIdKey(ContainerKey):
     project_id: str
+class ExperimentKey(ContainerKey):
+    id: str
 class MetastoreKey(ContainerKey):
     metastore: str

datahub/ingestion/api/common.py CHANGED Viewed

@@ -12,6 +12,9 @@ if TYPE_CHECKING:
 T = TypeVar("T")
+if TYPE_CHECKING:
+    from datahub.ingestion.run.pipeline_config import FlagsConfig
 @dataclass
 class RecordEnvelope(Generic[T]):
@@ -60,6 +63,12 @@ class PipelineContext:
         self._set_dataset_urn_to_lower_if_needed()
+    @property
+    def flags(self) -> "FlagsConfig":
+        from datahub.ingestion.run.pipeline_config import FlagsConfig
+        return self.pipeline_config.flags if self.pipeline_config else FlagsConfig()
     def _set_dataset_urn_to_lower_if_needed(self) -> None:
         # TODO: Get rid of this function once lower-casing is the standard.
         if self.graph:

datahub/ingestion/api/source.py CHANGED Viewed

@@ -39,6 +39,7 @@ from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
 from datahub.ingestion.api.report import Report
 from datahub.ingestion.api.source_helpers import (
+    AutoSystemMetadata,
     auto_browse_path_v2,
     auto_fix_duplicate_schema_field_paths,
     auto_fix_empty_field_paths,
@@ -475,8 +476,10 @@ class Source(Closeable, metaclass=ABCMeta):
         return stream
     def get_workunits(self) -> Iterable[MetadataWorkUnit]:
+        workunit_processors = self.get_workunit_processors()
+        workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
         return self._apply_workunit_processors(
-            self.get_workunit_processors(), auto_workunit(self.get_workunits_internal())
+            workunit_processors, auto_workunit(self.get_workunits_internal())
         )
     def get_workunits_internal(

datahub/ingestion/api/source_helpers.py CHANGED Viewed

@@ -13,9 +13,14 @@ from typing import (
 )
 from datahub.configuration.time_window_config import BaseTimeWindowConfig
-from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis
+from datahub.emitter.mce_builder import (
+    get_sys_time,
+    make_dataplatform_instance_urn,
+    parse_ts_millis,
+)
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import entity_supports_aspect
+from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.metadata.schema_classes import (
     BrowsePathEntryClass,
@@ -544,3 +549,23 @@ def _prepend_platform_instance(
         return [BrowsePathEntryClass(id=urn, urn=urn)] + entries
     return entries
+class AutoSystemMetadata:
+    def __init__(self, ctx: PipelineContext):
+        self.ctx = ctx
+    def stamp(self, stream: Iterable[MetadataWorkUnit]) -> Iterable[MetadataWorkUnit]:
+        for wu in stream:
+            yield self.stamp_wu(wu)
+    def stamp_wu(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
+        if self.ctx.flags.set_system_metadata:
+            if not wu.metadata.systemMetadata:
+                wu.metadata.systemMetadata = SystemMetadataClass()
+            wu.metadata.systemMetadata.runId = self.ctx.run_id
+            if not wu.metadata.systemMetadata.lastObserved:
+                wu.metadata.systemMetadata.lastObserved = get_sys_time()
+            if self.ctx.flags.set_system_metadata_pipeline_name:
+                wu.metadata.systemMetadata.pipelineName = self.ctx.pipeline_name
+        return wu

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -27,6 +27,7 @@ from pydantic import BaseModel
 from requests.models import HTTPError
 from typing_extensions import deprecated
+from datahub._codegen.aspect import _Aspect
 from datahub.cli import config_utils
 from datahub.configuration.common import ConfigModel, GraphError, OperationalError
 from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
@@ -1697,6 +1698,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         return res["runAssertionsForAsset"]
+    @deprecated("Use get_entities instead which returns typed aspects")
     def get_entities_v2(
         self,
         entity_name: str,
@@ -1736,6 +1738,108 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
                     retval[entity_urn][aspect_key] = aspect_value
         return retval
+    def get_entities(
+        self,
+        entity_name: str,
+        urns: List[str],
+        aspects: Optional[List[str]] = None,
+        with_system_metadata: bool = False,
+    ) -> Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]]:
+        """
+        Get entities using the OpenAPI v3 endpoint, deserializing aspects into typed objects.
+        Args:
+            entity_name: The entity type name
+            urns: List of entity URNs to fetch
+            aspects: Optional list of aspect names to fetch. If None, all aspects will be fetched.
+            with_system_metadata: If True, return system metadata along with each aspect.
+        Returns:
+            A dictionary mapping URNs to a dictionary of aspect name to tuples of
+            (typed aspect object, system metadata). If with_system_metadata is False,
+            the system metadata in the tuple will be None.
+        """
+        aspects = aspects or []
+        request_payload = []
+        for urn in urns:
+            entity_request: Dict[str, Any] = {"urn": urn}
+            for aspect_name in aspects:
+                entity_request[aspect_name] = {}
+            request_payload.append(entity_request)
+        headers: Dict[str, Any] = {
+            "Accept": "application/json",
+            "Content-Type": "application/json",
+        }
+        url = f"{self.config.server}/openapi/v3/entity/{entity_name}/batchGet"
+        if with_system_metadata:
+            url += "?systemMetadata=true"
+        response = self._session.post(
+            url, data=json.dumps(request_payload), headers=headers
+        )
+        response.raise_for_status()
+        entities = response.json()
+        result: Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]] = {}
+        for entity in entities:
+            entity_urn = entity.get("urn")
+            if entity_urn is None:
+                logger.warning(
+                    f"Missing URN in entity response: {entity}, skipping deserialization"
+                )
+                continue
+            entity_aspects: Dict[
+                str, Tuple[_Aspect, Optional[SystemMetadataClass]]
+            ] = {}
+            for aspect_name, aspect_obj in entity.items():
+                if aspect_name == "urn":
+                    continue
+                aspect_class = ASPECT_NAME_MAP.get(aspect_name)
+                if aspect_class is None:
+                    logger.warning(
+                        f"Unknown aspect type {aspect_name}, skipping deserialization"
+                    )
+                    continue
+                aspect_value = aspect_obj.get("value")
+                if aspect_value is None:
+                    logger.warning(
+                        f"Unknown aspect value for aspect {aspect_name}, skipping deserialization"
+                    )
+                    continue
+                try:
+                    post_json_obj = post_json_transform(aspect_value)
+                    typed_aspect = aspect_class.from_obj(post_json_obj)
+                    assert isinstance(typed_aspect, aspect_class) and isinstance(
+                        typed_aspect, _Aspect
+                    )
+                    system_metadata = None
+                    if with_system_metadata:
+                        system_metadata_obj = aspect_obj.get("systemMetadata")
+                        if system_metadata_obj:
+                            system_metadata = SystemMetadataClass.from_obj(
+                                system_metadata_obj
+                            )
+                    entity_aspects[aspect_name] = (typed_aspect, system_metadata)
+                except Exception as e:
+                    logger.error(f"Error deserializing aspect {aspect_name}: {e}")
+                    raise
+            if entity_aspects:
+                result[entity_urn] = entity_aspects
+        return result
     def upsert_custom_assertion(
         self,
         urn: Optional[str],

datahub/ingestion/run/pipeline.py CHANGED Viewed

@@ -39,9 +39,6 @@ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, Logging
 from datahub.ingestion.sink.datahub_rest import DatahubRestSink
 from datahub.ingestion.sink.sink_registry import sink_registry
 from datahub.ingestion.source.source_registry import source_registry
-from datahub.ingestion.transformer.system_metadata_transformer import (
-    SystemMetadataTransformer,
-)
 from datahub.ingestion.transformer.transform_registry import transform_registry
 from datahub.sdk._attribution import KnownAttribution, change_default_attribution
 from datahub.telemetry import stats
@@ -286,9 +283,6 @@ class Pipeline:
                     f"Transformer type:{transformer_type},{transformer_class} configured"
                 )
-        # Add the system metadata transformer at the end of the list.
-        self.transformers.append(SystemMetadataTransformer(self.ctx))
     def _configure_reporting(self, report_to: Optional[str]) -> None:
         if self.dry_run:
             # In dry run mode, we don't want to report anything.

datahub/ingestion/source/aws/sagemaker_processors/models.py CHANGED Viewed

@@ -323,7 +323,7 @@ class ModelProcessor:
             model_training_jobs = model_training_jobs.union(
                 {
                     job_urn
-                    for job_urn, job_direction in data_url_matched_jobs.keys()
+                    for job_urn, job_direction in data_url_matched_jobs
                     if job_direction == JobDirection.TRAINING
                 }
             )
@@ -331,7 +331,7 @@ class ModelProcessor:
             model_downstream_jobs = model_downstream_jobs.union(
                 {
                     job_urn
-                    for job_urn, job_direction in data_url_matched_jobs.keys()
+                    for job_urn, job_direction in data_url_matched_jobs
                     if job_direction == JobDirection.DOWNSTREAM
                 }
             )
@@ -368,7 +368,7 @@ class ModelProcessor:
         model_training_jobs = model_training_jobs.union(
             {
                 job_urn
-                for job_urn, job_direction in name_matched_jobs.keys()
+                for job_urn, job_direction in name_matched_jobs
                 if job_direction == JobDirection.TRAINING
             }
         )
@@ -376,7 +376,7 @@ class ModelProcessor:
         model_downstream_jobs = model_downstream_jobs.union(
             {
                 job_urn
-                for job_urn, job_direction in name_matched_jobs.keys()
+                for job_urn, job_direction in name_matched_jobs
                 if job_direction == JobDirection.DOWNSTREAM
             }
         )

datahub/ingestion/source/bigquery_v2/lineage.py CHANGED Viewed

@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
                 memory_footprint.total_size(lineage)
             )
-        for lineage_key in lineage.keys():
+        for lineage_key in lineage:
             # For views, we do not use the upstreams obtained by parsing audit logs
             # as they may contain indirectly referenced tables.
             if (

datahub/ingestion/source/dynamodb/dynamodb.py CHANGED Viewed

@@ -362,7 +362,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
         if self.config.include_table_item is None:
             return
         dataset_name = f"{region}.{table_name}"
-        if dataset_name not in self.config.include_table_item.keys():
+        if dataset_name not in self.config.include_table_item:
             return
         primary_key_list = self.config.include_table_item.get(dataset_name)
         assert isinstance(primary_key_list, List)

datahub/ingestion/source/fivetran/fivetran.py CHANGED Viewed

@@ -215,6 +215,7 @@ class FivetranSource(StatefulIngestionSourceBase):
         datajob = DataJob(
             id=connector.connector_id,
             flow_urn=dataflow_urn,
+            platform_instance=self.config.platform_instance,
             name=connector.connector_name,
             owners={owner_email} if owner_email else set(),
         )

datahub/ingestion/source/fivetran/fivetran_log_api.py CHANGED Viewed

@@ -190,7 +190,7 @@ class FivetranLogAPI:
         jobs: List[Job] = []
         if connector_sync_log is None:
             return jobs
-        for sync_id in connector_sync_log.keys():
+        for sync_id in connector_sync_log:
             if len(connector_sync_log[sync_id]) != 2:
                 # If both sync-start and sync-end event log not present for this sync that means sync is still in progress
                 continue

datahub/ingestion/source/hex/constants.py CHANGED Viewed

@@ -1,3 +1,8 @@
+from datahub.metadata.urns import DataPlatformUrn
 HEX_PLATFORM_NAME = "hex"
+HEX_PLATFORM_URN = DataPlatformUrn(platform_name=HEX_PLATFORM_NAME)
 HEX_API_BASE_URL_DEFAULT = "https://app.hex.tech/api/v1"
 HEX_API_PAGE_SIZE_DEFAULT = 100
+DATAHUB_API_PAGE_SIZE_DEFAULT = 100

datahub/ingestion/source/hex/hex.py CHANGED Viewed

@@ -1,9 +1,12 @@
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
 from typing import Any, Dict, Iterable, List, Optional
-from pydantic import Field, SecretStr
+from pydantic import Field, SecretStr, root_validator
 from typing_extensions import assert_never
 from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.datetimes import parse_user_datetime
 from datahub.configuration.source_common import (
     EnvConfigMixin,
     PlatformInstanceConfigMixin,
@@ -21,12 +24,17 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.hex.api import HexApi, HexApiReport
 from datahub.ingestion.source.hex.constants import (
+    DATAHUB_API_PAGE_SIZE_DEFAULT,
     HEX_API_BASE_URL_DEFAULT,
     HEX_API_PAGE_SIZE_DEFAULT,
     HEX_PLATFORM_NAME,
 )
 from datahub.ingestion.source.hex.mapper import Mapper
 from datahub.ingestion.source.hex.model import Component, Project
+from datahub.ingestion.source.hex.query_fetcher import (
+    HexQueryFetcher,
+    HexQueryFetcherReport,
+)
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
@@ -34,9 +42,10 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
 )
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
-    StatefulIngestionReport,
     StatefulIngestionSourceBase,
 )
+from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
+from datahub.sdk.main_client import DataHubClient
 class HexSourceConfig(
@@ -93,9 +102,73 @@ class HexSourceConfig(
         default=True,
         description="Set ownership identity from owner/creator email",
     )
+    include_lineage: bool = Field(
+        default=True,
+        description='Include Hex lineage, being fetched from DataHub. See "Limitations" section in the docs for more details about the limitations of this feature.',
+    )
+    lineage_start_time: Optional[datetime] = Field(
+        default=None,
+        description="Earliest date of lineage to consider. Default: 1 day before lineage end time. You can specify absolute time like '2023-01-01' or relative time like '-7 days' or '-7d'.",
+    )
+    lineage_end_time: Optional[datetime] = Field(
+        default=None,
+        description="Latest date of lineage to consider. Default: Current time in UTC. You can specify absolute time like '2023-01-01' or relative time like '-1 day' or '-1d'.",
+    )
+    datahub_page_size: int = Field(
+        default=DATAHUB_API_PAGE_SIZE_DEFAULT,
+        description="Number of items to fetch per DataHub API call.",
+    )
+    @root_validator(pre=True)
+    def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
+        # lineage_end_time default = now
+        if "lineage_end_time" not in data or data["lineage_end_time"] is None:
+            data["lineage_end_time"] = datetime.now(tz=timezone.utc)
+        # if string is given, parse it
+        if isinstance(data["lineage_end_time"], str):
+            data["lineage_end_time"] = parse_user_datetime(data["lineage_end_time"])
+        # if no timezone is given, assume UTC
+        if data["lineage_end_time"].tzinfo is None:
+            data["lineage_end_time"] = data["lineage_end_time"].replace(
+                tzinfo=timezone.utc
+            )
+        # at this point, we ensure there is a non null datetime with UTC timezone for lineage_end_time
+        assert (
+            data["lineage_end_time"]
+            and isinstance(data["lineage_end_time"], datetime)
+            and data["lineage_end_time"].tzinfo is not None
+            and data["lineage_end_time"].tzinfo == timezone.utc
+        )
+        # lineage_start_time default = lineage_end_time - 1 day
+        if "lineage_start_time" not in data or data["lineage_start_time"] is None:
+            data["lineage_start_time"] = data["lineage_end_time"] - timedelta(days=1)
+        # if string is given, parse it
+        if isinstance(data["lineage_start_time"], str):
+            data["lineage_start_time"] = parse_user_datetime(data["lineage_start_time"])
+        # if no timezone is given, assume UTC
+        if data["lineage_start_time"].tzinfo is None:
+            data["lineage_start_time"] = data["lineage_start_time"].replace(
+                tzinfo=timezone.utc
+            )
+        # at this point, we ensure there is a non null datetime with UTC timezone for lineage_start_time
+        assert (
+            data["lineage_start_time"]
+            and isinstance(data["lineage_start_time"], datetime)
+            and data["lineage_start_time"].tzinfo is not None
+            and data["lineage_start_time"].tzinfo == timezone.utc
+        )
+        return data
-class HexReport(StaleEntityRemovalSourceReport, HexApiReport):
+@dataclass
+class HexReport(
+    StaleEntityRemovalSourceReport,
+    HexApiReport,
+    IngestionStageReport,
+    HexQueryFetcherReport,
+):
     pass
@@ -110,7 +183,7 @@ class HexSource(StatefulIngestionSourceBase):
     def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
         self.source_config = config
-        self.report = HexReport()
+        self.report: HexReport = HexReport()
         self.platform = HEX_PLATFORM_NAME
         self.hex_api = HexApi(
             report=self.report,
@@ -129,6 +202,28 @@ class HexSource(StatefulIngestionSourceBase):
             categories_as_tags=self.source_config.categories_as_tags,
             set_ownership_from_email=self.source_config.set_ownership_from_email,
         )
+        self.project_registry: Dict[str, Project] = {}
+        self.component_registry: Dict[str, Component] = {}
+        self.datahub_client: Optional[DataHubClient] = None
+        self.query_fetcher: Optional[HexQueryFetcher] = None
+        if self.source_config.include_lineage:
+            graph = ctx.require_graph("Lineage")
+            assert self.source_config.lineage_start_time and isinstance(
+                self.source_config.lineage_start_time, datetime
+            )
+            assert self.source_config.lineage_end_time and isinstance(
+                self.source_config.lineage_end_time, datetime
+            )
+            self.datahub_client = DataHubClient(graph=graph)
+            self.query_fetcher = HexQueryFetcher(
+                datahub_client=self.datahub_client,
+                workspace_name=self.source_config.workspace_name,
+                start_datetime=self.source_config.lineage_start_time,
+                end_datetime=self.source_config.lineage_end_time,
+                report=self.report,
+                page_size=self.source_config.datahub_page_size,
+            )
     @classmethod
     def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
@@ -143,25 +238,58 @@ class HexSource(StatefulIngestionSourceBase):
             ).workunit_processor,
         ]
-    def get_report(self) -> StatefulIngestionReport:
+    def get_report(self) -> HexReport:
         return self.report
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        yield from self.mapper.map_workspace()
-        for project_or_component in self.hex_api.fetch_projects():
-            if isinstance(project_or_component, Project):
-                if self.source_config.project_title_pattern.allowed(
-                    project_or_component.title
-                ):
-                    yield from self.mapper.map_project(project=project_or_component)
-            elif isinstance(project_or_component, Component):
-                if (
-                    self.source_config.include_components
-                    and self.source_config.component_title_pattern.allowed(
+        with self.report.new_stage("Fetch Hex assets from Hex API"):
+            for project_or_component in self.hex_api.fetch_projects():
+                if isinstance(project_or_component, Project):
+                    if self.source_config.project_title_pattern.allowed(
                         project_or_component.title
-                    )
-                ):
-                    yield from self.mapper.map_component(component=project_or_component)
-            else:
-                assert_never(project_or_component)
+                    ):
+                        self.project_registry[project_or_component.id] = (
+                            project_or_component
+                        )
+                elif isinstance(project_or_component, Component):
+                    if (
+                        self.source_config.include_components
+                        and self.source_config.component_title_pattern.allowed(
+                            project_or_component.title
+                        )
+                    ):
+                        self.component_registry[project_or_component.id] = (
+                            project_or_component
+                        )
+                else:
+                    assert_never(project_or_component)
+        if self.source_config.include_lineage:
+            assert self.datahub_client and self.query_fetcher
+            with self.report.new_stage(
+                "Fetch Hex lineage from existing Queries in DataHub"
+            ):
+                for query_metadata in self.query_fetcher.fetch():
+                    project = self.project_registry.get(query_metadata.hex_project_id)
+                    if project:
+                        project.upstream_datasets.extend(
+                            query_metadata.dataset_subjects
+                        )
+                        project.upstream_schema_fields.extend(
+                            query_metadata.schema_field_subjects
+                        )
+                    else:
+                        self.report.report_warning(
+                            title="Missing project for lineage",
+                            message="Lineage missed because missed project, likely due to filter patterns or deleted project.",
+                            context=str(query_metadata),
+                        )
+        with self.report.new_stage("Emit"):
+            yield from self.mapper.map_workspace()
+            for project in self.project_registry.values():
+                yield from self.mapper.map_project(project=project)
+            for component in self.component_registry.values():
+                yield from self.mapper.map_component(component=component)

acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.1rc7py3-none-any.whl → 1.0.0.2py3-none-any.whl