PyPI - acryl-datahub - Versions diffs - 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl - Mend

acryl-datahub 1.0.0.1rc6py3-none-any.whl → 1.0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show

{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2557 -2557
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +81 -79
datahub/_version.py +1 -1
datahub/api/entities/datajob/dataflow.py +15 -0
datahub/api/entities/datajob/datajob.py +17 -0
datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
datahub/api/entities/dataset/dataset.py +2 -2
datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
datahub/cli/ingest_cli.py +4 -4
datahub/cli/migrate.py +6 -6
datahub/configuration/common.py +1 -1
datahub/emitter/mcp_builder.py +4 -0
datahub/errors.py +4 -0
datahub/ingestion/api/common.py +9 -0
datahub/ingestion/api/source.py +6 -2
datahub/ingestion/api/source_helpers.py +35 -2
datahub/ingestion/graph/client.py +122 -7
datahub/ingestion/graph/filters.py +41 -16
datahub/ingestion/run/pipeline.py +0 -6
datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
datahub/ingestion/source/cassandra/cassandra.py +1 -10
datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
datahub/ingestion/source/fivetran/fivetran.py +1 -0
datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
datahub/ingestion/source/hex/constants.py +5 -0
datahub/ingestion/source/hex/hex.py +150 -22
datahub/ingestion/source/hex/mapper.py +28 -2
datahub/ingestion/source/hex/model.py +10 -2
datahub/ingestion/source/hex/query_fetcher.py +300 -0
datahub/ingestion/source/iceberg/iceberg.py +106 -18
datahub/ingestion/source/kafka/kafka.py +1 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
datahub/ingestion/source/looker/looker_source.py +2 -3
datahub/ingestion/source/mlflow.py +6 -7
datahub/ingestion/source/mode.py +2 -2
datahub/ingestion/source/nifi.py +3 -3
datahub/ingestion/source/openapi.py +3 -3
datahub/ingestion/source/openapi_parser.py +8 -8
datahub/ingestion/source/powerbi/config.py +1 -1
datahub/ingestion/source/powerbi/powerbi.py +16 -3
datahub/ingestion/source/redshift/profile.py +2 -2
datahub/ingestion/source/sigma/sigma.py +6 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
datahub/ingestion/source/sql/trino.py +4 -3
datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
datahub/ingestion/source/superset.py +108 -81
datahub/ingestion/source/tableau/tableau.py +4 -4
datahub/ingestion/source/tableau/tableau_common.py +2 -2
datahub/ingestion/source/unity/source.py +1 -1
datahub/ingestion/source/vertexai/vertexai.py +7 -7
datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
datahub/ingestion/transformer/dataset_domain.py +1 -1
datahub/lite/lite_util.py +2 -2
datahub/metadata/_schema_classes.py +47 -2
datahub/metadata/_urns/urn_defs.py +56 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/schema.avsc +121 -85
datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
datahub/metadata/schemas/FormInfo.avsc +5 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
datahub/metadata/schemas/QueryProperties.avsc +4 -2
datahub/metadata/schemas/SystemMetadata.avsc +86 -0
datahub/sdk/search_client.py +81 -8
datahub/sdk/search_filters.py +73 -11
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +6 -6
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/ingest_utils.py +2 -2
datahub/utilities/threaded_iterator_executor.py +16 -3
datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/cassandra/cassandra.py CHANGED Viewed

@@ -123,16 +123,7 @@ class CassandraSource(StatefulIngestionSourceBase):
             ).workunit_processor,
         ]
-    def get_workunits_internal(
-        self,
-    ) -> Iterable[MetadataWorkUnit]:
-        for metadata in self._get_metadata():
-            if isinstance(metadata, MetadataWorkUnit):
-                yield metadata
-            else:
-                yield from metadata.as_workunits()
-    def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
+    def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
         if not self.cassandra_api.authenticate():
             return
         keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()

datahub/ingestion/source/dynamodb/dynamodb.py CHANGED Viewed

@@ -362,7 +362,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
         if self.config.include_table_item is None:
             return
         dataset_name = f"{region}.{table_name}"
-        if dataset_name not in self.config.include_table_item.keys():
+        if dataset_name not in self.config.include_table_item:
             return
         primary_key_list = self.config.include_table_item.get(dataset_name)
         assert isinstance(primary_key_list, List)

datahub/ingestion/source/fivetran/fivetran.py CHANGED Viewed

@@ -215,6 +215,7 @@ class FivetranSource(StatefulIngestionSourceBase):
         datajob = DataJob(
             id=connector.connector_id,
             flow_urn=dataflow_urn,
+            platform_instance=self.config.platform_instance,
             name=connector.connector_name,
             owners={owner_email} if owner_email else set(),
         )

datahub/ingestion/source/fivetran/fivetran_log_api.py CHANGED Viewed

@@ -190,7 +190,7 @@ class FivetranLogAPI:
         jobs: List[Job] = []
         if connector_sync_log is None:
             return jobs
-        for sync_id in connector_sync_log.keys():
+        for sync_id in connector_sync_log:
             if len(connector_sync_log[sync_id]) != 2:
                 # If both sync-start and sync-end event log not present for this sync that means sync is still in progress
                 continue

datahub/ingestion/source/hex/constants.py CHANGED Viewed

@@ -1,3 +1,8 @@
+from datahub.metadata.urns import DataPlatformUrn
 HEX_PLATFORM_NAME = "hex"
+HEX_PLATFORM_URN = DataPlatformUrn(platform_name=HEX_PLATFORM_NAME)
 HEX_API_BASE_URL_DEFAULT = "https://app.hex.tech/api/v1"
 HEX_API_PAGE_SIZE_DEFAULT = 100
+DATAHUB_API_PAGE_SIZE_DEFAULT = 100

datahub/ingestion/source/hex/hex.py CHANGED Viewed

@@ -1,9 +1,12 @@
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
 from typing import Any, Dict, Iterable, List, Optional
-from pydantic import Field, SecretStr
+from pydantic import Field, SecretStr, root_validator
 from typing_extensions import assert_never
 from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.datetimes import parse_user_datetime
 from datahub.configuration.source_common import (
     EnvConfigMixin,
     PlatformInstanceConfigMixin,
@@ -21,12 +24,17 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.hex.api import HexApi, HexApiReport
 from datahub.ingestion.source.hex.constants import (
+    DATAHUB_API_PAGE_SIZE_DEFAULT,
     HEX_API_BASE_URL_DEFAULT,
     HEX_API_PAGE_SIZE_DEFAULT,
     HEX_PLATFORM_NAME,
 )
 from datahub.ingestion.source.hex.mapper import Mapper
 from datahub.ingestion.source.hex.model import Component, Project
+from datahub.ingestion.source.hex.query_fetcher import (
+    HexQueryFetcher,
+    HexQueryFetcherReport,
+)
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
@@ -34,9 +42,10 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
 )
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
-    StatefulIngestionReport,
     StatefulIngestionSourceBase,
 )
+from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
+from datahub.sdk.main_client import DataHubClient
 class HexSourceConfig(
@@ -93,9 +102,73 @@ class HexSourceConfig(
         default=True,
         description="Set ownership identity from owner/creator email",
     )
+    include_lineage: bool = Field(
+        default=True,
+        description='Include Hex lineage, being fetched from DataHub. See "Limitations" section in the docs for more details about the limitations of this feature.',
+    )
+    lineage_start_time: Optional[datetime] = Field(
+        default=None,
+        description="Earliest date of lineage to consider. Default: 1 day before lineage end time. You can specify absolute time like '2023-01-01' or relative time like '-7 days' or '-7d'.",
+    )
+    lineage_end_time: Optional[datetime] = Field(
+        default=None,
+        description="Latest date of lineage to consider. Default: Current time in UTC. You can specify absolute time like '2023-01-01' or relative time like '-1 day' or '-1d'.",
+    )
+    datahub_page_size: int = Field(
+        default=DATAHUB_API_PAGE_SIZE_DEFAULT,
+        description="Number of items to fetch per DataHub API call.",
+    )
+    @root_validator(pre=True)
+    def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
+        # lineage_end_time default = now
+        if "lineage_end_time" not in data or data["lineage_end_time"] is None:
+            data["lineage_end_time"] = datetime.now(tz=timezone.utc)
+        # if string is given, parse it
+        if isinstance(data["lineage_end_time"], str):
+            data["lineage_end_time"] = parse_user_datetime(data["lineage_end_time"])
+        # if no timezone is given, assume UTC
+        if data["lineage_end_time"].tzinfo is None:
+            data["lineage_end_time"] = data["lineage_end_time"].replace(
+                tzinfo=timezone.utc
+            )
+        # at this point, we ensure there is a non null datetime with UTC timezone for lineage_end_time
+        assert (
+            data["lineage_end_time"]
+            and isinstance(data["lineage_end_time"], datetime)
+            and data["lineage_end_time"].tzinfo is not None
+            and data["lineage_end_time"].tzinfo == timezone.utc
+        )
+        # lineage_start_time default = lineage_end_time - 1 day
+        if "lineage_start_time" not in data or data["lineage_start_time"] is None:
+            data["lineage_start_time"] = data["lineage_end_time"] - timedelta(days=1)
+        # if string is given, parse it
+        if isinstance(data["lineage_start_time"], str):
+            data["lineage_start_time"] = parse_user_datetime(data["lineage_start_time"])
+        # if no timezone is given, assume UTC
+        if data["lineage_start_time"].tzinfo is None:
+            data["lineage_start_time"] = data["lineage_start_time"].replace(
+                tzinfo=timezone.utc
+            )
+        # at this point, we ensure there is a non null datetime with UTC timezone for lineage_start_time
+        assert (
+            data["lineage_start_time"]
+            and isinstance(data["lineage_start_time"], datetime)
+            and data["lineage_start_time"].tzinfo is not None
+            and data["lineage_start_time"].tzinfo == timezone.utc
+        )
+        return data
-class HexReport(StaleEntityRemovalSourceReport, HexApiReport):
+@dataclass
+class HexReport(
+    StaleEntityRemovalSourceReport,
+    HexApiReport,
+    IngestionStageReport,
+    HexQueryFetcherReport,
+):
     pass
@@ -110,7 +183,7 @@ class HexSource(StatefulIngestionSourceBase):
     def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
         self.source_config = config
-        self.report = HexReport()
+        self.report: HexReport = HexReport()
         self.platform = HEX_PLATFORM_NAME
         self.hex_api = HexApi(
             report=self.report,
@@ -129,6 +202,28 @@ class HexSource(StatefulIngestionSourceBase):
             categories_as_tags=self.source_config.categories_as_tags,
             set_ownership_from_email=self.source_config.set_ownership_from_email,
         )
+        self.project_registry: Dict[str, Project] = {}
+        self.component_registry: Dict[str, Component] = {}
+        self.datahub_client: Optional[DataHubClient] = None
+        self.query_fetcher: Optional[HexQueryFetcher] = None
+        if self.source_config.include_lineage:
+            graph = ctx.require_graph("Lineage")
+            assert self.source_config.lineage_start_time and isinstance(
+                self.source_config.lineage_start_time, datetime
+            )
+            assert self.source_config.lineage_end_time and isinstance(
+                self.source_config.lineage_end_time, datetime
+            )
+            self.datahub_client = DataHubClient(graph=graph)
+            self.query_fetcher = HexQueryFetcher(
+                datahub_client=self.datahub_client,
+                workspace_name=self.source_config.workspace_name,
+                start_datetime=self.source_config.lineage_start_time,
+                end_datetime=self.source_config.lineage_end_time,
+                report=self.report,
+                page_size=self.source_config.datahub_page_size,
+            )
     @classmethod
     def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
@@ -143,25 +238,58 @@ class HexSource(StatefulIngestionSourceBase):
             ).workunit_processor,
         ]
-    def get_report(self) -> StatefulIngestionReport:
+    def get_report(self) -> HexReport:
         return self.report
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        yield from self.mapper.map_workspace()
-        for project_or_component in self.hex_api.fetch_projects():
-            if isinstance(project_or_component, Project):
-                if self.source_config.project_title_pattern.allowed(
-                    project_or_component.title
-                ):
-                    yield from self.mapper.map_project(project=project_or_component)
-            elif isinstance(project_or_component, Component):
-                if (
-                    self.source_config.include_components
-                    and self.source_config.component_title_pattern.allowed(
+        with self.report.new_stage("Fetch Hex assets from Hex API"):
+            for project_or_component in self.hex_api.fetch_projects():
+                if isinstance(project_or_component, Project):
+                    if self.source_config.project_title_pattern.allowed(
                         project_or_component.title
-                    )
-                ):
-                    yield from self.mapper.map_component(component=project_or_component)
-            else:
-                assert_never(project_or_component)
+                    ):
+                        self.project_registry[project_or_component.id] = (
+                            project_or_component
+                        )
+                elif isinstance(project_or_component, Component):
+                    if (
+                        self.source_config.include_components
+                        and self.source_config.component_title_pattern.allowed(
+                            project_or_component.title
+                        )
+                    ):
+                        self.component_registry[project_or_component.id] = (
+                            project_or_component
+                        )
+                else:
+                    assert_never(project_or_component)
+        if self.source_config.include_lineage:
+            assert self.datahub_client and self.query_fetcher
+            with self.report.new_stage(
+                "Fetch Hex lineage from existing Queries in DataHub"
+            ):
+                for query_metadata in self.query_fetcher.fetch():
+                    project = self.project_registry.get(query_metadata.hex_project_id)
+                    if project:
+                        project.upstream_datasets.extend(
+                            query_metadata.dataset_subjects
+                        )
+                        project.upstream_schema_fields.extend(
+                            query_metadata.schema_field_subjects
+                        )
+                    else:
+                        self.report.report_warning(
+                            title="Missing project for lineage",
+                            message="Lineage missed because missed project, likely due to filter patterns or deleted project.",
+                            context=str(query_metadata),
+                        )
+        with self.report.new_stage("Emit"):
+            yield from self.mapper.map_workspace()
+            for project in self.project_registry.values():
+                yield from self.mapper.map_project(project=project)
+            for component in self.component_registry.values():
+                yield from self.mapper.map_component(component=component)

datahub/ingestion/source/hex/mapper.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from datetime import datetime
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple, Union
 from datahub._codegen.aspect import (
     _Aspect,  # TODO: is there a better import than this one?
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
     DashboardInfoClass,
     DashboardUsageStatisticsClass,
     DataPlatformInstanceClass,
+    EdgeClass,
     GlobalTagsClass,
     OwnerClass,
     OwnershipClass,
@@ -53,7 +54,14 @@ from datahub.metadata.schema_classes import (
     TagAssociationClass,
     TimeWindowSizeClass,
 )
-from datahub.metadata.urns import ContainerUrn, CorpUserUrn, DashboardUrn, Urn
+from datahub.metadata.urns import (
+    ContainerUrn,
+    CorpUserUrn,
+    DashboardUrn,
+    DatasetUrn,
+    SchemaFieldUrn,
+    Urn,
+)
 logger = logging.getLogger(__name__)
@@ -116,6 +124,8 @@ class Mapper:
             ),
             externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
             customProperties=dict(id=project.id),
+            datasetEdges=self._dataset_edges(project.upstream_datasets),
+            # TODO: support schema field upstream, maybe InputFields?
         )
         subtypes = SubTypesClass(
@@ -343,6 +353,22 @@ class Mapper:
             else None,
         )
+    def _dataset_edges(
+        self, upstream: List[Union[DatasetUrn, SchemaFieldUrn]]
+    ) -> Optional[List[EdgeClass]]:
+        # TBC: is there support for CLL in Dashboards? for the moment, skip SchemaFieldUrns
+        return (
+            [
+                EdgeClass(
+                    destinationUrn=upstream_urn.urn(),
+                )
+                for upstream_urn in upstream
+                if isinstance(upstream_urn, DatasetUrn)
+            ]
+            if upstream
+            else None
+        )
     def _yield_mcps(
         self, entity_urn: Urn, aspects: List[Optional[_Aspect]]
     ) -> Iterable[MetadataWorkUnit]:

datahub/ingestion/source/hex/model.py CHANGED Viewed

@@ -1,6 +1,8 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Union
+from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn
 @dataclass
@@ -51,6 +53,12 @@ class Project:
     creator: Optional[Owner] = None
     owner: Optional[Owner] = None
     analytics: Optional[Analytics] = None
+    upstream_datasets: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
+        default_factory=list
+    )
+    upstream_schema_fields: List[Union[DatasetUrn, SchemaFieldUrn]] = field(
+        default_factory=list
+    )
 @dataclass

datahub/ingestion/source/hex/query_fetcher.py ADDED Viewed

@@ -0,0 +1,300 @@
+import logging
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Dict, Iterable, List, Optional, Tuple
+from datahub.ingestion.api.source import SourceReport
+from datahub.ingestion.source.hex.constants import (
+    DATAHUB_API_PAGE_SIZE_DEFAULT,
+    HEX_PLATFORM_URN,
+)
+from datahub.metadata.schema_classes import QueryPropertiesClass, QuerySubjectsClass
+from datahub.metadata.urns import DatasetUrn, QueryUrn, SchemaFieldUrn
+from datahub.sdk.main_client import DataHubClient
+from datahub.sdk.search_filters import FilterDsl as F
+from datahub.utilities.time import datetime_to_ts_millis
+logger = logging.getLogger(__name__)
+# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
+# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
+HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
+@dataclass
+class QueryResponse:
+    """This is the public response model for the HexQueryFetcher."""
+    urn: QueryUrn
+    hex_project_id: str
+    dataset_subjects: List[DatasetUrn] = field(default_factory=list)
+    schema_field_subjects: List[SchemaFieldUrn] = field(default_factory=list)
+@dataclass
+class HexQueryFetcherReport(SourceReport):
+    start_datetime: Optional[datetime] = None
+    end_datetime: Optional[datetime] = None
+    fetched_query_urns: int = 0
+    fetched_query_objects: int = 0
+    filtered_out_queries_missing_metadata: int = 0
+    filtered_out_queries_different_workspace: int = 0
+    filtered_out_queries_no_match: int = 0
+    filtered_out_queries_no_subjects: int = 0
+    total_queries: int = 0
+    total_dataset_subjects: int = 0
+    total_schema_field_subjects: int = 0
+    num_calls_fetch_query_entities: int = 0
+class HexQueryFetcher:
+    def __init__(
+        self,
+        datahub_client: DataHubClient,
+        workspace_name: str,
+        start_datetime: datetime,
+        end_datetime: datetime,
+        report: HexQueryFetcherReport,
+        page_size: int = DATAHUB_API_PAGE_SIZE_DEFAULT,
+    ):
+        self.datahub_client = datahub_client
+        self.workspace_name = workspace_name
+        self.start_datetime = start_datetime
+        self.end_datetime = end_datetime
+        self.report = report
+        self.page_size = page_size
+        self.report.start_datetime = start_datetime
+        self.report.end_datetime = end_datetime
+    def fetch(self) -> Iterable[QueryResponse]:
+        try:
+            query_urns = self._fetch_query_urns_filter_hex_and_last_modified()
+            assert all(isinstance(urn, QueryUrn) for urn in query_urns)
+            self.report.fetched_query_urns = len(query_urns)
+            entities_by_urn = self._fetch_query_entities(query_urns)
+            self.report.fetched_query_objects = len(entities_by_urn)
+        except Exception as e:
+            self.report.failure(
+                title="Error fetching Queries for lineage",
+                message="Error fetching Queries will result on missing lineage",
+                context=str(
+                    dict(
+                        workspace_name=self.workspace_name,
+                        start_datetime=self.start_datetime,
+                        end_datetime=self.end_datetime,
+                    )
+                ),
+                exc=e,
+            )
+        else:
+            if not query_urns or not entities_by_urn:
+                self.report.warning(
+                    title="No Queries found with Hex as origin",
+                    message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
+                    context=str(
+                        dict(
+                            workspace_name=self.workspace_name,
+                            start_datetime=self.start_datetime,
+                            end_datetime=self.end_datetime,
+                        )
+                    ),
+                )
+                return
+            for query_urn, (
+                query_properties,
+                query_subjects,
+            ) in entities_by_urn.items():
+                maybe_query_response = self._build_query_response(
+                    query_urn=query_urn,
+                    query_properties=query_properties,
+                    query_subjects=query_subjects,
+                )
+                if maybe_query_response:
+                    yield maybe_query_response
+    def _fetch_query_entities(
+        self, query_urns: List[QueryUrn]
+    ) -> Dict[
+        QueryUrn, Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]]
+    ]:
+        entities_by_urn: Dict[
+            QueryUrn,
+            Tuple[Optional[QueryPropertiesClass], Optional[QuerySubjectsClass]],
+        ] = {}
+        for i in range(0, len(query_urns), self.page_size):
+            batch = query_urns[i : i + self.page_size]
+            logger.debug(f"Fetching query entities for {len(batch)} queries: {batch}")
+            entities = self.datahub_client._graph.get_entities(
+                entity_name=QueryUrn.ENTITY_TYPE,
+                urns=[urn.urn() for urn in batch],
+                aspects=[
+                    QueryPropertiesClass.ASPECT_NAME,
+                    QuerySubjectsClass.ASPECT_NAME,
+                ],
+                with_system_metadata=False,
+            )
+            self.report.num_calls_fetch_query_entities += 1
+            logger.debug(f"Get entities response: {entities}")
+            for urn, entity in entities.items():
+                query_urn = QueryUrn.from_string(urn)
+                properties_tuple = entity.get(
+                    QueryPropertiesClass.ASPECT_NAME, (None, None)
+                )
+                query_properties: Optional[QueryPropertiesClass] = None
+                if properties_tuple and properties_tuple[0]:
+                    assert isinstance(properties_tuple[0], QueryPropertiesClass)
+                    query_properties = properties_tuple[0]
+                subjects_tuple = entity.get(
+                    QuerySubjectsClass.ASPECT_NAME, (None, None)
+                )
+                query_subjects: Optional[QuerySubjectsClass] = None
+                if subjects_tuple and subjects_tuple[0]:
+                    assert isinstance(subjects_tuple[0], QuerySubjectsClass)
+                    query_subjects = subjects_tuple[0]
+                entities_by_urn[query_urn] = (query_properties, query_subjects)
+        return entities_by_urn
+    def _fetch_query_urns_filter_hex_and_last_modified(self) -> List[QueryUrn]:
+        last_modified_start_at_millis = datetime_to_ts_millis(self.start_datetime)
+        last_modified_end_at_millis = datetime_to_ts_millis(self.end_datetime)
+        urns = self.datahub_client.search.get_urns(
+            filter=F.and_(
+                F.entity_type(QueryUrn.ENTITY_TYPE),
+                F.custom_filter("origin", "EQUAL", [HEX_PLATFORM_URN.urn()]),
+                F.custom_filter(
+                    "lastModifiedAt",
+                    "GREATER_THAN_OR_EQUAL_TO",
+                    [str(last_modified_start_at_millis)],
+                ),
+                F.custom_filter(
+                    "lastModifiedAt",
+                    "LESS_THAN_OR_EQUAL_TO",
+                    [str(last_modified_end_at_millis)],
+                ),
+            ),
+        )
+        logger.debug(f"Get URNS by filter: {urns}")
+        return [QueryUrn.from_string(urn.urn()) for urn in urns]
+    def _extract_hex_metadata(self, sql_statement: str) -> Optional[Tuple[str, str]]:
+        """
+        Extract project ID and workspace name from SQL statement.
+        Looks for Hex metadata in SQL comments in the format:
+        -- Hex query metadata: {"project_id": "...", "project_url": "https://app.hex.tech/{workspace_name}/hex/..."}
+        Example:
+        -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
+        # TODO: Consider supporting multiline metadata format in the future:
+        # -- Hex query metadata: {
+        # --   "categories": ["Scratchpad"],
+        # --   "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
+        # --   ...
+        # -- }
+        Returns:
+            A tuple of (project_id, workspace_name) if both are successfully extracted
+            None if extraction fails for any reason
+        """
+        # Extract both project_id and workspace name in a single regex operation
+        match = re.search(HEX_METADATA_PATTERN, sql_statement)
+        if not match:
+            self.report.filtered_out_queries_no_match += 1
+            return None
+        try:
+            project_id = match.group(1)
+            workspace_name = match.group(2)
+            return project_id, workspace_name
+        except (IndexError, AttributeError) as e:
+            self.report.warning(
+                title="Failed to extract information from Hex query metadata",
+                message="Failed to extract information from Hex query metadata will result on missing lineage",
+                context=sql_statement,
+                exc=e,
+            )
+        return None
+    def _build_query_response(
+        self,
+        query_urn: QueryUrn,
+        query_properties: Optional[QueryPropertiesClass],
+        query_subjects: Optional[QuerySubjectsClass],
+    ) -> Optional[QueryResponse]:
+        # Skip if missing required aspects
+        if (
+            not query_properties
+            or not query_properties.statement
+            or not query_properties.statement.value
+            or not query_subjects
+            or query_subjects.subjects is None  # empty list is allowed
+        ):
+            logger.debug(
+                f"Skipping query {query_urn} - missing required fields: {(query_properties, query_subjects)}"
+            )
+            self.report.filtered_out_queries_missing_metadata += 1
+            return None
+        # Extract hex metadata (project_id and workspace_name)
+        metadata_result = self._extract_hex_metadata(query_properties.statement.value)
+        if not metadata_result:
+            logger.debug(f"Skipping query {query_urn} - failed to extract Hex metadata")
+            self.report.filtered_out_queries_missing_metadata += 1
+            return None
+        hex_project_id, workspace_from_url = metadata_result
+        # Validate workspace
+        if workspace_from_url != self.workspace_name:
+            logger.debug(
+                f"Skipping query {query_urn} - workspace '{workspace_from_url}' doesn't match '{self.workspace_name}'"
+            )
+            self.report.filtered_out_queries_different_workspace += 1
+            return None
+        # Extract subjects
+        dataset_subjects: List[DatasetUrn] = []
+        schema_field_subjects: List[SchemaFieldUrn] = []
+        for subject in query_subjects.subjects:
+            if subject.entity and subject.entity.startswith("urn:li:dataset:"):
+                dataset_subjects.append(DatasetUrn.from_string(subject.entity))
+            elif subject.entity and subject.entity.startswith("urn:li:schemaField:"):
+                schema_field_subjects.append(SchemaFieldUrn.from_string(subject.entity))
+        if not dataset_subjects and not schema_field_subjects:
+            self.report.filtered_out_queries_no_subjects += 1
+            return None
+        # Create response
+        response = QueryResponse(
+            urn=query_urn,
+            hex_project_id=hex_project_id,
+            dataset_subjects=dataset_subjects,
+            schema_field_subjects=schema_field_subjects,
+        )
+        logger.debug(
+            f"Succesfully extracted {len(dataset_subjects)} dataset subjects and {len(schema_field_subjects)} schema field subjects for query {query_urn}: {dataset_subjects} {schema_field_subjects}"
+        )
+        self.report.total_queries += 1
+        self.report.total_dataset_subjects += len(dataset_subjects)
+        self.report.total_schema_field_subjects += len(schema_field_subjects)
+        logger.debug(
+            f"Processed query {query_urn} with Hex project ID {hex_project_id}"
+        )
+        return response

acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.1rc6py3-none-any.whl → 1.0.0.2py3-none-any.whl