PyPI - acryl-datahub - Versions diffs - 1.1.0.5rc2__py3-none-any.whl → 1.1.0.5rc4__py3-none-any.whl - Mend

acryl-datahub 1.1.0.5rc2py3-none-any.whl → 1.1.0.5rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (42) hide show

datahub/ingestion/autogenerated/lineage_helper.py ADDED Viewed

@@ -0,0 +1,193 @@
+import json
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
+from datahub.utilities.urns.urn import guess_entity_type
+logger = logging.getLogger(__name__)
+# Global cache for lineage data to avoid repeated file reads
+_lineage_data: Optional[Dict] = None
+def _load_lineage_data() -> Dict:
+    """
+    This is experimental internal API subject to breaking changes without prior notice.
+    Load lineage data from the autogenerated lineage.json file.
+    Returns:
+        Dict containing the lineage information
+    Raises:
+        FileNotFoundError: If lineage.json doesn't exist
+        json.JSONDecodeError: If lineage.json is malformed
+    """
+    global _lineage_data
+    if _lineage_data is not None:
+        return _lineage_data
+    # Get the path to lineage.json relative to this file
+    current_file = Path(__file__)
+    lineage_file = current_file.parent / "lineage.json"
+    if not lineage_file.exists():
+        raise FileNotFoundError(f"Lineage file not found: {lineage_file}")
+    try:
+        with open(lineage_file, "r") as f:
+            _lineage_data = json.load(f)
+        return _lineage_data
+    except json.JSONDecodeError as e:
+        raise json.JSONDecodeError(
+            f"Failed to parse lineage.json: {e}", e.doc, e.pos
+        ) from e
+def get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
+    """
+    This is experimental internal API subject to breaking changes without prior notice.
+    Get lineage fields for a specific entity type and aspect.
+    Args:
+        entity_type: The entity type (e.g., 'dataset', 'dataJob')
+        aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
+    Returns:
+        List of lineage field dictionaries, each containing:
+        - name: field name
+        - path: dot-notation path to the field
+        - isLineage: boolean indicating if it's lineage
+        - relationship: relationship information
+    Raises:
+        FileNotFoundError: If lineage.json doesn't exist
+        json.JSONDecodeError: If lineage.json is malformed
+    """
+    lineage_data = _load_lineage_data()
+    entity_data = lineage_data.get("entities", {}).get(entity_type, {})
+    aspect_data = entity_data.get(aspect_name, {})
+    return aspect_data.get("fields", [])
+def is_lineage_field(urn: str, aspect_name: str, field_path: str) -> bool:
+    """
+    This is experimental internal API subject to breaking changes without prior notice.
+    Check if a specific field path is lineage-related.
+    Args:
+        urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
+        aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
+        field_path: The dot-notation path to the field (e.g., 'upstreams.dataset')
+    Returns:
+        True if the field is lineage-related, False otherwise
+    Raises:
+        FileNotFoundError: If lineage.json doesn't exist
+        json.JSONDecodeError: If lineage.json is malformed
+        AssertionError: If URN doesn't start with 'urn:li:'
+    """
+    entity_type = guess_entity_type(urn)
+    lineage_fields = get_lineage_fields(entity_type, aspect_name)
+    for field in lineage_fields:
+        if field.get("path") == field_path:
+            return field.get("isLineage", False)
+    return False
+def has_lineage(urn: str, aspect: Any) -> bool:
+    """
+    This is experimental internal API subject to breaking changes without prior notice.
+    Check if an aspect has any lineage fields.
+    Args:
+        urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
+        aspect: The aspect object
+    Returns:
+        True if the aspect has lineage fields, False otherwise
+    Raises:
+        FileNotFoundError: If lineage.json doesn't exist
+        json.JSONDecodeError: If lineage.json is malformed
+        AssertionError: If URN doesn't start with 'urn:li:'
+    """
+    entity_type = guess_entity_type(urn)
+    aspect_class = getattr(aspect, "__class__", None)
+    aspect_name = (
+        aspect_class.__name__ if aspect_class is not None else str(type(aspect))
+    )
+    lineage_fields = get_lineage_fields(entity_type, aspect_name)
+    return len(lineage_fields) > 0
+def has_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
+    """
+    This is experimental internal API subject to breaking changes without prior notice.
+    Check if an aspect has any lineage fields.
+    Args:
+        entity_type: The entity type (e.g., 'dataset', 'dataJob')
+        aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
+    Returns:
+        True if the aspect has lineage fields, False otherwise
+    Raises:
+        FileNotFoundError: If lineage.json doesn't exist
+        json.JSONDecodeError: If lineage.json is malformed
+    """
+    lineage_fields = get_lineage_fields(entity_type, aspect_name)
+    return len(lineage_fields) > 0
+def get_all_lineage_aspects(entity_type: str) -> Set[str]:
+    """
+    This is experimental internal API subject to breaking changes without prior notice.
+    Get all aspects that have lineage fields for a given entity type.
+    Args:
+        entity_type: The entity type (e.g., 'dataset', 'dataJob')
+    Returns:
+        Set of aspect names that have lineage fields
+    Raises:
+        FileNotFoundError: If lineage.json doesn't exist
+        json.JSONDecodeError: If lineage.json is malformed
+    """
+    lineage_data = _load_lineage_data()
+    entity_data = lineage_data.get("entities", {}).get(entity_type, {})
+    lineage_aspects = set()
+    for aspect_name, aspect_data in entity_data.items():
+        if aspect_data.get("fields"):
+            lineage_aspects.add(aspect_name)
+    return lineage_aspects
+def clear_cache() -> None:
+    """
+    This is experimental internal API subject to breaking changes without prior notice.
+    Clear the internal cache of lineage data.
+    This is useful for testing or when the lineage.json file has been updated.
+    """
+    global _lineage_data
+    _lineage_data = None

datahub/ingestion/run/pipeline.py CHANGED Viewed

@@ -578,11 +578,17 @@ class Pipeline:
         sink_failures = len(self.sink.get_report().failures)
         sink_warnings = len(self.sink.get_report().warnings)
         global_warnings = len(get_global_warnings())
+        source_aspects = self.source.get_report().get_aspects_dict()
+        source_aspects_by_subtype = (
+            self.source.get_report().get_aspects_by_subtypes_dict()
+        )
         telemetry_instance.ping(
             "ingest_stats",
             {
                 "source_type": self.source_type,
+                "source_aspects": source_aspects,
+                "source_aspects_by_subtype": source_aspects_by_subtype,
                 "sink_type": self.sink_type,
                 "transformer_types": [
                     transformer.type for transformer in self.config.transformers or []

datahub/ingestion/source/bigquery_v2/bigquery_queries.py CHANGED Viewed

@@ -94,3 +94,4 @@ class BigQueryQueriesSource(Source):
     def close(self) -> None:
         self.queries_extractor.close()
         self.connection.close()
+        super().close()

datahub/ingestion/source/bigquery_v2/profiler.py CHANGED Viewed

@@ -189,6 +189,7 @@ WHERE
         if len(profile_requests) == 0:
             return
         yield from self.generate_profile_workunits(
             profile_requests,
             max_workers=self.config.profiling.max_workers,
@@ -226,10 +227,11 @@ WHERE
             db_name, schema_name, bq_table, self.config.profiling.partition_datetime
         )
-        if partition is None and bq_table.partition_info:
+        # For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
+        if partition is None and bq_table.partition_info and bq_table.rows_count:
             self.report.report_warning(
                 title="Profile skipped for partitioned table",
-                message="profile skipped as partitioned table is empty or partition id or type was invalid",
+                message="profile skipped as partition id or type was invalid",
                 context=profile_request.pretty_name,
             )
             return None

datahub/ingestion/source/bigquery_v2/queries.py CHANGED Viewed

@@ -45,12 +45,12 @@ SELECT
   tos.OPTION_VALUE as comment,
   t.is_insertable_into,
   t.ddl,
-  ts.row_count,
-  ts.size_bytes as bytes,
+  ts.row_count as row_count,
+  ts.size_bytes as size_bytes,
   p.num_partitions,
   p.max_partition_id,
-  p.active_billable_bytes,
-  p.long_term_billable_bytes,
+  p.active_billable_bytes as active_billable_bytes,
+  -- IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
   REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
   REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base

datahub/ingestion/source/common/subtypes.py CHANGED Viewed

@@ -26,6 +26,8 @@ class DatasetSubTypes(StrEnum):
     NEO4J_RELATIONSHIP = "Neo4j Relationship"
     SNOWFLAKE_STREAM = "Snowflake Stream"
     API_ENDPOINT = "API Endpoint"
+    SLACK_CHANNEL = "Slack Channel"
+    PROJECTIONS = "Projections"
     # TODO: Create separate entity...
     NOTEBOOK = "Notebook"

datahub/ingestion/source/fivetran/fivetran.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import logging
-from typing import Dict, Iterable, List, Optional
+from typing import Dict, Iterable, List, Optional, Union
 import datahub.emitter.mce_builder as builder
-from datahub.api.entities.datajob import DataFlow, DataJob
+from datahub.api.entities.datajob import DataJob as DataJobV1
 from datahub.api.entities.dataprocess.dataprocess_instance import (
     DataProcessInstance,
     InstanceRunResult,
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
     FineGrainedLineageDownstreamType,
     FineGrainedLineageUpstreamType,
 )
-from datahub.utilities.urns.data_flow_urn import DataFlowUrn
-from datahub.utilities.urns.dataset_urn import DatasetUrn
+from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
+from datahub.sdk.dataflow import DataFlow
+from datahub.sdk.datajob import DataJob
+from datahub.sdk.entity import Entity
 # Logger instance
 logger = logging.getLogger(__name__)
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
         self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
     def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
-        input_dataset_urn_list: List[DatasetUrn] = []
-        output_dataset_urn_list: List[DatasetUrn] = []
+        input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
+        output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
         fine_grained_lineage: List[FineGrainedLineage] = []
         # TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
                         )
                     )
-        datajob.inlets.extend(input_dataset_urn_list)
-        datajob.outlets.extend(output_dataset_urn_list)
-        datajob.fine_grained_lineages.extend(fine_grained_lineage)
+        datajob.set_inlets(input_dataset_urn_list)
+        datajob.set_outlets(output_dataset_urn_list)
+        datajob.set_fine_grained_lineages(fine_grained_lineage)
         return dict(
             **{
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
     def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
         return DataFlow(
-            orchestrator=Constant.ORCHESTRATOR,
-            id=connector.connector_id,
+            platform=Constant.ORCHESTRATOR,
+            name=connector.connector_id,
             env=self.config.env,
-            name=connector.connector_name,
+            display_name=connector.connector_name,
             platform_instance=self.config.platform_instance,
         )
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
         )
         owner_email = self.audit_log.get_user_email(connector.user_id)
         datajob = DataJob(
-            id=connector.connector_id,
+            name=connector.connector_id,
             flow_urn=dataflow_urn,
             platform_instance=self.config.platform_instance,
-            name=connector.connector_name,
-            owners={owner_email} if owner_email else set(),
+            display_name=connector.connector_name,
+            owners=[CorpUserUrn(owner_email)] if owner_email else None,
         )
         # Map connector source and destination table with dataset entity
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
             "sync_frequency": str(connector.sync_frequency),
             "destination_id": connector.destination_id,
         }
-        datajob.properties = {
-            **connector_properties,
-            **lineage_properties,
-        }
+        datajob.set_custom_properties({**connector_properties, **lineage_properties})
         return datajob
     def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
+        # hack: convert to old instance for DataProcessInstance.from_datajob compatibility
+        datajob_v1 = DataJobV1(
+            id=datajob.name,
+            flow_urn=datajob.flow_urn,
+            platform_instance=self.config.platform_instance,
+            name=datajob.name,
+            inlets=datajob.inlets,
+            outlets=datajob.outlets,
+            fine_grained_lineages=datajob.fine_grained_lineages,
+        )
         return DataProcessInstance.from_datajob(
-            datajob=datajob,
+            datajob=datajob_v1,
             id=job.job_id,
             clone_inlets=True,
             clone_outlets=True,
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
     def _get_connector_workunits(
         self, connector: Connector
-    ) -> Iterable[MetadataWorkUnit]:
+    ) -> Iterable[Union[MetadataWorkUnit, Entity]]:
         self.report.report_connectors_scanned()
         # Create dataflow entity with same name as connector name
         dataflow = self._generate_dataflow_from_connector(connector)
-        for mcp in dataflow.generate_mcp():
-            yield mcp.as_workunit()
+        yield dataflow
         # Map Fivetran's connector entity with Datahub's datajob entity
         datajob = self._generate_datajob_from_connector(connector)
-        for mcp in datajob.generate_mcp(materialize_iolets=False):
-            yield mcp.as_workunit()
+        yield datajob
         # Map Fivetran's job/sync history entity with Datahub's data process entity
         if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
             ).workunit_processor,
         ]
-    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+    def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
         """
         Datahub Ingestion framework invoke this method
         """

datahub/ingestion/source/hex/api.py CHANGED Viewed

@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
 import requests
 from pydantic import BaseModel, Field, ValidationError, validator
+from requests.adapters import HTTPAdapter
 from typing_extensions import assert_never
+from urllib3.util.retry import Retry
 from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.source.hex.constants import (
@@ -220,6 +222,7 @@ class HexApi:
         self.base_url = base_url
         self.report = report
         self.page_size = page_size
+        self.session = self._create_retry_session()
     def _list_projects_url(self):
         return f"{self.base_url}/projects"
@@ -227,6 +230,28 @@ class HexApi:
     def _auth_header(self):
         return {"Authorization": f"Bearer {self.token}"}
+    def _create_retry_session(self) -> requests.Session:
+        """Create a requests session with retry logic for rate limiting.
+        Hex API rate limit: 60 requests per minute
+        https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
+        """
+        session = requests.Session()
+        # Configure retry strategy for 429 (Too Many Requests) with exponential backoff
+        retry_strategy = Retry(
+            total=5,  # Maximum number of retries
+            status_forcelist=[429],  # Only retry on 429 status code
+            backoff_factor=2,  # Exponential backoff: 2, 4, 8, 16, 32 seconds
+            raise_on_status=True,  # Raise exception after max retries
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        session.mount("http://", adapter)
+        session.mount("https://", adapter)
+        return session
     def fetch_projects(
         self,
         include_components: bool = True,
@@ -259,7 +284,7 @@ class HexApi:
         logger.debug(f"Fetching projects page with params: {params}")
         self.report.fetch_projects_page_calls += 1
         try:
-            response = requests.get(
+            response = self.session.get(
                 url=self._list_projects_url(),
                 headers=self._auth_header(),
                 params=params,

acryl-datahub 1.1.0.5rc2__py3-none-any.whl → 1.1.0.5rc4__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.5rc2py3-none-any.whl → 1.1.0.5rc4py3-none-any.whl