PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show

{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +31 -7
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +8 -5
datahub/ingestion/source/dbt/dbt_core.py +11 -9
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +6 -5
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +16 -18
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +19 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +258 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +30 -6
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +220 -126
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -357,6 +357,11 @@ class DBTCommonConfig(
         default=True,
         description="When enabled, includes the compiled code in the emitted metadata.",
     )
+    include_database_name: bool = Field(
+        default=True,
+        description="Whether to add database name to the table urn. "
+        "Set to False to skip it for engines like AWS Athena where it's not required.",
+    )
     @validator("target_platform")
     def validate_target_platform_value(cls, target_platform: str) -> str:
@@ -1028,7 +1033,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
             cll_nodes.add(dbt_name)
             schema_nodes.add(dbt_name)
-        for dbt_name in all_nodes_map.keys():
+        for dbt_name in all_nodes_map:
             if self._is_allowed_node(dbt_name):
                 add_node_to_cll_list(dbt_name)
@@ -1769,10 +1774,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
                     logger.debug(
                         f"Owner after applying owner extraction pattern:'{self.config.owner_extraction_pattern}' is '{owner}'."
                     )
-            if isinstance(owner, list):
-                owners = owner
-            else:
-                owners = [owner]
+            owners = owner if isinstance(owner, list) else [owner]
             for owner in owners:
                 if self.config.strip_user_ids_from_email:
                     owner = owner.split("@")[0]

datahub/ingestion/source/dbt/dbt_core.py CHANGED Viewed

@@ -167,6 +167,7 @@ def extract_dbt_entities(
     use_identifiers: bool,
     tag_prefix: str,
     only_include_if_in_catalog: bool,
+    include_database_name: bool,
     report: DBTSourceReport,
 ) -> List[DBTNode]:
     sources_by_id = {x["unique_id"]: x for x in sources_results}
@@ -267,7 +268,7 @@ def extract_dbt_entities(
             dbt_name=key,
             dbt_adapter=manifest_adapter,
             dbt_package_name=manifest_node.get("package_name"),
-            database=manifest_node["database"],
+            database=manifest_node["database"] if include_database_name else None,
             schema=manifest_node["schema"],
             name=name,
             alias=manifest_node.get("alias"),
@@ -543,14 +544,15 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
         all_catalog_entities = {**catalog_nodes, **catalog_sources}
         nodes = extract_dbt_entities(
-            all_manifest_entities,
-            all_catalog_entities,
-            sources_results,
-            manifest_adapter,
-            self.config.use_identifiers,
-            self.config.tag_prefix,
-            self.config.only_include_if_in_catalog,
-            self.report,
+            all_manifest_entities=all_manifest_entities,
+            all_catalog_entities=all_catalog_entities,
+            sources_results=sources_results,
+            manifest_adapter=manifest_adapter,
+            use_identifiers=self.config.use_identifiers,
+            tag_prefix=self.config.tag_prefix,
+            only_include_if_in_catalog=self.config.only_include_if_in_catalog,
+            include_database_name=self.config.include_database_name,
+            report=self.report,
         )
         return (

datahub/ingestion/source/dbt/dbt_tests.py CHANGED Viewed

@@ -57,15 +57,11 @@ def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]:
         # base assertions are violated, bail early
         return None
     m = re.match(r"^ref\(\'(.*)\'\)$", destination_ref)
-    if m:
-        destination_table = m.group(1)
-    else:
-        destination_table = destination_ref
+    destination_table = m.group(1) if m else destination_ref
     m = re.search(r"ref\(\'(.*)\'\)", source_ref)
-    if m:
-        source_table = m.group(1)
-    else:
-        source_table = source_ref
+    source_table = m.group(1) if m else source_ref
     return f"{source_table}.{column_name} referential integrity to {destination_table}.{dest_field_name}"

datahub/ingestion/source/delta_lake/config.py CHANGED Viewed

@@ -13,6 +13,9 @@ from datahub.configuration.source_common import (
 )
 from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
 from datahub.ingestion.source.aws.s3_util import is_s3_uri
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+)
 # hide annoying debug errors from py4j
 logging.getLogger("py4j").setLevel(logging.ERROR)
@@ -35,7 +38,11 @@ class S3(ConfigModel):
     )
-class DeltaLakeSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
+class DeltaLakeSourceConfig(
+    PlatformInstanceConfigMixin,
+    EnvConfigMixin,
+    StatefulIngestionConfigBase,
+):
     base_path: str = Field(
         description="Path to table (s3 or local file system). If path is not a delta table path "
         "then all subfolders will be scanned to detect and ingest delta tables."

datahub/ingestion/source/delta_lake/report.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import dataclasses
 from dataclasses import field as dataclass_field
-from datahub.ingestion.api.source import SourceReport
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalSourceReport,
+)
 from datahub.utilities.lossy_collections import LossyList
 @dataclasses.dataclass
-class DeltaLakeSourceReport(SourceReport):
+class DeltaLakeSourceReport(StaleEntityRemovalSourceReport):
     files_scanned = 0
     filtered: LossyList[str] = dataclass_field(default_factory=LossyList)

datahub/ingestion/source/delta_lake/source.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 import logging
 import os
 import time
-from typing import Dict, Iterable, List
+from typing import Dict, Iterable, List, Optional
 from urllib.parse import urlparse
 from deltalake import DeltaTable
@@ -21,7 +21,7 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags
 from datahub.ingestion.source.aws.s3_util import (
@@ -36,6 +36,12 @@ from datahub.ingestion.source.delta_lake.delta_lake_utils import (
     read_delta_table,
 )
 from datahub.ingestion.source.delta_lake.report import DeltaLakeSourceReport
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionSourceBase,
+)
 from datahub.metadata.com.linkedin.pegasus2avro.common import Status
 from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
@@ -79,7 +85,7 @@ OPERATION_STATEMENT_TYPES = {
 @config_class(DeltaLakeSourceConfig)
 @support_status(SupportStatus.INCUBATING)
 @capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
-class DeltaLakeSource(Source):
+class DeltaLakeSource(StatefulIngestionSourceBase):
     """
     This plugin extracts:
     - Column types and schema associated with each delta table
@@ -100,9 +106,10 @@ class DeltaLakeSource(Source):
     storage_options: Dict[str, str]
     def __init__(self, config: DeltaLakeSourceConfig, ctx: PipelineContext):
-        super().__init__(ctx)
+        super().__init__(config, ctx)
+        self.ctx = ctx
         self.source_config = config
-        self.report = DeltaLakeSourceReport()
+        self.report: DeltaLakeSourceReport = DeltaLakeSourceReport()
         if self.source_config.is_s3:
             if (
                 self.source_config.s3 is None
@@ -331,6 +338,14 @@ class DeltaLakeSource(Source):
         for folder in os.listdir(path):
             yield os.path.join(path, folder)
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.source_config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         self.container_WU_creator = ContainerWUCreator(
             self.source_config.platform,

datahub/ingestion/source/dremio/dremio_api.py CHANGED Viewed

@@ -271,12 +271,12 @@ class DremioAPIOperations:
                     self.cancel_query(job_id)
                     raise DremioAPIException(
                         f"Query execution timed out after {timeout} seconds"
-                    )
+                    ) from None
                 except RuntimeError as e:
-                    raise DremioAPIException(f"{str(e)}")
+                    raise DremioAPIException() from e
         except requests.RequestException as e:
-            raise DremioAPIException(f"Error executing query: {str(e)}")
+            raise DremioAPIException("Error executing query") from e
     def fetch_results(self, job_id: str) -> List[Dict]:
         """Fetch job results with status checking"""
@@ -683,11 +683,7 @@ class DremioAPIOperations:
                 # Add end anchor for exact matching
                 regex_pattern = regex_pattern + "$"
-        for path in paths:
-            if re.match(regex_pattern, path, re.IGNORECASE):
-                return True
-        return False
+        return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
     def should_include_container(self, path: List[str], name: str) -> bool:
         """

datahub/ingestion/source/dremio/dremio_aspects.py CHANGED Viewed

@@ -116,10 +116,7 @@ class SchemaFieldTypeMapper:
             data_type = data_type.lower()
             type_class = cls.FIELD_TYPE_MAPPING.get(data_type, NullTypeClass)
-            if data_size:
-                native_data_type = f"{data_type}({data_size})"
-            else:
-                native_data_type = data_type
+            native_data_type = f"{data_type}({data_size})" if data_size else data_type
         try:
             schema_field_type = SchemaFieldDataTypeClass(type=type_class())
@@ -168,8 +165,9 @@ class DremioAspects:
         )
     def get_container_urn(
-        self, name: Optional[str] = None, path: Optional[List[str]] = []
+        self, name: Optional[str] = None, path: Optional[List[str]] = None
     ) -> str:
+        path = path or []
         container_key = self.get_container_key(name, path)
         return container_key.as_urn()

datahub/ingestion/source/dynamodb/dynamodb.py CHANGED Viewed

@@ -165,6 +165,10 @@ _attribute_type_to_field_type_mapping: Dict[str, Type] = {
     SourceCapability.PLATFORM_INSTANCE,
     "By default, platform_instance will use the AWS account id",
 )
+@capability(
+    SourceCapability.CLASSIFICATION,
+    "Optionally enabled via `classification.enabled`",
+)
 class DynamoDBSource(StatefulIngestionSourceBase):
     """
     This plugin extracts the following:
@@ -242,8 +246,10 @@ class DynamoDBSource(StatefulIngestionSourceBase):
             platform=self.platform,
             platform_instance=platform_instance,
             name=dataset_name,
+            env=self.config.env,
         )
         dataset_properties = DatasetPropertiesClass(
+            name=table_name,
             tags=[],
             customProperties={
                 "table.arn": table_info["TableArn"],

datahub/ingestion/source/elastic_search.py CHANGED Viewed

@@ -32,9 +32,17 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+    StaleEntityRemovalSourceReport,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+    StatefulIngestionSourceBase,
+)
 from datahub.ingestion.source_config.operation_config import (
     OperationConfig,
     is_profiling_enabled,
@@ -188,7 +196,7 @@ class ElasticToSchemaFieldConverter:
 @dataclass
-class ElasticsearchSourceReport(SourceReport):
+class ElasticsearchSourceReport(StaleEntityRemovalSourceReport):
     index_scanned: int = 0
     filtered: LossyList[str] = field(default_factory=LossyList)
@@ -240,7 +248,11 @@ def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
     )
-class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
+class ElasticsearchSourceConfig(
+    StatefulIngestionConfigBase,
+    PlatformInstanceConfigMixin,
+    EnvConfigMixin,
+):
     host: str = Field(
         default="localhost:9200", description="The elastic search host URI."
     )
@@ -337,7 +349,7 @@ class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
 @config_class(ElasticsearchSourceConfig)
 @support_status(SupportStatus.CERTIFIED)
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
-class ElasticsearchSource(Source):
+class ElasticsearchSource(StatefulIngestionSourceBase):
     """
     This plugin extracts the following:
@@ -346,7 +358,7 @@ class ElasticsearchSource(Source):
     """
     def __init__(self, config: ElasticsearchSourceConfig, ctx: PipelineContext):
-        super().__init__(ctx)
+        super().__init__(config, ctx)
         self.source_config = config
         self.client = Elasticsearch(
             self.source_config.host,
@@ -361,7 +373,7 @@ class ElasticsearchSource(Source):
             ssl_assert_fingerprint=self.source_config.ssl_assert_fingerprint,
             url_prefix=self.source_config.url_prefix,
         )
-        self.report = ElasticsearchSourceReport()
+        self.report: ElasticsearchSourceReport = ElasticsearchSourceReport()
         self.data_stream_partition_count: Dict[str, int] = defaultdict(int)
         self.platform: str = "elasticsearch"
         self.cat_response: Optional[List[Dict[str, Any]]] = None
@@ -373,6 +385,14 @@ class ElasticsearchSource(Source):
         config = ElasticsearchSourceConfig.parse_obj(config_dict)
         return cls(config, ctx)
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.source_config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         indices = self.client.indices.get_alias()
         for index in indices:

datahub/ingestion/source/feast.py CHANGED Viewed

@@ -20,7 +20,6 @@ from feast.data_source import DataSource
 from pydantic import Field
 import datahub.emitter.mce_builder as builder
-from datahub.configuration.common import ConfigModel
 from datahub.emitter.mce_builder import DEFAULT_ENV
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
@@ -31,8 +30,16 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalHandler,
+    StaleEntityRemovalSourceReport,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+    StatefulIngestionSourceBase,
+)
 from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType
 from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
     MLFeatureSnapshot,
@@ -86,7 +93,9 @@ _field_type_mapping: Dict[Union[ValueType, feast.types.FeastType], str] = {
 }
-class FeastRepositorySourceConfig(ConfigModel):
+class FeastRepositorySourceConfig(
+    StatefulIngestionConfigBase,
+):
     path: str = Field(description="Path to Feast repository")
     fs_yaml_file: Optional[str] = Field(
         default=None,
@@ -122,7 +131,7 @@ class FeastRepositorySourceConfig(ConfigModel):
 @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
 @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
 @dataclass
-class FeastRepositorySource(Source):
+class FeastRepositorySource(StatefulIngestionSourceBase):
     """
     This plugin extracts:
@@ -135,13 +144,14 @@ class FeastRepositorySource(Source):
     platform = "feast"
     source_config: FeastRepositorySourceConfig
-    report: SourceReport
+    report: StaleEntityRemovalSourceReport
     feature_store: FeatureStore
     def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
-        super().__init__(ctx)
+        super().__init__(config, ctx)
         self.source_config = config
-        self.report = SourceReport()
+        self.ctx = ctx
+        self.report = StaleEntityRemovalSourceReport()
         self.feature_store = FeatureStore(
             repo_path=self.source_config.path,
             fs_yaml_file=self.source_config.fs_yaml_file,
@@ -158,7 +168,8 @@ class FeastRepositorySource(Source):
         if ml_feature_data_type is None:
             self.report.report_warning(
-                parent_name, f"unable to map type {field_type} to metadata schema"
+                "unable to map type",
+                f"unable to map type {field_type} to metadata schema to parent: {parent_name}",
             )
             ml_feature_data_type = MLFeatureDataType.UNKNOWN
@@ -456,6 +467,14 @@ class FeastRepositorySource(Source):
         config = FeastRepositorySourceConfig.parse_obj(config_dict)
         return cls(config, ctx)
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.source_config, self.ctx
+            ).workunit_processor,
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         for feature_view in self.feature_store.list_feature_views():
             for entity_name in feature_view.entities:

datahub/ingestion/source/file.py CHANGED Viewed

@@ -351,7 +351,7 @@ class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
                     self.report.add_deserialize_time(deserialize_duration)
                     yield i, item
             except Exception as e:
-                self.report.report_failure(f"path-{i}", str(e))
+                self.report.report_failure(f"{file_status.path}-{i}", str(e))
     @staticmethod
     def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -410,10 +410,13 @@ def _from_obj_for_file(
         item = MetadataChangeEvent.from_obj(obj)
     elif "aspect" in obj:
         item = MetadataChangeProposalWrapper.from_obj(obj)
-    else:
+    elif "bucket" in obj:
         item = UsageAggregationClass.from_obj(obj)
+    else:
+        raise ValueError(f"Unknown object type: {obj}")
     if not item.validate():
-        raise ValueError(f"failed to parse: {obj}")
+        raise ValueError(f"Failed to parse: {obj}")
     if isinstance(item, UsageAggregationClass):
         logger.warning(f"Dropping deprecated UsageAggregationClass: {item}")

datahub/ingestion/source/gc/dataprocess_cleanup.py CHANGED Viewed

@@ -498,7 +498,7 @@ class DataProcessCleanup:
         # Delete empty dataflows if needed
         if self.config.delete_empty_data_flows:
             deleted_data_flows: int = 0
-            for key in dataFlows.keys():
+            for key in dataFlows:
                 if not dataJobs.get(key) or len(dataJobs[key]) == 0:
                     logger.info(
                         f"Deleting dataflow {key} because there are not datajobs"

datahub/ingestion/source/gc/execution_request_cleanup.py CHANGED Viewed

@@ -130,8 +130,9 @@ class DatahubExecutionRequestCleanup:
         )
     def _scroll_execution_requests(
-        self, overrides: Dict[str, Any] = {}
+        self, overrides: Optional[Dict[str, Any]] = None
     ) -> Iterator[CleanupRecord]:
+        overrides = overrides or {}
         headers: Dict[str, Any] = {
             "Accept": "application/json",
             "Content-Type": "application/json",

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
             ).select_from(self._table)
         )
         return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == BIGQUERY:
-        element_values = self.engine.execute(
-            sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
-                self._table
-            )
-        )
-        return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == SNOWFLAKE:
+    elif (
+        self.engine.dialect.name.lower() == BIGQUERY
+        or self.engine.dialect.name.lower() == SNOWFLAKE
+    ):
         element_values = self.engine.execute(
             sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
                 self._table
@@ -381,13 +377,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
             col = col_dict["name"]
             self.column_types[col] = str(col_dict["type"])
             # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
-            if not self.config._allow_deny_patterns.allowed(
-                f"{self.dataset_name}.{col}"
+            if (
+                not self.config._allow_deny_patterns.allowed(
+                    f"{self.dataset_name}.{col}"
+                )
+                or not self.config.profile_nested_fields
+                and "." in col
             ):
                 ignored_columns_by_pattern.append(col)
-            # We try to ignore nested columns as well
-            elif not self.config.profile_nested_fields and "." in col:
-                ignored_columns_by_pattern.append(col)
             elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
                 ignored_columns_by_type.append(col)
             else:
@@ -605,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
         if not self.config.include_field_median_value:
             return
         try:
-            if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
+            if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
                 column_profile.median = str(
                     self.dataset.engine.execute(
                         sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -1408,7 +1405,7 @@ class DatahubGEProfiler:
             },
         )
-        if platform == BIGQUERY or platform == DATABRICKS:
+        if platform in (BIGQUERY, DATABRICKS):
             # This is done as GE makes the name as DATASET.TABLE
             # but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
             name_parts = pretty_name.split(".")

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -2,8 +2,9 @@ import json
 import logging
 import threading
 import uuid
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+from dateutil import parser as dateutil_parser
 from pyiceberg.catalog import Catalog
 from pyiceberg.exceptions import (
     NoSuchIcebergTableError,
@@ -81,6 +82,7 @@ from datahub.metadata.schema_classes import (
     OwnerClass,
     OwnershipClass,
     OwnershipTypeClass,
+    TimeStampClass,
 )
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -183,16 +185,9 @@ class IcebergSource(StatefulIngestionSourceBase):
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         thread_local = threading.local()
-        def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
-            LOGGER.debug(f"Processing dataset for path {dataset_path}")
-            dataset_name = ".".join(dataset_path)
-            if not self.config.table_pattern.allowed(dataset_name):
-                # Dataset name is rejected by pattern, report as dropped.
-                self.report.report_dropped(dataset_name)
-                LOGGER.debug(
-                    f"Skipping table {dataset_name} due to not being allowed by the config pattern"
-                )
-                return
+        def _try_processing_dataset(
+            dataset_path: Tuple[str, ...], dataset_name: str
+        ) -> Iterable[MetadataWorkUnit]:
             try:
                 if not hasattr(thread_local, "local_catalog"):
                     LOGGER.debug(
@@ -248,10 +243,31 @@ class IcebergSource(StatefulIngestionSourceBase):
                 LOGGER.warning(
                     f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
                 )
+            except ValueError as e:
+                if "Could not initialize FileIO" not in str(e):
+                    raise
+                self.report.warning(
+                    "Could not initialize FileIO",
+                    f"Could not initialize FileIO for {dataset_path} due to: {e}",
+                )
+        def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
+            try:
+                LOGGER.debug(f"Processing dataset for path {dataset_path}")
+                dataset_name = ".".join(dataset_path)
+                if not self.config.table_pattern.allowed(dataset_name):
+                    # Dataset name is rejected by pattern, report as dropped.
+                    self.report.report_dropped(dataset_name)
+                    LOGGER.debug(
+                        f"Skipping table {dataset_name} due to not being allowed by the config pattern"
+                    )
+                    return
+                yield from _try_processing_dataset(dataset_path, dataset_name)
             except Exception as e:
                 self.report.report_failure(
                     "general",
-                    f"Failed to create workunit for dataset {dataset_name}: {e}",
+                    f"Failed to create workunit for dataset {dataset_path}: {e}",
                 )
                 LOGGER.exception(
                     f"Exception while processing table {dataset_path}, skipping it.",
@@ -288,6 +304,7 @@ class IcebergSource(StatefulIngestionSourceBase):
             )
             # Dataset properties aspect.
+            additional_properties = {}
             custom_properties = table.metadata.properties.copy()
             custom_properties["location"] = table.metadata.location
             custom_properties["format-version"] = str(table.metadata.format_version)
@@ -299,10 +316,27 @@ class IcebergSource(StatefulIngestionSourceBase):
                 custom_properties["manifest-list"] = (
                     table.current_snapshot().manifest_list
                 )
+                additional_properties["lastModified"] = TimeStampClass(
+                    int(table.current_snapshot().timestamp_ms)
+                )
+            if "created-at" in custom_properties:
+                try:
+                    dt = dateutil_parser.isoparse(custom_properties["created-at"])
+                    additional_properties["created"] = TimeStampClass(
+                        int(dt.timestamp() * 1000)
+                    )
+                except Exception as ex:
+                    LOGGER.warning(
+                        f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
+                    )
             dataset_properties = DatasetPropertiesClass(
                 name=table.name()[-1],
                 description=table.metadata.properties.get("comment", None),
                 customProperties=custom_properties,
+                lastModified=additional_properties.get("lastModified"),
+                created=additional_properties.get("created"),
+                qualifiedName=dataset_name,
             )
             dataset_snapshot.aspects.append(dataset_properties)
             # Dataset ownership aspect.

acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl