PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc9py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show

{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
datahub/cli/cli_utils.py +2 -0
datahub/cli/delete_cli.py +103 -24
datahub/cli/ingest_cli.py +110 -0
datahub/cli/put_cli.py +1 -1
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/structuredproperties_cli.py +2 -1
datahub/configuration/common.py +3 -3
datahub/configuration/git.py +7 -1
datahub/configuration/kafka_consumer_config.py +31 -1
datahub/emitter/mcp_patch_builder.py +43 -0
datahub/emitter/rest_emitter.py +17 -4
datahub/ingestion/api/incremental_properties_helper.py +69 -0
datahub/ingestion/api/source.py +6 -1
datahub/ingestion/api/source_helpers.py +4 -2
datahub/ingestion/graph/client.py +2 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
datahub/ingestion/run/pipeline.py +6 -5
datahub/ingestion/run/pipeline_config.py +6 -0
datahub/ingestion/sink/datahub_rest.py +15 -4
datahub/ingestion/source/abs/source.py +4 -0
datahub/ingestion/source/aws/aws_common.py +13 -1
datahub/ingestion/source/aws/sagemaker.py +8 -0
datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/common/subtypes.py +2 -0
datahub/ingestion/source/csv_enricher.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
datahub/ingestion/source/datahub/datahub_source.py +8 -1
datahub/ingestion/source/dbt/dbt_common.py +7 -61
datahub/ingestion/source/dremio/dremio_api.py +204 -86
datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
datahub/ingestion/source/dremio/dremio_config.py +5 -0
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
datahub/ingestion/source/dremio/dremio_entities.py +4 -0
datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
datahub/ingestion/source/dremio/dremio_source.py +7 -2
datahub/ingestion/source/elastic_search.py +1 -1
datahub/ingestion/source/feast.py +97 -6
datahub/ingestion/source/gc/datahub_gc.py +46 -35
datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
datahub/ingestion/source/ge_data_profiler.py +23 -1
datahub/ingestion/source/iceberg/iceberg.py +12 -5
datahub/ingestion/source/kafka/kafka.py +39 -19
datahub/ingestion/source/kafka/kafka_connect.py +81 -51
datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
datahub/ingestion/source/looker/view_upstream.py +65 -30
datahub/ingestion/source/metadata/business_glossary.py +35 -18
datahub/ingestion/source/mode.py +0 -23
datahub/ingestion/source/neo4j/__init__.py +0 -0
datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
datahub/ingestion/source/powerbi/__init__.py +0 -1
datahub/ingestion/source/powerbi/config.py +3 -3
datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
datahub/ingestion/source/powerbi/powerbi.py +12 -6
datahub/ingestion/source/preset.py +1 -0
datahub/ingestion/source/pulsar.py +21 -2
datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
datahub/ingestion/source/redash.py +13 -63
datahub/ingestion/source/redshift/config.py +1 -0
datahub/ingestion/source/redshift/redshift.py +3 -0
datahub/ingestion/source/s3/source.py +2 -3
datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
datahub/ingestion/source/sql/athena.py +46 -22
datahub/ingestion/source/sql/mssql/source.py +0 -2
datahub/ingestion/source/sql/sql_common.py +34 -21
datahub/ingestion/source/sql/sql_report.py +1 -0
datahub/ingestion/source/sql/sql_types.py +85 -8
datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
datahub/ingestion/source/superset.py +215 -65
datahub/ingestion/source/tableau/tableau.py +237 -76
datahub/ingestion/source/tableau/tableau_common.py +12 -6
datahub/ingestion/source/tableau/tableau_constant.py +2 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
datahub/ingestion/source/tableau/tableau_validation.py +48 -0
datahub/ingestion/source/unity/proxy_types.py +1 -0
datahub/ingestion/source/unity/source.py +4 -0
datahub/ingestion/source/unity/usage.py +20 -11
datahub/ingestion/transformer/add_dataset_tags.py +1 -1
datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
datahub/integrations/assertion/common.py +1 -1
datahub/lite/duckdb_lite.py +12 -17
datahub/metadata/_schema_classes.py +512 -392
datahub/metadata/_urns/urn_defs.py +1355 -1355
datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
datahub/metadata/schema.avsc +17222 -17499
datahub/metadata/schemas/FormInfo.avsc +4 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
datahub/specific/chart.py +0 -39
datahub/specific/dashboard.py +0 -39
datahub/specific/datajob.py +7 -57
datahub/sql_parsing/schema_resolver.py +23 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
datahub/sql_parsing/sqlglot_lineage.py +55 -14
datahub/sql_parsing/sqlglot_utils.py +8 -2
datahub/telemetry/telemetry.py +23 -9
datahub/testing/compare_metadata_json.py +1 -1
datahub/testing/doctest.py +12 -0
datahub/utilities/file_backed_collections.py +35 -2
datahub/utilities/partition_executor.py +1 -1
datahub/utilities/urn_encoder.py +2 -1
datahub/utilities/urns/_urn_base.py +1 -1
datahub/utilities/urns/structured_properties_urn.py +1 -1
datahub/utilities/sql_lineage_parser_impl.py +0 -160
datahub/utilities/sql_parser.py +0 -94
datahub/utilities/sql_parser_base.py +0 -21
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
{acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/gc/datahub_gc.py CHANGED Viewed

@@ -65,18 +65,18 @@ class DataHubGcSourceConfig(ConfigModel):
         description="Sleep between truncation monitoring.",
     )
-    dataprocess_cleanup: Optional[DataProcessCleanupConfig] = Field(
-        default=None,
+    dataprocess_cleanup: DataProcessCleanupConfig = Field(
+        default_factory=DataProcessCleanupConfig,
         description="Configuration for data process cleanup",
     )
-    soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanupConfig] = Field(
-        default=None,
+    soft_deleted_entities_cleanup: SoftDeletedEntitiesCleanupConfig = Field(
+        default_factory=SoftDeletedEntitiesCleanupConfig,
         description="Configuration for soft deleted entities cleanup",
     )
-    execution_request_cleanup: Optional[DatahubExecutionRequestCleanupConfig] = Field(
-        default=None,
+    execution_request_cleanup: DatahubExecutionRequestCleanupConfig = Field(
+        default_factory=DatahubExecutionRequestCleanupConfig,
         description="Configuration for execution request cleanup",
     )
@@ -108,28 +108,22 @@ class DataHubGcSource(Source):
         self.ctx = ctx
         self.config = config
         self.report = DataHubGcSourceReport()
+        self.report.event_not_produced_warn = False
         self.graph = ctx.require_graph("The DataHubGc source")
-        self.dataprocess_cleanup: Optional[DataProcessCleanup] = None
-        self.soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanup] = None
-        self.execution_request_cleanup: Optional[DatahubExecutionRequestCleanup] = None
-        if self.config.dataprocess_cleanup:
-            self.dataprocess_cleanup = DataProcessCleanup(
-                ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
-            )
-        if self.config.soft_deleted_entities_cleanup:
-            self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
-                ctx,
-                self.config.soft_deleted_entities_cleanup,
-                self.report,
-                self.config.dry_run,
-            )
-        if self.config.execution_request_cleanup:
-            self.execution_request_cleanup = DatahubExecutionRequestCleanup(
-                config=self.config.execution_request_cleanup,
-                graph=self.graph,
-                report=self.report,
-            )
+        self.dataprocess_cleanup = DataProcessCleanup(
+            ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
+        )
+        self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
+            ctx,
+            self.config.soft_deleted_entities_cleanup,
+            self.report,
+            self.config.dry_run,
+        )
+        self.execution_request_cleanup = DatahubExecutionRequestCleanup(
+            config=self.config.execution_request_cleanup,
+            graph=self.graph,
+            report=self.report,
+        )
     @classmethod
     def create(cls, config_dict, ctx):
@@ -144,15 +138,32 @@ class DataHubGcSource(Source):
         self,
     ) -> Iterable[MetadataWorkUnit]:
         if self.config.cleanup_expired_tokens:
-            self.revoke_expired_tokens()
+            try:
+                self.revoke_expired_tokens()
+            except Exception as e:
+                self.report.failure("While trying to cleanup expired token ", exc=e)
         if self.config.truncate_indices:
-            self.truncate_indices()
-        if self.dataprocess_cleanup:
-            yield from self.dataprocess_cleanup.get_workunits_internal()
-        if self.soft_deleted_entities_cleanup:
-            self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
-        if self.execution_request_cleanup:
-            self.execution_request_cleanup.run()
+            try:
+                self.truncate_indices()
+            except Exception as e:
+                self.report.failure("While trying to truncate indices ", exc=e)
+        if self.config.soft_deleted_entities_cleanup.enabled:
+            try:
+                self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
+            except Exception as e:
+                self.report.failure(
+                    "While trying to cleanup soft deleted entities ", exc=e
+                )
+        if self.config.execution_request_cleanup.enabled:
+            try:
+                self.execution_request_cleanup.run()
+            except Exception as e:
+                self.report.failure("While trying to cleanup execution request ", exc=e)
+        if self.config.dataprocess_cleanup.enabled:
+            try:
+                yield from self.dataprocess_cleanup.get_workunits_internal()
+            except Exception as e:
+                self.report.failure("While trying to cleanup data process ", exc=e)
         yield from []
     def truncate_indices(self) -> None:

datahub/ingestion/source/gc/dataprocess_cleanup.py CHANGED Viewed

@@ -98,6 +98,9 @@ query getDataJobRuns($dataJobUrn: String!, $start: Int!, $count: Int!) {
 class DataProcessCleanupConfig(ConfigModel):
+    enabled: bool = Field(
+        default=True, description="Whether to do data process cleanup."
+    )
     retention_days: Optional[int] = Field(
         10,
         description="Number of days to retain metadata in DataHub",
@@ -114,11 +117,11 @@ class DataProcessCleanupConfig(ConfigModel):
     )
     delete_empty_data_jobs: bool = Field(
-        True, description="Wether to delete Data Jobs without runs"
+        False, description="Whether to delete Data Jobs without runs"
     )
     delete_empty_data_flows: bool = Field(
-        True, description="Wether to delete Data Flows without runs"
+        False, description="Whether to delete Data Flows without runs"
     )
     hard_delete_entities: bool = Field(
@@ -128,7 +131,7 @@ class DataProcessCleanupConfig(ConfigModel):
     batch_size: int = Field(
         500,
-        description="The number of entities to get in a batch from GraphQL",
+        description="The number of entities to get in a batch from API",
     )
     max_workers: int = Field(
@@ -173,9 +176,9 @@ class DataProcessCleanup:
     """
     This source is a maintenance source which cleans up old/unused aspects.
-    Currently it only supports:.
+    Currently it only supports:
         - DataFlow
-        -DataJob
+        - DataJob
         - DataProcessInstance
     """
@@ -207,23 +210,34 @@ class DataProcessCleanup:
         assert self.ctx.graph
         dpis = []
         start = 0
+        # This graphql endpoint doesn't support scrolling and therefore after 10k DPIs it causes performance issues on ES
+        # Therefore, we are limiting the max DPIs to 9000
+        max_item = 9000
         while True:
-            job_query_result = self.ctx.graph.execute_graphql(
-                DATA_PROCESS_INSTANCES_QUERY,
-                {"dataJobUrn": job_urn, "start": start, "count": batch_size},
-            )
-            job_data = job_query_result.get("dataJob")
-            if not job_data:
-                raise ValueError(f"Error getting job {job_urn}")
-            runs_data = job_data.get("runs")
-            if not runs_data:
-                raise ValueError(f"Error getting runs for {job_urn}")
-            runs = runs_data.get("runs")
-            dpis.extend(runs)
-            start += batch_size
-            if len(runs) < batch_size:
+            try:
+                job_query_result = self.ctx.graph.execute_graphql(
+                    DATA_PROCESS_INSTANCES_QUERY,
+                    {"dataJobUrn": job_urn, "start": start, "count": batch_size},
+                )
+                job_data = job_query_result.get("dataJob")
+                if not job_data:
+                    logger.error(f"Error getting job {job_urn}")
+                    break
+                runs_data = job_data.get("runs")
+                if not runs_data:
+                    logger.error(f"Error getting runs for {job_urn}")
+                    break
+                runs = runs_data.get("runs")
+                dpis.extend(runs)
+                start += batch_size
+                if len(runs) < batch_size or start >= max_item:
+                    break
+            except Exception as e:
+                self.report.failure(
+                    f"Exception while fetching DPIs for job {job_urn}:", exc=e
+                )
                 break
         return dpis
@@ -243,9 +257,14 @@ class DataProcessCleanup:
                 futures[future] = dpi
             for future in as_completed(futures):
-                deleted_count_last_n += 1
-                futures[future]["deleted"] = True
+                try:
+                    future.result()
+                    deleted_count_last_n += 1
+                    futures[future]["deleted"] = True
+                except Exception as e:
+                    self.report.report_failure(
+                        f"Exception while deleting DPI: {e}", exc=e
+                    )
             if deleted_count_last_n % self.config.batch_size == 0:
                 logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
                 if self.config.delay:
@@ -267,7 +286,7 @@ class DataProcessCleanup:
         if self.dry_run:
             logger.info(
-                f"Dry run is on otherwise it would have deleted {urn} with hard deletion is{self.config.hard_delete_entities}"
+                f"Dry run is on otherwise it would have deleted {urn} with hard deletion is {self.config.hard_delete_entities}"
             )
             return
@@ -277,7 +296,12 @@ class DataProcessCleanup:
         assert self.ctx.graph
         dpis = self.fetch_dpis(job.urn, self.config.batch_size)
-        dpis.sort(key=lambda x: x["created"]["time"], reverse=True)
+        dpis.sort(
+            key=lambda x: x["created"]["time"]
+            if x.get("created") and x["created"].get("time")
+            else 0,
+            reverse=True,
+        )
         with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
             if self.config.keep_last_n:
@@ -309,15 +333,23 @@ class DataProcessCleanup:
             if dpi.get("deleted"):
                 continue
-            if dpi["created"]["time"] < retention_time * 1000:
+            if (
+                not dpi.get("created")
+                or not dpi["created"].get("time")
+                or dpi["created"]["time"] < retention_time * 1000
+            ):
                 future = executor.submit(
                     self.delete_entity, dpi["urn"], "dataprocessInstance"
                 )
                 futures[future] = dpi
         for future in as_completed(futures):
-            deleted_count_retention += 1
-            futures[future]["deleted"] = True
+            try:
+                future.result()
+                deleted_count_retention += 1
+                futures[future]["deleted"] = True
+            except Exception as e:
+                self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
             if deleted_count_retention % self.config.batch_size == 0:
                 logger.info(
@@ -328,9 +360,12 @@ class DataProcessCleanup:
                     logger.info(f"Sleeping for {self.config.delay} seconds")
                     time.sleep(self.config.delay)
-        logger.info(
-            f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
-        )
+        if deleted_count_retention > 0:
+            logger.info(
+                f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
+            )
+        else:
+            logger.debug(f"No DPIs to delete from {job.urn} due to retention")
     def get_data_flows(self) -> Iterable[DataFlowEntity]:
         assert self.ctx.graph
@@ -339,17 +374,26 @@ class DataProcessCleanup:
         previous_scroll_id: Optional[str] = None
         while True:
-            result = self.ctx.graph.execute_graphql(
-                DATAFLOW_QUERY,
-                {
-                    "query": "*",
-                    "scrollId": scroll_id if scroll_id else None,
-                    "batchSize": self.config.batch_size,
-                },
-            )
+            result = None
+            try:
+                result = self.ctx.graph.execute_graphql(
+                    DATAFLOW_QUERY,
+                    {
+                        "query": "*",
+                        "scrollId": scroll_id if scroll_id else None,
+                        "batchSize": self.config.batch_size,
+                    },
+                )
+            except Exception as e:
+                self.report.failure(
+                    f"While trying to get dataflows with {scroll_id}", exc=e
+                )
+                break
             scrollAcrossEntities = result.get("scrollAcrossEntities")
             if not scrollAcrossEntities:
                 raise ValueError("Missing scrollAcrossEntities in response")
+            logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
             scroll_id = scrollAcrossEntities.get("nextScrollId")
             for flow in scrollAcrossEntities.get("searchResults"):
@@ -366,6 +410,8 @@ class DataProcessCleanup:
             previous_scroll_id = scroll_id
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        if not self.config.enabled:
+            return []
         assert self.ctx.graph
         dataFlows: Dict[str, DataFlowEntity] = {}
@@ -373,17 +419,26 @@ class DataProcessCleanup:
             dataFlows[flow.urn] = flow
         scroll_id: Optional[str] = None
+        previous_scroll_id: Optional[str] = None
         dataJobs: Dict[str, List[DataJobEntity]] = defaultdict(list)
         deleted_jobs: int = 0
         while True:
-            result = self.ctx.graph.execute_graphql(
-                DATAJOB_QUERY,
-                {
-                    "query": "*",
-                    "scrollId": scroll_id if scroll_id else None,
-                    "batchSize": self.config.batch_size,
-                },
-            )
+            try:
+                result = self.ctx.graph.execute_graphql(
+                    DATAJOB_QUERY,
+                    {
+                        "query": "*",
+                        "scrollId": scroll_id if scroll_id else None,
+                        "batchSize": self.config.batch_size,
+                    },
+                )
+            except Exception as e:
+                self.report.failure(
+                    f"While trying to get data jobs with {scroll_id}", exc=e
+                )
+                break
             scrollAcrossEntities = result.get("scrollAcrossEntities")
             if not scrollAcrossEntities:
                 raise ValueError("Missing scrollAcrossEntities in response")
@@ -404,7 +459,9 @@ class DataProcessCleanup:
                     try:
                         self.delete_dpi_from_datajobs(datajob_entity)
                     except Exception as e:
-                        logger.error(f"While trying to delete {datajob_entity} got {e}")
+                        self.report.failure(
+                            f"While trying to delete {datajob_entity} ", exc=e
+                        )
                 if (
                     datajob_entity.total_runs == 0
                     and self.config.delete_empty_data_jobs
@@ -419,9 +476,11 @@ class DataProcessCleanup:
                 else:
                     dataJobs[datajob_entity.flow_urn].append(datajob_entity)
-            if not scroll_id:
+            if not scroll_id or previous_scroll_id == scroll_id:
                 break
+            previous_scroll_id = scroll_id
         logger.info(f"Deleted {deleted_jobs} DataJobs")
         # Delete empty dataflows if needed
         if self.config.delete_empty_data_flows:
@@ -436,4 +495,5 @@ class DataProcessCleanup:
                     if deleted_jobs % self.config.batch_size == 0:
                         logger.info(f"Deleted {deleted_data_flows} DataFlows")
             logger.info(f"Deleted {deleted_data_flows} DataFlows")
         return []

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -20,6 +20,9 @@ logger = logging.getLogger(__name__)
 class SoftDeletedEntitiesCleanupConfig(ConfigModel):
+    enabled: bool = Field(
+        default=True, description="Whether to do soft deletion cleanup."
+    )
     retention_days: Optional[int] = Field(
         10,
         description="Number of days to retain metadata in DataHub",
@@ -60,7 +63,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
         description="Query to filter entities",
     )
     limit_entities_delete: Optional[int] = Field(
-        10000, description="Max number of entities to delete."
+        25000, description="Max number of entities to delete."
     )
     runtime_limit_seconds: Optional[int] = Field(
@@ -104,7 +107,7 @@ class SoftDeletedEntitiesCleanup:
     def delete_entity(self, urn: str) -> None:
         assert self.ctx.graph
-        entity_urn = Urn.create_from_string(urn)
+        entity_urn = Urn.from_string(urn)
         self.report.num_soft_deleted_entity_removed += 1
         self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
             self.report.num_soft_deleted_entity_removed_by_type.get(
@@ -156,6 +159,8 @@ class SoftDeletedEntitiesCleanup:
                 self.delete_entity(urn)
     def cleanup_soft_deleted_entities(self) -> None:
+        if not self.config.enabled:
+            return
         assert self.ctx.graph
         start_time = time.time()

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -57,7 +57,11 @@ from datahub.ingestion.source.profiling.common import (
     convert_to_cardinality,
 )
 from datahub.ingestion.source.sql.sql_report import SQLSourceReport
-from datahub.metadata.com.linkedin.pegasus2avro.schema import EditableSchemaMetadata
+from datahub.ingestion.source.sql.sql_types import resolve_sql_type
+from datahub.metadata.com.linkedin.pegasus2avro.schema import (
+    EditableSchemaMetadata,
+    NumberType,
+)
 from datahub.metadata.schema_classes import (
     DatasetFieldProfileClass,
     DatasetProfileClass,
@@ -361,6 +365,8 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
     platform: str
     env: str
+    column_types: Dict[str, str] = dataclasses.field(default_factory=dict)
     def _get_columns_to_profile(self) -> List[str]:
         if not self.config.any_field_level_metrics_enabled():
             return []
@@ -374,6 +380,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
         for col_dict in self.dataset.columns:
             col = col_dict["name"]
+            self.column_types[col] = str(col_dict["type"])
             # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
             if not self.config._allow_deny_patterns.allowed(
                 f"{self.dataset_name}.{col}"
@@ -430,6 +437,21 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
             self.dataset, column
         )
+        if column_spec.type_ == ProfilerDataType.UNKNOWN:
+            try:
+                datahub_field_type = resolve_sql_type(
+                    self.column_types[column], self.dataset.engine.dialect.name.lower()
+                )
+            except Exception as e:
+                logger.debug(
+                    f"Error resolving sql type {self.column_types[column]}: {e}"
+                )
+                datahub_field_type = None
+            if datahub_field_type is None:
+                return
+            if isinstance(datahub_field_type, NumberType):
+                column_spec.type_ = ProfilerDataType.NUMERIC
     @_run_with_query_combiner
     def _get_column_cardinality(
         self, column_spec: _SingleColumnSpec, column: str

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pyiceberg.exceptions import (
     NoSuchIcebergTableError,
     NoSuchNamespaceError,
     NoSuchPropertyException,
+    NoSuchTableError,
 )
 from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
 from pyiceberg.table import Table
@@ -104,7 +105,7 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
 @capability(SourceCapability.DESCRIPTIONS, "Enabled by default.")
 @capability(
     SourceCapability.OWNERSHIP,
-    "Optionally enabled via configuration by specifying which Iceberg table property holds user or group ownership.",
+    "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
 )
 @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
 class IcebergSource(StatefulIngestionSourceBase):
@@ -192,9 +193,7 @@ class IcebergSource(StatefulIngestionSourceBase):
                     table = thread_local.local_catalog.load_table(dataset_path)
                     time_taken = timer.elapsed_seconds()
                     self.report.report_table_load_time(time_taken)
-                LOGGER.debug(
-                    f"Loaded table: {table.identifier}, time taken: {time_taken}"
-                )
+                LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
                 yield from self._create_iceberg_workunit(dataset_name, table)
             except NoSuchPropertyException as e:
                 self.report.report_warning(
@@ -206,12 +205,20 @@ class IcebergSource(StatefulIngestionSourceBase):
                 )
             except NoSuchIcebergTableError as e:
                 self.report.report_warning(
-                    "no-iceberg-table",
+                    "not-an-iceberg-table",
                     f"Failed to create workunit for {dataset_name}. {e}",
                 )
                 LOGGER.warning(
                     f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
                 )
+            except NoSuchTableError as e:
+                self.report.report_warning(
+                    "no-such-table",
+                    f"Failed to create workunit for {dataset_name}. {e}",
+                )
+                LOGGER.warning(
+                    f"NoSuchTableError while processing table {dataset_path}, skipping it.",
+                )
             except Exception as e:
                 self.report.report_failure("general", f"Failed to create workunit: {e}")
                 LOGGER.exception(

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -141,6 +141,10 @@ class KafkaSourceConfig(
         default=False,
         description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
     )
+    ingest_schemas_as_entities: bool = pydantic.Field(
+        default=False,
+        description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
+    )
 def get_kafka_consumer(
@@ -148,7 +152,7 @@ def get_kafka_consumer(
 ) -> confluent_kafka.Consumer:
     consumer = confluent_kafka.Consumer(
         {
-            "group.id": "test",
+            "group.id": "datahub-kafka-ingestion",
             "bootstrap.servers": connection.bootstrap,
             **connection.consumer_config,
         }
@@ -164,6 +168,25 @@ def get_kafka_consumer(
     return consumer
+def get_kafka_admin_client(
+    connection: KafkaConsumerConnectionConfig,
+) -> AdminClient:
+    client = AdminClient(
+        {
+            "group.id": "datahub-kafka-ingestion",
+            "bootstrap.servers": connection.bootstrap,
+            **connection.consumer_config,
+        }
+    )
+    if CallableConsumerConfig.is_callable_config(connection.consumer_config):
+        # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
+        # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
+        logger.debug("Initiating polling for kafka admin client")
+        client.poll(timeout=30)
+        logger.debug("Initiated polling for kafka admin client")
+    return client
 @dataclass
 class KafkaSourceReport(StaleEntityRemovalSourceReport):
     topics_scanned: int = 0
@@ -278,13 +301,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
     def init_kafka_admin_client(self) -> None:
         try:
             # TODO: Do we require separate config than existing consumer_config ?
-            self.admin_client = AdminClient(
-                {
-                    "group.id": "test",
-                    "bootstrap.servers": self.source_config.connection.bootstrap,
-                    **self.source_config.connection.consumer_config,
-                }
-            )
+            self.admin_client = get_kafka_admin_client(self.source_config.connection)
         except Exception as e:
             logger.debug(e, exc_info=e)
             self.report.report_warning(
@@ -330,17 +347,20 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
             else:
                 self.report.report_dropped(topic)
-        # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
-        for subject in self.schema_registry_client.get_subjects():
-            try:
-                yield from self._extract_record(
-                    subject, True, topic_detail=None, extra_topic_config=None
-                )
-            except Exception as e:
-                logger.warning(f"Failed to extract subject {subject}", exc_info=True)
-                self.report.report_warning(
-                    "subject", f"Exception while extracting topic {subject}: {e}"
-                )
+        if self.source_config.ingest_schemas_as_entities:
+            # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
+            for subject in self.schema_registry_client.get_subjects():
+                try:
+                    yield from self._extract_record(
+                        subject, True, topic_detail=None, extra_topic_config=None
+                    )
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to extract subject {subject}", exc_info=True
+                    )
+                    self.report.report_warning(
+                        "subject", f"Exception while extracting topic {subject}: {e}"
+                    )
     def _extract_record(
         self,

acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc9py3-none-any.whl → 0.15.0py3-none-any.whl