PyPI - acryl-datahub - Versions diffs - 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl - Mend

acryl-datahub 0.15.0.2rc6py3-none-any.whl → 0.15.0.2rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (168) hide show

{acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
{acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
datahub/__init__.py +1 -1
datahub/api/entities/assertion/assertion_operator.py +3 -5
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/assertion_operator.py +3 -5
datahub/api/entities/dataproduct/dataproduct.py +4 -4
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
datahub/cli/cli_utils.py +1 -1
datahub/cli/docker_cli.py +6 -6
datahub/cli/ingest_cli.py +25 -15
datahub/cli/lite_cli.py +2 -2
datahub/cli/migrate.py +3 -3
datahub/cli/specific/assertions_cli.py +3 -3
datahub/cli/timeline_cli.py +1 -1
datahub/configuration/common.py +1 -2
datahub/configuration/config_loader.py +73 -50
datahub/configuration/git.py +2 -2
datahub/configuration/time_window_config.py +10 -5
datahub/emitter/mce_builder.py +4 -8
datahub/emitter/mcp_patch_builder.py +1 -2
datahub/entrypoints.py +6 -0
datahub/ingestion/api/incremental_lineage_helper.py +2 -8
datahub/ingestion/api/report.py +1 -2
datahub/ingestion/api/source_helpers.py +1 -1
datahub/ingestion/extractor/json_schema_util.py +3 -3
datahub/ingestion/extractor/schema_util.py +3 -5
datahub/ingestion/fs/s3_fs.py +3 -3
datahub/ingestion/glossary/datahub_classifier.py +6 -4
datahub/ingestion/graph/client.py +4 -6
datahub/ingestion/run/pipeline.py +8 -7
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
datahub/ingestion/source/abs/source.py +19 -8
datahub/ingestion/source/aws/glue.py +11 -11
datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
datahub/ingestion/source/bigquery_v2/queries.py +1 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
datahub/ingestion/source/bigquery_v2/usage.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +0 -1
datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
datahub/ingestion/source/confluent_schema_registry.py +6 -6
datahub/ingestion/source/csv_enricher.py +29 -29
datahub/ingestion/source/datahub/config.py +4 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
datahub/ingestion/source/dbt/dbt_common.py +9 -7
datahub/ingestion/source/dremio/dremio_api.py +4 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
datahub/ingestion/source/elastic_search.py +4 -4
datahub/ingestion/source/gc/datahub_gc.py +1 -0
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
datahub/ingestion/source/gcs/gcs_source.py +3 -2
datahub/ingestion/source/ge_data_profiler.py +2 -5
datahub/ingestion/source/ge_profiling_config.py +3 -3
datahub/ingestion/source/iceberg/iceberg.py +3 -3
datahub/ingestion/source/identity/azure_ad.py +3 -3
datahub/ingestion/source/identity/okta.py +3 -3
datahub/ingestion/source/kafka/kafka.py +11 -9
datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
datahub/ingestion/source/looker/looker_common.py +19 -19
datahub/ingestion/source/looker/looker_config.py +3 -3
datahub/ingestion/source/looker/looker_source.py +25 -25
datahub/ingestion/source/looker/looker_template_language.py +3 -3
datahub/ingestion/source/looker/looker_usage.py +5 -7
datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
datahub/ingestion/source/looker/lookml_source.py +13 -15
datahub/ingestion/source/looker/view_upstream.py +5 -5
datahub/ingestion/source/mlflow.py +4 -4
datahub/ingestion/source/mode.py +5 -5
datahub/ingestion/source/mongodb.py +6 -4
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +24 -26
datahub/ingestion/source/openapi.py +9 -9
datahub/ingestion/source/powerbi/config.py +12 -12
datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
datahub/ingestion/source/powerbi/powerbi.py +6 -6
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
datahub/ingestion/source/redshift/config.py +3 -3
datahub/ingestion/source/redshift/redshift.py +12 -12
datahub/ingestion/source/redshift/usage.py +8 -8
datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
datahub/ingestion/source/s3/source.py +1 -1
datahub/ingestion/source/salesforce.py +26 -25
datahub/ingestion/source/schema/json_schema.py +1 -1
datahub/ingestion/source/sigma/sigma.py +3 -3
datahub/ingestion/source/sigma/sigma_api.py +12 -10
datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
datahub/ingestion/source/sql/athena.py +1 -3
datahub/ingestion/source/sql/clickhouse.py +8 -14
datahub/ingestion/source/sql/oracle.py +1 -3
datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
datahub/ingestion/source/sql/sql_types.py +0 -1
datahub/ingestion/source/sql/teradata.py +16 -3
datahub/ingestion/source/state/profiling_state_handler.py +3 -3
datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/tableau/tableau.py +245 -101
datahub/ingestion/source/tableau/tableau_common.py +5 -2
datahub/ingestion/source/unity/config.py +3 -1
datahub/ingestion/source/unity/proxy.py +1 -1
datahub/ingestion/source/unity/source.py +3 -3
datahub/ingestion/source/unity/usage.py +3 -1
datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
datahub/ingestion/source/usage/usage_common.py +1 -1
datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
datahub/ingestion/transformer/add_dataset_properties.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
datahub/ingestion/transformer/tags_to_terms.py +7 -7
datahub/integrations/assertion/snowflake/compiler.py +10 -10
datahub/lite/duckdb_lite.py +12 -10
datahub/metadata/_schema_classes.py +1 -1
datahub/metadata/schema.avsc +6 -2
datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
datahub/secret/datahub_secrets_client.py +12 -21
datahub/secret/secret_common.py +14 -8
datahub/specific/aspect_helpers/custom_properties.py +1 -2
datahub/sql_parsing/schema_resolver.py +5 -10
datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
datahub/sql_parsing/sqlglot_lineage.py +3 -3
datahub/sql_parsing/sqlglot_utils.py +1 -1
datahub/telemetry/stats.py +1 -2
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +10 -10
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/logging_manager.py +2 -2
datahub/utilities/lossy_collections.py +3 -3
datahub/utilities/mapping.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/serialized_lru_cache.py +3 -1
datahub/utilities/sqlalchemy_query_combiner.py +6 -6
datahub/utilities/sqllineage_patch.py +1 -1
datahub/utilities/stats_collections.py +3 -1
datahub/utilities/urns/_urn_base.py +28 -5
datahub/utilities/urns/urn_iter.py +2 -2
{acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
{acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/csv_enricher.py CHANGED Viewed

@@ -371,11 +371,11 @@ class CSVEnricherSource(Source):
         domain: Optional[str],
         description: Optional[str],
     ) -> Iterable[MetadataWorkUnit]:
-        maybe_terms_wu: Optional[
-            MetadataWorkUnit
-        ] = self.get_resource_glossary_terms_work_unit(
-            entity_urn=entity_urn,
-            term_associations=term_associations,
+        maybe_terms_wu: Optional[MetadataWorkUnit] = (
+            self.get_resource_glossary_terms_work_unit(
+                entity_urn=entity_urn,
+                term_associations=term_associations,
+            )
         )
         if maybe_terms_wu:
             self.report.num_glossary_term_workunits_produced += 1
@@ -389,31 +389,31 @@ class CSVEnricherSource(Source):
             self.report.num_tag_workunits_produced += 1
             yield maybe_tags_wu
-        maybe_owners_wu: Optional[
-            MetadataWorkUnit
-        ] = self.get_resource_owners_work_unit(
-            entity_urn=entity_urn,
-            owners=owners,
+        maybe_owners_wu: Optional[MetadataWorkUnit] = (
+            self.get_resource_owners_work_unit(
+                entity_urn=entity_urn,
+                owners=owners,
+            )
         )
         if maybe_owners_wu:
             self.report.num_owners_workunits_produced += 1
             yield maybe_owners_wu
-        maybe_domain_wu: Optional[
-            MetadataWorkUnit
-        ] = self.get_resource_domain_work_unit(
-            entity_urn=entity_urn,
-            domain=domain,
+        maybe_domain_wu: Optional[MetadataWorkUnit] = (
+            self.get_resource_domain_work_unit(
+                entity_urn=entity_urn,
+                domain=domain,
+            )
         )
         if maybe_domain_wu:
             self.report.num_domain_workunits_produced += 1
             yield maybe_domain_wu
-        maybe_description_wu: Optional[
-            MetadataWorkUnit
-        ] = self.get_resource_description_work_unit(
-            entity_urn=entity_urn,
-            description=description,
+        maybe_description_wu: Optional[MetadataWorkUnit] = (
+            self.get_resource_description_work_unit(
+                entity_urn=entity_urn,
+                description=description,
+            )
         )
         if maybe_description_wu:
             self.report.num_description_workunits_produced += 1
@@ -426,9 +426,9 @@ class CSVEnricherSource(Source):
         needs_write: bool,
     ) -> Tuple[EditableSchemaMetadataClass, bool]:
         field_path: str = sub_resource_row.field_path
-        term_associations: List[
-            GlossaryTermAssociationClass
-        ] = sub_resource_row.term_associations
+        term_associations: List[GlossaryTermAssociationClass] = (
+            sub_resource_row.term_associations
+        )
         tag_associations: List[TagAssociationClass] = sub_resource_row.tag_associations
         description: Optional[str] = sub_resource_row.description
         has_terms: bool = len(term_associations) > 0
@@ -517,9 +517,9 @@ class CSVEnricherSource(Source):
             # Boolean field to tell whether we need to write an MCPW.
             needs_write = False
-            current_editable_schema_metadata: Optional[
-                EditableSchemaMetadataClass
-            ] = None
+            current_editable_schema_metadata: Optional[EditableSchemaMetadataClass] = (
+                None
+            )
             if self.ctx.graph and not self.should_overwrite:
                 # Fetch the current editable schema metadata
                 current_editable_schema_metadata = self.ctx.graph.get_aspect(
@@ -655,9 +655,9 @@ class CSVEnricherSource(Source):
             entity_urn = row["resource"]
             entity_type = Urn.from_string(row["resource"]).get_type()
-            term_associations: List[
-                GlossaryTermAssociationClass
-            ] = self.maybe_extract_glossary_terms(row)
+            term_associations: List[GlossaryTermAssociationClass] = (
+                self.maybe_extract_glossary_terms(row)
+            )
             tag_associations: List[TagAssociationClass] = self.maybe_extract_tags(row)
             owners: List[OwnerClass] = self.maybe_extract_owners(row, is_resource_row)

datahub/ingestion/source/datahub/config.py CHANGED Viewed

@@ -25,6 +25,10 @@ DEFAULT_EXCLUDE_ASPECTS = {
     "globalSettingsKey",
     "globalSettingsInfo",
     "testResults",
+    "dataHubExecutionRequestKey",
+    "dataHubExecutionRequestInput",
+    "dataHubExecutionRequestSignal",
+    "dataHubExecutionRequestResult",
 }

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -152,7 +152,9 @@ class DataHubDatabaseReader:
     ) -> Iterable[Dict[str, Any]]:
         with self.engine.connect() as conn:
             if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
-                with conn.begin():  # Transaction required for PostgreSQL server-side cursor
+                with (
+                    conn.begin()
+                ):  # Transaction required for PostgreSQL server-side cursor
                     # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
                     # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
                     conn = conn.execution_options(
@@ -222,7 +224,7 @@ class DataHubDatabaseReader:
             )
         except Exception as e:
             logger.warning(
-                f'Failed to parse metadata for {row["urn"]}: {e}', exc_info=True
+                f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
             )
             self.report.num_database_parse_errors += 1
             self.report.database_parse_errors.setdefault(

datahub/ingestion/source/dbt/dbt_cloud.py CHANGED Viewed

@@ -194,20 +194,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
 _DBT_FIELDS_BY_TYPE = {
     "models": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
-    { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
-    { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
+    {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
+    {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
     dependsOn
     materializedType
 """,
     "seeds": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
-    { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
-    { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
+    {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
+    {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
 """,
     "sources": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
-    { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
+    {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
     identifier
     sourceName
     sourceDescription
@@ -218,9 +218,9 @@ _DBT_FIELDS_BY_TYPE = {
     loader
 """,
     "snapshots": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
-    { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
-    { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
+    {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
+    {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
     parentsSources {{
       uniqueId
     }}
@@ -229,7 +229,7 @@ _DBT_FIELDS_BY_TYPE = {
     }}
 """,
     "tests": f"""
-    { _DBT_GRAPHQL_COMMON_FIELDS }
+    {_DBT_GRAPHQL_COMMON_FIELDS}
     state
     columnName
     status
@@ -315,7 +315,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
             res = response.json()
             if "errors" in res:
                 raise ValueError(
-                    f'Unable to fetch metadata from dbt Cloud: {res["errors"]}'
+                    f"Unable to fetch metadata from dbt Cloud: {res['errors']}"
                 )
             data = res["data"]
         except JSONDecodeError as e:

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -506,16 +506,18 @@ class DBTNode:
     materialization: Optional[str]  # table, view, ephemeral, incremental, snapshot
     # see https://docs.getdbt.com/reference/artifacts/manifest-json
     catalog_type: Optional[str]
-    missing_from_catalog: bool  # indicates if the node was missing from the catalog.json
+    missing_from_catalog: (
+        bool  # indicates if the node was missing from the catalog.json
+    )
     owner: Optional[str]
     columns: List[DBTColumn] = field(default_factory=list)
     upstream_nodes: List[str] = field(default_factory=list)  # list of upstream dbt_name
     upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list)
-    raw_sql_parsing_result: Optional[
-        SqlParsingResult
-    ] = None  # only set for nodes that don't depend on ephemeral models
+    raw_sql_parsing_result: Optional[SqlParsingResult] = (
+        None  # only set for nodes that don't depend on ephemeral models
+    )
     cll_debug_info: Optional[SqlParsingDebugInfo] = None
     meta: Dict[str, Any] = field(default_factory=dict)
@@ -869,10 +871,10 @@ class DBTSourceBase(StatefulIngestionSourceBase):
                                 "platform": DBT_PLATFORM,
                                 "name": node.dbt_name,
                                 "instance": self.config.platform_instance,
+                                # Ideally we'd include the env unconditionally. However, we started out
+                                # not including env in the guid, so we need to maintain backwards compatibility
+                                # with existing PROD assertions.
                                 **(
-                                    # Ideally we'd include the env unconditionally. However, we started out
-                                    # not including env in the guid, so we need to maintain backwards compatibility
-                                    # with existing PROD assertions.
                                     {"env": self.config.env}
                                     if self.config.env != mce_builder.DEFAULT_ENV
                                     and self.config.include_env_in_assertion_guid

datahub/ingestion/source/dremio/dremio_api.py CHANGED Viewed

@@ -181,7 +181,7 @@ class DremioAPIOperations:
             return
         # On-prem Dremio authentication (PAT or Basic Auth)
-        for retry in range(1, self._retry_count + 1):
+        for _ in range(1, self._retry_count + 1):
             try:
                 if connection_args.authentication_method == "PAT":
                     self.session.headers.update(
@@ -191,9 +191,9 @@ class DremioAPIOperations:
                     )
                     return
                 else:
-                    assert (
-                        connection_args.username and connection_args.password
-                    ), "Username and password are required for authentication"
+                    assert connection_args.username and connection_args.password, (
+                        "Username and password are required for authentication"
+                    )
                     host = connection_args.hostname
                     port = connection_args.port
                     protocol = "https" if connection_args.tls else "http"

datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py CHANGED Viewed

@@ -101,9 +101,9 @@ class DremioToDataHubSourceTypeMapping:
         Add a new source type if not in the map (e.g., Dremio ARP).
         """
         dremio_source_type = dremio_source_type.upper()
-        DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[
-            dremio_source_type
-        ] = datahub_source_type
+        DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = (
+            datahub_source_type
+        )
         if category:
             if category.lower() == "file_object_storage":

datahub/ingestion/source/elastic_search.py CHANGED Viewed

@@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter:
     @staticmethod
     def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:
-        type_class: Optional[
-            Type
-        ] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
-            elastic_column_type
+        type_class: Optional[Type] = (
+            ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
+                elastic_column_type
+            )
         )
         if type_class is None:
             logger.warning(

datahub/ingestion/source/gc/datahub_gc.py CHANGED Viewed

@@ -292,6 +292,7 @@ class DataHubGcSource(Source):
             tokens = list_access_tokens.get("tokens", [])
             total = list_access_tokens.get("total", 0)
             if tokens == []:
+                # Due to a server bug we cannot rely on just total
                 break
             for token in tokens:
                 self.report.expired_tokens_revoked += 1

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -99,6 +99,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
 @dataclass
 class SoftDeletedEntitiesReport(SourceReport):
+    num_calls_made: Dict[str, int] = field(default_factory=dict)
     num_entities_found: Dict[str, int] = field(default_factory=dict)
     num_soft_deleted_entity_processed: int = 0
     num_soft_deleted_retained_due_to_age: int = 0
@@ -154,9 +155,9 @@ class SoftDeletedEntitiesCleanup:
             current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
             self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
             if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
-                self.report.sample_hard_deleted_aspects_by_type[
-                    entity_type
-                ] = LossyList()
+                self.report.sample_hard_deleted_aspects_by_type[entity_type] = (
+                    LossyList()
+                )
             self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
     def delete_entity(self, urn: str) -> None:
@@ -242,6 +243,11 @@ class SoftDeletedEntitiesCleanup:
         while True:
             try:
+                if entity_type not in self.report.num_calls_made:
+                    self.report.num_calls_made[entity_type] = 1
+                else:
+                    self.report.num_calls_made[entity_type] += 1
+                self._print_report()
                 result = self.ctx.graph.execute_graphql(
                     graphql_query,
                     {
@@ -270,7 +276,13 @@ class SoftDeletedEntitiesCleanup:
                 )
                 break
             scroll_across_entities = result.get("scrollAcrossEntities")
-            if not scroll_across_entities or not scroll_across_entities.get("count"):
+            if not scroll_across_entities:
+                break
+            search_results = scroll_across_entities.get("searchResults")
+            count = scroll_across_entities.get("count")
+            if not count or not search_results:
+                # Due to a server bug we cannot rely on just count as it was returning response like this
+                # {'count': 1, 'nextScrollId': None, 'searchResults': []}
                 break
             if entity_type == "DATA_PROCESS_INSTANCE":
                 # Temp workaround. See note in beginning of the function
@@ -282,7 +294,7 @@ class SoftDeletedEntitiesCleanup:
             self.report.num_entities_found[entity_type] += scroll_across_entities.get(
                 "count"
             )
-            for query in scroll_across_entities.get("searchResults"):
+            for query in search_results:
                 yield query["entity"]["urn"]
     def _get_urns(self) -> Iterable[str]:

datahub/ingestion/source/gcs/gcs_source.py CHANGED Viewed

@@ -141,8 +141,9 @@ class GCSSource(StatefulIngestionSourceBase):
         source.source_config.platform = PLATFORM_GCS
         source.is_s3_platform = lambda: True  # type: ignore
-        source.create_s3_path = lambda bucket_name, key: unquote(f"s3://{bucket_name}/{key}")  # type: ignore
+        source.create_s3_path = lambda bucket_name, key: unquote(  # type: ignore
+            f"s3://{bucket_name}/{key}"
+        )
         return source
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
         "get_column_max",
         "get_column_mean",
         "get_column_stdev",
-        "get_column_stdev",
         "get_column_nonnull_count",
         "get_column_unique_count",
     }
@@ -328,7 +327,7 @@ def _is_single_row_query_method(query: Any) -> bool:
 def _run_with_query_combiner(
-    method: Callable[Concatenate["_SingleDatasetProfiler", P], None]
+    method: Callable[Concatenate["_SingleDatasetProfiler", P], None],
 ) -> Callable[Concatenate["_SingleDatasetProfiler", P], None]:
     @functools.wraps(method)
     def inner(
@@ -1538,9 +1537,7 @@ def create_bigquery_temp_table(
         query_job: Optional["google.cloud.bigquery.job.query.QueryJob"] = (
             # In google-cloud-bigquery 3.15.0, the _query_job attribute was
             # made public and renamed to query_job.
-            cursor.query_job
-            if hasattr(cursor, "query_job")
-            else cursor._query_job  # type: ignore[attr-defined]
+            cursor.query_job if hasattr(cursor, "query_job") else cursor._query_job  # type: ignore[attr-defined]
         )
         assert query_job
         temp_destination_table = query_job.destination

datahub/ingestion/source/ge_profiling_config.py CHANGED Viewed

@@ -220,9 +220,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
                         )
                     values[field_level_metric] = False
-            assert (
-                max_num_fields_to_profile is None
-            ), f"{max_num_fields_to_profile_key} should be set to None"
+            assert max_num_fields_to_profile is None, (
+                f"{max_num_fields_to_profile_key} should be set to None"
+            )
         # Disable expensive queries.
         if values.get("turn_off_expensive_profiling_metrics"):

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -296,9 +296,9 @@ class IcebergSource(StatefulIngestionSourceBase):
                 custom_properties["snapshot-id"] = str(
                     table.current_snapshot().snapshot_id
                 )
-                custom_properties[
-                    "manifest-list"
-                ] = table.current_snapshot().manifest_list
+                custom_properties["manifest-list"] = (
+                    table.current_snapshot().manifest_list
+                )
             dataset_properties = DatasetPropertiesClass(
                 name=table.name()[-1],
                 description=table.metadata.properties.get("comment", None),

datahub/ingestion/source/identity/azure_ad.py CHANGED Viewed

@@ -354,9 +354,9 @@ class AzureADSource(StatefulIngestionSourceBase):
                     yield MetadataWorkUnit(id=group_status_wu_id, mcp=group_status_mcp)
         # Populate GroupMembership Aspects for CorpUsers
-        datahub_corp_user_urn_to_group_membership: Dict[
-            str, GroupMembershipClass
-        ] = defaultdict(lambda: GroupMembershipClass(groups=[]))
+        datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
+            defaultdict(lambda: GroupMembershipClass(groups=[]))
+        )
         if (
             self.config.ingest_group_membership
             and len(self.selected_azure_ad_groups) > 0

datahub/ingestion/source/identity/okta.py CHANGED Viewed

@@ -344,9 +344,9 @@ class OktaSource(StatefulIngestionSourceBase):
                 ).as_workunit()
         # Step 2: Populate GroupMembership Aspects for CorpUsers
-        datahub_corp_user_urn_to_group_membership: Dict[
-            str, GroupMembershipClass
-        ] = defaultdict(lambda: GroupMembershipClass(groups=[]))
+        datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
+            defaultdict(lambda: GroupMembershipClass(groups=[]))
+        )
         if self.config.ingest_group_membership and okta_groups is not None:
             # Fetch membership for each group.
             for okta_group in okta_groups:

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -419,10 +419,10 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
             custom_props = self.build_custom_properties(
                 topic, topic_detail, extra_topic_config
             )
-            schema_name: Optional[
-                str
-            ] = self.schema_registry_client._get_subject_for_topic(
-                topic, is_key_schema=False
+            schema_name: Optional[str] = (
+                self.schema_registry_client._get_subject_for_topic(
+                    topic, is_key_schema=False
+                )
             )
             if schema_name is not None:
                 custom_props["Schema Name"] = schema_name
@@ -610,11 +610,13 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
     def fetch_topic_configurations(self, topics: List[str]) -> Dict[str, dict]:
         logger.info("Fetching config details for all topics")
-        configs: Dict[
-            ConfigResource, concurrent.futures.Future
-        ] = self.admin_client.describe_configs(
-            resources=[ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics],
-            request_timeout=self.source_config.connection.client_timeout_seconds,
+        configs: Dict[ConfigResource, concurrent.futures.Future] = (
+            self.admin_client.describe_configs(
+                resources=[
+                    ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics
+                ],
+                request_timeout=self.source_config.connection.client_timeout_seconds,
+            )
         )
         logger.debug("Waiting for config details futures to complete")
         concurrent.futures.wait(configs.values())

datahub/ingestion/source/kafka_connect/kafka_connect.py CHANGED Viewed

@@ -110,9 +110,8 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
             connector_manifest = self._get_connector_manifest(
                 connector_name, connector_url
             )
-            if (
-                connector_manifest is None
-                or not self.config.connector_patterns.allowed(connector_manifest.name)
+            if connector_manifest is None or not self.config.connector_patterns.allowed(
+                connector_manifest.name
             ):
                 self.report.report_dropped(connector_name)
                 continue

datahub/ingestion/source/kafka_connect/sink_connectors.py CHANGED Viewed

@@ -199,9 +199,9 @@ class BigQuerySinkConnector(BaseConnector):
             transforms.append(transform)
             for key in self.connector_manifest.config.keys():
                 if key.startswith(f"transforms.{name}."):
-                    transform[
-                        key.replace(f"transforms.{name}.", "")
-                    ] = self.connector_manifest.config[key]
+                    transform[key.replace(f"transforms.{name}.", "")] = (
+                        self.connector_manifest.config[key]
+                    )
         if "defaultDataset" in connector_manifest.config:
             defaultDataset = connector_manifest.config["defaultDataset"]

datahub/ingestion/source/kafka_connect/source_connectors.py CHANGED Viewed

@@ -123,9 +123,9 @@ class ConfluentJDBCSourceConnector(BaseConnector):
             transforms.append(transform)
             for key in self.connector_manifest.config.keys():
                 if key.startswith(f"transforms.{name}."):
-                    transform[
-                        key.replace(f"transforms.{name}.", "")
-                    ] = self.connector_manifest.config[key]
+                    transform[key.replace(f"transforms.{name}.", "")] = (
+                        self.connector_manifest.config[key]
+                    )
         return self.JdbcParser(
             db_connection_url,

datahub/ingestion/source/looker/looker_common.py CHANGED Viewed

@@ -596,9 +596,9 @@ class LookerUtil:
     @staticmethod
     def _extract_view_from_field(field: str) -> str:
-        assert (
-            field.count(".") == 1
-        ), f"Error: A field must be prefixed by a view name, field is: {field}"
+        assert field.count(".") == 1, (
+            f"Error: A field must be prefixed by a view name, field is: {field}"
+        )
         return field.split(".")[0]
     @staticmethod
@@ -815,9 +815,9 @@ class LookerExplore:
     project_name: Optional[str] = None
     label: Optional[str] = None
     description: Optional[str] = None
-    upstream_views: Optional[
-        List[ProjectInclude]
-    ] = None  # captures the view name(s) this explore is derived from
+    upstream_views: Optional[List[ProjectInclude]] = (
+        None  # captures the view name(s) this explore is derived from
+    )
     upstream_views_file_path: Dict[str, Optional[str]] = dataclasses_field(
         default_factory=dict
     )  # view_name is key and file_path is value. A single file may contains multiple views
@@ -889,7 +889,7 @@ class LookerExplore:
                     upstream_views.extend(parsed_explore.upstream_views or [])
                 else:
                     logger.warning(
-                        f'Could not find extended explore {extended_explore} for explore {dict["name"]} in model {model_name}'
+                        f"Could not find extended explore {extended_explore} for explore {dict['name']} in model {model_name}"
                     )
         else:
             # we only fallback to the view_names list if this is not an extended explore
@@ -903,7 +903,7 @@ class LookerExplore:
                 )
                 if not info:
                     logger.warning(
-                        f'Could not resolve view {view_name} for explore {dict["name"]} in model {model_name}'
+                        f"Could not resolve view {view_name} for explore {dict['name']} in model {model_name}"
                     )
                 else:
                     upstream_views.append(
@@ -935,9 +935,9 @@ class LookerExplore:
         try:
             explore = client.lookml_model_explore(model, explore_name)
             views: Set[str] = set()
-            lkml_fields: List[
-                LookmlModelExploreField
-            ] = explore_field_set_to_lkml_fields(explore)
+            lkml_fields: List[LookmlModelExploreField] = (
+                explore_field_set_to_lkml_fields(explore)
+            )
             if explore.view_name is not None and explore.view_name != explore.name:
                 # explore is not named after a view and is instead using a from field, which is modeled as view_name.
@@ -1034,9 +1034,9 @@ class LookerExplore:
                         if measure_field.name is None:
                             continue
                         else:
-                            field_name_vs_raw_explore_field[
-                                measure_field.name
-                            ] = measure_field
+                            field_name_vs_raw_explore_field[measure_field.name] = (
+                                measure_field
+                            )
                             view_fields.append(
                                 ViewField(
@@ -1072,11 +1072,11 @@ class LookerExplore:
             if view_project_map:
                 logger.debug(f"views and their projects: {view_project_map}")
-            upstream_views_file_path: Dict[
-                str, Optional[str]
-            ] = create_upstream_views_file_path_map(
-                lkml_fields=lkml_fields,
-                view_names=views,
+            upstream_views_file_path: Dict[str, Optional[str]] = (
+                create_upstream_views_file_path_map(
+                    lkml_fields=lkml_fields,
+                    view_names=views,
+                )
             )
             if upstream_views_file_path:
                 logger.debug(f"views and their file-paths: {upstream_views_file_path}")

datahub/ingestion/source/looker/looker_config.py CHANGED Viewed

@@ -166,9 +166,9 @@ def _get_generic_definition(
         # e.g. spark1 or hive2 or druid_18
         platform = re.sub(r"[0-9]+", "", dialect_name.split("_")[0])
-    assert (
-        platform is not None
-    ), f"Failed to extract a valid platform from connection {looker_connection}"
+    assert platform is not None, (
+        f"Failed to extract a valid platform from connection {looker_connection}"
+    )
     db = looker_connection.database
     schema = looker_connection.schema  # ok for this to be None
     return platform, db, schema

acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.2rc6py3-none-any.whl → 0.15.0.2rc8py3-none-any.whl