PyPI - acryl-datahub - Versions diffs - 1.0.0rc6__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc6py3-none-any.whl → 1.0.0rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (74) hide show

{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2490 -2490
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +74 -74
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/cli/docker_cli.py +1 -1
datahub/cli/iceberg_cli.py +1 -1
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -1
datahub/ingestion/graph/client.py +16 -7
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
datahub/ingestion/source/cassandra/cassandra_api.py +2 -1
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/dbt/dbt_common.py +1 -1
datahub/ingestion/source/dremio/dremio_api.py +3 -3
datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
datahub/ingestion/source/file.py +5 -2
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +11 -14
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
datahub/ingestion/source/identity/okta.py +1 -3
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +54 -32
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mode.py +1 -1
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +6 -3
datahub/ingestion/source/openapi_parser.py +2 -2
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/pulsar.py +2 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +2 -1
datahub/ingestion/source/s3/config.py +2 -4
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +1 -1
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +2 -2
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/sql_common.py +2 -2
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +4 -2
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/superset.py +65 -37
datahub/ingestion/source/tableau/tableau.py +3 -6
datahub/ingestion/source/tableau/tableau_common.py +2 -1
datahub/lite/duckdb_lite.py +5 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/sdk/dataset.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -330,7 +330,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         aspect_type_name: Optional[str] = None,
         version: int = 0,
     ) -> Optional[Aspect]:
-        assert aspect_type.ASPECT_NAME == aspect
+        assert aspect == aspect_type.ASPECT_NAME
         return self.get_aspect(
             entity_urn=entity_urn,
             aspect_type=aspect_type,
@@ -1547,7 +1547,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         return fragment
     def _run_assertion_build_params(
-        self, params: Optional[Dict[str, str]] = {}
+        self, params: Optional[Dict[str, str]] = None
     ) -> List[Any]:
         if params is None:
             return []
@@ -1566,9 +1566,11 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         self,
         urn: str,
         save_result: bool = True,
-        parameters: Optional[Dict[str, str]] = {},
+        parameters: Optional[Dict[str, str]] = None,
         async_flag: bool = False,
     ) -> Dict:
+        if parameters is None:
+            parameters = {}
         params = self._run_assertion_build_params(parameters)
         graph_query: str = """
             %s
@@ -1597,9 +1599,11 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         self,
         urns: List[str],
         save_result: bool = True,
-        parameters: Optional[Dict[str, str]] = {},
+        parameters: Optional[Dict[str, str]] = None,
         async_flag: bool = False,
     ) -> Dict:
+        if parameters is None:
+            parameters = {}
         params = self._run_assertion_build_params(parameters)
         graph_query: str = """
             %s
@@ -1636,10 +1640,14 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
     def run_assertions_for_asset(
         self,
         urn: str,
-        tag_urns: Optional[List[str]] = [],
-        parameters: Optional[Dict[str, str]] = {},
+        tag_urns: Optional[List[str]] = None,
+        parameters: Optional[Dict[str, str]] = None,
         async_flag: bool = False,
     ) -> Dict:
+        if tag_urns is None:
+            tag_urns = []
+        if parameters is None:
+            parameters = {}
         params = self._run_assertion_build_params(parameters)
         graph_query: str = """
             %s
@@ -1677,9 +1685,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         self,
         entity_name: str,
         urns: List[str],
-        aspects: List[str] = [],
+        aspects: Optional[List[str]] = None,
         with_system_metadata: bool = False,
     ) -> Dict[str, Any]:
+        aspects = aspects or []
         payload = {
             "urns": urns,
             "aspectNames": aspects,

datahub/ingestion/graph/entity_versioning.py CHANGED Viewed

@@ -93,7 +93,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
         try:
             return response["linkAssetVersion"]["urn"]
         except KeyError:
-            raise ValueError(f"Unexpected response: {response}")
+            raise ValueError(f"Unexpected response: {response}") from None
     def link_asset_to_versioned_asset(
         self,
@@ -165,7 +165,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
         try:
             return response["unlinkAssetVersion"]["urn"]
         except KeyError:
-            raise ValueError(f"Unexpected response: {response}")
+            raise ValueError(f"Unexpected response: {response}") from None
     def unlink_latest_asset_from_version_set(
         self, version_set_urn: str
@@ -198,4 +198,4 @@ class EntityVersioningAPI(DataHubGraphProtocol):
         try:
             return response["unlinkAssetVersion"]["urn"]
         except KeyError:
-            raise ValueError(f"Unexpected response: {response}")
+            raise ValueError(f"Unexpected response: {response}") from None

datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py CHANGED Viewed

@@ -163,12 +163,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
                 key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
                 for key, value in obj.items()
             }
-        elif isinstance(obj, list):
-            return [
-                DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
-                for element in obj
-            ]
-        elif isinstance(obj, set):
+        elif isinstance(obj, list) or isinstance(obj, set):
             return [
                 DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
                 for element in obj

datahub/ingestion/source/abs/config.py CHANGED Viewed

@@ -144,10 +144,8 @@ class DataLakeSourceConfig(
         return path_specs
     @pydantic.validator("platform", always=True)
-    def platform_not_empty(cls, platform: str, values: dict) -> str:
-        inferred_platform = values.get(
-            "platform", None
-        )  # we may have inferred it above
+    def platform_not_empty(cls, platform: Any, values: dict) -> str:
+        inferred_platform = values.get("platform")  # we may have inferred it above
         platform = platform or inferred_platform
         if not platform:
             raise ValueError("platform must not be empty")

datahub/ingestion/source/bigquery_v2/bigquery_audit.py CHANGED Viewed

@@ -165,7 +165,7 @@ class BigQueryTableRef:
     @classmethod
     def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
         for key in ["projectId", "datasetId", "tableId"]:
-            if key not in spec.keys():
+            if key not in spec:
                 raise ValueError(f"invalid BigQuery table reference dict: {spec}")
         return cls(

datahub/ingestion/source/bigquery_v2/bigquery_schema.py CHANGED Viewed

@@ -344,7 +344,7 @@ class BigQuerySchemaApi:
         with_partitions: bool = False,
     ) -> Iterator[BigqueryTable]:
         with PerfTimer() as current_timer:
-            filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
+            filter_clause: str = ", ".join(f"'{table}'" for table in tables)
             if with_partitions:
                 query_template = BigqueryQuery.tables_for_dataset

datahub/ingestion/source/cassandra/cassandra_api.py CHANGED Viewed

@@ -159,7 +159,8 @@ class CassandraAPI:
             self.report.failure(message="Failed to authenticate to Cassandra", exc=e)
             return False
-    def get(self, query: str, parameters: Optional[List] = []) -> List:
+    def get(self, query: str, parameters: Optional[List] = None) -> List:
+        parameters = parameters or []
         if not self._cassandra_session:
             return []

datahub/ingestion/source/csv_enricher.py CHANGED Viewed

@@ -314,7 +314,7 @@ class CSVEnricherSource(Source):
             "datajob": EditableDataJobPropertiesClass,
             "dataflow": EditableDataFlowPropertiesClass,
             "notebook": EditableNotebookPropertiesClass,
-        }.get(entityType, None)
+        }.get(entityType)
         if not entityClass:
             raise ValueError(
@@ -640,8 +640,8 @@ class CSVEnricherSource(Source):
                 )
             except Exception as e:
                 raise ConfigurationError(
-                    f"Cannot read remote file {self.config.filename}, error:{e}"
-                )
+                    f"Cannot read remote file {self.config.filename}: {e}"
+                ) from e
         else:
             with open(pathlib.Path(self.config.filename), encoding="utf-8-sig") as f:
                 rows = list(csv.DictReader(f, delimiter=self.config.delimiter))

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -1033,7 +1033,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
             cll_nodes.add(dbt_name)
             schema_nodes.add(dbt_name)
-        for dbt_name in all_nodes_map.keys():
+        for dbt_name in all_nodes_map:
             if self._is_allowed_node(dbt_name):
                 add_node_to_cll_list(dbt_name)

datahub/ingestion/source/dremio/dremio_api.py CHANGED Viewed

@@ -271,12 +271,12 @@ class DremioAPIOperations:
                     self.cancel_query(job_id)
                     raise DremioAPIException(
                         f"Query execution timed out after {timeout} seconds"
-                    )
+                    ) from None
                 except RuntimeError as e:
-                    raise DremioAPIException(f"{str(e)}")
+                    raise DremioAPIException() from e
         except requests.RequestException as e:
-            raise DremioAPIException(f"Error executing query: {str(e)}")
+            raise DremioAPIException("Error executing query") from e
     def fetch_results(self, job_id: str) -> List[Dict]:
         """Fetch job results with status checking"""

datahub/ingestion/source/dremio/dremio_aspects.py CHANGED Viewed

@@ -168,8 +168,9 @@ class DremioAspects:
         )
     def get_container_urn(
-        self, name: Optional[str] = None, path: Optional[List[str]] = []
+        self, name: Optional[str] = None, path: Optional[List[str]] = None
     ) -> str:
+        path = path or []
         container_key = self.get_container_key(name, path)
         return container_key.as_urn()

datahub/ingestion/source/file.py CHANGED Viewed

@@ -410,10 +410,13 @@ def _from_obj_for_file(
         item = MetadataChangeEvent.from_obj(obj)
     elif "aspect" in obj:
         item = MetadataChangeProposalWrapper.from_obj(obj)
-    else:
+    elif "bucket" in obj:
         item = UsageAggregationClass.from_obj(obj)
+    else:
+        raise ValueError(f"Unknown object type: {obj}")
     if not item.validate():
-        raise ValueError(f"failed to parse: {obj}")
+        raise ValueError(f"Failed to parse: {obj}")
     if isinstance(item, UsageAggregationClass):
         logger.warning(f"Dropping deprecated UsageAggregationClass: {item}")

datahub/ingestion/source/gc/dataprocess_cleanup.py CHANGED Viewed

@@ -498,7 +498,7 @@ class DataProcessCleanup:
         # Delete empty dataflows if needed
         if self.config.delete_empty_data_flows:
             deleted_data_flows: int = 0
-            for key in dataFlows.keys():
+            for key in dataFlows:
                 if not dataJobs.get(key) or len(dataJobs[key]) == 0:
                     logger.info(
                         f"Deleting dataflow {key} because there are not datajobs"

datahub/ingestion/source/gc/execution_request_cleanup.py CHANGED Viewed

@@ -130,8 +130,9 @@ class DatahubExecutionRequestCleanup:
         )
     def _scroll_execution_requests(
-        self, overrides: Dict[str, Any] = {}
+        self, overrides: Optional[Dict[str, Any]] = None
     ) -> Iterator[CleanupRecord]:
+        overrides = overrides or {}
         headers: Dict[str, Any] = {
             "Accept": "application/json",
             "Content-Type": "application/json",

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
             ).select_from(self._table)
         )
         return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == BIGQUERY:
-        element_values = self.engine.execute(
-            sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
-                self._table
-            )
-        )
-        return convert_to_json_serializable(element_values.fetchone()[0])
-    elif self.engine.dialect.name.lower() == SNOWFLAKE:
+    elif (
+        self.engine.dialect.name.lower() == BIGQUERY
+        or self.engine.dialect.name.lower() == SNOWFLAKE
+    ):
         element_values = self.engine.execute(
             sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
                 self._table
@@ -381,13 +377,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
             col = col_dict["name"]
             self.column_types[col] = str(col_dict["type"])
             # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
-            if not self.config._allow_deny_patterns.allowed(
-                f"{self.dataset_name}.{col}"
+            if (
+                not self.config._allow_deny_patterns.allowed(
+                    f"{self.dataset_name}.{col}"
+                )
+                or not self.config.profile_nested_fields
+                and "." in col
             ):
                 ignored_columns_by_pattern.append(col)
-            # We try to ignore nested columns as well
-            elif not self.config.profile_nested_fields and "." in col:
-                ignored_columns_by_pattern.append(col)
             elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
                 ignored_columns_by_type.append(col)
             else:
@@ -1408,7 +1405,7 @@ class DatahubGEProfiler:
             },
         )
-        if platform == BIGQUERY or platform == DATABRICKS:
+        if platform in (BIGQUERY, DATABRICKS):
             # This is done as GE makes the name as DATASET.TABLE
             # but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
             name_parts = pretty_name.split(".")

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -2,8 +2,9 @@ import json
 import logging
 import threading
 import uuid
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+from dateutil import parser as dateutil_parser
 from pyiceberg.catalog import Catalog
 from pyiceberg.exceptions import (
     NoSuchIcebergTableError,
@@ -81,6 +82,7 @@ from datahub.metadata.schema_classes import (
     OwnerClass,
     OwnershipClass,
     OwnershipTypeClass,
+    TimeStampClass,
 )
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -183,16 +185,9 @@ class IcebergSource(StatefulIngestionSourceBase):
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         thread_local = threading.local()
-        def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
-            LOGGER.debug(f"Processing dataset for path {dataset_path}")
-            dataset_name = ".".join(dataset_path)
-            if not self.config.table_pattern.allowed(dataset_name):
-                # Dataset name is rejected by pattern, report as dropped.
-                self.report.report_dropped(dataset_name)
-                LOGGER.debug(
-                    f"Skipping table {dataset_name} due to not being allowed by the config pattern"
-                )
-                return
+        def _try_processing_dataset(
+            dataset_path: Tuple[str, ...], dataset_name: str
+        ) -> Iterable[MetadataWorkUnit]:
             try:
                 if not hasattr(thread_local, "local_catalog"):
                     LOGGER.debug(
@@ -248,10 +243,31 @@ class IcebergSource(StatefulIngestionSourceBase):
                 LOGGER.warning(
                     f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
                 )
+            except ValueError as e:
+                if "Could not initialize FileIO" not in str(e):
+                    raise
+                self.report.warning(
+                    "Could not initialize FileIO",
+                    f"Could not initialize FileIO for {dataset_path} due to: {e}",
+                )
+        def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
+            try:
+                LOGGER.debug(f"Processing dataset for path {dataset_path}")
+                dataset_name = ".".join(dataset_path)
+                if not self.config.table_pattern.allowed(dataset_name):
+                    # Dataset name is rejected by pattern, report as dropped.
+                    self.report.report_dropped(dataset_name)
+                    LOGGER.debug(
+                        f"Skipping table {dataset_name} due to not being allowed by the config pattern"
+                    )
+                    return
+                yield from _try_processing_dataset(dataset_path, dataset_name)
             except Exception as e:
                 self.report.report_failure(
                     "general",
-                    f"Failed to create workunit for dataset {dataset_name}: {e}",
+                    f"Failed to create workunit for dataset {dataset_path}: {e}",
                 )
                 LOGGER.exception(
                     f"Exception while processing table {dataset_path}, skipping it.",
@@ -288,6 +304,7 @@ class IcebergSource(StatefulIngestionSourceBase):
             )
             # Dataset properties aspect.
+            additional_properties = {}
             custom_properties = table.metadata.properties.copy()
             custom_properties["location"] = table.metadata.location
             custom_properties["format-version"] = str(table.metadata.format_version)
@@ -299,10 +316,27 @@ class IcebergSource(StatefulIngestionSourceBase):
                 custom_properties["manifest-list"] = (
                     table.current_snapshot().manifest_list
                 )
+                additional_properties["lastModified"] = TimeStampClass(
+                    int(table.current_snapshot().timestamp_ms)
+                )
+            if "created-at" in custom_properties:
+                try:
+                    dt = dateutil_parser.isoparse(custom_properties["created-at"])
+                    additional_properties["created"] = TimeStampClass(
+                        int(dt.timestamp() * 1000)
+                    )
+                except Exception as ex:
+                    LOGGER.warning(
+                        f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
+                    )
             dataset_properties = DatasetPropertiesClass(
                 name=table.name()[-1],
                 description=table.metadata.properties.get("comment", None),
                 customProperties=custom_properties,
+                lastModified=additional_properties.get("lastModified"),
+                created=additional_properties.get("created"),
+                qualifiedName=dataset_name,
             )
             dataset_snapshot.aspects.append(dataset_properties)
             # Dataset ownership aspect.

datahub/ingestion/source/iceberg/iceberg_common.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import threading
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional
@@ -156,18 +157,21 @@ class TopTableTimings:
     def __init__(self, size: int = 10):
         self._size = size
         self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
+        self._lock = threading.Lock()
     def add(self, entity: Dict[str, Any]) -> None:
         if self._VALUE_FIELD not in entity:
             return
-        self.top_entites.add(entity)
-        if len(self.top_entites) > self._size:
-            self.top_entites.pop()
+        with self._lock:
+            self.top_entites.add(entity)
+            if len(self.top_entites) > self._size:
+                self.top_entites.pop()
     def __str__(self) -> str:
-        if len(self.top_entites) == 0:
-            return "no timings reported"
-        return str(list(self.top_entites))
+        with self._lock:
+            if len(self.top_entites) == 0:
+                return "no timings reported"
+            return str(list(self.top_entites))
 class TimingClass:
@@ -175,24 +179,31 @@ class TimingClass:
     def __init__(self):
         self.times = SortedList()
+        self._lock = threading.Lock()
     def add_timing(self, t: float) -> None:
-        self.times.add(t)
+        with self._lock:
+            self.times.add(t)
     def __str__(self) -> str:
-        if len(self.times) == 0:
-            return "no timings reported"
-        total = sum(self.times)
-        avg = total / len(self.times)
-        return str(
-            {
-                "average_time": format_timespan(avg, detailed=True, max_units=3),
-                "min_time": format_timespan(self.times[0], detailed=True, max_units=3),
-                "max_time": format_timespan(self.times[-1], detailed=True, max_units=3),
-                # total_time does not provide correct information in case we run in more than 1 thread
-                "total_time": format_timespan(total, detailed=True, max_units=3),
-            }
-        )
+        with self._lock:
+            if len(self.times) == 0:
+                return "no timings reported"
+            total = sum(self.times)
+            avg = total / len(self.times)
+            return str(
+                {
+                    "average_time": format_timespan(avg, detailed=True, max_units=3),
+                    "min_time": format_timespan(
+                        self.times[0], detailed=True, max_units=3
+                    ),
+                    "max_time": format_timespan(
+                        self.times[-1], detailed=True, max_units=3
+                    ),
+                    # total_time does not provide correct information in case we run in more than 1 thread
+                    "total_time": format_timespan(total, detailed=True, max_units=3),
+                }
+            )
 @dataclass

datahub/ingestion/source/identity/okta.py CHANGED Viewed

@@ -568,9 +568,7 @@ class OktaSource(StatefulIngestionSourceBase):
         if (
             self.config.include_deprovisioned_users is False
             and okta_user.status == UserStatus.DEPROVISIONED
-        ):
-            return False
-        elif (
+        ) or (
             self.config.include_suspended_users is False
             and okta_user.status == UserStatus.SUSPENDED
         ):

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -272,7 +272,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
             return schema_registry_class.create(config, report)
         except Exception as e:
             logger.debug(e, exc_info=e)
-            raise ImportError(config.schema_registry_class)
+            raise ImportError(config.schema_registry_class) from e
     def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)

datahub/ingestion/source/kafka_connect/source_connectors.py CHANGED Viewed

@@ -447,13 +447,10 @@ class DebeziumSourceConnector(BaseConnector):
     ) -> DebeziumParser:
         connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
-        if connector_class == "io.debezium.connector.mysql.MySqlConnector":
-            parser = self.DebeziumParser(
-                source_platform="mysql",
-                server_name=self.get_server_name(connector_manifest),
-                database_name=None,
-            )
-        elif connector_class == "MySqlConnector":
+        if (
+            connector_class == "io.debezium.connector.mysql.MySqlConnector"
+            or connector_class == "MySqlConnector"
+        ):
             parser = self.DebeziumParser(
                 source_platform="mysql",
                 server_name=self.get_server_name(connector_manifest),

datahub/ingestion/source/looker/looker_file_loader.py CHANGED Viewed

@@ -33,14 +33,14 @@ class LookerViewFileLoader:
         base_projects_folder: Dict[str, pathlib.Path],
         reporter: LookMLSourceReport,
         source_config: LookMLSourceConfig,
-        manifest_constants: Dict[str, LookerConstant] = {},
+        manifest_constants: Optional[Dict[str, LookerConstant]] = None,
     ) -> None:
         self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
         self._root_project_name = root_project_name
         self._base_projects_folder = base_projects_folder
         self.reporter = reporter
         self.source_config = source_config
-        self.manifest_constants = manifest_constants
+        self.manifest_constants = manifest_constants or {}
     def _load_viewfile(
         self, project_name: str, path: str, reporter: LookMLSourceReport

datahub/ingestion/source/looker/looker_lib_wrapper.py CHANGED Viewed

@@ -205,8 +205,9 @@ class LookerAPI:
     def folder_ancestors(
         self,
         folder_id: str,
-        fields: Union[str, List[str]] = ["id", "name", "parent_id"],
+        fields: Optional[Union[str, List[str]]] = None,
     ) -> Sequence[Folder]:
+        fields = fields or ["id", "name", "parent_id"]
         self.client_stats.folder_calls += 1
         try:
             return self.client.folder_ancestors(

datahub/ingestion/source/looker/looker_template_language.py CHANGED Viewed

@@ -464,9 +464,10 @@ def process_lookml_template_language(
     source_config: LookMLSourceConfig,
     view_lkml_file_dict: dict,
     reporter: LookMLSourceReport,
-    manifest_constants: Dict[str, "LookerConstant"] = {},
+    manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
     resolve_constants: bool = False,
 ) -> None:
+    manifest_constants = manifest_constants or {}
     if "views" not in view_lkml_file_dict:
         return
@@ -507,9 +508,10 @@ def load_and_preprocess_file(
     path: Union[str, pathlib.Path],
     source_config: LookMLSourceConfig,
     reporter: LookMLSourceReport,
-    manifest_constants: Dict[str, "LookerConstant"] = {},
+    manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
     resolve_constants: bool = False,
 ) -> dict:
+    manifest_constants = manifest_constants or {}
     parsed = load_lkml(path)
     process_lookml_template_language(

datahub/ingestion/source/looker/lookml_source.py CHANGED Viewed

@@ -501,7 +501,7 @@ class LookMLSource(StatefulIngestionSourceBase):
             raise ValueError(
                 f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
                 f"in your config file"
-            )
+            ) from None
     def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
         manifest_file = folder / "manifest.lkml"
@@ -1006,8 +1006,9 @@ class LookMLSource(StatefulIngestionSourceBase):
     def report_skipped_unreachable_views(
         self,
         viewfile_loader: LookerViewFileLoader,
-        processed_view_map: Dict[str, Set[str]] = {},
+        processed_view_map: Optional[Dict[str, Set[str]]] = None,
     ) -> None:
+        processed_view_map = processed_view_map or {}
         view_files: Dict[str, List[pathlib.Path]] = {}
         for project, folder_path in self.base_projects_folder.items():
             folder = pathlib.Path(folder_path)

acryl-datahub 1.0.0rc6__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc6py3-none-any.whl → 1.0.0rc8py3-none-any.whl