PyPI - acryl-datahub - Versions diffs - 1.0.0.2rc3__py3-none-any.whl → 1.0.0.2rc5__py3-none-any.whl - Mend

acryl-datahub 1.0.0.2rc3py3-none-any.whl → 1.0.0.2rc5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (62) hide show

datahub/ingestion/run/pipeline.py CHANGED Viewed

@@ -39,9 +39,6 @@ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, Logging
 from datahub.ingestion.sink.datahub_rest import DatahubRestSink
 from datahub.ingestion.sink.sink_registry import sink_registry
 from datahub.ingestion.source.source_registry import source_registry
-from datahub.ingestion.transformer.system_metadata_transformer import (
-    SystemMetadataTransformer,
-)
 from datahub.ingestion.transformer.transform_registry import transform_registry
 from datahub.sdk._attribution import KnownAttribution, change_default_attribution
 from datahub.telemetry import stats
@@ -286,9 +283,6 @@ class Pipeline:
                     f"Transformer type:{transformer_type},{transformer_class} configured"
                 )
-        # Add the system metadata transformer at the end of the list.
-        self.transformers.append(SystemMetadataTransformer(self.ctx))
     def _configure_reporting(self, report_to: Optional[str]) -> None:
         if self.dry_run:
             # In dry run mode, we don't want to report anything.

datahub/ingestion/source/aws/sagemaker_processors/models.py CHANGED Viewed

@@ -323,7 +323,7 @@ class ModelProcessor:
             model_training_jobs = model_training_jobs.union(
                 {
                     job_urn
-                    for job_urn, job_direction in data_url_matched_jobs.keys()
+                    for job_urn, job_direction in data_url_matched_jobs
                     if job_direction == JobDirection.TRAINING
                 }
             )
@@ -331,7 +331,7 @@ class ModelProcessor:
             model_downstream_jobs = model_downstream_jobs.union(
                 {
                     job_urn
-                    for job_urn, job_direction in data_url_matched_jobs.keys()
+                    for job_urn, job_direction in data_url_matched_jobs
                     if job_direction == JobDirection.DOWNSTREAM
                 }
             )
@@ -368,7 +368,7 @@ class ModelProcessor:
         model_training_jobs = model_training_jobs.union(
             {
                 job_urn
-                for job_urn, job_direction in name_matched_jobs.keys()
+                for job_urn, job_direction in name_matched_jobs
                 if job_direction == JobDirection.TRAINING
             }
         )
@@ -376,7 +376,7 @@ class ModelProcessor:
         model_downstream_jobs = model_downstream_jobs.union(
             {
                 job_urn
-                for job_urn, job_direction in name_matched_jobs.keys()
+                for job_urn, job_direction in name_matched_jobs
                 if job_direction == JobDirection.DOWNSTREAM
             }
         )

datahub/ingestion/source/bigquery_v2/lineage.py CHANGED Viewed

@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
                 memory_footprint.total_size(lineage)
             )
-        for lineage_key in lineage.keys():
+        for lineage_key in lineage:
             # For views, we do not use the upstreams obtained by parsing audit logs
             # as they may contain indirectly referenced tables.
             if (

datahub/ingestion/source/dynamodb/dynamodb.py CHANGED Viewed

@@ -362,7 +362,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
         if self.config.include_table_item is None:
             return
         dataset_name = f"{region}.{table_name}"
-        if dataset_name not in self.config.include_table_item.keys():
+        if dataset_name not in self.config.include_table_item:
             return
         primary_key_list = self.config.include_table_item.get(dataset_name)
         assert isinstance(primary_key_list, List)

datahub/ingestion/source/fivetran/fivetran.py CHANGED Viewed

@@ -215,6 +215,7 @@ class FivetranSource(StatefulIngestionSourceBase):
         datajob = DataJob(
             id=connector.connector_id,
             flow_urn=dataflow_urn,
+            platform_instance=self.config.platform_instance,
             name=connector.connector_name,
             owners={owner_email} if owner_email else set(),
         )

datahub/ingestion/source/fivetran/fivetran_log_api.py CHANGED Viewed

@@ -190,7 +190,7 @@ class FivetranLogAPI:
         jobs: List[Job] = []
         if connector_sync_log is None:
             return jobs
-        for sync_id in connector_sync_log.keys():
+        for sync_id in connector_sync_log:
             if len(connector_sync_log[sync_id]) != 2:
                 # If both sync-start and sync-end event log not present for this sync that means sync is still in progress
                 continue

datahub/ingestion/source/hex/query_fetcher.py CHANGED Viewed

@@ -18,7 +18,8 @@ from datahub.utilities.time import datetime_to_ts_millis
 logger = logging.getLogger(__name__)
 # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
-HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
+# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
+HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
 @dataclass
@@ -39,6 +40,7 @@ class HexQueryFetcherReport(SourceReport):
     fetched_query_objects: int = 0
     filtered_out_queries_missing_metadata: int = 0
     filtered_out_queries_different_workspace: int = 0
+    filtered_out_queries_no_match: int = 0
     filtered_out_queries_no_subjects: int = 0
     total_queries: int = 0
     total_dataset_subjects: int = 0
@@ -210,6 +212,7 @@ class HexQueryFetcher:
         match = re.search(HEX_METADATA_PATTERN, sql_statement)
         if not match:
+            self.report.filtered_out_queries_no_match += 1
             return None
         try:

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import logging
 import threading
 import uuid
+from functools import partial
 from typing import Any, Dict, Iterable, List, Optional, Tuple
 from dateutil import parser as dateutil_parser
@@ -47,6 +48,12 @@ from datahub.emitter.mce_builder import (
 )
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import NamespaceKey
+from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
+    auto_patch_last_modified,
+)
+from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
+    EnsureAspectSizeProcessor,
+)
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SourceCapability,
@@ -57,6 +64,14 @@ from datahub.ingestion.api.decorators import (
     support_status,
 )
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
+from datahub.ingestion.api.source_helpers import (
+    AutoSystemMetadata,
+    auto_fix_duplicate_schema_field_paths,
+    auto_fix_empty_field_paths,
+    auto_lowercase_urns,
+    auto_materialize_referenced_tags_terms,
+    auto_workunit_reporter,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.extractor import schema_util
 from datahub.ingestion.source.common.subtypes import (
@@ -82,6 +97,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     SchemaMetadata,
 )
 from datahub.metadata.schema_classes import (
+    BrowsePathEntryClass,
+    BrowsePathsV2Class,
     ContainerClass,
     DataPlatformInstanceClass,
     DatasetPropertiesClass,
@@ -134,6 +151,7 @@ class IcebergSource(StatefulIngestionSourceBase):
         super().__init__(config, ctx)
         self.report: IcebergSourceReport = IcebergSourceReport()
         self.config: IcebergSourceConfig = config
+        self.ctx: PipelineContext = ctx
     @classmethod
     def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
@@ -141,8 +159,47 @@ class IcebergSource(StatefulIngestionSourceBase):
         return cls(config, ctx)
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        # This source needs to overwrite standard `get_workunit_processor`, because it is unique in terms of usage
+        # of parallelism. Because of this, 2 processors won't work as expected:
+        # 1. browse_path_processor - it needs aspects for a single entity to be continuous - which is not guaranteed
+        #    in this source
+        # 2. automatic stamping with systemMetadata - in current implementation of the Source class this processor
+        #    would have been applied in a thread (single) shared between the source, processors and transformers.
+        #    Since the metadata scraping happens in separate threads, this could lead to difference between
+        #    time used by systemMetadata and actual time at which metadata was read
+        auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
+        if (
+            self.ctx.pipeline_config
+            and self.ctx.pipeline_config.source
+            and self.ctx.pipeline_config.source.config
+            and (
+                (
+                    hasattr(
+                        self.ctx.pipeline_config.source.config,
+                        "convert_urns_to_lowercase",
+                    )
+                    and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase
+                )
+                or (
+                    hasattr(self.ctx.pipeline_config.source.config, "get")
+                    and self.ctx.pipeline_config.source.config.get(
+                        "convert_urns_to_lowercase"
+                    )
+                )
+            )
+        ):
+            auto_lowercase_dataset_urns = auto_lowercase_urns
         return [
-            *super().get_workunit_processors(),
+            auto_lowercase_dataset_urns,
+            auto_materialize_referenced_tags_terms,
+            partial(
+                auto_fix_duplicate_schema_field_paths, platform=self._infer_platform()
+            ),
+            partial(auto_fix_empty_field_paths, platform=self._infer_platform()),
+            partial(auto_workunit_reporter, self.get_report()),
+            auto_patch_last_modified,
+            EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
             StaleEntityRemovalHandler.create(
                 self, self.config, self.ctx
             ).workunit_processor,
@@ -208,6 +265,12 @@ class IcebergSource(StatefulIngestionSourceBase):
                     )
                     thread_local.local_catalog = self.config.get_catalog()
+                if not hasattr(thread_local, "stamping_processor"):
+                    LOGGER.debug(
+                        f"Didn't find stamping_processor in thread_local ({thread_local}), initializing new workunit processor"
+                    )
+                    thread_local.stamping_processor = AutoSystemMetadata(self.ctx)
                 with PerfTimer() as timer:
                     table = thread_local.local_catalog.load_table(dataset_path)
                     time_taken = timer.elapsed_seconds()
@@ -224,9 +287,11 @@ class IcebergSource(StatefulIngestionSourceBase):
                 for aspect in self._create_iceberg_table_aspects(
                     dataset_name, table, namespace_urn
                 ):
-                    yield MetadataChangeProposalWrapper(
-                        entityUrn=dataset_urn, aspect=aspect
-                    ).as_workunit()
+                    yield thread_local.stamping_processor.stamp_wu(
+                        MetadataChangeProposalWrapper(
+                            entityUrn=dataset_urn, aspect=aspect
+                        ).as_workunit()
+                    )
             except NoSuchPropertyException as e:
                 self.report.warning(
                     title="Unable to process table",
@@ -308,6 +373,7 @@ class IcebergSource(StatefulIngestionSourceBase):
             return
         try:
+            stamping_processor = AutoSystemMetadata(self.ctx)
             namespace_ids = self._get_namespaces(catalog)
             namespaces: List[Tuple[Identifier, str]] = []
             for namespace in namespace_ids:
@@ -323,9 +389,11 @@ class IcebergSource(StatefulIngestionSourceBase):
                 )
                 namespaces.append((namespace, namespace_urn))
                 for aspect in self._create_iceberg_namespace_aspects(namespace):
-                    yield MetadataChangeProposalWrapper(
-                        entityUrn=namespace_urn, aspect=aspect
-                    ).as_workunit()
+                    yield stamping_processor.stamp_wu(
+                        MetadataChangeProposalWrapper(
+                            entityUrn=namespace_urn, aspect=aspect
+                        ).as_workunit()
+                    )
             LOGGER.debug("Namespaces ingestion completed")
         except Exception as e:
             self.report.report_failure(
@@ -366,7 +434,9 @@ class IcebergSource(StatefulIngestionSourceBase):
                 yield dataset_ownership
             yield self._create_schema_metadata(dataset_name, table)
-            yield self._get_dataplatform_instance_aspect()
+            dpi = self._get_dataplatform_instance_aspect()
+            yield dpi
+            yield self._create_browse_paths_aspect(dpi.instance, str(namespace_urn))
             yield ContainerClass(container=str(namespace_urn))
         self.report.report_table_processing_time(
@@ -377,6 +447,22 @@ class IcebergSource(StatefulIngestionSourceBase):
             profiler = IcebergProfiler(self.report, self.config.profiling)
             yield from profiler.profile_table(dataset_name, table)
+    def _create_browse_paths_aspect(
+        self,
+        platform_instance_urn: Optional[str] = None,
+        container_urn: Optional[str] = None,
+    ) -> BrowsePathsV2Class:
+        path = []
+        if platform_instance_urn:
+            path.append(
+                BrowsePathEntryClass(
+                    id=platform_instance_urn, urn=platform_instance_urn
+                )
+            )
+        if container_urn:
+            path.append(BrowsePathEntryClass(id=container_urn, urn=container_urn))
+        return BrowsePathsV2Class(path=path)
     def _get_partition_aspect(self, table: Table) -> Optional[str]:
         """Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
         Each element of the returned array represents a field in the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) that follows [Appendix-C](https://iceberg.apache.org/spec/?#appendix-c-json-serialization) of the Iceberg specification.
@@ -530,7 +616,9 @@ class IcebergSource(StatefulIngestionSourceBase):
             name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
         )
         yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
-        yield self._get_dataplatform_instance_aspect()
+        dpi = self._get_dataplatform_instance_aspect()
+        yield dpi
+        yield self._create_browse_paths_aspect(dpi.instance)
 class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -568,10 +568,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
         for config_key in KafkaTopicConfigKeys:
             try:
-                if (
-                    config_key in topic_config.keys()
-                    and topic_config[config_key] is not None
-                ):
+                if config_key in topic_config and topic_config[config_key] is not None:
                     config_value = topic_config[config_key].value
                     custom_props[config_key] = (
                         config_value

datahub/ingestion/source/kafka_connect/sink_connectors.py CHANGED Viewed

@@ -197,7 +197,7 @@ class BigQuerySinkConnector(BaseConnector):
         for name in transform_names:
             transform = {"name": name}
             transforms.append(transform)
-            for key in self.connector_manifest.config.keys():
+            for key in self.connector_manifest.config:
                 if key.startswith(f"transforms.{name}."):
                     transform[key.replace(f"transforms.{name}.", "")] = (
                         self.connector_manifest.config[key]

datahub/ingestion/source/kafka_connect/source_connectors.py CHANGED Viewed

@@ -121,7 +121,7 @@ class ConfluentJDBCSourceConnector(BaseConnector):
         for name in transform_names:
             transform = {"name": name}
             transforms.append(transform)
-            for key in self.connector_manifest.config.keys():
+            for key in self.connector_manifest.config:
                 if key.startswith(f"transforms.{name}."):
                     transform[key.replace(f"transforms.{name}.", "")] = (
                         self.connector_manifest.config[key]

datahub/ingestion/source/looker/looker_source.py CHANGED Viewed

@@ -363,7 +363,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         filters: MutableMapping[str, Any] = (
             query.filters if query.filters is not None else {}
         )
-        for field in filters.keys():
+        for field in filters:
             if field is None:
                 continue
@@ -877,8 +877,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
             # fine to set them to None.
             # TODO: Track project names for each explore.
             explores_to_fetch = [
-                (None, model, explore)
-                for (model, explore) in self.reachable_explores.keys()
+                (None, model, explore) for (model, explore) in self.reachable_explores
             ]
         explores_to_fetch.sort()

datahub/ingestion/source/mlflow.py CHANGED Viewed

@@ -36,6 +36,7 @@ from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
+    StatefulStaleMetadataRemovalConfig,
 )
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
@@ -119,6 +120,8 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
         default=None, description="Password for MLflow authentication"
     )
+    stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
 @dataclass
 class MLflowRegisteredModelStageInfo:

datahub/ingestion/source/mode.py CHANGED Viewed

@@ -899,7 +899,7 @@ class ModeSource(StatefulIngestionSourceBase):
                 for match in matches:
                     definition = Template(source=match).render()
                     parameters = yaml.safe_load(definition)
-                    for key in parameters.keys():
+                    for key in parameters:
                         jinja_params[key] = parameters[key].get("default", "")
                 normalized_query = re.sub(
@@ -1601,7 +1601,7 @@ class ModeSource(StatefulIngestionSourceBase):
     def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
         # Space/collection -> report -> query -> Chart
-        for space_token in self.space_tokens.keys():
+        for space_token in self.space_tokens:
             reports = self._get_reports(space_token)
             for report in reports:
                 report_token = report.get("token", "")

datahub/ingestion/source/nifi.py CHANGED Viewed

@@ -703,7 +703,7 @@ class NifiSource(StatefulIngestionSourceBase):
             if (
                 component.nifi_type is NifiType.PROCESSOR
                 and component.type
-                not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS.keys()
+                not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
             ) or component.nifi_type not in [
                 NifiType.PROCESSOR,
                 NifiType.REMOTE_INPUT_PORT,
@@ -977,7 +977,7 @@ class NifiSource(StatefulIngestionSourceBase):
                     )
             for incoming_from in incoming:
-                if incoming_from in self.nifi_flow.remotely_accessible_ports.keys():
+                if incoming_from in self.nifi_flow.remotely_accessible_ports:
                     dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
                     dataset_urn = builder.make_dataset_urn(
                         NIFI, dataset_name, self.config.env
@@ -994,7 +994,7 @@ class NifiSource(StatefulIngestionSourceBase):
                     )
             for outgoing_to in outgoing:
-                if outgoing_to in self.nifi_flow.remotely_accessible_ports.keys():
+                if outgoing_to in self.nifi_flow.remotely_accessible_ports:
                     dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
                     dataset_urn = builder.make_dataset_urn(
                         NIFI, dataset_name, self.config.env

datahub/ingestion/source/openapi.py CHANGED Viewed

@@ -102,7 +102,7 @@ class OpenApiConfig(ConfigModel):
                 # details there once, and then use that session for all requests.
                 self.token = f"Bearer {self.bearer_token}"
             else:
-                assert "url_complement" in self.get_token.keys(), (
+                assert "url_complement" in self.get_token, (
                     "When 'request_type' is set to 'get', an url_complement is needed for the request."
                 )
                 if self.get_token["request_type"] == "get":
@@ -317,7 +317,7 @@ class APISource(Source, ABC):
                 yield wu
             # Handle schema metadata if available
-            if "data" in endpoint_dets.keys():
+            if "data" in endpoint_dets:
                 # we are lucky! data is defined in the swagger for this endpoint
                 schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
                 wu = MetadataWorkUnit(
@@ -371,7 +371,7 @@ class APISource(Source, ABC):
                 else:
                     self.report_bad_responses(response.status_code, type=endpoint_k)
             else:
-                if endpoint_k not in config.forced_examples.keys():
+                if endpoint_k not in config.forced_examples:
                     # start guessing...
                     url_guess = try_guessing(endpoint_k, root_dataset_samples)
                     tot_url = clean_url(config.url + self.url_basepath + url_guess)

datahub/ingestion/source/openapi_parser.py CHANGED Viewed

@@ -128,18 +128,18 @@ def get_endpoints(sw_dict: dict) -> dict:
     for p_k, p_o in sw_dict["paths"].items():
         method = list(p_o)[0]
-        if "200" in p_o[method]["responses"].keys():
+        if "200" in p_o[method]["responses"]:
             base_res = p_o[method]["responses"]["200"]
-        elif 200 in p_o[method]["responses"].keys():
+        elif 200 in p_o[method]["responses"]:
             # if you read a plain yml file the 200 will be an integer
             base_res = p_o[method]["responses"][200]
         else:
             # the endpoint does not have a 200 response
             continue
-        if "description" in p_o[method].keys():
+        if "description" in p_o[method]:
             desc = p_o[method]["description"]
-        elif "summary" in p_o[method].keys():
+        elif "summary" in p_o[method]:
             desc = p_o[method]["summary"]
         else:  # still testing
             desc = ""
@@ -156,7 +156,7 @@ def get_endpoints(sw_dict: dict) -> dict:
             url_details[p_k]["data"] = example_data
         # checking whether there are defined parameters to execute the call...
-        if "parameters" in p_o[method].keys():
+        if "parameters" in p_o[method]:
             url_details[p_k]["parameters"] = p_o[method]["parameters"]
     return dict(sorted(url_details.items()))
@@ -169,7 +169,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
     data = {}
     if "content" in base_res:
         res_cont = base_res["content"]
-        if "application/json" in res_cont.keys():
+        if "application/json" in res_cont:
             ex_field = None
             if "example" in res_cont["application/json"]:
                 ex_field = "example"
@@ -186,7 +186,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
                 logger.warning(
                     f"Field in swagger file does not give consistent data --- {key}"
                 )
-        elif "text/csv" in res_cont.keys():
+        elif "text/csv" in res_cont:
             data = res_cont["text/csv"]["schema"]
     elif "examples" in base_res:
         data = base_res["examples"]["application/json"]
@@ -239,7 +239,7 @@ def guessing_url_name(url: str, examples: dict) -> str:
     # substituting the parameter's name w the value
     for name, clean_name in zip(needed_n, cleaned_needed_n):
-        if clean_name in examples[ex2use].keys():
+        if clean_name in examples[ex2use]:
             guessed_url = re.sub(name, str(examples[ex2use][clean_name]), guessed_url)
     return guessed_url

datahub/ingestion/source/powerbi/config.py CHANGED Viewed

@@ -555,7 +555,7 @@ class PowerBiDashboardSourceConfig(
     def map_data_platform(cls, value):
         # For backward compatibility convert input PostgreSql to PostgreSQL
         # PostgreSQL is name of the data-platform in M-Query
-        if "PostgreSql" in value.keys():
+        if "PostgreSql" in value:
             platform_name = value["PostgreSql"]
             del value["PostgreSql"]
             value["PostgreSQL"] = platform_name

datahub/ingestion/source/powerbi/powerbi.py CHANGED Viewed

@@ -263,7 +263,7 @@ class Mapper:
             for upstream_dpt in lineage.upstreams:
                 if (
                     upstream_dpt.data_platform_pair.powerbi_data_platform_name
-                    not in self.__config.dataset_type_mapping.keys()
+                    not in self.__config.dataset_type_mapping
                 ):
                     logger.debug(
                         f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
@@ -1353,7 +1353,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
             for data_platform in SupportedDataPlatform
         ]
-        for key in self.source_config.dataset_type_mapping.keys():
+        for key in self.source_config.dataset_type_mapping:
             if key not in powerbi_data_platforms:
                 raise ValueError(f"PowerBI DataPlatform {key} is not supported")

datahub/ingestion/source/redshift/profile.py CHANGED Viewed

@@ -42,9 +42,9 @@ class RedshiftProfiler(GenericProfiler):
                 "max_overflow", self.config.profiling.max_workers
             )
-        for db in tables.keys():
+        for db in tables:
             profile_requests = []
-            for schema in tables.get(db, {}).keys():
+            for schema in tables.get(db, {}):
                 if not self.config.schema_pattern.allowed(schema):
                     continue
                 for table in tables[db].get(schema, {}):

datahub/ingestion/source/snowflake/snowflake_utils.py CHANGED Viewed

@@ -77,7 +77,7 @@ class SnowsightUrlBuilder:
         region: str,
     ) -> Tuple[str, str]:
         cloud: str
-        if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING.keys():
+        if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING:
             cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region]
         elif region.startswith(("aws_", "gcp_", "azure_")):
             # e.g. aws_us_west_2, gcp_us_central1, azure_northeurope

datahub/ingestion/source/sql/stored_procedures/base.py CHANGED Viewed

@@ -26,6 +26,7 @@ from datahub.metadata.schema_classes import (
     DataPlatformInstanceClass,
     DataTransformClass,
     DataTransformLogicClass,
+    QueryLanguageClass,
     QueryStatementClass,
     SubTypesClass,
 )
@@ -176,7 +177,17 @@ def _generate_job_workunits(
                     DataTransformClass(
                         queryStatement=QueryStatementClass(
                             value=procedure.procedure_definition,
-                            language=procedure.language,
+                            language=(
+                                QueryLanguageClass.SQL
+                                if procedure.language == "SQL"
+                                # The language field uses a pretty limited enum.
+                                # The "UNKNOWN" enum value is pretty new, so we don't want to
+                                # emit it until it has broader server-side support. As a
+                                # short-term solution, we map all languages to "SQL".
+                                # TODO: Once we've released server 1.1.0, we should change
+                                # this to be "UNKNOWN" for all languages except "SQL".
+                                else QueryLanguageClass.SQL
+                            ),
                         ),
                     )
                 ]

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -1623,7 +1623,7 @@ class TableauSiteSource:
                 # if multiple project has name C. Ideal solution is to use projectLuidWithin to avoid duplicate project,
                 # however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later.
                 project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
-                if project_luid not in self.tableau_project_registry.keys():
+                if project_luid not in self.tableau_project_registry:
                     wrk_name: Optional[str] = workbook.get(c.NAME)
                     wrk_id: Optional[str] = workbook.get(c.ID)
                     prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
@@ -2253,7 +2253,7 @@ class TableauSiteSource:
         # It is possible due to https://github.com/tableau/server-client-python/issues/1210
         if (
             ds.get(c.LUID)
-            and ds[c.LUID] not in self.datasource_project_map.keys()
+            and ds[c.LUID] not in self.datasource_project_map
             and self.report.get_all_datasources_query_failed
         ):
             logger.debug(
@@ -2265,7 +2265,7 @@ class TableauSiteSource:
         if (
             ds.get(c.LUID)
-            and ds[c.LUID] in self.datasource_project_map.keys()
+            and ds[c.LUID] in self.datasource_project_map
             and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry
         ):
             return self.datasource_project_map[ds[c.LUID]]
@@ -3252,7 +3252,7 @@ class TableauSiteSource:
         parent_key = None
         project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
-        if project_luid and project_luid in self.tableau_project_registry.keys():
+        if project_luid and project_luid in self.tableau_project_registry:
             parent_key = self.gen_project_key(project_luid)
         else:
             workbook_id: Optional[str] = workbook.get(c.ID)

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -774,7 +774,7 @@ def get_overridden_info(
     if (
         lineage_overrides is not None
         and lineage_overrides.platform_override_map is not None
-        and original_platform in lineage_overrides.platform_override_map.keys()
+        and original_platform in lineage_overrides.platform_override_map
     ):
         platform = lineage_overrides.platform_override_map[original_platform]
@@ -782,7 +782,7 @@ def get_overridden_info(
         lineage_overrides is not None
         and lineage_overrides.database_override_map is not None
         and upstream_db is not None
-        and upstream_db in lineage_overrides.database_override_map.keys()
+        and upstream_db in lineage_overrides.database_override_map
     ):
         upstream_db = lineage_overrides.database_override_map[upstream_db]

acryl-datahub 1.0.0.2rc3__py3-none-any.whl → 1.0.0.2rc5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.2rc3py3-none-any.whl → 1.0.0.2rc5py3-none-any.whl