acryl-datahub 1.0.0.2rc2__py3-none-any.whl → 1.0.0.2rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (49) hide show
  1. {acryl_datahub-1.0.0.2rc2.dist-info → acryl_datahub-1.0.0.2rc4.dist-info}/METADATA +2499 -2499
  2. {acryl_datahub-1.0.0.2rc2.dist-info → acryl_datahub-1.0.0.2rc4.dist-info}/RECORD +48 -49
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datajob/dataflow.py +15 -0
  5. datahub/api/entities/datajob/datajob.py +17 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  7. datahub/api/entities/dataset/dataset.py +2 -2
  8. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  9. datahub/cli/migrate.py +6 -6
  10. datahub/configuration/common.py +1 -1
  11. datahub/ingestion/api/common.py +9 -0
  12. datahub/ingestion/api/source.py +4 -1
  13. datahub/ingestion/api/source_helpers.py +26 -1
  14. datahub/ingestion/run/pipeline.py +0 -6
  15. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  16. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  17. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  18. datahub/ingestion/source/fivetran/fivetran.py +1 -0
  19. datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
  20. datahub/ingestion/source/iceberg/iceberg.py +97 -9
  21. datahub/ingestion/source/kafka/kafka.py +1 -4
  22. datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
  23. datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
  24. datahub/ingestion/source/looker/looker_source.py +2 -3
  25. datahub/ingestion/source/mlflow.py +3 -0
  26. datahub/ingestion/source/mode.py +2 -2
  27. datahub/ingestion/source/nifi.py +3 -3
  28. datahub/ingestion/source/openapi.py +3 -3
  29. datahub/ingestion/source/openapi_parser.py +8 -8
  30. datahub/ingestion/source/powerbi/config.py +1 -1
  31. datahub/ingestion/source/powerbi/powerbi.py +2 -2
  32. datahub/ingestion/source/redshift/profile.py +2 -2
  33. datahub/ingestion/source/sigma/sigma.py +6 -2
  34. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
  35. datahub/ingestion/source/tableau/tableau.py +4 -4
  36. datahub/ingestion/source/tableau/tableau_common.py +2 -2
  37. datahub/ingestion/source/unity/source.py +1 -1
  38. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  39. datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
  40. datahub/ingestion/transformer/dataset_domain.py +1 -1
  41. datahub/lite/lite_util.py +2 -2
  42. datahub/testing/mcp_diff.py +1 -1
  43. datahub/utilities/file_backed_collections.py +6 -6
  44. datahub/utilities/hive_schema_to_avro.py +2 -2
  45. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  46. {acryl_datahub-1.0.0.2rc2.dist-info → acryl_datahub-1.0.0.2rc4.dist-info}/WHEEL +0 -0
  47. {acryl_datahub-1.0.0.2rc2.dist-info → acryl_datahub-1.0.0.2rc4.dist-info}/entry_points.txt +0 -0
  48. {acryl_datahub-1.0.0.2rc2.dist-info → acryl_datahub-1.0.0.2rc4.dist-info}/licenses/LICENSE +0 -0
  49. {acryl_datahub-1.0.0.2rc2.dist-info → acryl_datahub-1.0.0.2rc4.dist-info}/top_level.txt +0 -0
@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
375
375
  memory_footprint.total_size(lineage)
376
376
  )
377
377
 
378
- for lineage_key in lineage.keys():
378
+ for lineage_key in lineage:
379
379
  # For views, we do not use the upstreams obtained by parsing audit logs
380
380
  # as they may contain indirectly referenced tables.
381
381
  if (
@@ -362,7 +362,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
362
362
  if self.config.include_table_item is None:
363
363
  return
364
364
  dataset_name = f"{region}.{table_name}"
365
- if dataset_name not in self.config.include_table_item.keys():
365
+ if dataset_name not in self.config.include_table_item:
366
366
  return
367
367
  primary_key_list = self.config.include_table_item.get(dataset_name)
368
368
  assert isinstance(primary_key_list, List)
@@ -215,6 +215,7 @@ class FivetranSource(StatefulIngestionSourceBase):
215
215
  datajob = DataJob(
216
216
  id=connector.connector_id,
217
217
  flow_urn=dataflow_urn,
218
+ platform_instance=self.config.platform_instance,
218
219
  name=connector.connector_name,
219
220
  owners={owner_email} if owner_email else set(),
220
221
  )
@@ -190,7 +190,7 @@ class FivetranLogAPI:
190
190
  jobs: List[Job] = []
191
191
  if connector_sync_log is None:
192
192
  return jobs
193
- for sync_id in connector_sync_log.keys():
193
+ for sync_id in connector_sync_log:
194
194
  if len(connector_sync_log[sync_id]) != 2:
195
195
  # If both sync-start and sync-end event log not present for this sync that means sync is still in progress
196
196
  continue
@@ -2,6 +2,7 @@ import json
2
2
  import logging
3
3
  import threading
4
4
  import uuid
5
+ from functools import partial
5
6
  from typing import Any, Dict, Iterable, List, Optional, Tuple
6
7
 
7
8
  from dateutil import parser as dateutil_parser
@@ -47,6 +48,12 @@ from datahub.emitter.mce_builder import (
47
48
  )
48
49
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
49
50
  from datahub.emitter.mcp_builder import NamespaceKey
51
+ from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
52
+ auto_patch_last_modified,
53
+ )
54
+ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
55
+ EnsureAspectSizeProcessor,
56
+ )
50
57
  from datahub.ingestion.api.common import PipelineContext
51
58
  from datahub.ingestion.api.decorators import (
52
59
  SourceCapability,
@@ -57,6 +64,14 @@ from datahub.ingestion.api.decorators import (
57
64
  support_status,
58
65
  )
59
66
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
67
+ from datahub.ingestion.api.source_helpers import (
68
+ AutoSystemMetadata,
69
+ auto_fix_duplicate_schema_field_paths,
70
+ auto_fix_empty_field_paths,
71
+ auto_lowercase_urns,
72
+ auto_materialize_referenced_tags_terms,
73
+ auto_workunit_reporter,
74
+ )
60
75
  from datahub.ingestion.api.workunit import MetadataWorkUnit
61
76
  from datahub.ingestion.extractor import schema_util
62
77
  from datahub.ingestion.source.common.subtypes import (
@@ -82,6 +97,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
82
97
  SchemaMetadata,
83
98
  )
84
99
  from datahub.metadata.schema_classes import (
100
+ BrowsePathEntryClass,
101
+ BrowsePathsV2Class,
85
102
  ContainerClass,
86
103
  DataPlatformInstanceClass,
87
104
  DatasetPropertiesClass,
@@ -134,6 +151,7 @@ class IcebergSource(StatefulIngestionSourceBase):
134
151
  super().__init__(config, ctx)
135
152
  self.report: IcebergSourceReport = IcebergSourceReport()
136
153
  self.config: IcebergSourceConfig = config
154
+ self.ctx: PipelineContext = ctx
137
155
 
138
156
  @classmethod
139
157
  def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
@@ -141,8 +159,47 @@ class IcebergSource(StatefulIngestionSourceBase):
141
159
  return cls(config, ctx)
142
160
 
143
161
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
162
+ # This source needs to overwrite standard `get_workunit_processor`, because it is unique in terms of usage
163
+ # of parallelism. Because of this, 2 processors won't work as expected:
164
+ # 1. browse_path_processor - it needs aspects for a single entity to be continuous - which is not guaranteed
165
+ # in this source
166
+ # 2. automatic stamping with systemMetadata - in current implementation of the Source class this processor
167
+ # would have been applied in a thread (single) shared between the source, processors and transformers.
168
+ # Since the metadata scraping happens in separate threads, this could lead to difference between
169
+ # time used by systemMetadata and actual time at which metadata was read
170
+ auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
171
+ if (
172
+ self.ctx.pipeline_config
173
+ and self.ctx.pipeline_config.source
174
+ and self.ctx.pipeline_config.source.config
175
+ and (
176
+ (
177
+ hasattr(
178
+ self.ctx.pipeline_config.source.config,
179
+ "convert_urns_to_lowercase",
180
+ )
181
+ and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase
182
+ )
183
+ or (
184
+ hasattr(self.ctx.pipeline_config.source.config, "get")
185
+ and self.ctx.pipeline_config.source.config.get(
186
+ "convert_urns_to_lowercase"
187
+ )
188
+ )
189
+ )
190
+ ):
191
+ auto_lowercase_dataset_urns = auto_lowercase_urns
192
+
144
193
  return [
145
- *super().get_workunit_processors(),
194
+ auto_lowercase_dataset_urns,
195
+ auto_materialize_referenced_tags_terms,
196
+ partial(
197
+ auto_fix_duplicate_schema_field_paths, platform=self._infer_platform()
198
+ ),
199
+ partial(auto_fix_empty_field_paths, platform=self._infer_platform()),
200
+ partial(auto_workunit_reporter, self.get_report()),
201
+ auto_patch_last_modified,
202
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
146
203
  StaleEntityRemovalHandler.create(
147
204
  self, self.config, self.ctx
148
205
  ).workunit_processor,
@@ -208,6 +265,12 @@ class IcebergSource(StatefulIngestionSourceBase):
208
265
  )
209
266
  thread_local.local_catalog = self.config.get_catalog()
210
267
 
268
+ if not hasattr(thread_local, "stamping_processor"):
269
+ LOGGER.debug(
270
+ f"Didn't find stamping_processor in thread_local ({thread_local}), initializing new workunit processor"
271
+ )
272
+ thread_local.stamping_processor = AutoSystemMetadata(self.ctx)
273
+
211
274
  with PerfTimer() as timer:
212
275
  table = thread_local.local_catalog.load_table(dataset_path)
213
276
  time_taken = timer.elapsed_seconds()
@@ -224,9 +287,11 @@ class IcebergSource(StatefulIngestionSourceBase):
224
287
  for aspect in self._create_iceberg_table_aspects(
225
288
  dataset_name, table, namespace_urn
226
289
  ):
227
- yield MetadataChangeProposalWrapper(
228
- entityUrn=dataset_urn, aspect=aspect
229
- ).as_workunit()
290
+ yield thread_local.stamping_processor.stamp_wu(
291
+ MetadataChangeProposalWrapper(
292
+ entityUrn=dataset_urn, aspect=aspect
293
+ ).as_workunit()
294
+ )
230
295
  except NoSuchPropertyException as e:
231
296
  self.report.warning(
232
297
  title="Unable to process table",
@@ -308,6 +373,7 @@ class IcebergSource(StatefulIngestionSourceBase):
308
373
  return
309
374
 
310
375
  try:
376
+ stamping_processor = AutoSystemMetadata(self.ctx)
311
377
  namespace_ids = self._get_namespaces(catalog)
312
378
  namespaces: List[Tuple[Identifier, str]] = []
313
379
  for namespace in namespace_ids:
@@ -323,9 +389,11 @@ class IcebergSource(StatefulIngestionSourceBase):
323
389
  )
324
390
  namespaces.append((namespace, namespace_urn))
325
391
  for aspect in self._create_iceberg_namespace_aspects(namespace):
326
- yield MetadataChangeProposalWrapper(
327
- entityUrn=namespace_urn, aspect=aspect
328
- ).as_workunit()
392
+ yield stamping_processor.stamp_wu(
393
+ MetadataChangeProposalWrapper(
394
+ entityUrn=namespace_urn, aspect=aspect
395
+ ).as_workunit()
396
+ )
329
397
  LOGGER.debug("Namespaces ingestion completed")
330
398
  except Exception as e:
331
399
  self.report.report_failure(
@@ -366,7 +434,9 @@ class IcebergSource(StatefulIngestionSourceBase):
366
434
  yield dataset_ownership
367
435
 
368
436
  yield self._create_schema_metadata(dataset_name, table)
369
- yield self._get_dataplatform_instance_aspect()
437
+ dpi = self._get_dataplatform_instance_aspect()
438
+ yield dpi
439
+ yield self._create_browse_paths_aspect(dpi.instance, str(namespace_urn))
370
440
  yield ContainerClass(container=str(namespace_urn))
371
441
 
372
442
  self.report.report_table_processing_time(
@@ -377,6 +447,22 @@ class IcebergSource(StatefulIngestionSourceBase):
377
447
  profiler = IcebergProfiler(self.report, self.config.profiling)
378
448
  yield from profiler.profile_table(dataset_name, table)
379
449
 
450
+ def _create_browse_paths_aspect(
451
+ self,
452
+ platform_instance_urn: Optional[str] = None,
453
+ container_urn: Optional[str] = None,
454
+ ) -> BrowsePathsV2Class:
455
+ path = []
456
+ if platform_instance_urn:
457
+ path.append(
458
+ BrowsePathEntryClass(
459
+ id=platform_instance_urn, urn=platform_instance_urn
460
+ )
461
+ )
462
+ if container_urn:
463
+ path.append(BrowsePathEntryClass(id=container_urn, urn=container_urn))
464
+ return BrowsePathsV2Class(path=path)
465
+
380
466
  def _get_partition_aspect(self, table: Table) -> Optional[str]:
381
467
  """Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
382
468
  Each element of the returned array represents a field in the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) that follows [Appendix-C](https://iceberg.apache.org/spec/?#appendix-c-json-serialization) of the Iceberg specification.
@@ -530,7 +616,9 @@ class IcebergSource(StatefulIngestionSourceBase):
530
616
  name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
531
617
  )
532
618
  yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
533
- yield self._get_dataplatform_instance_aspect()
619
+ dpi = self._get_dataplatform_instance_aspect()
620
+ yield dpi
621
+ yield self._create_browse_paths_aspect(dpi.instance)
534
622
 
535
623
 
536
624
  class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
@@ -568,10 +568,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
568
568
 
569
569
  for config_key in KafkaTopicConfigKeys:
570
570
  try:
571
- if (
572
- config_key in topic_config.keys()
573
- and topic_config[config_key] is not None
574
- ):
571
+ if config_key in topic_config and topic_config[config_key] is not None:
575
572
  config_value = topic_config[config_key].value
576
573
  custom_props[config_key] = (
577
574
  config_value
@@ -197,7 +197,7 @@ class BigQuerySinkConnector(BaseConnector):
197
197
  for name in transform_names:
198
198
  transform = {"name": name}
199
199
  transforms.append(transform)
200
- for key in self.connector_manifest.config.keys():
200
+ for key in self.connector_manifest.config:
201
201
  if key.startswith(f"transforms.{name}."):
202
202
  transform[key.replace(f"transforms.{name}.", "")] = (
203
203
  self.connector_manifest.config[key]
@@ -121,7 +121,7 @@ class ConfluentJDBCSourceConnector(BaseConnector):
121
121
  for name in transform_names:
122
122
  transform = {"name": name}
123
123
  transforms.append(transform)
124
- for key in self.connector_manifest.config.keys():
124
+ for key in self.connector_manifest.config:
125
125
  if key.startswith(f"transforms.{name}."):
126
126
  transform[key.replace(f"transforms.{name}.", "")] = (
127
127
  self.connector_manifest.config[key]
@@ -363,7 +363,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
363
363
  filters: MutableMapping[str, Any] = (
364
364
  query.filters if query.filters is not None else {}
365
365
  )
366
- for field in filters.keys():
366
+ for field in filters:
367
367
  if field is None:
368
368
  continue
369
369
 
@@ -877,8 +877,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
877
877
  # fine to set them to None.
878
878
  # TODO: Track project names for each explore.
879
879
  explores_to_fetch = [
880
- (None, model, explore)
881
- for (model, explore) in self.reachable_explores.keys()
880
+ (None, model, explore) for (model, explore) in self.reachable_explores
882
881
  ]
883
882
  explores_to_fetch.sort()
884
883
 
@@ -36,6 +36,7 @@ from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
36
36
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
37
37
  StaleEntityRemovalHandler,
38
38
  StaleEntityRemovalSourceReport,
39
+ StatefulStaleMetadataRemovalConfig,
39
40
  )
40
41
  from datahub.ingestion.source.state.stateful_ingestion_base import (
41
42
  StatefulIngestionConfigBase,
@@ -119,6 +120,8 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
119
120
  default=None, description="Password for MLflow authentication"
120
121
  )
121
122
 
123
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
124
+
122
125
 
123
126
  @dataclass
124
127
  class MLflowRegisteredModelStageInfo:
@@ -899,7 +899,7 @@ class ModeSource(StatefulIngestionSourceBase):
899
899
  for match in matches:
900
900
  definition = Template(source=match).render()
901
901
  parameters = yaml.safe_load(definition)
902
- for key in parameters.keys():
902
+ for key in parameters:
903
903
  jinja_params[key] = parameters[key].get("default", "")
904
904
 
905
905
  normalized_query = re.sub(
@@ -1601,7 +1601,7 @@ class ModeSource(StatefulIngestionSourceBase):
1601
1601
 
1602
1602
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
1603
1603
  # Space/collection -> report -> query -> Chart
1604
- for space_token in self.space_tokens.keys():
1604
+ for space_token in self.space_tokens:
1605
1605
  reports = self._get_reports(space_token)
1606
1606
  for report in reports:
1607
1607
  report_token = report.get("token", "")
@@ -703,7 +703,7 @@ class NifiSource(StatefulIngestionSourceBase):
703
703
  if (
704
704
  component.nifi_type is NifiType.PROCESSOR
705
705
  and component.type
706
- not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS.keys()
706
+ not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
707
707
  ) or component.nifi_type not in [
708
708
  NifiType.PROCESSOR,
709
709
  NifiType.REMOTE_INPUT_PORT,
@@ -977,7 +977,7 @@ class NifiSource(StatefulIngestionSourceBase):
977
977
  )
978
978
 
979
979
  for incoming_from in incoming:
980
- if incoming_from in self.nifi_flow.remotely_accessible_ports.keys():
980
+ if incoming_from in self.nifi_flow.remotely_accessible_ports:
981
981
  dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
982
982
  dataset_urn = builder.make_dataset_urn(
983
983
  NIFI, dataset_name, self.config.env
@@ -994,7 +994,7 @@ class NifiSource(StatefulIngestionSourceBase):
994
994
  )
995
995
 
996
996
  for outgoing_to in outgoing:
997
- if outgoing_to in self.nifi_flow.remotely_accessible_ports.keys():
997
+ if outgoing_to in self.nifi_flow.remotely_accessible_ports:
998
998
  dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
999
999
  dataset_urn = builder.make_dataset_urn(
1000
1000
  NIFI, dataset_name, self.config.env
@@ -102,7 +102,7 @@ class OpenApiConfig(ConfigModel):
102
102
  # details there once, and then use that session for all requests.
103
103
  self.token = f"Bearer {self.bearer_token}"
104
104
  else:
105
- assert "url_complement" in self.get_token.keys(), (
105
+ assert "url_complement" in self.get_token, (
106
106
  "When 'request_type' is set to 'get', an url_complement is needed for the request."
107
107
  )
108
108
  if self.get_token["request_type"] == "get":
@@ -317,7 +317,7 @@ class APISource(Source, ABC):
317
317
  yield wu
318
318
 
319
319
  # Handle schema metadata if available
320
- if "data" in endpoint_dets.keys():
320
+ if "data" in endpoint_dets:
321
321
  # we are lucky! data is defined in the swagger for this endpoint
322
322
  schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
323
323
  wu = MetadataWorkUnit(
@@ -371,7 +371,7 @@ class APISource(Source, ABC):
371
371
  else:
372
372
  self.report_bad_responses(response.status_code, type=endpoint_k)
373
373
  else:
374
- if endpoint_k not in config.forced_examples.keys():
374
+ if endpoint_k not in config.forced_examples:
375
375
  # start guessing...
376
376
  url_guess = try_guessing(endpoint_k, root_dataset_samples)
377
377
  tot_url = clean_url(config.url + self.url_basepath + url_guess)
@@ -128,18 +128,18 @@ def get_endpoints(sw_dict: dict) -> dict:
128
128
 
129
129
  for p_k, p_o in sw_dict["paths"].items():
130
130
  method = list(p_o)[0]
131
- if "200" in p_o[method]["responses"].keys():
131
+ if "200" in p_o[method]["responses"]:
132
132
  base_res = p_o[method]["responses"]["200"]
133
- elif 200 in p_o[method]["responses"].keys():
133
+ elif 200 in p_o[method]["responses"]:
134
134
  # if you read a plain yml file the 200 will be an integer
135
135
  base_res = p_o[method]["responses"][200]
136
136
  else:
137
137
  # the endpoint does not have a 200 response
138
138
  continue
139
139
 
140
- if "description" in p_o[method].keys():
140
+ if "description" in p_o[method]:
141
141
  desc = p_o[method]["description"]
142
- elif "summary" in p_o[method].keys():
142
+ elif "summary" in p_o[method]:
143
143
  desc = p_o[method]["summary"]
144
144
  else: # still testing
145
145
  desc = ""
@@ -156,7 +156,7 @@ def get_endpoints(sw_dict: dict) -> dict:
156
156
  url_details[p_k]["data"] = example_data
157
157
 
158
158
  # checking whether there are defined parameters to execute the call...
159
- if "parameters" in p_o[method].keys():
159
+ if "parameters" in p_o[method]:
160
160
  url_details[p_k]["parameters"] = p_o[method]["parameters"]
161
161
 
162
162
  return dict(sorted(url_details.items()))
@@ -169,7 +169,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
169
169
  data = {}
170
170
  if "content" in base_res:
171
171
  res_cont = base_res["content"]
172
- if "application/json" in res_cont.keys():
172
+ if "application/json" in res_cont:
173
173
  ex_field = None
174
174
  if "example" in res_cont["application/json"]:
175
175
  ex_field = "example"
@@ -186,7 +186,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
186
186
  logger.warning(
187
187
  f"Field in swagger file does not give consistent data --- {key}"
188
188
  )
189
- elif "text/csv" in res_cont.keys():
189
+ elif "text/csv" in res_cont:
190
190
  data = res_cont["text/csv"]["schema"]
191
191
  elif "examples" in base_res:
192
192
  data = base_res["examples"]["application/json"]
@@ -239,7 +239,7 @@ def guessing_url_name(url: str, examples: dict) -> str:
239
239
 
240
240
  # substituting the parameter's name w the value
241
241
  for name, clean_name in zip(needed_n, cleaned_needed_n):
242
- if clean_name in examples[ex2use].keys():
242
+ if clean_name in examples[ex2use]:
243
243
  guessed_url = re.sub(name, str(examples[ex2use][clean_name]), guessed_url)
244
244
 
245
245
  return guessed_url
@@ -555,7 +555,7 @@ class PowerBiDashboardSourceConfig(
555
555
  def map_data_platform(cls, value):
556
556
  # For backward compatibility convert input PostgreSql to PostgreSQL
557
557
  # PostgreSQL is name of the data-platform in M-Query
558
- if "PostgreSql" in value.keys():
558
+ if "PostgreSql" in value:
559
559
  platform_name = value["PostgreSql"]
560
560
  del value["PostgreSql"]
561
561
  value["PostgreSQL"] = platform_name
@@ -263,7 +263,7 @@ class Mapper:
263
263
  for upstream_dpt in lineage.upstreams:
264
264
  if (
265
265
  upstream_dpt.data_platform_pair.powerbi_data_platform_name
266
- not in self.__config.dataset_type_mapping.keys()
266
+ not in self.__config.dataset_type_mapping
267
267
  ):
268
268
  logger.debug(
269
269
  f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
@@ -1353,7 +1353,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1353
1353
  for data_platform in SupportedDataPlatform
1354
1354
  ]
1355
1355
 
1356
- for key in self.source_config.dataset_type_mapping.keys():
1356
+ for key in self.source_config.dataset_type_mapping:
1357
1357
  if key not in powerbi_data_platforms:
1358
1358
  raise ValueError(f"PowerBI DataPlatform {key} is not supported")
1359
1359
 
@@ -42,9 +42,9 @@ class RedshiftProfiler(GenericProfiler):
42
42
  "max_overflow", self.config.profiling.max_workers
43
43
  )
44
44
 
45
- for db in tables.keys():
45
+ for db in tables:
46
46
  profile_requests = []
47
- for schema in tables.get(db, {}).keys():
47
+ for schema in tables.get(db, {}):
48
48
  if not self.config.schema_pattern.allowed(schema):
49
49
  continue
50
50
  for table in tables[db].get(schema, {}):
@@ -170,7 +170,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
170
170
  if self.config.workspace_pattern.allowed(workspace.name):
171
171
  allowed_workspaces.append(workspace)
172
172
  else:
173
- self.reporter.workspaces.dropped(workspace.workspaceId)
173
+ self.reporter.workspaces.dropped(
174
+ f"{workspace.name} ({workspace.workspaceId})"
175
+ )
174
176
  logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
175
177
 
176
178
  return allowed_workspaces
@@ -661,7 +663,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
661
663
  yield from self._gen_workbook_workunit(workbook)
662
664
 
663
665
  for workspace in self._get_allowed_workspaces():
664
- self.reporter.workspaces.processed(workspace.workspaceId)
666
+ self.reporter.workspaces.processed(
667
+ f"{workspace.name} ({workspace.workspaceId})"
668
+ )
665
669
  yield from self._gen_workspace_workunit(workspace)
666
670
  yield from self._gen_sigma_dataset_upstream_lineage_workunit()
667
671
 
@@ -77,7 +77,7 @@ class SnowsightUrlBuilder:
77
77
  region: str,
78
78
  ) -> Tuple[str, str]:
79
79
  cloud: str
80
- if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING.keys():
80
+ if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING:
81
81
  cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region]
82
82
  elif region.startswith(("aws_", "gcp_", "azure_")):
83
83
  # e.g. aws_us_west_2, gcp_us_central1, azure_northeurope
@@ -1623,7 +1623,7 @@ class TableauSiteSource:
1623
1623
  # if multiple project has name C. Ideal solution is to use projectLuidWithin to avoid duplicate project,
1624
1624
  # however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later.
1625
1625
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
1626
- if project_luid not in self.tableau_project_registry.keys():
1626
+ if project_luid not in self.tableau_project_registry:
1627
1627
  wrk_name: Optional[str] = workbook.get(c.NAME)
1628
1628
  wrk_id: Optional[str] = workbook.get(c.ID)
1629
1629
  prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
@@ -2253,7 +2253,7 @@ class TableauSiteSource:
2253
2253
  # It is possible due to https://github.com/tableau/server-client-python/issues/1210
2254
2254
  if (
2255
2255
  ds.get(c.LUID)
2256
- and ds[c.LUID] not in self.datasource_project_map.keys()
2256
+ and ds[c.LUID] not in self.datasource_project_map
2257
2257
  and self.report.get_all_datasources_query_failed
2258
2258
  ):
2259
2259
  logger.debug(
@@ -2265,7 +2265,7 @@ class TableauSiteSource:
2265
2265
 
2266
2266
  if (
2267
2267
  ds.get(c.LUID)
2268
- and ds[c.LUID] in self.datasource_project_map.keys()
2268
+ and ds[c.LUID] in self.datasource_project_map
2269
2269
  and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry
2270
2270
  ):
2271
2271
  return self.datasource_project_map[ds[c.LUID]]
@@ -3252,7 +3252,7 @@ class TableauSiteSource:
3252
3252
 
3253
3253
  parent_key = None
3254
3254
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
3255
- if project_luid and project_luid in self.tableau_project_registry.keys():
3255
+ if project_luid and project_luid in self.tableau_project_registry:
3256
3256
  parent_key = self.gen_project_key(project_luid)
3257
3257
  else:
3258
3258
  workbook_id: Optional[str] = workbook.get(c.ID)
@@ -774,7 +774,7 @@ def get_overridden_info(
774
774
  if (
775
775
  lineage_overrides is not None
776
776
  and lineage_overrides.platform_override_map is not None
777
- and original_platform in lineage_overrides.platform_override_map.keys()
777
+ and original_platform in lineage_overrides.platform_override_map
778
778
  ):
779
779
  platform = lineage_overrides.platform_override_map[original_platform]
780
780
 
@@ -782,7 +782,7 @@ def get_overridden_info(
782
782
  lineage_overrides is not None
783
783
  and lineage_overrides.database_override_map is not None
784
784
  and upstream_db is not None
785
- and upstream_db in lineage_overrides.database_override_map.keys()
785
+ and upstream_db in lineage_overrides.database_override_map
786
786
  ):
787
787
  upstream_db = lineage_overrides.database_override_map[upstream_db]
788
788
 
@@ -1003,7 +1003,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
1003
1003
  generate_usage_statistics=False,
1004
1004
  generate_operations=False,
1005
1005
  )
1006
- for dataset_name in self.view_definitions.keys():
1006
+ for dataset_name in self.view_definitions:
1007
1007
  view_ref, view_definition = self.view_definitions[dataset_name]
1008
1008
  result = self._run_sql_parser(
1009
1009
  view_ref,
@@ -54,7 +54,7 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
54
54
  data_products_container: Dict[str, DataProductPatchBuilder] = {}
55
55
  logger.debug("Generating dataproducts")
56
56
  is_container = self.config.is_container
57
- for entity_urn in self.entity_map.keys():
57
+ for entity_urn in self.entity_map:
58
58
  data_product_urn = self.config.get_data_product_to_add(entity_urn)
59
59
  if data_product_urn:
60
60
  if data_product_urn not in data_products:
@@ -86,7 +86,7 @@ class AddDatasetOwnership(OwnershipTransformer):
86
86
  logger.debug("Generating Ownership for containers")
87
87
  ownership_container_mapping: Dict[str, List[OwnerClass]] = {}
88
88
  for entity_urn, data_ownerships in (
89
- (urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map.keys()
89
+ (urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map
90
90
  ):
91
91
  if not data_ownerships:
92
92
  continue
@@ -125,7 +125,7 @@ class AddDatasetDomain(DatasetDomainTransformer):
125
125
  return domain_mcps
126
126
 
127
127
  for entity_urn, domain_to_add in (
128
- (urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map.keys()
128
+ (urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map
129
129
  ):
130
130
  if not domain_to_add or not domain_to_add.domains:
131
131
  continue
datahub/lite/lite_util.py CHANGED
@@ -99,7 +99,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
99
99
  lite_class = lite_registry.get(lite_type)
100
100
  except KeyError as e:
101
101
  raise Exception(
102
- f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping.keys()]}"
102
+ f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping]}"
103
103
  ) from e
104
104
 
105
105
  lite_specific_config = lite_class.get_config_class().parse_obj(
@@ -127,7 +127,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
127
127
  return lite
128
128
  else:
129
129
  raise Exception(
130
- f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping.keys()]}"
130
+ f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping]}"
131
131
  )
132
132
  else:
133
133
  return lite
@@ -189,7 +189,7 @@ class MCPDiff:
189
189
  """
190
190
  aspect_diffs = [v for d in self.aspect_changes.values() for v in d.values()]
191
191
  for aspect_diff in aspect_diffs:
192
- for _, old, new in aspect_diff.aspects_changed.keys():
192
+ for _, old, new in aspect_diff.aspects_changed:
193
193
  golden[old.delta_info.idx] = new.delta_info.original
194
194
 
195
195
  indices_to_remove = set()