acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show
  1. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2557 -2557
  2. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +81 -79
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datajob/dataflow.py +15 -0
  5. datahub/api/entities/datajob/datajob.py +17 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  7. datahub/api/entities/dataset/dataset.py +2 -2
  8. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  9. datahub/cli/ingest_cli.py +4 -4
  10. datahub/cli/migrate.py +6 -6
  11. datahub/configuration/common.py +1 -1
  12. datahub/emitter/mcp_builder.py +4 -0
  13. datahub/errors.py +4 -0
  14. datahub/ingestion/api/common.py +9 -0
  15. datahub/ingestion/api/source.py +6 -2
  16. datahub/ingestion/api/source_helpers.py +35 -2
  17. datahub/ingestion/graph/client.py +122 -7
  18. datahub/ingestion/graph/filters.py +41 -16
  19. datahub/ingestion/run/pipeline.py +0 -6
  20. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  21. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  22. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  23. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  24. datahub/ingestion/source/fivetran/fivetran.py +1 -0
  25. datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
  26. datahub/ingestion/source/hex/constants.py +5 -0
  27. datahub/ingestion/source/hex/hex.py +150 -22
  28. datahub/ingestion/source/hex/mapper.py +28 -2
  29. datahub/ingestion/source/hex/model.py +10 -2
  30. datahub/ingestion/source/hex/query_fetcher.py +300 -0
  31. datahub/ingestion/source/iceberg/iceberg.py +106 -18
  32. datahub/ingestion/source/kafka/kafka.py +1 -4
  33. datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
  34. datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
  35. datahub/ingestion/source/looker/looker_source.py +2 -3
  36. datahub/ingestion/source/mlflow.py +6 -7
  37. datahub/ingestion/source/mode.py +2 -2
  38. datahub/ingestion/source/nifi.py +3 -3
  39. datahub/ingestion/source/openapi.py +3 -3
  40. datahub/ingestion/source/openapi_parser.py +8 -8
  41. datahub/ingestion/source/powerbi/config.py +1 -1
  42. datahub/ingestion/source/powerbi/powerbi.py +16 -3
  43. datahub/ingestion/source/redshift/profile.py +2 -2
  44. datahub/ingestion/source/sigma/sigma.py +6 -2
  45. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
  46. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  47. datahub/ingestion/source/sql/trino.py +4 -3
  48. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  49. datahub/ingestion/source/superset.py +108 -81
  50. datahub/ingestion/source/tableau/tableau.py +4 -4
  51. datahub/ingestion/source/tableau/tableau_common.py +2 -2
  52. datahub/ingestion/source/unity/source.py +1 -1
  53. datahub/ingestion/source/vertexai/vertexai.py +7 -7
  54. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  55. datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
  56. datahub/ingestion/transformer/dataset_domain.py +1 -1
  57. datahub/lite/lite_util.py +2 -2
  58. datahub/metadata/_schema_classes.py +47 -2
  59. datahub/metadata/_urns/urn_defs.py +56 -0
  60. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  61. datahub/metadata/schema.avsc +121 -85
  62. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  63. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  64. datahub/metadata/schemas/FormInfo.avsc +5 -0
  65. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  66. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  67. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  68. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  69. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  70. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  71. datahub/sdk/search_client.py +81 -8
  72. datahub/sdk/search_filters.py +73 -11
  73. datahub/testing/mcp_diff.py +1 -1
  74. datahub/utilities/file_backed_collections.py +6 -6
  75. datahub/utilities/hive_schema_to_avro.py +2 -2
  76. datahub/utilities/ingest_utils.py +2 -2
  77. datahub/utilities/threaded_iterator_executor.py +16 -3
  78. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  79. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
  80. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
  81. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
  82. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import json
2
2
  import logging
3
3
  import threading
4
4
  import uuid
5
+ from functools import partial
5
6
  from typing import Any, Dict, Iterable, List, Optional, Tuple
6
7
 
7
8
  from dateutil import parser as dateutil_parser
@@ -47,6 +48,12 @@ from datahub.emitter.mce_builder import (
47
48
  )
48
49
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
49
50
  from datahub.emitter.mcp_builder import NamespaceKey
51
+ from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
52
+ auto_patch_last_modified,
53
+ )
54
+ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
55
+ EnsureAspectSizeProcessor,
56
+ )
50
57
  from datahub.ingestion.api.common import PipelineContext
51
58
  from datahub.ingestion.api.decorators import (
52
59
  SourceCapability,
@@ -57,6 +64,14 @@ from datahub.ingestion.api.decorators import (
57
64
  support_status,
58
65
  )
59
66
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
67
+ from datahub.ingestion.api.source_helpers import (
68
+ AutoSystemMetadata,
69
+ auto_fix_duplicate_schema_field_paths,
70
+ auto_fix_empty_field_paths,
71
+ auto_lowercase_urns,
72
+ auto_materialize_referenced_tags_terms,
73
+ auto_workunit_reporter,
74
+ )
60
75
  from datahub.ingestion.api.workunit import MetadataWorkUnit
61
76
  from datahub.ingestion.extractor import schema_util
62
77
  from datahub.ingestion.source.common.subtypes import (
@@ -82,6 +97,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
82
97
  SchemaMetadata,
83
98
  )
84
99
  from datahub.metadata.schema_classes import (
100
+ BrowsePathEntryClass,
101
+ BrowsePathsV2Class,
85
102
  ContainerClass,
86
103
  DataPlatformInstanceClass,
87
104
  DatasetPropertiesClass,
@@ -134,6 +151,7 @@ class IcebergSource(StatefulIngestionSourceBase):
134
151
  super().__init__(config, ctx)
135
152
  self.report: IcebergSourceReport = IcebergSourceReport()
136
153
  self.config: IcebergSourceConfig = config
154
+ self.ctx: PipelineContext = ctx
137
155
 
138
156
  @classmethod
139
157
  def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
@@ -141,8 +159,47 @@ class IcebergSource(StatefulIngestionSourceBase):
141
159
  return cls(config, ctx)
142
160
 
143
161
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
162
+ # This source needs to overwrite standard `get_workunit_processor`, because it is unique in terms of usage
163
+ # of parallelism. Because of this, 2 processors won't work as expected:
164
+ # 1. browse_path_processor - it needs aspects for a single entity to be continuous - which is not guaranteed
165
+ # in this source
166
+ # 2. automatic stamping with systemMetadata - in current implementation of the Source class this processor
167
+ # would have been applied in a thread (single) shared between the source, processors and transformers.
168
+ # Since the metadata scraping happens in separate threads, this could lead to difference between
169
+ # time used by systemMetadata and actual time at which metadata was read
170
+ auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
171
+ if (
172
+ self.ctx.pipeline_config
173
+ and self.ctx.pipeline_config.source
174
+ and self.ctx.pipeline_config.source.config
175
+ and (
176
+ (
177
+ hasattr(
178
+ self.ctx.pipeline_config.source.config,
179
+ "convert_urns_to_lowercase",
180
+ )
181
+ and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase
182
+ )
183
+ or (
184
+ hasattr(self.ctx.pipeline_config.source.config, "get")
185
+ and self.ctx.pipeline_config.source.config.get(
186
+ "convert_urns_to_lowercase"
187
+ )
188
+ )
189
+ )
190
+ ):
191
+ auto_lowercase_dataset_urns = auto_lowercase_urns
192
+
144
193
  return [
145
- *super().get_workunit_processors(),
194
+ auto_lowercase_dataset_urns,
195
+ auto_materialize_referenced_tags_terms,
196
+ partial(
197
+ auto_fix_duplicate_schema_field_paths, platform=self._infer_platform()
198
+ ),
199
+ partial(auto_fix_empty_field_paths, platform=self._infer_platform()),
200
+ partial(auto_workunit_reporter, self.get_report()),
201
+ auto_patch_last_modified,
202
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
146
203
  StaleEntityRemovalHandler.create(
147
204
  self, self.config, self.ctx
148
205
  ).workunit_processor,
@@ -208,6 +265,12 @@ class IcebergSource(StatefulIngestionSourceBase):
208
265
  )
209
266
  thread_local.local_catalog = self.config.get_catalog()
210
267
 
268
+ if not hasattr(thread_local, "stamping_processor"):
269
+ LOGGER.debug(
270
+ f"Didn't find stamping_processor in thread_local ({thread_local}), initializing new workunit processor"
271
+ )
272
+ thread_local.stamping_processor = AutoSystemMetadata(self.ctx)
273
+
211
274
  with PerfTimer() as timer:
212
275
  table = thread_local.local_catalog.load_table(dataset_path)
213
276
  time_taken = timer.elapsed_seconds()
@@ -224,9 +287,11 @@ class IcebergSource(StatefulIngestionSourceBase):
224
287
  for aspect in self._create_iceberg_table_aspects(
225
288
  dataset_name, table, namespace_urn
226
289
  ):
227
- yield MetadataChangeProposalWrapper(
228
- entityUrn=dataset_urn, aspect=aspect
229
- ).as_workunit()
290
+ yield thread_local.stamping_processor.stamp_wu(
291
+ MetadataChangeProposalWrapper(
292
+ entityUrn=dataset_urn, aspect=aspect
293
+ ).as_workunit()
294
+ )
230
295
  except NoSuchPropertyException as e:
231
296
  self.report.warning(
232
297
  title="Unable to process table",
@@ -308,6 +373,7 @@ class IcebergSource(StatefulIngestionSourceBase):
308
373
  return
309
374
 
310
375
  try:
376
+ stamping_processor = AutoSystemMetadata(self.ctx)
311
377
  namespace_ids = self._get_namespaces(catalog)
312
378
  namespaces: List[Tuple[Identifier, str]] = []
313
379
  for namespace in namespace_ids:
@@ -323,9 +389,11 @@ class IcebergSource(StatefulIngestionSourceBase):
323
389
  )
324
390
  namespaces.append((namespace, namespace_urn))
325
391
  for aspect in self._create_iceberg_namespace_aspects(namespace):
326
- yield MetadataChangeProposalWrapper(
327
- entityUrn=namespace_urn, aspect=aspect
328
- ).as_workunit()
392
+ yield stamping_processor.stamp_wu(
393
+ MetadataChangeProposalWrapper(
394
+ entityUrn=namespace_urn, aspect=aspect
395
+ ).as_workunit()
396
+ )
329
397
  LOGGER.debug("Namespaces ingestion completed")
330
398
  except Exception as e:
331
399
  self.report.report_failure(
@@ -366,7 +434,9 @@ class IcebergSource(StatefulIngestionSourceBase):
366
434
  yield dataset_ownership
367
435
 
368
436
  yield self._create_schema_metadata(dataset_name, table)
369
- yield self._get_dataplatform_instance_aspect()
437
+ dpi = self._get_dataplatform_instance_aspect()
438
+ yield dpi
439
+ yield self._create_browse_paths_aspect(dpi.instance, str(namespace_urn))
370
440
  yield ContainerClass(container=str(namespace_urn))
371
441
 
372
442
  self.report.report_table_processing_time(
@@ -377,6 +447,22 @@ class IcebergSource(StatefulIngestionSourceBase):
377
447
  profiler = IcebergProfiler(self.report, self.config.profiling)
378
448
  yield from profiler.profile_table(dataset_name, table)
379
449
 
450
+ def _create_browse_paths_aspect(
451
+ self,
452
+ platform_instance_urn: Optional[str] = None,
453
+ container_urn: Optional[str] = None,
454
+ ) -> BrowsePathsV2Class:
455
+ path = []
456
+ if platform_instance_urn:
457
+ path.append(
458
+ BrowsePathEntryClass(
459
+ id=platform_instance_urn, urn=platform_instance_urn
460
+ )
461
+ )
462
+ if container_urn:
463
+ path.append(BrowsePathEntryClass(id=container_urn, urn=container_urn))
464
+ return BrowsePathsV2Class(path=path)
465
+
380
466
  def _get_partition_aspect(self, table: Table) -> Optional[str]:
381
467
  """Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
382
468
  Each element of the returned array represents a field in the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) that follows [Appendix-C](https://iceberg.apache.org/spec/?#appendix-c-json-serialization) of the Iceberg specification.
@@ -425,23 +511,21 @@ class IcebergSource(StatefulIngestionSourceBase):
425
511
  def _get_dataset_properties_aspect(
426
512
  self, dataset_name: str, table: Table
427
513
  ) -> DatasetPropertiesClass:
428
- additional_properties = {}
514
+ created: Optional[TimeStampClass] = None
429
515
  custom_properties = table.metadata.properties.copy()
430
516
  custom_properties["location"] = table.metadata.location
431
517
  custom_properties["format-version"] = str(table.metadata.format_version)
432
518
  custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
519
+ last_modified: Optional[int] = table.metadata.last_updated_ms
433
520
  if table.current_snapshot():
434
521
  custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
435
522
  custom_properties["manifest-list"] = table.current_snapshot().manifest_list
436
- additional_properties["lastModified"] = TimeStampClass(
437
- int(table.current_snapshot().timestamp_ms)
438
- )
523
+ if not last_modified:
524
+ last_modified = int(table.current_snapshot().timestamp_ms)
439
525
  if "created-at" in custom_properties:
440
526
  try:
441
527
  dt = dateutil_parser.isoparse(custom_properties["created-at"])
442
- additional_properties["created"] = TimeStampClass(
443
- int(dt.timestamp() * 1000)
444
- )
528
+ created = TimeStampClass(int(dt.timestamp() * 1000))
445
529
  except Exception as ex:
446
530
  LOGGER.warning(
447
531
  f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
@@ -451,8 +535,10 @@ class IcebergSource(StatefulIngestionSourceBase):
451
535
  name=table.name()[-1],
452
536
  description=table.metadata.properties.get("comment", None),
453
537
  customProperties=custom_properties,
454
- lastModified=additional_properties.get("lastModified"),
455
- created=additional_properties.get("created"),
538
+ lastModified=TimeStampClass(last_modified)
539
+ if last_modified is not None
540
+ else None,
541
+ created=created,
456
542
  qualifiedName=dataset_name,
457
543
  )
458
544
 
@@ -530,7 +616,9 @@ class IcebergSource(StatefulIngestionSourceBase):
530
616
  name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
531
617
  )
532
618
  yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
533
- yield self._get_dataplatform_instance_aspect()
619
+ dpi = self._get_dataplatform_instance_aspect()
620
+ yield dpi
621
+ yield self._create_browse_paths_aspect(dpi.instance)
534
622
 
535
623
 
536
624
  class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
@@ -568,10 +568,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
568
568
 
569
569
  for config_key in KafkaTopicConfigKeys:
570
570
  try:
571
- if (
572
- config_key in topic_config.keys()
573
- and topic_config[config_key] is not None
574
- ):
571
+ if config_key in topic_config and topic_config[config_key] is not None:
575
572
  config_value = topic_config[config_key].value
576
573
  custom_props[config_key] = (
577
574
  config_value
@@ -197,7 +197,7 @@ class BigQuerySinkConnector(BaseConnector):
197
197
  for name in transform_names:
198
198
  transform = {"name": name}
199
199
  transforms.append(transform)
200
- for key in self.connector_manifest.config.keys():
200
+ for key in self.connector_manifest.config:
201
201
  if key.startswith(f"transforms.{name}."):
202
202
  transform[key.replace(f"transforms.{name}.", "")] = (
203
203
  self.connector_manifest.config[key]
@@ -121,7 +121,7 @@ class ConfluentJDBCSourceConnector(BaseConnector):
121
121
  for name in transform_names:
122
122
  transform = {"name": name}
123
123
  transforms.append(transform)
124
- for key in self.connector_manifest.config.keys():
124
+ for key in self.connector_manifest.config:
125
125
  if key.startswith(f"transforms.{name}."):
126
126
  transform[key.replace(f"transforms.{name}.", "")] = (
127
127
  self.connector_manifest.config[key]
@@ -363,7 +363,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
363
363
  filters: MutableMapping[str, Any] = (
364
364
  query.filters if query.filters is not None else {}
365
365
  )
366
- for field in filters.keys():
366
+ for field in filters:
367
367
  if field is None:
368
368
  continue
369
369
 
@@ -877,8 +877,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
877
877
  # fine to set them to None.
878
878
  # TODO: Track project names for each explore.
879
879
  explores_to_fetch = [
880
- (None, model, explore)
881
- for (model, explore) in self.reachable_explores.keys()
880
+ (None, model, explore) for (model, explore) in self.reachable_explores
882
881
  ]
883
882
  explores_to_fetch.sort()
884
883
 
@@ -16,7 +16,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
16
16
  )
17
17
  from datahub.configuration.source_common import EnvConfigMixin
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
- from datahub.emitter.mcp_builder import ContainerKey
19
+ from datahub.emitter.mcp_builder import ExperimentKey
20
20
  from datahub.ingestion.api.common import PipelineContext
21
21
  from datahub.ingestion.api.decorators import (
22
22
  SupportStatus,
@@ -36,6 +36,7 @@ from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
36
36
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
37
37
  StaleEntityRemovalHandler,
38
38
  StaleEntityRemovalSourceReport,
39
+ StatefulStaleMetadataRemovalConfig,
39
40
  )
40
41
  from datahub.ingestion.source.state.stateful_ingestion_base import (
41
42
  StatefulIngestionConfigBase,
@@ -77,10 +78,6 @@ from datahub.sdk.dataset import Dataset
77
78
  T = TypeVar("T")
78
79
 
79
80
 
80
- class ContainerKeyWithId(ContainerKey):
81
- id: str
82
-
83
-
84
81
  class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
85
82
  tracking_uri: Optional[str] = Field(
86
83
  default=None,
@@ -123,6 +120,8 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
123
120
  default=None, description="Password for MLflow authentication"
124
121
  )
125
122
 
123
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
124
+
126
125
 
127
126
  @dataclass
128
127
  class MLflowRegisteredModelStageInfo:
@@ -252,7 +251,7 @@ class MLflowSource(StatefulIngestionSourceBase):
252
251
  self, experiment: Experiment
253
252
  ) -> Iterable[MetadataWorkUnit]:
254
253
  experiment_container = Container(
255
- container_key=ContainerKeyWithId(
254
+ container_key=ExperimentKey(
256
255
  platform=str(DataPlatformUrn(platform_name=self.platform)),
257
256
  id=experiment.name,
258
257
  ),
@@ -470,7 +469,7 @@ class MLflowSource(StatefulIngestionSourceBase):
470
469
  def _get_run_workunits(
471
470
  self, experiment: Experiment, run: Run
472
471
  ) -> Iterable[MetadataWorkUnit]:
473
- experiment_key = ContainerKeyWithId(
472
+ experiment_key = ExperimentKey(
474
473
  platform=str(DataPlatformUrn(self.platform)), id=experiment.name
475
474
  )
476
475
 
@@ -899,7 +899,7 @@ class ModeSource(StatefulIngestionSourceBase):
899
899
  for match in matches:
900
900
  definition = Template(source=match).render()
901
901
  parameters = yaml.safe_load(definition)
902
- for key in parameters.keys():
902
+ for key in parameters:
903
903
  jinja_params[key] = parameters[key].get("default", "")
904
904
 
905
905
  normalized_query = re.sub(
@@ -1601,7 +1601,7 @@ class ModeSource(StatefulIngestionSourceBase):
1601
1601
 
1602
1602
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
1603
1603
  # Space/collection -> report -> query -> Chart
1604
- for space_token in self.space_tokens.keys():
1604
+ for space_token in self.space_tokens:
1605
1605
  reports = self._get_reports(space_token)
1606
1606
  for report in reports:
1607
1607
  report_token = report.get("token", "")
@@ -703,7 +703,7 @@ class NifiSource(StatefulIngestionSourceBase):
703
703
  if (
704
704
  component.nifi_type is NifiType.PROCESSOR
705
705
  and component.type
706
- not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS.keys()
706
+ not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
707
707
  ) or component.nifi_type not in [
708
708
  NifiType.PROCESSOR,
709
709
  NifiType.REMOTE_INPUT_PORT,
@@ -977,7 +977,7 @@ class NifiSource(StatefulIngestionSourceBase):
977
977
  )
978
978
 
979
979
  for incoming_from in incoming:
980
- if incoming_from in self.nifi_flow.remotely_accessible_ports.keys():
980
+ if incoming_from in self.nifi_flow.remotely_accessible_ports:
981
981
  dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
982
982
  dataset_urn = builder.make_dataset_urn(
983
983
  NIFI, dataset_name, self.config.env
@@ -994,7 +994,7 @@ class NifiSource(StatefulIngestionSourceBase):
994
994
  )
995
995
 
996
996
  for outgoing_to in outgoing:
997
- if outgoing_to in self.nifi_flow.remotely_accessible_ports.keys():
997
+ if outgoing_to in self.nifi_flow.remotely_accessible_ports:
998
998
  dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
999
999
  dataset_urn = builder.make_dataset_urn(
1000
1000
  NIFI, dataset_name, self.config.env
@@ -102,7 +102,7 @@ class OpenApiConfig(ConfigModel):
102
102
  # details there once, and then use that session for all requests.
103
103
  self.token = f"Bearer {self.bearer_token}"
104
104
  else:
105
- assert "url_complement" in self.get_token.keys(), (
105
+ assert "url_complement" in self.get_token, (
106
106
  "When 'request_type' is set to 'get', an url_complement is needed for the request."
107
107
  )
108
108
  if self.get_token["request_type"] == "get":
@@ -317,7 +317,7 @@ class APISource(Source, ABC):
317
317
  yield wu
318
318
 
319
319
  # Handle schema metadata if available
320
- if "data" in endpoint_dets.keys():
320
+ if "data" in endpoint_dets:
321
321
  # we are lucky! data is defined in the swagger for this endpoint
322
322
  schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
323
323
  wu = MetadataWorkUnit(
@@ -371,7 +371,7 @@ class APISource(Source, ABC):
371
371
  else:
372
372
  self.report_bad_responses(response.status_code, type=endpoint_k)
373
373
  else:
374
- if endpoint_k not in config.forced_examples.keys():
374
+ if endpoint_k not in config.forced_examples:
375
375
  # start guessing...
376
376
  url_guess = try_guessing(endpoint_k, root_dataset_samples)
377
377
  tot_url = clean_url(config.url + self.url_basepath + url_guess)
@@ -128,18 +128,18 @@ def get_endpoints(sw_dict: dict) -> dict:
128
128
 
129
129
  for p_k, p_o in sw_dict["paths"].items():
130
130
  method = list(p_o)[0]
131
- if "200" in p_o[method]["responses"].keys():
131
+ if "200" in p_o[method]["responses"]:
132
132
  base_res = p_o[method]["responses"]["200"]
133
- elif 200 in p_o[method]["responses"].keys():
133
+ elif 200 in p_o[method]["responses"]:
134
134
  # if you read a plain yml file the 200 will be an integer
135
135
  base_res = p_o[method]["responses"][200]
136
136
  else:
137
137
  # the endpoint does not have a 200 response
138
138
  continue
139
139
 
140
- if "description" in p_o[method].keys():
140
+ if "description" in p_o[method]:
141
141
  desc = p_o[method]["description"]
142
- elif "summary" in p_o[method].keys():
142
+ elif "summary" in p_o[method]:
143
143
  desc = p_o[method]["summary"]
144
144
  else: # still testing
145
145
  desc = ""
@@ -156,7 +156,7 @@ def get_endpoints(sw_dict: dict) -> dict:
156
156
  url_details[p_k]["data"] = example_data
157
157
 
158
158
  # checking whether there are defined parameters to execute the call...
159
- if "parameters" in p_o[method].keys():
159
+ if "parameters" in p_o[method]:
160
160
  url_details[p_k]["parameters"] = p_o[method]["parameters"]
161
161
 
162
162
  return dict(sorted(url_details.items()))
@@ -169,7 +169,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
169
169
  data = {}
170
170
  if "content" in base_res:
171
171
  res_cont = base_res["content"]
172
- if "application/json" in res_cont.keys():
172
+ if "application/json" in res_cont:
173
173
  ex_field = None
174
174
  if "example" in res_cont["application/json"]:
175
175
  ex_field = "example"
@@ -186,7 +186,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
186
186
  logger.warning(
187
187
  f"Field in swagger file does not give consistent data --- {key}"
188
188
  )
189
- elif "text/csv" in res_cont.keys():
189
+ elif "text/csv" in res_cont:
190
190
  data = res_cont["text/csv"]["schema"]
191
191
  elif "examples" in base_res:
192
192
  data = base_res["examples"]["application/json"]
@@ -239,7 +239,7 @@ def guessing_url_name(url: str, examples: dict) -> str:
239
239
 
240
240
  # substituting the parameter's name w the value
241
241
  for name, clean_name in zip(needed_n, cleaned_needed_n):
242
- if clean_name in examples[ex2use].keys():
242
+ if clean_name in examples[ex2use]:
243
243
  guessed_url = re.sub(name, str(examples[ex2use][clean_name]), guessed_url)
244
244
 
245
245
  return guessed_url
@@ -555,7 +555,7 @@ class PowerBiDashboardSourceConfig(
555
555
  def map_data_platform(cls, value):
556
556
  # For backward compatibility convert input PostgreSql to PostgreSQL
557
557
  # PostgreSQL is name of the data-platform in M-Query
558
- if "PostgreSql" in value.keys():
558
+ if "PostgreSql" in value:
559
559
  platform_name = value["PostgreSql"]
560
560
  del value["PostgreSql"]
561
561
  value["PostgreSQL"] = platform_name
@@ -94,7 +94,7 @@ from datahub.metadata.schema_classes import (
94
94
  UpstreamLineageClass,
95
95
  ViewPropertiesClass,
96
96
  )
97
- from datahub.metadata.urns import ChartUrn
97
+ from datahub.metadata.urns import ChartUrn, DatasetUrn
98
98
  from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
99
99
  from datahub.utilities.dedup_list import deduplicate_list
100
100
  from datahub.utilities.urns.urn_iter import lowercase_dataset_urn
@@ -263,7 +263,7 @@ class Mapper:
263
263
  for upstream_dpt in lineage.upstreams:
264
264
  if (
265
265
  upstream_dpt.data_platform_pair.powerbi_data_platform_name
266
- not in self.__config.dataset_type_mapping.keys()
266
+ not in self.__config.dataset_type_mapping
267
267
  ):
268
268
  logger.debug(
269
269
  f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
@@ -1083,6 +1083,7 @@ class Mapper:
1083
1083
  report: powerbi_data_classes.Report,
1084
1084
  chart_mcps: List[MetadataChangeProposalWrapper],
1085
1085
  user_mcps: List[MetadataChangeProposalWrapper],
1086
+ dataset_edges: List[EdgeClass],
1086
1087
  ) -> List[MetadataChangeProposalWrapper]:
1087
1088
  """
1088
1089
  Map PowerBi report to Datahub dashboard
@@ -1104,6 +1105,7 @@ class Mapper:
1104
1105
  charts=chart_urn_list,
1105
1106
  lastModified=ChangeAuditStamps(),
1106
1107
  dashboardUrl=report.webUrl,
1108
+ datasetEdges=dataset_edges,
1107
1109
  )
1108
1110
 
1109
1111
  info_mcp = self.new_mcp(
@@ -1197,12 +1199,23 @@ class Mapper:
1197
1199
  ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
1198
1200
  chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
1199
1201
 
1202
+ # collect all upstream datasets; using a set to retain unique urns
1203
+ dataset_urns = {
1204
+ dataset.entityUrn
1205
+ for dataset in ds_mcps
1206
+ if dataset.entityType == DatasetUrn.ENTITY_TYPE and dataset.entityUrn
1207
+ }
1208
+ dataset_edges = [
1209
+ EdgeClass(destinationUrn=dataset_urn) for dataset_urn in dataset_urns
1210
+ ]
1211
+
1200
1212
  # Let's convert report to datahub dashboard
1201
1213
  report_mcps = self.report_to_dashboard(
1202
1214
  workspace=workspace,
1203
1215
  report=report,
1204
1216
  chart_mcps=chart_mcps,
1205
1217
  user_mcps=user_mcps,
1218
+ dataset_edges=dataset_edges,
1206
1219
  )
1207
1220
 
1208
1221
  # Now add MCPs in sequence
@@ -1340,7 +1353,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1340
1353
  for data_platform in SupportedDataPlatform
1341
1354
  ]
1342
1355
 
1343
- for key in self.source_config.dataset_type_mapping.keys():
1356
+ for key in self.source_config.dataset_type_mapping:
1344
1357
  if key not in powerbi_data_platforms:
1345
1358
  raise ValueError(f"PowerBI DataPlatform {key} is not supported")
1346
1359
 
@@ -42,9 +42,9 @@ class RedshiftProfiler(GenericProfiler):
42
42
  "max_overflow", self.config.profiling.max_workers
43
43
  )
44
44
 
45
- for db in tables.keys():
45
+ for db in tables:
46
46
  profile_requests = []
47
- for schema in tables.get(db, {}).keys():
47
+ for schema in tables.get(db, {}):
48
48
  if not self.config.schema_pattern.allowed(schema):
49
49
  continue
50
50
  for table in tables[db].get(schema, {}):
@@ -170,7 +170,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
170
170
  if self.config.workspace_pattern.allowed(workspace.name):
171
171
  allowed_workspaces.append(workspace)
172
172
  else:
173
- self.reporter.workspaces.dropped(workspace.workspaceId)
173
+ self.reporter.workspaces.dropped(
174
+ f"{workspace.name} ({workspace.workspaceId})"
175
+ )
174
176
  logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
175
177
 
176
178
  return allowed_workspaces
@@ -661,7 +663,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
661
663
  yield from self._gen_workbook_workunit(workbook)
662
664
 
663
665
  for workspace in self._get_allowed_workspaces():
664
- self.reporter.workspaces.processed(workspace.workspaceId)
666
+ self.reporter.workspaces.processed(
667
+ f"{workspace.name} ({workspace.workspaceId})"
668
+ )
665
669
  yield from self._gen_workspace_workunit(workspace)
666
670
  yield from self._gen_sigma_dataset_upstream_lineage_workunit()
667
671
 
@@ -77,7 +77,7 @@ class SnowsightUrlBuilder:
77
77
  region: str,
78
78
  ) -> Tuple[str, str]:
79
79
  cloud: str
80
- if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING.keys():
80
+ if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING:
81
81
  cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region]
82
82
  elif region.startswith(("aws_", "gcp_", "azure_")):
83
83
  # e.g. aws_us_west_2, gcp_us_central1, azure_northeurope
@@ -26,6 +26,7 @@ from datahub.metadata.schema_classes import (
26
26
  DataPlatformInstanceClass,
27
27
  DataTransformClass,
28
28
  DataTransformLogicClass,
29
+ QueryLanguageClass,
29
30
  QueryStatementClass,
30
31
  SubTypesClass,
31
32
  )
@@ -176,7 +177,17 @@ def _generate_job_workunits(
176
177
  DataTransformClass(
177
178
  queryStatement=QueryStatementClass(
178
179
  value=procedure.procedure_definition,
179
- language=procedure.language,
180
+ language=(
181
+ QueryLanguageClass.SQL
182
+ if procedure.language == "SQL"
183
+ # The language field uses a pretty limited enum.
184
+ # The "UNKNOWN" enum value is pretty new, so we don't want to
185
+ # emit it until it has broader server-side support. As a
186
+ # short-term solution, we map all languages to "SQL".
187
+ # TODO: Once we've released server 1.1.0, we should change
188
+ # this to be "UNKNOWN" for all languages except "SQL".
189
+ else QueryLanguageClass.SQL
190
+ ),
180
191
  ),
181
192
  )
182
193
  ]
@@ -128,9 +128,10 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
128
128
  if catalog_name is None:
129
129
  raise exc.NoSuchTableError("catalog is required in connection")
130
130
  connector_name = get_catalog_connector_name(connection.engine, catalog_name)
131
- if connector_name is None:
132
- return {}
133
- if connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS:
131
+ if (
132
+ connector_name is not None
133
+ and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
134
+ ):
134
135
  properties_table = self._get_full_table(f"{table_name}$properties", schema)
135
136
  query = f"SELECT * FROM {properties_table}"
136
137
  row = connection.execute(sql.text(query)).fetchone()
@@ -45,7 +45,6 @@ class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
45
45
  description="Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the 'fail_safe_threshold'.",
46
46
  le=100.0,
47
47
  ge=0.0,
48
- hidden_from_docs=True,
49
48
  )
50
49
 
51
50