acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show
  1. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2557 -2557
  2. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +81 -79
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datajob/dataflow.py +15 -0
  5. datahub/api/entities/datajob/datajob.py +17 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  7. datahub/api/entities/dataset/dataset.py +2 -2
  8. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  9. datahub/cli/ingest_cli.py +4 -4
  10. datahub/cli/migrate.py +6 -6
  11. datahub/configuration/common.py +1 -1
  12. datahub/emitter/mcp_builder.py +4 -0
  13. datahub/errors.py +4 -0
  14. datahub/ingestion/api/common.py +9 -0
  15. datahub/ingestion/api/source.py +6 -2
  16. datahub/ingestion/api/source_helpers.py +35 -2
  17. datahub/ingestion/graph/client.py +122 -7
  18. datahub/ingestion/graph/filters.py +41 -16
  19. datahub/ingestion/run/pipeline.py +0 -6
  20. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  21. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  22. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  23. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  24. datahub/ingestion/source/fivetran/fivetran.py +1 -0
  25. datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
  26. datahub/ingestion/source/hex/constants.py +5 -0
  27. datahub/ingestion/source/hex/hex.py +150 -22
  28. datahub/ingestion/source/hex/mapper.py +28 -2
  29. datahub/ingestion/source/hex/model.py +10 -2
  30. datahub/ingestion/source/hex/query_fetcher.py +300 -0
  31. datahub/ingestion/source/iceberg/iceberg.py +106 -18
  32. datahub/ingestion/source/kafka/kafka.py +1 -4
  33. datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
  34. datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
  35. datahub/ingestion/source/looker/looker_source.py +2 -3
  36. datahub/ingestion/source/mlflow.py +6 -7
  37. datahub/ingestion/source/mode.py +2 -2
  38. datahub/ingestion/source/nifi.py +3 -3
  39. datahub/ingestion/source/openapi.py +3 -3
  40. datahub/ingestion/source/openapi_parser.py +8 -8
  41. datahub/ingestion/source/powerbi/config.py +1 -1
  42. datahub/ingestion/source/powerbi/powerbi.py +16 -3
  43. datahub/ingestion/source/redshift/profile.py +2 -2
  44. datahub/ingestion/source/sigma/sigma.py +6 -2
  45. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
  46. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  47. datahub/ingestion/source/sql/trino.py +4 -3
  48. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  49. datahub/ingestion/source/superset.py +108 -81
  50. datahub/ingestion/source/tableau/tableau.py +4 -4
  51. datahub/ingestion/source/tableau/tableau_common.py +2 -2
  52. datahub/ingestion/source/unity/source.py +1 -1
  53. datahub/ingestion/source/vertexai/vertexai.py +7 -7
  54. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  55. datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
  56. datahub/ingestion/transformer/dataset_domain.py +1 -1
  57. datahub/lite/lite_util.py +2 -2
  58. datahub/metadata/_schema_classes.py +47 -2
  59. datahub/metadata/_urns/urn_defs.py +56 -0
  60. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  61. datahub/metadata/schema.avsc +121 -85
  62. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  63. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  64. datahub/metadata/schemas/FormInfo.avsc +5 -0
  65. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  66. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  67. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  68. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  69. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  70. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  71. datahub/sdk/search_client.py +81 -8
  72. datahub/sdk/search_filters.py +73 -11
  73. datahub/testing/mcp_diff.py +1 -1
  74. datahub/utilities/file_backed_collections.py +6 -6
  75. datahub/utilities/hive_schema_to_avro.py +2 -2
  76. datahub/utilities/ingest_utils.py +2 -2
  77. datahub/utilities/threaded_iterator_executor.py +16 -3
  78. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  79. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
  80. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
  81. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
  82. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  from dataclasses import dataclass, field
4
5
  from datetime import datetime
5
6
  from functools import lru_cache
@@ -100,6 +101,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
100
101
  from datahub.utilities import config_clean
101
102
  from datahub.utilities.lossy_collections import LossyList
102
103
  from datahub.utilities.registries.domain_registry import DomainRegistry
104
+ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
103
105
 
104
106
  logger = logging.getLogger(__name__)
105
107
 
@@ -210,6 +212,11 @@ class SupersetConfig(
210
212
  default=10, description="Timeout of single API call to superset."
211
213
  )
212
214
 
215
+ max_threads: int = Field(
216
+ default_factory=lambda: os.cpu_count() or 40,
217
+ description="Max parallelism for API calls. Defaults to cpuCount or 40",
218
+ )
219
+
213
220
  # TODO: Check and remove this if no longer needed.
214
221
  # Config database_alias is removed from sql sources.
215
222
  database_alias: Dict[str, str] = Field(
@@ -339,6 +346,7 @@ class SupersetSource(StatefulIngestionSourceBase):
339
346
 
340
347
  if response.status_code != 200:
341
348
  logger.warning(f"Failed to get {entity_type} data: {response.text}")
349
+ continue
342
350
 
343
351
  payload = response.json()
344
352
  # Update total_items with the actual count from the response
@@ -501,33 +509,41 @@ class SupersetSource(StatefulIngestionSourceBase):
501
509
 
502
510
  return dashboard_snapshot
503
511
 
504
- def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
505
- for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
506
- try:
507
- dashboard_id = str(dashboard_data.get("id"))
508
- dashboard_title = dashboard_data.get("dashboard_title", "")
509
-
510
- if not self.config.dashboard_pattern.allowed(dashboard_title):
511
- self.report.report_dropped(
512
- f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
513
- )
514
- continue
515
-
516
- dashboard_snapshot = self.construct_dashboard_from_api_data(
517
- dashboard_data
518
- )
519
- except Exception as e:
520
- self.report.warning(
521
- f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
512
+ def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
513
+ dashboard_title = ""
514
+ try:
515
+ dashboard_id = str(dashboard_data.get("id"))
516
+ dashboard_title = dashboard_data.get("dashboard_title", "")
517
+ if not self.config.dashboard_pattern.allowed(dashboard_title):
518
+ self.report.report_dropped(
519
+ f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
522
520
  )
523
- continue
524
- # Emit the dashboard
525
- mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
526
- yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
527
- yield from self._get_domain_wu(
528
- title=dashboard_title,
529
- entity_urn=dashboard_snapshot.urn,
521
+ return
522
+ dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
523
+ except Exception as e:
524
+ self.report.warning(
525
+ f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
526
+ )
527
+ return
528
+ mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
529
+ yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
530
+ yield from self._get_domain_wu(
531
+ title=dashboard_title, entity_urn=dashboard_snapshot.urn
532
+ )
533
+
534
+ def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
535
+ dashboard_data_list = [
536
+ (dashboard_data,)
537
+ for dashboard_data in self.paginate_entity_api_results(
538
+ "dashboard/", PAGE_SIZE
530
539
  )
540
+ ]
541
+
542
+ yield from ThreadedIteratorExecutor.process(
543
+ worker_func=self._process_dashboard,
544
+ args_list=dashboard_data_list,
545
+ max_workers=self.config.max_threads,
546
+ )
531
547
 
532
548
  def build_input_fields(
533
549
  self,
@@ -762,40 +778,46 @@ class SupersetSource(StatefulIngestionSourceBase):
762
778
  entity_urn=chart_urn,
763
779
  )
764
780
 
765
- def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
766
- for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
767
- try:
768
- chart_id = str(chart_data.get("id"))
769
- chart_name = chart_data.get("slice_name", "")
770
-
771
- if not self.config.chart_pattern.allowed(chart_name):
772
- self.report.report_dropped(
773
- f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
781
+ def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
782
+ chart_name = ""
783
+ try:
784
+ chart_id = str(chart_data.get("id"))
785
+ chart_name = chart_data.get("slice_name", "")
786
+ if not self.config.chart_pattern.allowed(chart_name):
787
+ self.report.report_dropped(
788
+ f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
789
+ )
790
+ return
791
+ if self.config.dataset_pattern != AllowDenyPattern.allow_all():
792
+ datasource_id = chart_data.get("datasource_id")
793
+ if datasource_id:
794
+ dataset_response = self.get_dataset_info(datasource_id)
795
+ dataset_name = dataset_response.get("result", {}).get(
796
+ "table_name", ""
774
797
  )
775
- continue
776
-
777
- # Emit a warning if charts use data from a dataset that will be filtered out
778
- if self.config.dataset_pattern != AllowDenyPattern.allow_all():
779
- datasource_id = chart_data.get("datasource_id")
780
- if datasource_id:
781
- dataset_response = self.get_dataset_info(datasource_id)
782
- dataset_name = dataset_response.get("result", {}).get(
783
- "table_name", ""
798
+ if dataset_name and not self.config.dataset_pattern.allowed(
799
+ dataset_name
800
+ ):
801
+ self.report.warning(
802
+ f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
784
803
  )
804
+ yield from self.construct_chart_from_chart_data(chart_data)
805
+ except Exception as e:
806
+ self.report.warning(
807
+ f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
808
+ )
809
+ return
785
810
 
786
- if dataset_name and not self.config.dataset_pattern.allowed(
787
- dataset_name
788
- ):
789
- self.report.warning(
790
- f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
791
- )
792
-
793
- yield from self.construct_chart_from_chart_data(chart_data)
794
- except Exception as e:
795
- self.report.warning(
796
- f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
797
- )
798
- continue
811
+ def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
812
+ chart_data_list = [
813
+ (chart_data,)
814
+ for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE)
815
+ ]
816
+ yield from ThreadedIteratorExecutor.process(
817
+ worker_func=self._process_chart,
818
+ args_list=chart_data_list,
819
+ max_workers=self.config.max_threads,
820
+ )
799
821
 
800
822
  def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
801
823
  schema_fields: List[SchemaField] = []
@@ -1023,33 +1045,38 @@ class SupersetSource(StatefulIngestionSourceBase):
1023
1045
 
1024
1046
  return dataset_snapshot
1025
1047
 
1026
- def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
1027
- for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
1028
- try:
1029
- dataset_name = dataset_data.get("table_name", "")
1030
-
1031
- # Check if dataset should be filtered by dataset name
1032
- if not self.config.dataset_pattern.allowed(dataset_name):
1033
- self.report.report_dropped(
1034
- f"Dataset '{dataset_name}' filtered by dataset_pattern"
1035
- )
1036
- continue
1037
-
1038
- dataset_snapshot = self.construct_dataset_from_dataset_data(
1039
- dataset_data
1040
- )
1041
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1042
- except Exception as e:
1043
- self.report.warning(
1044
- f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
1048
+ def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
1049
+ dataset_name = ""
1050
+ try:
1051
+ dataset_name = dataset_data.get("table_name", "")
1052
+ if not self.config.dataset_pattern.allowed(dataset_name):
1053
+ self.report.report_dropped(
1054
+ f"Dataset '{dataset_name}' filtered by dataset_pattern"
1045
1055
  )
1046
- continue
1047
- # Emit the dataset
1048
- yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
1049
- yield from self._get_domain_wu(
1050
- title=dataset_data.get("table_name", ""),
1051
- entity_urn=dataset_snapshot.urn,
1056
+ return
1057
+ dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
1058
+ mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1059
+ except Exception as e:
1060
+ self.report.warning(
1061
+ f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
1052
1062
  )
1063
+ return
1064
+ yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
1065
+ yield from self._get_domain_wu(
1066
+ title=dataset_data.get("table_name", ""),
1067
+ entity_urn=dataset_snapshot.urn,
1068
+ )
1069
+
1070
+ def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
1071
+ dataset_data_list = [
1072
+ (dataset_data,)
1073
+ for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE)
1074
+ ]
1075
+ yield from ThreadedIteratorExecutor.process(
1076
+ worker_func=self._process_dataset,
1077
+ args_list=dataset_data_list,
1078
+ max_workers=self.config.max_threads,
1079
+ )
1053
1080
 
1054
1081
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1055
1082
  if self.config.ingest_dashboards:
@@ -1623,7 +1623,7 @@ class TableauSiteSource:
1623
1623
  # if multiple project has name C. Ideal solution is to use projectLuidWithin to avoid duplicate project,
1624
1624
  # however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later.
1625
1625
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
1626
- if project_luid not in self.tableau_project_registry.keys():
1626
+ if project_luid not in self.tableau_project_registry:
1627
1627
  wrk_name: Optional[str] = workbook.get(c.NAME)
1628
1628
  wrk_id: Optional[str] = workbook.get(c.ID)
1629
1629
  prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
@@ -2253,7 +2253,7 @@ class TableauSiteSource:
2253
2253
  # It is possible due to https://github.com/tableau/server-client-python/issues/1210
2254
2254
  if (
2255
2255
  ds.get(c.LUID)
2256
- and ds[c.LUID] not in self.datasource_project_map.keys()
2256
+ and ds[c.LUID] not in self.datasource_project_map
2257
2257
  and self.report.get_all_datasources_query_failed
2258
2258
  ):
2259
2259
  logger.debug(
@@ -2265,7 +2265,7 @@ class TableauSiteSource:
2265
2265
 
2266
2266
  if (
2267
2267
  ds.get(c.LUID)
2268
- and ds[c.LUID] in self.datasource_project_map.keys()
2268
+ and ds[c.LUID] in self.datasource_project_map
2269
2269
  and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry
2270
2270
  ):
2271
2271
  return self.datasource_project_map[ds[c.LUID]]
@@ -3252,7 +3252,7 @@ class TableauSiteSource:
3252
3252
 
3253
3253
  parent_key = None
3254
3254
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
3255
- if project_luid and project_luid in self.tableau_project_registry.keys():
3255
+ if project_luid and project_luid in self.tableau_project_registry:
3256
3256
  parent_key = self.gen_project_key(project_luid)
3257
3257
  else:
3258
3258
  workbook_id: Optional[str] = workbook.get(c.ID)
@@ -774,7 +774,7 @@ def get_overridden_info(
774
774
  if (
775
775
  lineage_overrides is not None
776
776
  and lineage_overrides.platform_override_map is not None
777
- and original_platform in lineage_overrides.platform_override_map.keys()
777
+ and original_platform in lineage_overrides.platform_override_map
778
778
  ):
779
779
  platform = lineage_overrides.platform_override_map[original_platform]
780
780
 
@@ -782,7 +782,7 @@ def get_overridden_info(
782
782
  lineage_overrides is not None
783
783
  and lineage_overrides.database_override_map is not None
784
784
  and upstream_db is not None
785
- and upstream_db in lineage_overrides.database_override_map.keys()
785
+ and upstream_db in lineage_overrides.database_override_map
786
786
  ):
787
787
  upstream_db = lineage_overrides.database_override_map[upstream_db]
788
788
 
@@ -1003,7 +1003,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
1003
1003
  generate_usage_statistics=False,
1004
1004
  generate_operations=False,
1005
1005
  )
1006
- for dataset_name in self.view_definitions.keys():
1006
+ for dataset_name in self.view_definitions:
1007
1007
  view_ref, view_definition = self.view_definitions[dataset_name]
1008
1008
  result = self._run_sql_parser(
1009
1009
  view_ref,
@@ -22,7 +22,11 @@ from google.oauth2 import service_account
22
22
 
23
23
  import datahub.emitter.mce_builder as builder
24
24
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
25
- from datahub.emitter.mcp_builder import ContainerKey, ProjectIdKey, gen_containers
25
+ from datahub.emitter.mcp_builder import (
26
+ ExperimentKey,
27
+ ProjectIdKey,
28
+ gen_containers,
29
+ )
26
30
  from datahub.ingestion.api.common import PipelineContext
27
31
  from datahub.ingestion.api.decorators import (
28
32
  SupportStatus,
@@ -96,10 +100,6 @@ class ModelMetadata:
96
100
  endpoints: Optional[List[Endpoint]] = None
97
101
 
98
102
 
99
- class ContainerKeyWithId(ContainerKey):
100
- id: str
101
-
102
-
103
103
  @platform_name("Vertex AI", id="vertexai")
104
104
  @config_class(VertexAIConfig)
105
105
  @support_status(SupportStatus.TESTING)
@@ -173,7 +173,7 @@ class VertexAISource(Source):
173
173
  ) -> Iterable[MetadataWorkUnit]:
174
174
  yield from gen_containers(
175
175
  parent_container_key=self._get_project_container(),
176
- container_key=ContainerKeyWithId(
176
+ container_key=ExperimentKey(
177
177
  platform=self.platform,
178
178
  id=self._make_vertexai_experiment_name(experiment.name),
179
179
  ),
@@ -309,7 +309,7 @@ class VertexAISource(Source):
309
309
  def _gen_experiment_run_mcps(
310
310
  self, experiment: Experiment, run: ExperimentRun
311
311
  ) -> Iterable[MetadataChangeProposalWrapper]:
312
- experiment_key = ContainerKeyWithId(
312
+ experiment_key = ExperimentKey(
313
313
  platform=self.platform,
314
314
  id=self._make_vertexai_experiment_name(experiment.name),
315
315
  )
@@ -54,7 +54,7 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
54
54
  data_products_container: Dict[str, DataProductPatchBuilder] = {}
55
55
  logger.debug("Generating dataproducts")
56
56
  is_container = self.config.is_container
57
- for entity_urn in self.entity_map.keys():
57
+ for entity_urn in self.entity_map:
58
58
  data_product_urn = self.config.get_data_product_to_add(entity_urn)
59
59
  if data_product_urn:
60
60
  if data_product_urn not in data_products:
@@ -86,7 +86,7 @@ class AddDatasetOwnership(OwnershipTransformer):
86
86
  logger.debug("Generating Ownership for containers")
87
87
  ownership_container_mapping: Dict[str, List[OwnerClass]] = {}
88
88
  for entity_urn, data_ownerships in (
89
- (urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map.keys()
89
+ (urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map
90
90
  ):
91
91
  if not data_ownerships:
92
92
  continue
@@ -125,7 +125,7 @@ class AddDatasetDomain(DatasetDomainTransformer):
125
125
  return domain_mcps
126
126
 
127
127
  for entity_urn, domain_to_add in (
128
- (urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map.keys()
128
+ (urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map
129
129
  ):
130
130
  if not domain_to_add or not domain_to_add.domains:
131
131
  continue
datahub/lite/lite_util.py CHANGED
@@ -99,7 +99,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
99
99
  lite_class = lite_registry.get(lite_type)
100
100
  except KeyError as e:
101
101
  raise Exception(
102
- f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping.keys()]}"
102
+ f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping]}"
103
103
  ) from e
104
104
 
105
105
  lite_specific_config = lite_class.get_config_class().parse_obj(
@@ -127,7 +127,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
127
127
  return lite
128
128
  else:
129
129
  raise Exception(
130
- f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping.keys()]}"
130
+ f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping]}"
131
131
  )
132
132
  else:
133
133
  return lite
@@ -15442,6 +15442,35 @@ class DataHubIngestionSourceKeyClass(_Aspect):
15442
15442
  self._inner_dict['id'] = value
15443
15443
 
15444
15444
 
15445
+ class DataHubOpenAPISchemaKeyClass(_Aspect):
15446
+ """Key for a Query"""
15447
+
15448
+
15449
+ ASPECT_NAME = 'dataHubOpenAPISchemaKey'
15450
+ ASPECT_INFO = {'keyForEntity': 'dataHubOpenAPISchema', 'entityCategory': 'internal', 'entityAspects': ['systemMetadata'], 'entityDoc': 'Contains aspects which are used in OpenAPI requests/responses which are not otherwise present in the data model.'}
15451
+ RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataHubOpenAPISchemaKey")
15452
+
15453
+ def __init__(self,
15454
+ id: str,
15455
+ ):
15456
+ super().__init__()
15457
+
15458
+ self.id = id
15459
+
15460
+ def _restore_defaults(self) -> None:
15461
+ self.id = str()
15462
+
15463
+
15464
+ @property
15465
+ def id(self) -> str:
15466
+ """A unique id for the DataHub OpenAPI schema."""
15467
+ return self._inner_dict.get('id') # type: ignore
15468
+
15469
+ @id.setter
15470
+ def id(self, value: str) -> None:
15471
+ self._inner_dict['id'] = value
15472
+
15473
+
15445
15474
  class DataHubPersonaKeyClass(_Aspect):
15446
15475
  """Key for a persona type"""
15447
15476
 
@@ -20128,10 +20157,14 @@ class PlatformEventHeaderClass(DictWrapper):
20128
20157
  self._inner_dict['timestampMillis'] = value
20129
20158
 
20130
20159
 
20131
- class SystemMetadataClass(DictWrapper):
20160
+ class SystemMetadataClass(_Aspect):
20132
20161
  """Metadata associated with each metadata change that is processed by the system"""
20133
-
20162
+
20163
+
20164
+ ASPECT_NAME = 'systemMetadata'
20165
+ ASPECT_INFO = {}
20134
20166
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.mxe.SystemMetadata")
20167
+
20135
20168
  def __init__(self,
20136
20169
  lastObserved: Optional[Union[int, None]]=None,
20137
20170
  runId: Optional[Union[str, None]]=None,
@@ -21738,6 +21771,9 @@ class QueryLanguageClass(object):
21738
21771
  SQL = "SQL"
21739
21772
  """A SQL Query"""
21740
21773
 
21774
+ UNKNOWN = "UNKNOWN"
21775
+ """Unknown query language"""
21776
+
21741
21777
 
21742
21778
 
21743
21779
  class QueryPropertiesClass(_Aspect):
@@ -26135,6 +26171,7 @@ __SCHEMA_TYPES = {
26135
26171
  'com.linkedin.pegasus2avro.metadata.key.DataHubActionKey': DataHubActionKeyClass,
26136
26172
  'com.linkedin.pegasus2avro.metadata.key.DataHubConnectionKey': DataHubConnectionKeyClass,
26137
26173
  'com.linkedin.pegasus2avro.metadata.key.DataHubIngestionSourceKey': DataHubIngestionSourceKeyClass,
26174
+ 'com.linkedin.pegasus2avro.metadata.key.DataHubOpenAPISchemaKey': DataHubOpenAPISchemaKeyClass,
26138
26175
  'com.linkedin.pegasus2avro.metadata.key.DataHubPersonaKey': DataHubPersonaKeyClass,
26139
26176
  'com.linkedin.pegasus2avro.metadata.key.DataHubPolicyKey': DataHubPolicyKeyClass,
26140
26177
  'com.linkedin.pegasus2avro.metadata.key.DataHubRetentionKey': DataHubRetentionKeyClass,
@@ -26620,6 +26657,7 @@ __SCHEMA_TYPES = {
26620
26657
  'DataHubActionKey': DataHubActionKeyClass,
26621
26658
  'DataHubConnectionKey': DataHubConnectionKeyClass,
26622
26659
  'DataHubIngestionSourceKey': DataHubIngestionSourceKeyClass,
26660
+ 'DataHubOpenAPISchemaKey': DataHubOpenAPISchemaKeyClass,
26623
26661
  'DataHubPersonaKey': DataHubPersonaKeyClass,
26624
26662
  'DataHubPolicyKey': DataHubPolicyKeyClass,
26625
26663
  'DataHubRetentionKey': DataHubRetentionKeyClass,
@@ -26879,6 +26917,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
26879
26917
  ContainerClass,
26880
26918
  ContainerPropertiesClass,
26881
26919
  EditableContainerPropertiesClass,
26920
+ SystemMetadataClass,
26882
26921
  DataHubSecretValueClass,
26883
26922
  DataHubUpgradeRequestClass,
26884
26923
  DataHubUpgradeResultClass,
@@ -26935,6 +26974,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
26935
26974
  MLModelKeyClass,
26936
26975
  NotebookKeyClass,
26937
26976
  RoleKeyClass,
26977
+ DataHubOpenAPISchemaKeyClass,
26938
26978
  GlobalSettingsKeyClass,
26939
26979
  DatasetKeyClass,
26940
26980
  ChartKeyClass,
@@ -27102,6 +27142,7 @@ class AspectBag(TypedDict, total=False):
27102
27142
  container: ContainerClass
27103
27143
  containerProperties: ContainerPropertiesClass
27104
27144
  editableContainerProperties: EditableContainerPropertiesClass
27145
+ systemMetadata: SystemMetadataClass
27105
27146
  dataHubSecretValue: DataHubSecretValueClass
27106
27147
  dataHubUpgradeRequest: DataHubUpgradeRequestClass
27107
27148
  dataHubUpgradeResult: DataHubUpgradeResultClass
@@ -27158,6 +27199,7 @@ class AspectBag(TypedDict, total=False):
27158
27199
  mlModelKey: MLModelKeyClass
27159
27200
  notebookKey: NotebookKeyClass
27160
27201
  roleKey: RoleKeyClass
27202
+ dataHubOpenAPISchemaKey: DataHubOpenAPISchemaKeyClass
27161
27203
  globalSettingsKey: GlobalSettingsKeyClass
27162
27204
  datasetKey: DatasetKeyClass
27163
27205
  chartKey: ChartKeyClass
@@ -27292,6 +27334,7 @@ KEY_ASPECTS: Dict[str, Type[_Aspect]] = {
27292
27334
  'mlModel': MLModelKeyClass,
27293
27335
  'notebook': NotebookKeyClass,
27294
27336
  'role': RoleKeyClass,
27337
+ 'dataHubOpenAPISchema': DataHubOpenAPISchemaKeyClass,
27295
27338
  'globalSettings': GlobalSettingsKeyClass,
27296
27339
  'dataset': DatasetKeyClass,
27297
27340
  'chart': ChartKeyClass,
@@ -27352,6 +27395,7 @@ ENTITY_TYPE_NAMES: List[str] = [
27352
27395
  'mlModel',
27353
27396
  'notebook',
27354
27397
  'role',
27398
+ 'dataHubOpenAPISchema',
27355
27399
  'globalSettings',
27356
27400
  'dataset',
27357
27401
  'chart',
@@ -27411,6 +27455,7 @@ EntityTypeName = Literal[
27411
27455
  'mlModel',
27412
27456
  'notebook',
27413
27457
  'role',
27458
+ 'dataHubOpenAPISchema',
27414
27459
  'globalSettings',
27415
27460
  'dataset',
27416
27461
  'chart',
@@ -594,6 +594,62 @@ class RoleUrn(_SpecificUrn):
594
594
  def id(self) -> str:
595
595
  return self._entity_ids[0]
596
596
 
597
+ if TYPE_CHECKING:
598
+ from datahub.metadata.schema_classes import DataHubOpenAPISchemaKeyClass
599
+
600
+ class DataHubOpenAPISchemaUrn(_SpecificUrn):
601
+ ENTITY_TYPE: ClassVar[Literal["dataHubOpenAPISchema"]] = "dataHubOpenAPISchema"
602
+ _URN_PARTS: ClassVar[int] = 1
603
+
604
+ def __init__(self, id: Union["DataHubOpenAPISchemaUrn", str], *, _allow_coercion: bool = True) -> None:
605
+ if _allow_coercion:
606
+ # Field coercion logic (if any is required).
607
+ if isinstance(id, str):
608
+ if id.startswith('urn:li:'):
609
+ try:
610
+ id = DataHubOpenAPISchemaUrn.from_string(id)
611
+ except InvalidUrnError:
612
+ raise InvalidUrnError(f'Expecting a DataHubOpenAPISchemaUrn but got {id}')
613
+ else:
614
+ id = UrnEncoder.encode_string(id)
615
+
616
+ # Validation logic.
617
+ if not id:
618
+ raise InvalidUrnError("DataHubOpenAPISchemaUrn id cannot be empty")
619
+ if isinstance(id, DataHubOpenAPISchemaUrn):
620
+ id = id.id
621
+ elif isinstance(id, Urn):
622
+ raise InvalidUrnError(f'Expecting a DataHubOpenAPISchemaUrn but got {id}')
623
+ if UrnEncoder.contains_reserved_char(id):
624
+ raise InvalidUrnError(f'DataHubOpenAPISchemaUrn id contains reserved characters')
625
+
626
+ super().__init__(self.ENTITY_TYPE, [id])
627
+
628
+ @classmethod
629
+ def _parse_ids(cls, entity_ids: List[str]) -> "DataHubOpenAPISchemaUrn":
630
+ if len(entity_ids) != cls._URN_PARTS:
631
+ raise InvalidUrnError(f"DataHubOpenAPISchemaUrn should have {cls._URN_PARTS} parts, got {len(entity_ids)}: {entity_ids}")
632
+ return cls(id=entity_ids[0], _allow_coercion=False)
633
+
634
+ @classmethod
635
+ def underlying_key_aspect_type(cls) -> Type["DataHubOpenAPISchemaKeyClass"]:
636
+ from datahub.metadata.schema_classes import DataHubOpenAPISchemaKeyClass
637
+
638
+ return DataHubOpenAPISchemaKeyClass
639
+
640
+ def to_key_aspect(self) -> "DataHubOpenAPISchemaKeyClass":
641
+ from datahub.metadata.schema_classes import DataHubOpenAPISchemaKeyClass
642
+
643
+ return DataHubOpenAPISchemaKeyClass(id=self.id)
644
+
645
+ @classmethod
646
+ def from_key_aspect(cls, key_aspect: "DataHubOpenAPISchemaKeyClass") -> "DataHubOpenAPISchemaUrn":
647
+ return cls(id=key_aspect.id)
648
+
649
+ @property
650
+ def id(self) -> str:
651
+ return self._entity_ids[0]
652
+
597
653
  if TYPE_CHECKING:
598
654
  from datahub.metadata.schema_classes import GlobalSettingsKeyClass
599
655
 
@@ -19,6 +19,7 @@ from ......schema_classes import DataHubAccessTokenKeyClass
19
19
  from ......schema_classes import DataHubActionKeyClass
20
20
  from ......schema_classes import DataHubConnectionKeyClass
21
21
  from ......schema_classes import DataHubIngestionSourceKeyClass
22
+ from ......schema_classes import DataHubOpenAPISchemaKeyClass
22
23
  from ......schema_classes import DataHubPersonaKeyClass
23
24
  from ......schema_classes import DataHubPolicyKeyClass
24
25
  from ......schema_classes import DataHubRetentionKeyClass
@@ -72,6 +73,7 @@ DataHubAccessTokenKey = DataHubAccessTokenKeyClass
72
73
  DataHubActionKey = DataHubActionKeyClass
73
74
  DataHubConnectionKey = DataHubConnectionKeyClass
74
75
  DataHubIngestionSourceKey = DataHubIngestionSourceKeyClass
76
+ DataHubOpenAPISchemaKey = DataHubOpenAPISchemaKeyClass
75
77
  DataHubPersonaKey = DataHubPersonaKeyClass
76
78
  DataHubPolicyKey = DataHubPolicyKeyClass
77
79
  DataHubRetentionKey = DataHubRetentionKeyClass