acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (76) hide show
  1. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2561 -2561
  2. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +75 -73
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datajob/dataflow.py +15 -0
  5. datahub/api/entities/datajob/datajob.py +17 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  7. datahub/api/entities/dataset/dataset.py +2 -2
  8. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  9. datahub/cli/ingest_cli.py +4 -4
  10. datahub/cli/migrate.py +6 -6
  11. datahub/configuration/common.py +1 -1
  12. datahub/emitter/mcp_builder.py +4 -0
  13. datahub/ingestion/api/common.py +9 -0
  14. datahub/ingestion/api/source.py +4 -1
  15. datahub/ingestion/api/source_helpers.py +26 -1
  16. datahub/ingestion/graph/client.py +104 -0
  17. datahub/ingestion/run/pipeline.py +0 -6
  18. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  19. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  20. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  21. datahub/ingestion/source/fivetran/fivetran.py +1 -0
  22. datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
  23. datahub/ingestion/source/hex/constants.py +5 -0
  24. datahub/ingestion/source/hex/hex.py +150 -22
  25. datahub/ingestion/source/hex/mapper.py +28 -2
  26. datahub/ingestion/source/hex/model.py +10 -2
  27. datahub/ingestion/source/hex/query_fetcher.py +300 -0
  28. datahub/ingestion/source/iceberg/iceberg.py +106 -18
  29. datahub/ingestion/source/kafka/kafka.py +1 -4
  30. datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
  31. datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
  32. datahub/ingestion/source/looker/looker_source.py +2 -3
  33. datahub/ingestion/source/mlflow.py +6 -7
  34. datahub/ingestion/source/mode.py +2 -2
  35. datahub/ingestion/source/nifi.py +3 -3
  36. datahub/ingestion/source/openapi.py +3 -3
  37. datahub/ingestion/source/openapi_parser.py +8 -8
  38. datahub/ingestion/source/powerbi/config.py +1 -1
  39. datahub/ingestion/source/powerbi/powerbi.py +16 -3
  40. datahub/ingestion/source/redshift/profile.py +2 -2
  41. datahub/ingestion/source/sigma/sigma.py +6 -2
  42. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
  43. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  44. datahub/ingestion/source/sql/trino.py +4 -3
  45. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  46. datahub/ingestion/source/superset.py +108 -81
  47. datahub/ingestion/source/tableau/tableau.py +4 -4
  48. datahub/ingestion/source/tableau/tableau_common.py +2 -2
  49. datahub/ingestion/source/unity/source.py +1 -1
  50. datahub/ingestion/source/vertexai/vertexai.py +7 -7
  51. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  52. datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
  53. datahub/ingestion/transformer/dataset_domain.py +1 -1
  54. datahub/lite/lite_util.py +2 -2
  55. datahub/metadata/_schema_classes.py +47 -2
  56. datahub/metadata/_urns/urn_defs.py +56 -0
  57. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  58. datahub/metadata/schema.avsc +121 -85
  59. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  60. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  61. datahub/metadata/schemas/FormInfo.avsc +5 -0
  62. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  63. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  64. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  65. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  66. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  67. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  68. datahub/testing/mcp_diff.py +1 -1
  69. datahub/utilities/file_backed_collections.py +6 -6
  70. datahub/utilities/hive_schema_to_avro.py +2 -2
  71. datahub/utilities/ingest_utils.py +2 -2
  72. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  73. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
  74. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
  75. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
  76. {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
16
16
  )
17
17
  from datahub.configuration.source_common import EnvConfigMixin
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
- from datahub.emitter.mcp_builder import ContainerKey
19
+ from datahub.emitter.mcp_builder import ExperimentKey
20
20
  from datahub.ingestion.api.common import PipelineContext
21
21
  from datahub.ingestion.api.decorators import (
22
22
  SupportStatus,
@@ -36,6 +36,7 @@ from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
36
36
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
37
37
  StaleEntityRemovalHandler,
38
38
  StaleEntityRemovalSourceReport,
39
+ StatefulStaleMetadataRemovalConfig,
39
40
  )
40
41
  from datahub.ingestion.source.state.stateful_ingestion_base import (
41
42
  StatefulIngestionConfigBase,
@@ -77,10 +78,6 @@ from datahub.sdk.dataset import Dataset
77
78
  T = TypeVar("T")
78
79
 
79
80
 
80
- class ContainerKeyWithId(ContainerKey):
81
- id: str
82
-
83
-
84
81
  class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
85
82
  tracking_uri: Optional[str] = Field(
86
83
  default=None,
@@ -123,6 +120,8 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
123
120
  default=None, description="Password for MLflow authentication"
124
121
  )
125
122
 
123
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
124
+
126
125
 
127
126
  @dataclass
128
127
  class MLflowRegisteredModelStageInfo:
@@ -252,7 +251,7 @@ class MLflowSource(StatefulIngestionSourceBase):
252
251
  self, experiment: Experiment
253
252
  ) -> Iterable[MetadataWorkUnit]:
254
253
  experiment_container = Container(
255
- container_key=ContainerKeyWithId(
254
+ container_key=ExperimentKey(
256
255
  platform=str(DataPlatformUrn(platform_name=self.platform)),
257
256
  id=experiment.name,
258
257
  ),
@@ -470,7 +469,7 @@ class MLflowSource(StatefulIngestionSourceBase):
470
469
  def _get_run_workunits(
471
470
  self, experiment: Experiment, run: Run
472
471
  ) -> Iterable[MetadataWorkUnit]:
473
- experiment_key = ContainerKeyWithId(
472
+ experiment_key = ExperimentKey(
474
473
  platform=str(DataPlatformUrn(self.platform)), id=experiment.name
475
474
  )
476
475
 
@@ -899,7 +899,7 @@ class ModeSource(StatefulIngestionSourceBase):
899
899
  for match in matches:
900
900
  definition = Template(source=match).render()
901
901
  parameters = yaml.safe_load(definition)
902
- for key in parameters.keys():
902
+ for key in parameters:
903
903
  jinja_params[key] = parameters[key].get("default", "")
904
904
 
905
905
  normalized_query = re.sub(
@@ -1601,7 +1601,7 @@ class ModeSource(StatefulIngestionSourceBase):
1601
1601
 
1602
1602
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
1603
1603
  # Space/collection -> report -> query -> Chart
1604
- for space_token in self.space_tokens.keys():
1604
+ for space_token in self.space_tokens:
1605
1605
  reports = self._get_reports(space_token)
1606
1606
  for report in reports:
1607
1607
  report_token = report.get("token", "")
@@ -703,7 +703,7 @@ class NifiSource(StatefulIngestionSourceBase):
703
703
  if (
704
704
  component.nifi_type is NifiType.PROCESSOR
705
705
  and component.type
706
- not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS.keys()
706
+ not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
707
707
  ) or component.nifi_type not in [
708
708
  NifiType.PROCESSOR,
709
709
  NifiType.REMOTE_INPUT_PORT,
@@ -977,7 +977,7 @@ class NifiSource(StatefulIngestionSourceBase):
977
977
  )
978
978
 
979
979
  for incoming_from in incoming:
980
- if incoming_from in self.nifi_flow.remotely_accessible_ports.keys():
980
+ if incoming_from in self.nifi_flow.remotely_accessible_ports:
981
981
  dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
982
982
  dataset_urn = builder.make_dataset_urn(
983
983
  NIFI, dataset_name, self.config.env
@@ -994,7 +994,7 @@ class NifiSource(StatefulIngestionSourceBase):
994
994
  )
995
995
 
996
996
  for outgoing_to in outgoing:
997
- if outgoing_to in self.nifi_flow.remotely_accessible_ports.keys():
997
+ if outgoing_to in self.nifi_flow.remotely_accessible_ports:
998
998
  dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
999
999
  dataset_urn = builder.make_dataset_urn(
1000
1000
  NIFI, dataset_name, self.config.env
@@ -102,7 +102,7 @@ class OpenApiConfig(ConfigModel):
102
102
  # details there once, and then use that session for all requests.
103
103
  self.token = f"Bearer {self.bearer_token}"
104
104
  else:
105
- assert "url_complement" in self.get_token.keys(), (
105
+ assert "url_complement" in self.get_token, (
106
106
  "When 'request_type' is set to 'get', an url_complement is needed for the request."
107
107
  )
108
108
  if self.get_token["request_type"] == "get":
@@ -317,7 +317,7 @@ class APISource(Source, ABC):
317
317
  yield wu
318
318
 
319
319
  # Handle schema metadata if available
320
- if "data" in endpoint_dets.keys():
320
+ if "data" in endpoint_dets:
321
321
  # we are lucky! data is defined in the swagger for this endpoint
322
322
  schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
323
323
  wu = MetadataWorkUnit(
@@ -371,7 +371,7 @@ class APISource(Source, ABC):
371
371
  else:
372
372
  self.report_bad_responses(response.status_code, type=endpoint_k)
373
373
  else:
374
- if endpoint_k not in config.forced_examples.keys():
374
+ if endpoint_k not in config.forced_examples:
375
375
  # start guessing...
376
376
  url_guess = try_guessing(endpoint_k, root_dataset_samples)
377
377
  tot_url = clean_url(config.url + self.url_basepath + url_guess)
@@ -128,18 +128,18 @@ def get_endpoints(sw_dict: dict) -> dict:
128
128
 
129
129
  for p_k, p_o in sw_dict["paths"].items():
130
130
  method = list(p_o)[0]
131
- if "200" in p_o[method]["responses"].keys():
131
+ if "200" in p_o[method]["responses"]:
132
132
  base_res = p_o[method]["responses"]["200"]
133
- elif 200 in p_o[method]["responses"].keys():
133
+ elif 200 in p_o[method]["responses"]:
134
134
  # if you read a plain yml file the 200 will be an integer
135
135
  base_res = p_o[method]["responses"][200]
136
136
  else:
137
137
  # the endpoint does not have a 200 response
138
138
  continue
139
139
 
140
- if "description" in p_o[method].keys():
140
+ if "description" in p_o[method]:
141
141
  desc = p_o[method]["description"]
142
- elif "summary" in p_o[method].keys():
142
+ elif "summary" in p_o[method]:
143
143
  desc = p_o[method]["summary"]
144
144
  else: # still testing
145
145
  desc = ""
@@ -156,7 +156,7 @@ def get_endpoints(sw_dict: dict) -> dict:
156
156
  url_details[p_k]["data"] = example_data
157
157
 
158
158
  # checking whether there are defined parameters to execute the call...
159
- if "parameters" in p_o[method].keys():
159
+ if "parameters" in p_o[method]:
160
160
  url_details[p_k]["parameters"] = p_o[method]["parameters"]
161
161
 
162
162
  return dict(sorted(url_details.items()))
@@ -169,7 +169,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
169
169
  data = {}
170
170
  if "content" in base_res:
171
171
  res_cont = base_res["content"]
172
- if "application/json" in res_cont.keys():
172
+ if "application/json" in res_cont:
173
173
  ex_field = None
174
174
  if "example" in res_cont["application/json"]:
175
175
  ex_field = "example"
@@ -186,7 +186,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
186
186
  logger.warning(
187
187
  f"Field in swagger file does not give consistent data --- {key}"
188
188
  )
189
- elif "text/csv" in res_cont.keys():
189
+ elif "text/csv" in res_cont:
190
190
  data = res_cont["text/csv"]["schema"]
191
191
  elif "examples" in base_res:
192
192
  data = base_res["examples"]["application/json"]
@@ -239,7 +239,7 @@ def guessing_url_name(url: str, examples: dict) -> str:
239
239
 
240
240
  # substituting the parameter's name w the value
241
241
  for name, clean_name in zip(needed_n, cleaned_needed_n):
242
- if clean_name in examples[ex2use].keys():
242
+ if clean_name in examples[ex2use]:
243
243
  guessed_url = re.sub(name, str(examples[ex2use][clean_name]), guessed_url)
244
244
 
245
245
  return guessed_url
@@ -555,7 +555,7 @@ class PowerBiDashboardSourceConfig(
555
555
  def map_data_platform(cls, value):
556
556
  # For backward compatibility convert input PostgreSql to PostgreSQL
557
557
  # PostgreSQL is name of the data-platform in M-Query
558
- if "PostgreSql" in value.keys():
558
+ if "PostgreSql" in value:
559
559
  platform_name = value["PostgreSql"]
560
560
  del value["PostgreSql"]
561
561
  value["PostgreSQL"] = platform_name
@@ -94,7 +94,7 @@ from datahub.metadata.schema_classes import (
94
94
  UpstreamLineageClass,
95
95
  ViewPropertiesClass,
96
96
  )
97
- from datahub.metadata.urns import ChartUrn
97
+ from datahub.metadata.urns import ChartUrn, DatasetUrn
98
98
  from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
99
99
  from datahub.utilities.dedup_list import deduplicate_list
100
100
  from datahub.utilities.urns.urn_iter import lowercase_dataset_urn
@@ -263,7 +263,7 @@ class Mapper:
263
263
  for upstream_dpt in lineage.upstreams:
264
264
  if (
265
265
  upstream_dpt.data_platform_pair.powerbi_data_platform_name
266
- not in self.__config.dataset_type_mapping.keys()
266
+ not in self.__config.dataset_type_mapping
267
267
  ):
268
268
  logger.debug(
269
269
  f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
@@ -1083,6 +1083,7 @@ class Mapper:
1083
1083
  report: powerbi_data_classes.Report,
1084
1084
  chart_mcps: List[MetadataChangeProposalWrapper],
1085
1085
  user_mcps: List[MetadataChangeProposalWrapper],
1086
+ dataset_edges: List[EdgeClass],
1086
1087
  ) -> List[MetadataChangeProposalWrapper]:
1087
1088
  """
1088
1089
  Map PowerBi report to Datahub dashboard
@@ -1104,6 +1105,7 @@ class Mapper:
1104
1105
  charts=chart_urn_list,
1105
1106
  lastModified=ChangeAuditStamps(),
1106
1107
  dashboardUrl=report.webUrl,
1108
+ datasetEdges=dataset_edges,
1107
1109
  )
1108
1110
 
1109
1111
  info_mcp = self.new_mcp(
@@ -1197,12 +1199,23 @@ class Mapper:
1197
1199
  ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
1198
1200
  chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
1199
1201
 
1202
+ # collect all upstream datasets; using a set to retain unique urns
1203
+ dataset_urns = {
1204
+ dataset.entityUrn
1205
+ for dataset in ds_mcps
1206
+ if dataset.entityType == DatasetUrn.ENTITY_TYPE and dataset.entityUrn
1207
+ }
1208
+ dataset_edges = [
1209
+ EdgeClass(destinationUrn=dataset_urn) for dataset_urn in dataset_urns
1210
+ ]
1211
+
1200
1212
  # Let's convert report to datahub dashboard
1201
1213
  report_mcps = self.report_to_dashboard(
1202
1214
  workspace=workspace,
1203
1215
  report=report,
1204
1216
  chart_mcps=chart_mcps,
1205
1217
  user_mcps=user_mcps,
1218
+ dataset_edges=dataset_edges,
1206
1219
  )
1207
1220
 
1208
1221
  # Now add MCPs in sequence
@@ -1340,7 +1353,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1340
1353
  for data_platform in SupportedDataPlatform
1341
1354
  ]
1342
1355
 
1343
- for key in self.source_config.dataset_type_mapping.keys():
1356
+ for key in self.source_config.dataset_type_mapping:
1344
1357
  if key not in powerbi_data_platforms:
1345
1358
  raise ValueError(f"PowerBI DataPlatform {key} is not supported")
1346
1359
 
@@ -42,9 +42,9 @@ class RedshiftProfiler(GenericProfiler):
42
42
  "max_overflow", self.config.profiling.max_workers
43
43
  )
44
44
 
45
- for db in tables.keys():
45
+ for db in tables:
46
46
  profile_requests = []
47
- for schema in tables.get(db, {}).keys():
47
+ for schema in tables.get(db, {}):
48
48
  if not self.config.schema_pattern.allowed(schema):
49
49
  continue
50
50
  for table in tables[db].get(schema, {}):
@@ -170,7 +170,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
170
170
  if self.config.workspace_pattern.allowed(workspace.name):
171
171
  allowed_workspaces.append(workspace)
172
172
  else:
173
- self.reporter.workspaces.dropped(workspace.workspaceId)
173
+ self.reporter.workspaces.dropped(
174
+ f"{workspace.name} ({workspace.workspaceId})"
175
+ )
174
176
  logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
175
177
 
176
178
  return allowed_workspaces
@@ -661,7 +663,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
661
663
  yield from self._gen_workbook_workunit(workbook)
662
664
 
663
665
  for workspace in self._get_allowed_workspaces():
664
- self.reporter.workspaces.processed(workspace.workspaceId)
666
+ self.reporter.workspaces.processed(
667
+ f"{workspace.name} ({workspace.workspaceId})"
668
+ )
665
669
  yield from self._gen_workspace_workunit(workspace)
666
670
  yield from self._gen_sigma_dataset_upstream_lineage_workunit()
667
671
 
@@ -77,7 +77,7 @@ class SnowsightUrlBuilder:
77
77
  region: str,
78
78
  ) -> Tuple[str, str]:
79
79
  cloud: str
80
- if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING.keys():
80
+ if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING:
81
81
  cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region]
82
82
  elif region.startswith(("aws_", "gcp_", "azure_")):
83
83
  # e.g. aws_us_west_2, gcp_us_central1, azure_northeurope
@@ -26,6 +26,7 @@ from datahub.metadata.schema_classes import (
26
26
  DataPlatformInstanceClass,
27
27
  DataTransformClass,
28
28
  DataTransformLogicClass,
29
+ QueryLanguageClass,
29
30
  QueryStatementClass,
30
31
  SubTypesClass,
31
32
  )
@@ -176,7 +177,17 @@ def _generate_job_workunits(
176
177
  DataTransformClass(
177
178
  queryStatement=QueryStatementClass(
178
179
  value=procedure.procedure_definition,
179
- language=procedure.language,
180
+ language=(
181
+ QueryLanguageClass.SQL
182
+ if procedure.language == "SQL"
183
+ # The language field uses a pretty limited enum.
184
+ # The "UNKNOWN" enum value is pretty new, so we don't want to
185
+ # emit it until it has broader server-side support. As a
186
+ # short-term solution, we map all languages to "SQL".
187
+ # TODO: Once we've released server 1.1.0, we should change
188
+ # this to be "UNKNOWN" for all languages except "SQL".
189
+ else QueryLanguageClass.SQL
190
+ ),
180
191
  ),
181
192
  )
182
193
  ]
@@ -128,9 +128,10 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
128
128
  if catalog_name is None:
129
129
  raise exc.NoSuchTableError("catalog is required in connection")
130
130
  connector_name = get_catalog_connector_name(connection.engine, catalog_name)
131
- if connector_name is None:
132
- return {}
133
- if connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS:
131
+ if (
132
+ connector_name is not None
133
+ and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
134
+ ):
134
135
  properties_table = self._get_full_table(f"{table_name}$properties", schema)
135
136
  query = f"SELECT * FROM {properties_table}"
136
137
  row = connection.execute(sql.text(query)).fetchone()
@@ -45,7 +45,6 @@ class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
45
45
  description="Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the 'fail_safe_threshold'.",
46
46
  le=100.0,
47
47
  ge=0.0,
48
- hidden_from_docs=True,
49
48
  )
50
49
 
51
50
 
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  from dataclasses import dataclass, field
4
5
  from datetime import datetime
5
6
  from functools import lru_cache
@@ -100,6 +101,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
100
101
  from datahub.utilities import config_clean
101
102
  from datahub.utilities.lossy_collections import LossyList
102
103
  from datahub.utilities.registries.domain_registry import DomainRegistry
104
+ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
103
105
 
104
106
  logger = logging.getLogger(__name__)
105
107
 
@@ -210,6 +212,11 @@ class SupersetConfig(
210
212
  default=10, description="Timeout of single API call to superset."
211
213
  )
212
214
 
215
+ max_threads: int = Field(
216
+ default_factory=lambda: os.cpu_count() or 40,
217
+ description="Max parallelism for API calls. Defaults to cpuCount or 40",
218
+ )
219
+
213
220
  # TODO: Check and remove this if no longer needed.
214
221
  # Config database_alias is removed from sql sources.
215
222
  database_alias: Dict[str, str] = Field(
@@ -339,6 +346,7 @@ class SupersetSource(StatefulIngestionSourceBase):
339
346
 
340
347
  if response.status_code != 200:
341
348
  logger.warning(f"Failed to get {entity_type} data: {response.text}")
349
+ continue
342
350
 
343
351
  payload = response.json()
344
352
  # Update total_items with the actual count from the response
@@ -501,33 +509,41 @@ class SupersetSource(StatefulIngestionSourceBase):
501
509
 
502
510
  return dashboard_snapshot
503
511
 
504
- def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
505
- for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
506
- try:
507
- dashboard_id = str(dashboard_data.get("id"))
508
- dashboard_title = dashboard_data.get("dashboard_title", "")
509
-
510
- if not self.config.dashboard_pattern.allowed(dashboard_title):
511
- self.report.report_dropped(
512
- f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
513
- )
514
- continue
515
-
516
- dashboard_snapshot = self.construct_dashboard_from_api_data(
517
- dashboard_data
518
- )
519
- except Exception as e:
520
- self.report.warning(
521
- f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
512
+ def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
513
+ dashboard_title = ""
514
+ try:
515
+ dashboard_id = str(dashboard_data.get("id"))
516
+ dashboard_title = dashboard_data.get("dashboard_title", "")
517
+ if not self.config.dashboard_pattern.allowed(dashboard_title):
518
+ self.report.report_dropped(
519
+ f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
522
520
  )
523
- continue
524
- # Emit the dashboard
525
- mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
526
- yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
527
- yield from self._get_domain_wu(
528
- title=dashboard_title,
529
- entity_urn=dashboard_snapshot.urn,
521
+ return
522
+ dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
523
+ except Exception as e:
524
+ self.report.warning(
525
+ f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
526
+ )
527
+ return
528
+ mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
529
+ yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
530
+ yield from self._get_domain_wu(
531
+ title=dashboard_title, entity_urn=dashboard_snapshot.urn
532
+ )
533
+
534
+ def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
535
+ dashboard_data_list = [
536
+ (dashboard_data,)
537
+ for dashboard_data in self.paginate_entity_api_results(
538
+ "dashboard/", PAGE_SIZE
530
539
  )
540
+ ]
541
+
542
+ yield from ThreadedIteratorExecutor.process(
543
+ worker_func=self._process_dashboard,
544
+ args_list=dashboard_data_list,
545
+ max_workers=self.config.max_threads,
546
+ )
531
547
 
532
548
  def build_input_fields(
533
549
  self,
@@ -762,40 +778,46 @@ class SupersetSource(StatefulIngestionSourceBase):
762
778
  entity_urn=chart_urn,
763
779
  )
764
780
 
765
- def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
766
- for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
767
- try:
768
- chart_id = str(chart_data.get("id"))
769
- chart_name = chart_data.get("slice_name", "")
770
-
771
- if not self.config.chart_pattern.allowed(chart_name):
772
- self.report.report_dropped(
773
- f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
781
+ def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
782
+ chart_name = ""
783
+ try:
784
+ chart_id = str(chart_data.get("id"))
785
+ chart_name = chart_data.get("slice_name", "")
786
+ if not self.config.chart_pattern.allowed(chart_name):
787
+ self.report.report_dropped(
788
+ f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
789
+ )
790
+ return
791
+ if self.config.dataset_pattern != AllowDenyPattern.allow_all():
792
+ datasource_id = chart_data.get("datasource_id")
793
+ if datasource_id:
794
+ dataset_response = self.get_dataset_info(datasource_id)
795
+ dataset_name = dataset_response.get("result", {}).get(
796
+ "table_name", ""
774
797
  )
775
- continue
776
-
777
- # Emit a warning if charts use data from a dataset that will be filtered out
778
- if self.config.dataset_pattern != AllowDenyPattern.allow_all():
779
- datasource_id = chart_data.get("datasource_id")
780
- if datasource_id:
781
- dataset_response = self.get_dataset_info(datasource_id)
782
- dataset_name = dataset_response.get("result", {}).get(
783
- "table_name", ""
798
+ if dataset_name and not self.config.dataset_pattern.allowed(
799
+ dataset_name
800
+ ):
801
+ self.report.warning(
802
+ f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
784
803
  )
804
+ yield from self.construct_chart_from_chart_data(chart_data)
805
+ except Exception as e:
806
+ self.report.warning(
807
+ f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
808
+ )
809
+ return
785
810
 
786
- if dataset_name and not self.config.dataset_pattern.allowed(
787
- dataset_name
788
- ):
789
- self.report.warning(
790
- f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
791
- )
792
-
793
- yield from self.construct_chart_from_chart_data(chart_data)
794
- except Exception as e:
795
- self.report.warning(
796
- f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
797
- )
798
- continue
811
+ def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
812
+ chart_data_list = [
813
+ (chart_data,)
814
+ for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE)
815
+ ]
816
+ yield from ThreadedIteratorExecutor.process(
817
+ worker_func=self._process_chart,
818
+ args_list=chart_data_list,
819
+ max_workers=self.config.max_threads,
820
+ )
799
821
 
800
822
  def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
801
823
  schema_fields: List[SchemaField] = []
@@ -1023,33 +1045,38 @@ class SupersetSource(StatefulIngestionSourceBase):
1023
1045
 
1024
1046
  return dataset_snapshot
1025
1047
 
1026
- def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
1027
- for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
1028
- try:
1029
- dataset_name = dataset_data.get("table_name", "")
1030
-
1031
- # Check if dataset should be filtered by dataset name
1032
- if not self.config.dataset_pattern.allowed(dataset_name):
1033
- self.report.report_dropped(
1034
- f"Dataset '{dataset_name}' filtered by dataset_pattern"
1035
- )
1036
- continue
1037
-
1038
- dataset_snapshot = self.construct_dataset_from_dataset_data(
1039
- dataset_data
1040
- )
1041
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1042
- except Exception as e:
1043
- self.report.warning(
1044
- f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
1048
+ def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
1049
+ dataset_name = ""
1050
+ try:
1051
+ dataset_name = dataset_data.get("table_name", "")
1052
+ if not self.config.dataset_pattern.allowed(dataset_name):
1053
+ self.report.report_dropped(
1054
+ f"Dataset '{dataset_name}' filtered by dataset_pattern"
1045
1055
  )
1046
- continue
1047
- # Emit the dataset
1048
- yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
1049
- yield from self._get_domain_wu(
1050
- title=dataset_data.get("table_name", ""),
1051
- entity_urn=dataset_snapshot.urn,
1056
+ return
1057
+ dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
1058
+ mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1059
+ except Exception as e:
1060
+ self.report.warning(
1061
+ f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
1052
1062
  )
1063
+ return
1064
+ yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
1065
+ yield from self._get_domain_wu(
1066
+ title=dataset_data.get("table_name", ""),
1067
+ entity_urn=dataset_snapshot.urn,
1068
+ )
1069
+
1070
+ def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
1071
+ dataset_data_list = [
1072
+ (dataset_data,)
1073
+ for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE)
1074
+ ]
1075
+ yield from ThreadedIteratorExecutor.process(
1076
+ worker_func=self._process_dataset,
1077
+ args_list=dataset_data_list,
1078
+ max_workers=self.config.max_threads,
1079
+ )
1053
1080
 
1054
1081
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1055
1082
  if self.config.ingest_dashboards:
@@ -1623,7 +1623,7 @@ class TableauSiteSource:
1623
1623
  # if multiple project has name C. Ideal solution is to use projectLuidWithin to avoid duplicate project,
1624
1624
  # however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later.
1625
1625
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
1626
- if project_luid not in self.tableau_project_registry.keys():
1626
+ if project_luid not in self.tableau_project_registry:
1627
1627
  wrk_name: Optional[str] = workbook.get(c.NAME)
1628
1628
  wrk_id: Optional[str] = workbook.get(c.ID)
1629
1629
  prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
@@ -2253,7 +2253,7 @@ class TableauSiteSource:
2253
2253
  # It is possible due to https://github.com/tableau/server-client-python/issues/1210
2254
2254
  if (
2255
2255
  ds.get(c.LUID)
2256
- and ds[c.LUID] not in self.datasource_project_map.keys()
2256
+ and ds[c.LUID] not in self.datasource_project_map
2257
2257
  and self.report.get_all_datasources_query_failed
2258
2258
  ):
2259
2259
  logger.debug(
@@ -2265,7 +2265,7 @@ class TableauSiteSource:
2265
2265
 
2266
2266
  if (
2267
2267
  ds.get(c.LUID)
2268
- and ds[c.LUID] in self.datasource_project_map.keys()
2268
+ and ds[c.LUID] in self.datasource_project_map
2269
2269
  and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry
2270
2270
  ):
2271
2271
  return self.datasource_project_map[ds[c.LUID]]
@@ -3252,7 +3252,7 @@ class TableauSiteSource:
3252
3252
 
3253
3253
  parent_key = None
3254
3254
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
3255
- if project_luid and project_luid in self.tableau_project_registry.keys():
3255
+ if project_luid and project_luid in self.tableau_project_registry:
3256
3256
  parent_key = self.gen_project_key(project_luid)
3257
3257
  else:
3258
3258
  workbook_id: Optional[str] = workbook.get(c.ID)