acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show
  1. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
  2. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
  3. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/common/subtypes.py +2 -0
  35. datahub/ingestion/source/csv_enricher.py +1 -1
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  37. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  38. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  39. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  40. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  41. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  42. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  43. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  44. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  45. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  46. datahub/ingestion/source/elastic_search.py +1 -1
  47. datahub/ingestion/source/feast.py +97 -6
  48. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  49. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  50. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  51. datahub/ingestion/source/ge_data_profiler.py +23 -1
  52. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  53. datahub/ingestion/source/kafka/kafka.py +39 -19
  54. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  55. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  56. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  57. datahub/ingestion/source/looker/view_upstream.py +65 -30
  58. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  59. datahub/ingestion/source/mode.py +0 -23
  60. datahub/ingestion/source/neo4j/__init__.py +0 -0
  61. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  62. datahub/ingestion/source/powerbi/__init__.py +0 -1
  63. datahub/ingestion/source/powerbi/config.py +3 -3
  64. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  65. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  66. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  67. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  68. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  69. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  70. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  71. datahub/ingestion/source/preset.py +1 -0
  72. datahub/ingestion/source/pulsar.py +21 -2
  73. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  74. datahub/ingestion/source/redash.py +13 -63
  75. datahub/ingestion/source/redshift/config.py +1 -0
  76. datahub/ingestion/source/redshift/redshift.py +3 -0
  77. datahub/ingestion/source/s3/source.py +2 -3
  78. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  79. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  80. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  81. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  82. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  83. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  84. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  85. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  86. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  87. datahub/ingestion/source/sql/athena.py +46 -22
  88. datahub/ingestion/source/sql/mssql/source.py +0 -2
  89. datahub/ingestion/source/sql/sql_common.py +34 -21
  90. datahub/ingestion/source/sql/sql_report.py +1 -0
  91. datahub/ingestion/source/sql/sql_types.py +85 -8
  92. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  93. datahub/ingestion/source/superset.py +215 -65
  94. datahub/ingestion/source/tableau/tableau.py +237 -76
  95. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  96. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  97. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  98. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  99. datahub/ingestion/source/unity/proxy_types.py +1 -0
  100. datahub/ingestion/source/unity/source.py +4 -0
  101. datahub/ingestion/source/unity/usage.py +20 -11
  102. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  103. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  104. datahub/integrations/assertion/common.py +1 -1
  105. datahub/lite/duckdb_lite.py +12 -17
  106. datahub/metadata/_schema_classes.py +512 -392
  107. datahub/metadata/_urns/urn_defs.py +1355 -1355
  108. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  109. datahub/metadata/schema.avsc +17222 -17499
  110. datahub/metadata/schemas/FormInfo.avsc +4 -0
  111. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  112. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  113. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  114. datahub/specific/chart.py +0 -39
  115. datahub/specific/dashboard.py +0 -39
  116. datahub/specific/datajob.py +7 -57
  117. datahub/sql_parsing/schema_resolver.py +23 -0
  118. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  119. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  120. datahub/sql_parsing/sqlglot_utils.py +8 -2
  121. datahub/telemetry/telemetry.py +23 -9
  122. datahub/testing/compare_metadata_json.py +1 -1
  123. datahub/testing/doctest.py +12 -0
  124. datahub/utilities/file_backed_collections.py +35 -2
  125. datahub/utilities/partition_executor.py +1 -1
  126. datahub/utilities/urn_encoder.py +2 -1
  127. datahub/utilities/urns/_urn_base.py +1 -1
  128. datahub/utilities/urns/structured_properties_urn.py +1 -1
  129. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  130. datahub/utilities/sql_parser.py +0 -94
  131. datahub/utilities/sql_parser_base.py +0 -21
  132. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  133. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,12 @@
1
1
  import json
2
2
  import logging
3
+ from datetime import datetime
3
4
  from functools import lru_cache
4
- from typing import Dict, Iterable, List, Optional
5
+ from typing import Any, Dict, Iterable, List, Optional
5
6
 
6
7
  import dateutil.parser as dp
7
8
  import requests
9
+ from pydantic import BaseModel
8
10
  from pydantic.class_validators import root_validator, validator
9
11
  from pydantic.fields import Field
10
12
 
@@ -16,7 +18,9 @@ from datahub.configuration.source_common import (
16
18
  from datahub.emitter.mce_builder import (
17
19
  make_chart_urn,
18
20
  make_dashboard_urn,
21
+ make_data_platform_urn,
19
22
  make_dataset_urn,
23
+ make_dataset_urn_with_platform_instance,
20
24
  make_domain_urn,
21
25
  )
22
26
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
@@ -31,6 +35,7 @@ from datahub.ingestion.api.decorators import (
31
35
  )
32
36
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
33
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
+ from datahub.ingestion.source.sql.sql_types import resolve_sql_type
34
39
  from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
35
40
  get_platform_from_sqlalchemy_uri,
36
41
  )
@@ -47,16 +52,26 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import (
47
52
  AuditStamp,
48
53
  ChangeAuditStamps,
49
54
  Status,
55
+ TimeStamp,
50
56
  )
51
57
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
52
58
  ChartSnapshot,
53
59
  DashboardSnapshot,
60
+ DatasetSnapshot,
54
61
  )
55
62
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
63
+ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
64
+ MySqlDDL,
65
+ NullType,
66
+ SchemaField,
67
+ SchemaFieldDataType,
68
+ SchemaMetadata,
69
+ )
56
70
  from datahub.metadata.schema_classes import (
57
71
  ChartInfoClass,
58
72
  ChartTypeClass,
59
73
  DashboardInfoClass,
74
+ DatasetPropertiesClass,
60
75
  )
61
76
  from datahub.utilities import config_clean
62
77
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -82,9 +97,29 @@ chart_type_from_viz_type = {
82
97
  "box_plot": ChartTypeClass.BAR,
83
98
  }
84
99
 
100
+
85
101
  platform_without_databases = ["druid"]
86
102
 
87
103
 
104
+ class SupersetDataset(BaseModel):
105
+ id: int
106
+ table_name: str
107
+ changed_on_utc: Optional[str] = None
108
+ explore_url: Optional[str] = ""
109
+
110
+ @property
111
+ def modified_dt(self) -> Optional[datetime]:
112
+ if self.changed_on_utc:
113
+ return dp.parse(self.changed_on_utc)
114
+ return None
115
+
116
+ @property
117
+ def modified_ts(self) -> Optional[int]:
118
+ if self.modified_dt:
119
+ return int(self.modified_dt.timestamp() * 1000)
120
+ return None
121
+
122
+
88
123
  class SupersetConfig(
89
124
  StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
90
125
  ):
@@ -103,15 +138,17 @@ class SupersetConfig(
103
138
  )
104
139
  username: Optional[str] = Field(default=None, description="Superset username.")
105
140
  password: Optional[str] = Field(default=None, description="Superset password.")
106
- api_key: Optional[str] = Field(default=None, description="Preset.io API key.")
107
- api_secret: Optional[str] = Field(default=None, description="Preset.io API secret.")
108
- manager_uri: str = Field(
109
- default="https://api.app.preset.io", description="Preset.io API URL"
110
- )
111
141
  # Configuration for stateful ingestion
112
142
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
113
143
  default=None, description="Superset Stateful Ingestion Config."
114
144
  )
145
+ ingest_dashboards: bool = Field(
146
+ default=True, description="Enable to ingest dashboards."
147
+ )
148
+ ingest_charts: bool = Field(default=True, description="Enable to ingest charts.")
149
+ ingest_datasets: bool = Field(
150
+ default=False, description="Enable to ingest datasets."
151
+ )
115
152
 
116
153
  provider: str = Field(default="db", description="Superset provider.")
117
154
  options: Dict = Field(default={}, description="")
@@ -123,6 +160,10 @@ class SupersetConfig(
123
160
  description="Can be used to change mapping for database names in superset to what you have in datahub",
124
161
  )
125
162
 
163
+ class Config:
164
+ # This is required to allow preset configs to get parsed
165
+ extra = "allow"
166
+
126
167
  @validator("connect_uri", "display_uri")
127
168
  def remove_trailing_slash(cls, v):
128
169
  return config_clean.remove_trailing_slashes(v)
@@ -229,6 +270,28 @@ class SupersetSource(StatefulIngestionSourceBase):
229
270
  config = SupersetConfig.parse_obj(config_dict)
230
271
  return cls(ctx, config)
231
272
 
273
+ def paginate_entity_api_results(self, entity_type, page_size=100):
274
+ current_page = 0
275
+ total_items = page_size
276
+
277
+ while current_page * page_size < total_items:
278
+ response = self.session.get(
279
+ f"{self.config.connect_uri}/api/v1/{entity_type}/",
280
+ params={"q": f"(page:{current_page},page_size:{page_size})"},
281
+ )
282
+
283
+ if response.status_code != 200:
284
+ logger.warning(f"Failed to get {entity_type} data: {response.text}")
285
+
286
+ payload = response.json()
287
+ # Update total_items with the actual count from the response
288
+ total_items = payload.get("count", total_items)
289
+ # Yield each item in the result, this gets passed into the construct functions
290
+ for item in payload.get("result", []):
291
+ yield item
292
+
293
+ current_page += 1
294
+
232
295
  @lru_cache(maxsize=None)
233
296
  def get_platform_from_database_id(self, database_id):
234
297
  database_response = self.session.get(
@@ -250,11 +313,18 @@ class SupersetSource(StatefulIngestionSourceBase):
250
313
  return platform_name
251
314
 
252
315
  @lru_cache(maxsize=None)
253
- def get_datasource_urn_from_id(self, datasource_id):
316
+ def get_dataset_info(self, dataset_id: int) -> dict:
254
317
  dataset_response = self.session.get(
255
- f"{self.config.connect_uri}/api/v1/dataset/{datasource_id}"
256
- ).json()
257
-
318
+ f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
319
+ )
320
+ if dataset_response.status_code != 200:
321
+ logger.warning(f"Failed to get dataset info: {dataset_response.text}")
322
+ dataset_response.raise_for_status()
323
+ return dataset_response.json()
324
+
325
+ def get_datasource_urn_from_id(
326
+ self, dataset_response: dict, platform_instance: str
327
+ ) -> str:
258
328
  schema_name = dataset_response.get("result", {}).get("schema")
259
329
  table_name = dataset_response.get("result", {}).get("table_name")
260
330
  database_id = dataset_response.get("result", {}).get("database", {}).get("id")
@@ -283,9 +353,11 @@ class SupersetSource(StatefulIngestionSourceBase):
283
353
  ),
284
354
  env=self.config.env,
285
355
  )
286
- return None
356
+ raise ValueError("Could not construct dataset URN")
287
357
 
288
- def construct_dashboard_from_api_data(self, dashboard_data):
358
+ def construct_dashboard_from_api_data(
359
+ self, dashboard_data: dict
360
+ ) -> DashboardSnapshot:
289
361
  dashboard_urn = make_dashboard_urn(
290
362
  platform=self.platform,
291
363
  name=dashboard_data["id"],
@@ -340,7 +412,7 @@ class SupersetSource(StatefulIngestionSourceBase):
340
412
  }
341
413
 
342
414
  if dashboard_data.get("certified_by"):
343
- custom_properties["CertifiedBy"] = dashboard_data.get("certified_by")
415
+ custom_properties["CertifiedBy"] = dashboard_data.get("certified_by", "")
344
416
  custom_properties["CertificationDetails"] = str(
345
417
  dashboard_data.get("certification_details")
346
418
  )
@@ -358,38 +430,25 @@ class SupersetSource(StatefulIngestionSourceBase):
358
430
  return dashboard_snapshot
359
431
 
360
432
  def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
361
- current_dashboard_page = 0
362
- # we will set total dashboards to the actual number after we get the response
363
- total_dashboards = PAGE_SIZE
364
-
365
- while current_dashboard_page * PAGE_SIZE <= total_dashboards:
366
- dashboard_response = self.session.get(
367
- f"{self.config.connect_uri}/api/v1/dashboard/",
368
- params=f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})",
369
- )
370
- if dashboard_response.status_code != 200:
371
- logger.warning(
372
- f"Failed to get dashboard data: {dashboard_response.text}"
373
- )
374
- dashboard_response.raise_for_status()
375
-
376
- payload = dashboard_response.json()
377
- total_dashboards = payload.get("count") or 0
378
-
379
- current_dashboard_page += 1
380
-
381
- for dashboard_data in payload["result"]:
433
+ for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
434
+ try:
382
435
  dashboard_snapshot = self.construct_dashboard_from_api_data(
383
436
  dashboard_data
384
437
  )
385
- mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
386
- yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
387
- yield from self._get_domain_wu(
388
- title=dashboard_data.get("dashboard_title", ""),
389
- entity_urn=dashboard_snapshot.urn,
438
+ except Exception as e:
439
+ self.report.warning(
440
+ f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
390
441
  )
442
+ continue
443
+ # Emit the dashboard
444
+ mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
445
+ yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
446
+ yield from self._get_domain_wu(
447
+ title=dashboard_data.get("dashboard_title", ""),
448
+ entity_urn=dashboard_snapshot.urn,
449
+ )
391
450
 
392
- def construct_chart_from_chart_data(self, chart_data):
451
+ def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
393
452
  chart_urn = make_chart_urn(
394
453
  platform=self.platform,
395
454
  name=chart_data["id"],
@@ -415,9 +474,12 @@ class SupersetSource(StatefulIngestionSourceBase):
415
474
  chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
416
475
 
417
476
  datasource_id = chart_data.get("datasource_id")
418
- datasource_urn = self.get_datasource_urn_from_id(datasource_id)
477
+ dataset_response = self.get_dataset_info(datasource_id)
478
+ datasource_urn = self.get_datasource_urn_from_id(
479
+ dataset_response, self.platform
480
+ )
419
481
 
420
- params = json.loads(chart_data.get("params"))
482
+ params = json.loads(chart_data.get("params", "{}"))
421
483
  metrics = [
422
484
  get_metric_name(metric)
423
485
  for metric in (params.get("metrics", []) or [params.get("metric")])
@@ -467,36 +529,124 @@ class SupersetSource(StatefulIngestionSourceBase):
467
529
  return chart_snapshot
468
530
 
469
531
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
470
- current_chart_page = 0
471
- # we will set total charts to the actual number after we get the response
472
- total_charts = PAGE_SIZE
473
-
474
- while current_chart_page * PAGE_SIZE <= total_charts:
475
- chart_response = self.session.get(
476
- f"{self.config.connect_uri}/api/v1/chart/",
477
- params=f"q=(page:{current_chart_page},page_size:{PAGE_SIZE})",
532
+ for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
533
+ try:
534
+ chart_snapshot = self.construct_chart_from_chart_data(chart_data)
535
+
536
+ mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
537
+ except Exception as e:
538
+ self.report.warning(
539
+ f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
540
+ )
541
+ continue
542
+ # Emit the chart
543
+ yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
544
+ yield from self._get_domain_wu(
545
+ title=chart_data.get("slice_name", ""),
546
+ entity_urn=chart_snapshot.urn,
478
547
  )
479
- if chart_response.status_code != 200:
480
- logger.warning(f"Failed to get chart data: {chart_response.text}")
481
- chart_response.raise_for_status()
482
548
 
483
- current_chart_page += 1
549
+ def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
550
+ schema_fields: List[SchemaField] = []
551
+ for col in column_data:
552
+ col_type = (col.get("type") or "").lower()
553
+ data_type = resolve_sql_type(col_type)
554
+ if data_type is None:
555
+ data_type = NullType()
556
+
557
+ field = SchemaField(
558
+ fieldPath=col.get("column_name", ""),
559
+ type=SchemaFieldDataType(data_type),
560
+ nativeDataType="",
561
+ description=col.get("column_name", ""),
562
+ nullable=True,
563
+ )
564
+ schema_fields.append(field)
565
+ return schema_fields
566
+
567
+ def gen_schema_metadata(
568
+ self,
569
+ dataset_response: dict,
570
+ ) -> SchemaMetadata:
571
+ dataset_response = dataset_response.get("result", {})
572
+ column_data = dataset_response.get("columns", [])
573
+ schema_metadata = SchemaMetadata(
574
+ schemaName=dataset_response.get("table_name", ""),
575
+ platform=make_data_platform_urn(self.platform),
576
+ version=0,
577
+ hash="",
578
+ platformSchema=MySqlDDL(tableSchema=""),
579
+ fields=self.gen_schema_fields(column_data),
580
+ )
581
+ return schema_metadata
484
582
 
485
- payload = chart_response.json()
486
- total_charts = payload["count"]
487
- for chart_data in payload["result"]:
488
- chart_snapshot = self.construct_chart_from_chart_data(chart_data)
583
+ def gen_dataset_urn(self, datahub_dataset_name: str) -> str:
584
+ return make_dataset_urn_with_platform_instance(
585
+ platform=self.platform,
586
+ name=datahub_dataset_name,
587
+ platform_instance=self.config.platform_instance,
588
+ env=self.config.env,
589
+ )
489
590
 
490
- mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
491
- yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
492
- yield from self._get_domain_wu(
493
- title=chart_data.get("slice_name", ""),
494
- entity_urn=chart_snapshot.urn,
591
+ def construct_dataset_from_dataset_data(
592
+ self, dataset_data: dict
593
+ ) -> DatasetSnapshot:
594
+ dataset_response = self.get_dataset_info(dataset_data.get("id"))
595
+ dataset = SupersetDataset(**dataset_response["result"])
596
+ datasource_urn = self.get_datasource_urn_from_id(
597
+ dataset_response, self.platform
598
+ )
599
+
600
+ dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
601
+
602
+ dataset_info = DatasetPropertiesClass(
603
+ name=dataset.table_name,
604
+ description="",
605
+ lastModified=TimeStamp(time=dataset.modified_ts)
606
+ if dataset.modified_ts
607
+ else None,
608
+ externalUrl=dataset_url,
609
+ )
610
+ aspects_items: List[Any] = []
611
+ aspects_items.extend(
612
+ [
613
+ self.gen_schema_metadata(dataset_response),
614
+ dataset_info,
615
+ ]
616
+ )
617
+
618
+ dataset_snapshot = DatasetSnapshot(
619
+ urn=datasource_urn,
620
+ aspects=aspects_items,
621
+ )
622
+ return dataset_snapshot
623
+
624
+ def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
625
+ for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
626
+ try:
627
+ dataset_snapshot = self.construct_dataset_from_dataset_data(
628
+ dataset_data
495
629
  )
630
+ mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
631
+ except Exception as e:
632
+ self.report.warning(
633
+ f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
634
+ )
635
+ continue
636
+ # Emit the dataset
637
+ yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
638
+ yield from self._get_domain_wu(
639
+ title=dataset_data.get("table_name", ""),
640
+ entity_urn=dataset_snapshot.urn,
641
+ )
496
642
 
497
643
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
498
- yield from self.emit_dashboard_mces()
499
- yield from self.emit_chart_mces()
644
+ if self.config.ingest_dashboards:
645
+ yield from self.emit_dashboard_mces()
646
+ if self.config.ingest_charts:
647
+ yield from self.emit_chart_mces()
648
+ if self.config.ingest_datasets:
649
+ yield from self.emit_dataset_mces()
500
650
 
501
651
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
502
652
  return [