acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show
  1. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
  2. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
  3. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
  35. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
  36. datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
  37. datahub/ingestion/source/common/subtypes.py +2 -0
  38. datahub/ingestion/source/csv_enricher.py +1 -1
  39. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  40. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  41. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  42. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  43. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  44. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  45. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  46. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  47. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  48. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  49. datahub/ingestion/source/elastic_search.py +1 -1
  50. datahub/ingestion/source/feast.py +97 -6
  51. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  52. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  53. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  54. datahub/ingestion/source/ge_data_profiler.py +46 -9
  55. datahub/ingestion/source/ge_profiling_config.py +5 -0
  56. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  57. datahub/ingestion/source/kafka/kafka.py +39 -19
  58. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  59. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  60. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  61. datahub/ingestion/source/looker/view_upstream.py +65 -30
  62. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  63. datahub/ingestion/source/mode.py +0 -23
  64. datahub/ingestion/source/neo4j/__init__.py +0 -0
  65. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  66. datahub/ingestion/source/powerbi/__init__.py +0 -1
  67. datahub/ingestion/source/powerbi/config.py +3 -3
  68. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  69. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  70. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  71. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  72. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  73. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  74. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  75. datahub/ingestion/source/preset.py +1 -0
  76. datahub/ingestion/source/pulsar.py +21 -2
  77. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  78. datahub/ingestion/source/redash.py +13 -63
  79. datahub/ingestion/source/redshift/config.py +1 -0
  80. datahub/ingestion/source/redshift/redshift.py +3 -0
  81. datahub/ingestion/source/s3/source.py +2 -3
  82. datahub/ingestion/source/sigma/data_classes.py +1 -0
  83. datahub/ingestion/source/sigma/sigma.py +101 -43
  84. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  85. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  86. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  87. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  88. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  89. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  90. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  91. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  92. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  93. datahub/ingestion/source/sql/athena.py +46 -22
  94. datahub/ingestion/source/sql/mssql/source.py +18 -6
  95. datahub/ingestion/source/sql/sql_common.py +34 -21
  96. datahub/ingestion/source/sql/sql_report.py +1 -0
  97. datahub/ingestion/source/sql/sql_types.py +85 -8
  98. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  99. datahub/ingestion/source/superset.py +215 -65
  100. datahub/ingestion/source/tableau/tableau.py +237 -76
  101. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  102. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  103. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  104. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  105. datahub/ingestion/source/unity/proxy_types.py +1 -0
  106. datahub/ingestion/source/unity/source.py +4 -0
  107. datahub/ingestion/source/unity/usage.py +20 -11
  108. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  109. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  110. datahub/integrations/assertion/common.py +1 -1
  111. datahub/lite/duckdb_lite.py +12 -17
  112. datahub/metadata/_schema_classes.py +512 -392
  113. datahub/metadata/_urns/urn_defs.py +1355 -1355
  114. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  115. datahub/metadata/schema.avsc +17222 -17499
  116. datahub/metadata/schemas/FormInfo.avsc +4 -0
  117. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  118. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  119. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  120. datahub/specific/chart.py +0 -39
  121. datahub/specific/dashboard.py +0 -39
  122. datahub/specific/datajob.py +7 -57
  123. datahub/sql_parsing/schema_resolver.py +23 -0
  124. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  125. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  126. datahub/sql_parsing/sqlglot_utils.py +8 -2
  127. datahub/telemetry/telemetry.py +23 -9
  128. datahub/testing/compare_metadata_json.py +1 -1
  129. datahub/testing/doctest.py +12 -0
  130. datahub/utilities/file_backed_collections.py +35 -2
  131. datahub/utilities/partition_executor.py +1 -1
  132. datahub/utilities/urn_encoder.py +2 -1
  133. datahub/utilities/urns/_urn_base.py +1 -1
  134. datahub/utilities/urns/structured_properties_urn.py +1 -1
  135. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  136. datahub/utilities/sql_parser.py +0 -94
  137. datahub/utilities/sql_parser_base.py +0 -21
  138. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  139. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -582,6 +582,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
582
582
  generate_operations=False,
583
583
  )
584
584
  for dataset_name in self._view_definition_cache.keys():
585
+ # TODO: Ensure that the lineage generated from the view definition
586
+ # matches the dataset_name.
585
587
  view_definition = self._view_definition_cache[dataset_name]
586
588
  result = self._run_sql_parser(
587
589
  dataset_name,
@@ -1059,6 +1061,20 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1059
1061
  exc=e,
1060
1062
  )
1061
1063
 
1064
+ def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
1065
+ try:
1066
+ view_definition = inspector.get_view_definition(view, schema)
1067
+ if view_definition is None:
1068
+ view_definition = ""
1069
+ else:
1070
+ # Some dialects return a TextClause instead of a raw string,
1071
+ # so we need to convert them to a string.
1072
+ view_definition = str(view_definition)
1073
+ except NotImplementedError:
1074
+ view_definition = ""
1075
+
1076
+ return view_definition
1077
+
1062
1078
  def _process_view(
1063
1079
  self,
1064
1080
  dataset_name: str,
@@ -1077,7 +1093,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1077
1093
  columns = inspector.get_columns(view, schema)
1078
1094
  except KeyError:
1079
1095
  # For certain types of views, we are unable to fetch the list of columns.
1080
- self.warn(logger, dataset_name, "unable to get schema for this view")
1096
+ self.report.warning(
1097
+ message="Unable to get schema for a view",
1098
+ context=f"{dataset_name}",
1099
+ )
1081
1100
  schema_metadata = None
1082
1101
  else:
1083
1102
  schema_fields = self.get_schema_fields(dataset_name, columns, inspector)
@@ -1091,19 +1110,12 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1091
1110
  if self._save_schema_to_resolver():
1092
1111
  self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
1093
1112
  self.discovered_datasets.add(dataset_name)
1113
+
1094
1114
  description, properties, _ = self.get_table_properties(inspector, schema, view)
1095
- try:
1096
- view_definition = inspector.get_view_definition(view, schema)
1097
- if view_definition is None:
1098
- view_definition = ""
1099
- else:
1100
- # Some dialects return a TextClause instead of a raw string,
1101
- # so we need to convert them to a string.
1102
- view_definition = str(view_definition)
1103
- except NotImplementedError:
1104
- view_definition = ""
1105
- properties["view_definition"] = view_definition
1106
1115
  properties["is_view"] = "True"
1116
+
1117
+ view_definition = self._get_view_definition(inspector, schema, view)
1118
+ properties["view_definition"] = view_definition
1107
1119
  if view_definition and self.config.include_view_lineage:
1108
1120
  self._view_definition_cache[dataset_name] = view_definition
1109
1121
 
@@ -1135,15 +1147,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1135
1147
  entityUrn=dataset_urn,
1136
1148
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
1137
1149
  ).as_workunit()
1138
- if "view_definition" in properties:
1139
- view_definition_string = properties["view_definition"]
1140
- view_properties_aspect = ViewPropertiesClass(
1141
- materialized=False, viewLanguage="SQL", viewLogic=view_definition_string
1142
- )
1143
- yield MetadataChangeProposalWrapper(
1144
- entityUrn=dataset_urn,
1145
- aspect=view_properties_aspect,
1146
- ).as_workunit()
1150
+
1151
+ view_properties_aspect = ViewPropertiesClass(
1152
+ materialized=False, viewLanguage="SQL", viewLogic=view_definition
1153
+ )
1154
+ yield MetadataChangeProposalWrapper(
1155
+ entityUrn=dataset_urn,
1156
+ aspect=view_properties_aspect,
1157
+ ).as_workunit()
1147
1158
 
1148
1159
  if self.config.domain and self.domain_registry:
1149
1160
  yield from get_domain_wu(
@@ -1197,6 +1208,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1197
1208
  )
1198
1209
  else:
1199
1210
  self.report.num_view_definitions_parsed += 1
1211
+ if raw_lineage.out_tables != [view_urn]:
1212
+ self.report.num_view_definitions_view_urn_mismatch += 1
1200
1213
  return view_definition_lineage_helper(raw_lineage, view_urn)
1201
1214
 
1202
1215
  def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
@@ -48,6 +48,7 @@ class SQLSourceReport(
48
48
  query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
49
49
 
50
50
  num_view_definitions_parsed: int = 0
51
+ num_view_definitions_view_urn_mismatch: int = 0
51
52
  num_view_definitions_failed_parsing: int = 0
52
53
  num_view_definitions_failed_column_parsing: int = 0
53
54
  view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
@@ -1,5 +1,5 @@
1
1
  import re
2
- from typing import Any, Dict, ValuesView
2
+ from typing import Any, Dict, Optional, Type, Union, ValuesView
3
3
 
4
4
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
5
5
  ArrayType,
@@ -16,14 +16,28 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
16
16
  UnionType,
17
17
  )
18
18
 
19
- # these can be obtained by running `select format_type(oid, null),* from pg_type;`
20
- # we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
21
- # (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
19
+ DATAHUB_FIELD_TYPE = Union[
20
+ ArrayType,
21
+ BooleanType,
22
+ BytesType,
23
+ DateType,
24
+ EnumType,
25
+ MapType,
26
+ NullType,
27
+ NumberType,
28
+ RecordType,
29
+ StringType,
30
+ TimeType,
31
+ UnionType,
32
+ ]
22
33
 
23
- # we map from format_type since this is what dbt uses
24
- # see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
25
34
 
26
- # see https://www.npgsql.org/dev/types.html for helpful type annotations
35
+ # These can be obtained by running `select format_type(oid, null),* from pg_type;`
36
+ # We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
37
+ # (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
38
+ # We map from format_type since this is what dbt uses.
39
+ # See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
40
+ # See https://www.npgsql.org/dev/types.html for helpful type annotations
27
41
  POSTGRES_TYPES_MAP: Dict[str, Any] = {
28
42
  "boolean": BooleanType,
29
43
  "bytea": BytesType,
@@ -262,7 +276,6 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
262
276
  return VERTICA_SQL_TYPES_MAP[type_string]
263
277
 
264
278
 
265
- # see https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
266
279
  SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
267
280
  "NUMBER": NumberType,
268
281
  "DECIMAL": NumberType,
@@ -298,6 +311,18 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
298
311
  "GEOGRAPHY": None,
299
312
  }
300
313
 
314
+
315
+ def resolve_snowflake_modified_type(type_string: str) -> Any:
316
+ # Match types with precision and scale, e.g., 'DECIMAL(38,0)'
317
+ match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
318
+ if match:
319
+ modified_type_base = match.group(1) # Extract the base type
320
+ return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
321
+
322
+ # Fallback for types without precision/scale
323
+ return SNOWFLAKE_TYPES_MAP.get(type_string, None)
324
+
325
+
301
326
  # see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
302
327
  BIGQUERY_TYPES_MAP: Dict[str, Any] = {
303
328
  "STRING": StringType,
@@ -366,6 +391,7 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
366
391
  "row": RecordType,
367
392
  "map": MapType,
368
393
  "array": ArrayType,
394
+ "json": RecordType,
369
395
  }
370
396
 
371
397
  # https://docs.aws.amazon.com/athena/latest/ug/data-types.html
@@ -430,3 +456,54 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
430
456
  "geography": None,
431
457
  "uuid": StringType,
432
458
  }
459
+
460
+
461
+ _merged_mapping = {
462
+ "boolean": BooleanType,
463
+ "date": DateType,
464
+ "time": TimeType,
465
+ "numeric": NumberType,
466
+ "text": StringType,
467
+ "timestamp with time zone": DateType,
468
+ "timestamp without time zone": DateType,
469
+ "integer": NumberType,
470
+ "float8": NumberType,
471
+ "struct": RecordType,
472
+ **POSTGRES_TYPES_MAP,
473
+ **SNOWFLAKE_TYPES_MAP,
474
+ **BIGQUERY_TYPES_MAP,
475
+ **SPARK_SQL_TYPES_MAP,
476
+ **TRINO_SQL_TYPES_MAP,
477
+ **ATHENA_SQL_TYPES_MAP,
478
+ **VERTICA_SQL_TYPES_MAP,
479
+ }
480
+
481
+
482
+ def resolve_sql_type(
483
+ column_type: Optional[str],
484
+ platform: Optional[str] = None,
485
+ ) -> Optional[DATAHUB_FIELD_TYPE]:
486
+ # In theory, we should use the platform-specific mapping where available.
487
+ # However, the types don't ever conflict, so the merged mapping is fine.
488
+ TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
489
+ _merged_mapping.get(column_type) if column_type else None
490
+ )
491
+
492
+ if TypeClass is None and column_type:
493
+ # resolve a modified type
494
+ if platform == "trino":
495
+ TypeClass = resolve_trino_modified_type(column_type)
496
+ elif platform == "athena":
497
+ TypeClass = resolve_athena_modified_type(column_type)
498
+ elif platform == "postgres" or platform == "redshift":
499
+ # Redshift uses a variant of Postgres, so we can use the same logic.
500
+ TypeClass = resolve_postgres_modified_type(column_type)
501
+ elif platform == "vertica":
502
+ TypeClass = resolve_vertica_modified_type(column_type)
503
+ elif platform == "snowflake":
504
+ # Snowflake types are uppercase, so we check that.
505
+ TypeClass = resolve_snowflake_modified_type(column_type.upper())
506
+
507
+ if TypeClass:
508
+ return TypeClass()
509
+ return None
@@ -69,7 +69,7 @@ class RedundantRunSkipHandler(
69
69
  platform: Optional[str] = None
70
70
  source_class = type(self.source)
71
71
  if hasattr(source_class, "get_platform_name"):
72
- platform = source_class.get_platform_name() # type: ignore
72
+ platform = source_class.get_platform_name()
73
73
 
74
74
  # Default name for everything else
75
75
  job_name_suffix = self.get_job_name_suffix()
@@ -1,10 +1,12 @@
1
1
  import json
2
2
  import logging
3
+ from datetime import datetime
3
4
  from functools import lru_cache
4
- from typing import Dict, Iterable, List, Optional
5
+ from typing import Any, Dict, Iterable, List, Optional
5
6
 
6
7
  import dateutil.parser as dp
7
8
  import requests
9
+ from pydantic import BaseModel
8
10
  from pydantic.class_validators import root_validator, validator
9
11
  from pydantic.fields import Field
10
12
 
@@ -16,7 +18,9 @@ from datahub.configuration.source_common import (
16
18
  from datahub.emitter.mce_builder import (
17
19
  make_chart_urn,
18
20
  make_dashboard_urn,
21
+ make_data_platform_urn,
19
22
  make_dataset_urn,
23
+ make_dataset_urn_with_platform_instance,
20
24
  make_domain_urn,
21
25
  )
22
26
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
@@ -31,6 +35,7 @@ from datahub.ingestion.api.decorators import (
31
35
  )
32
36
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
33
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
+ from datahub.ingestion.source.sql.sql_types import resolve_sql_type
34
39
  from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
35
40
  get_platform_from_sqlalchemy_uri,
36
41
  )
@@ -47,16 +52,26 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import (
47
52
  AuditStamp,
48
53
  ChangeAuditStamps,
49
54
  Status,
55
+ TimeStamp,
50
56
  )
51
57
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
52
58
  ChartSnapshot,
53
59
  DashboardSnapshot,
60
+ DatasetSnapshot,
54
61
  )
55
62
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
63
+ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
64
+ MySqlDDL,
65
+ NullType,
66
+ SchemaField,
67
+ SchemaFieldDataType,
68
+ SchemaMetadata,
69
+ )
56
70
  from datahub.metadata.schema_classes import (
57
71
  ChartInfoClass,
58
72
  ChartTypeClass,
59
73
  DashboardInfoClass,
74
+ DatasetPropertiesClass,
60
75
  )
61
76
  from datahub.utilities import config_clean
62
77
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -82,9 +97,29 @@ chart_type_from_viz_type = {
82
97
  "box_plot": ChartTypeClass.BAR,
83
98
  }
84
99
 
100
+
85
101
  platform_without_databases = ["druid"]
86
102
 
87
103
 
104
+ class SupersetDataset(BaseModel):
105
+ id: int
106
+ table_name: str
107
+ changed_on_utc: Optional[str] = None
108
+ explore_url: Optional[str] = ""
109
+
110
+ @property
111
+ def modified_dt(self) -> Optional[datetime]:
112
+ if self.changed_on_utc:
113
+ return dp.parse(self.changed_on_utc)
114
+ return None
115
+
116
+ @property
117
+ def modified_ts(self) -> Optional[int]:
118
+ if self.modified_dt:
119
+ return int(self.modified_dt.timestamp() * 1000)
120
+ return None
121
+
122
+
88
123
  class SupersetConfig(
89
124
  StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
90
125
  ):
@@ -103,15 +138,17 @@ class SupersetConfig(
103
138
  )
104
139
  username: Optional[str] = Field(default=None, description="Superset username.")
105
140
  password: Optional[str] = Field(default=None, description="Superset password.")
106
- api_key: Optional[str] = Field(default=None, description="Preset.io API key.")
107
- api_secret: Optional[str] = Field(default=None, description="Preset.io API secret.")
108
- manager_uri: str = Field(
109
- default="https://api.app.preset.io", description="Preset.io API URL"
110
- )
111
141
  # Configuration for stateful ingestion
112
142
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
113
143
  default=None, description="Superset Stateful Ingestion Config."
114
144
  )
145
+ ingest_dashboards: bool = Field(
146
+ default=True, description="Enable to ingest dashboards."
147
+ )
148
+ ingest_charts: bool = Field(default=True, description="Enable to ingest charts.")
149
+ ingest_datasets: bool = Field(
150
+ default=False, description="Enable to ingest datasets."
151
+ )
115
152
 
116
153
  provider: str = Field(default="db", description="Superset provider.")
117
154
  options: Dict = Field(default={}, description="")
@@ -123,6 +160,10 @@ class SupersetConfig(
123
160
  description="Can be used to change mapping for database names in superset to what you have in datahub",
124
161
  )
125
162
 
163
+ class Config:
164
+ # This is required to allow preset configs to get parsed
165
+ extra = "allow"
166
+
126
167
  @validator("connect_uri", "display_uri")
127
168
  def remove_trailing_slash(cls, v):
128
169
  return config_clean.remove_trailing_slashes(v)
@@ -229,6 +270,28 @@ class SupersetSource(StatefulIngestionSourceBase):
229
270
  config = SupersetConfig.parse_obj(config_dict)
230
271
  return cls(ctx, config)
231
272
 
273
+ def paginate_entity_api_results(self, entity_type, page_size=100):
274
+ current_page = 0
275
+ total_items = page_size
276
+
277
+ while current_page * page_size < total_items:
278
+ response = self.session.get(
279
+ f"{self.config.connect_uri}/api/v1/{entity_type}/",
280
+ params={"q": f"(page:{current_page},page_size:{page_size})"},
281
+ )
282
+
283
+ if response.status_code != 200:
284
+ logger.warning(f"Failed to get {entity_type} data: {response.text}")
285
+
286
+ payload = response.json()
287
+ # Update total_items with the actual count from the response
288
+ total_items = payload.get("count", total_items)
289
+ # Yield each item in the result, this gets passed into the construct functions
290
+ for item in payload.get("result", []):
291
+ yield item
292
+
293
+ current_page += 1
294
+
232
295
  @lru_cache(maxsize=None)
233
296
  def get_platform_from_database_id(self, database_id):
234
297
  database_response = self.session.get(
@@ -250,11 +313,18 @@ class SupersetSource(StatefulIngestionSourceBase):
250
313
  return platform_name
251
314
 
252
315
  @lru_cache(maxsize=None)
253
- def get_datasource_urn_from_id(self, datasource_id):
316
+ def get_dataset_info(self, dataset_id: int) -> dict:
254
317
  dataset_response = self.session.get(
255
- f"{self.config.connect_uri}/api/v1/dataset/{datasource_id}"
256
- ).json()
257
-
318
+ f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
319
+ )
320
+ if dataset_response.status_code != 200:
321
+ logger.warning(f"Failed to get dataset info: {dataset_response.text}")
322
+ dataset_response.raise_for_status()
323
+ return dataset_response.json()
324
+
325
+ def get_datasource_urn_from_id(
326
+ self, dataset_response: dict, platform_instance: str
327
+ ) -> str:
258
328
  schema_name = dataset_response.get("result", {}).get("schema")
259
329
  table_name = dataset_response.get("result", {}).get("table_name")
260
330
  database_id = dataset_response.get("result", {}).get("database", {}).get("id")
@@ -283,9 +353,11 @@ class SupersetSource(StatefulIngestionSourceBase):
283
353
  ),
284
354
  env=self.config.env,
285
355
  )
286
- return None
356
+ raise ValueError("Could not construct dataset URN")
287
357
 
288
- def construct_dashboard_from_api_data(self, dashboard_data):
358
+ def construct_dashboard_from_api_data(
359
+ self, dashboard_data: dict
360
+ ) -> DashboardSnapshot:
289
361
  dashboard_urn = make_dashboard_urn(
290
362
  platform=self.platform,
291
363
  name=dashboard_data["id"],
@@ -340,7 +412,7 @@ class SupersetSource(StatefulIngestionSourceBase):
340
412
  }
341
413
 
342
414
  if dashboard_data.get("certified_by"):
343
- custom_properties["CertifiedBy"] = dashboard_data.get("certified_by")
415
+ custom_properties["CertifiedBy"] = dashboard_data.get("certified_by", "")
344
416
  custom_properties["CertificationDetails"] = str(
345
417
  dashboard_data.get("certification_details")
346
418
  )
@@ -358,38 +430,25 @@ class SupersetSource(StatefulIngestionSourceBase):
358
430
  return dashboard_snapshot
359
431
 
360
432
  def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
361
- current_dashboard_page = 0
362
- # we will set total dashboards to the actual number after we get the response
363
- total_dashboards = PAGE_SIZE
364
-
365
- while current_dashboard_page * PAGE_SIZE <= total_dashboards:
366
- dashboard_response = self.session.get(
367
- f"{self.config.connect_uri}/api/v1/dashboard/",
368
- params=f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})",
369
- )
370
- if dashboard_response.status_code != 200:
371
- logger.warning(
372
- f"Failed to get dashboard data: {dashboard_response.text}"
373
- )
374
- dashboard_response.raise_for_status()
375
-
376
- payload = dashboard_response.json()
377
- total_dashboards = payload.get("count") or 0
378
-
379
- current_dashboard_page += 1
380
-
381
- for dashboard_data in payload["result"]:
433
+ for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
434
+ try:
382
435
  dashboard_snapshot = self.construct_dashboard_from_api_data(
383
436
  dashboard_data
384
437
  )
385
- mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
386
- yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
387
- yield from self._get_domain_wu(
388
- title=dashboard_data.get("dashboard_title", ""),
389
- entity_urn=dashboard_snapshot.urn,
438
+ except Exception as e:
439
+ self.report.warning(
440
+ f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
390
441
  )
442
+ continue
443
+ # Emit the dashboard
444
+ mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
445
+ yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
446
+ yield from self._get_domain_wu(
447
+ title=dashboard_data.get("dashboard_title", ""),
448
+ entity_urn=dashboard_snapshot.urn,
449
+ )
391
450
 
392
- def construct_chart_from_chart_data(self, chart_data):
451
+ def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
393
452
  chart_urn = make_chart_urn(
394
453
  platform=self.platform,
395
454
  name=chart_data["id"],
@@ -415,9 +474,12 @@ class SupersetSource(StatefulIngestionSourceBase):
415
474
  chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
416
475
 
417
476
  datasource_id = chart_data.get("datasource_id")
418
- datasource_urn = self.get_datasource_urn_from_id(datasource_id)
477
+ dataset_response = self.get_dataset_info(datasource_id)
478
+ datasource_urn = self.get_datasource_urn_from_id(
479
+ dataset_response, self.platform
480
+ )
419
481
 
420
- params = json.loads(chart_data.get("params"))
482
+ params = json.loads(chart_data.get("params", "{}"))
421
483
  metrics = [
422
484
  get_metric_name(metric)
423
485
  for metric in (params.get("metrics", []) or [params.get("metric")])
@@ -467,36 +529,124 @@ class SupersetSource(StatefulIngestionSourceBase):
467
529
  return chart_snapshot
468
530
 
469
531
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
470
- current_chart_page = 0
471
- # we will set total charts to the actual number after we get the response
472
- total_charts = PAGE_SIZE
473
-
474
- while current_chart_page * PAGE_SIZE <= total_charts:
475
- chart_response = self.session.get(
476
- f"{self.config.connect_uri}/api/v1/chart/",
477
- params=f"q=(page:{current_chart_page},page_size:{PAGE_SIZE})",
532
+ for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
533
+ try:
534
+ chart_snapshot = self.construct_chart_from_chart_data(chart_data)
535
+
536
+ mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
537
+ except Exception as e:
538
+ self.report.warning(
539
+ f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
540
+ )
541
+ continue
542
+ # Emit the chart
543
+ yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
544
+ yield from self._get_domain_wu(
545
+ title=chart_data.get("slice_name", ""),
546
+ entity_urn=chart_snapshot.urn,
478
547
  )
479
- if chart_response.status_code != 200:
480
- logger.warning(f"Failed to get chart data: {chart_response.text}")
481
- chart_response.raise_for_status()
482
548
 
483
- current_chart_page += 1
549
+ def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
550
+ schema_fields: List[SchemaField] = []
551
+ for col in column_data:
552
+ col_type = (col.get("type") or "").lower()
553
+ data_type = resolve_sql_type(col_type)
554
+ if data_type is None:
555
+ data_type = NullType()
556
+
557
+ field = SchemaField(
558
+ fieldPath=col.get("column_name", ""),
559
+ type=SchemaFieldDataType(data_type),
560
+ nativeDataType="",
561
+ description=col.get("column_name", ""),
562
+ nullable=True,
563
+ )
564
+ schema_fields.append(field)
565
+ return schema_fields
566
+
567
+ def gen_schema_metadata(
568
+ self,
569
+ dataset_response: dict,
570
+ ) -> SchemaMetadata:
571
+ dataset_response = dataset_response.get("result", {})
572
+ column_data = dataset_response.get("columns", [])
573
+ schema_metadata = SchemaMetadata(
574
+ schemaName=dataset_response.get("table_name", ""),
575
+ platform=make_data_platform_urn(self.platform),
576
+ version=0,
577
+ hash="",
578
+ platformSchema=MySqlDDL(tableSchema=""),
579
+ fields=self.gen_schema_fields(column_data),
580
+ )
581
+ return schema_metadata
484
582
 
485
- payload = chart_response.json()
486
- total_charts = payload["count"]
487
- for chart_data in payload["result"]:
488
- chart_snapshot = self.construct_chart_from_chart_data(chart_data)
583
+ def gen_dataset_urn(self, datahub_dataset_name: str) -> str:
584
+ return make_dataset_urn_with_platform_instance(
585
+ platform=self.platform,
586
+ name=datahub_dataset_name,
587
+ platform_instance=self.config.platform_instance,
588
+ env=self.config.env,
589
+ )
489
590
 
490
- mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
491
- yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
492
- yield from self._get_domain_wu(
493
- title=chart_data.get("slice_name", ""),
494
- entity_urn=chart_snapshot.urn,
591
+ def construct_dataset_from_dataset_data(
592
+ self, dataset_data: dict
593
+ ) -> DatasetSnapshot:
594
+ dataset_response = self.get_dataset_info(dataset_data.get("id"))
595
+ dataset = SupersetDataset(**dataset_response["result"])
596
+ datasource_urn = self.get_datasource_urn_from_id(
597
+ dataset_response, self.platform
598
+ )
599
+
600
+ dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
601
+
602
+ dataset_info = DatasetPropertiesClass(
603
+ name=dataset.table_name,
604
+ description="",
605
+ lastModified=TimeStamp(time=dataset.modified_ts)
606
+ if dataset.modified_ts
607
+ else None,
608
+ externalUrl=dataset_url,
609
+ )
610
+ aspects_items: List[Any] = []
611
+ aspects_items.extend(
612
+ [
613
+ self.gen_schema_metadata(dataset_response),
614
+ dataset_info,
615
+ ]
616
+ )
617
+
618
+ dataset_snapshot = DatasetSnapshot(
619
+ urn=datasource_urn,
620
+ aspects=aspects_items,
621
+ )
622
+ return dataset_snapshot
623
+
624
+ def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
625
+ for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
626
+ try:
627
+ dataset_snapshot = self.construct_dataset_from_dataset_data(
628
+ dataset_data
495
629
  )
630
+ mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
631
+ except Exception as e:
632
+ self.report.warning(
633
+ f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
634
+ )
635
+ continue
636
+ # Emit the dataset
637
+ yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
638
+ yield from self._get_domain_wu(
639
+ title=dataset_data.get("table_name", ""),
640
+ entity_urn=dataset_snapshot.urn,
641
+ )
496
642
 
497
643
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
498
- yield from self.emit_dashboard_mces()
499
- yield from self.emit_chart_mces()
644
+ if self.config.ingest_dashboards:
645
+ yield from self.emit_dashboard_mces()
646
+ if self.config.ingest_charts:
647
+ yield from self.emit_chart_mces()
648
+ if self.config.ingest_datasets:
649
+ yield from self.emit_dataset_mces()
500
650
 
501
651
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
502
652
  return [