acryl-datahub 1.0.0rc7__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (88) hide show
  1. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/METADATA +2487 -2487
  2. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/RECORD +88 -84
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +731 -42
  5. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  6. datahub/cli/specific/dataset_cli.py +128 -14
  7. datahub/configuration/git.py +1 -3
  8. datahub/ingestion/glossary/classification_mixin.py +1 -1
  9. datahub/ingestion/graph/client.py +16 -12
  10. datahub/ingestion/graph/filters.py +64 -37
  11. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  12. datahub/ingestion/source/abs/config.py +2 -4
  13. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  14. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
  15. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  16. datahub/ingestion/source/csv_enricher.py +1 -1
  17. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  18. datahub/ingestion/source/file.py +5 -2
  19. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  20. datahub/ingestion/source/ge_data_profiler.py +11 -14
  21. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  22. datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
  23. datahub/ingestion/source/identity/okta.py +1 -3
  24. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
  25. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  26. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  27. datahub/ingestion/source/looker/lookml_source.py +2 -1
  28. datahub/ingestion/source/metadata/lineage.py +2 -2
  29. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  30. datahub/ingestion/source/nifi.py +6 -3
  31. datahub/ingestion/source/openapi_parser.py +2 -2
  32. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  33. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  34. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  35. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  36. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  37. datahub/ingestion/source/preset.py +7 -4
  38. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  39. datahub/ingestion/source/redash.py +2 -1
  40. datahub/ingestion/source/s3/config.py +2 -4
  41. datahub/ingestion/source/s3/source.py +20 -41
  42. datahub/ingestion/source/salesforce.py +1 -1
  43. datahub/ingestion/source/schema_inference/object.py +1 -1
  44. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  45. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  46. datahub/ingestion/source/sql/athena.py +2 -2
  47. datahub/ingestion/source/sql/sql_common.py +2 -2
  48. datahub/ingestion/source/sql/sql_types.py +2 -2
  49. datahub/ingestion/source/sql/teradata.py +4 -2
  50. datahub/ingestion/source/sql/trino.py +2 -2
  51. datahub/ingestion/source/superset.py +218 -56
  52. datahub/ingestion/source/tableau/tableau.py +1 -5
  53. datahub/lite/duckdb_lite.py +3 -9
  54. datahub/metadata/_schema_classes.py +157 -14
  55. datahub/metadata/_urns/urn_defs.py +58 -58
  56. datahub/metadata/schema.avsc +23 -10
  57. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  58. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  59. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  60. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  61. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  62. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  63. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  64. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  65. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  66. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  67. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  68. datahub/metadata/schemas/PostKey.avsc +2 -1
  69. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  70. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  71. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  72. datahub/pydantic/__init__.py +0 -0
  73. datahub/pydantic/compat.py +58 -0
  74. datahub/sdk/__init__.py +1 -0
  75. datahub/sdk/_all_entities.py +1 -1
  76. datahub/sdk/_shared.py +88 -3
  77. datahub/sdk/container.py +7 -1
  78. datahub/sdk/dataset.py +10 -4
  79. datahub/sdk/{_entity.py → entity.py} +4 -0
  80. datahub/sdk/entity_client.py +1 -1
  81. datahub/sdk/main_client.py +7 -1
  82. datahub/sdk/resolver_client.py +17 -29
  83. datahub/sdk/search_client.py +50 -0
  84. datahub/sdk/search_filters.py +374 -0
  85. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/LICENSE +0 -0
  86. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/WHEEL +0 -0
  87. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/entry_points.txt +0 -0
  88. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/top_level.txt +0 -0
@@ -204,7 +204,7 @@ def get_column_type(
204
204
  """
205
205
 
206
206
  TypeClass: Optional[Type] = None
207
- for sql_type in _field_type_mapping.keys():
207
+ for sql_type in _field_type_mapping:
208
208
  if isinstance(column_type, sql_type):
209
209
  TypeClass = _field_type_mapping[sql_type]
210
210
  break
@@ -973,7 +973,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
973
973
  inspector=inspector,
974
974
  )
975
975
  ),
976
- description=column.get("comment", None),
976
+ description=column.get("comment"),
977
977
  nullable=column["nullable"],
978
978
  recursive=False,
979
979
  globalTags=gtc,
@@ -317,10 +317,10 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
317
317
  match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
318
318
  if match:
319
319
  modified_type_base = match.group(1) # Extract the base type
320
- return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
320
+ return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
321
321
 
322
322
  # Fallback for types without precision/scale
323
- return SNOWFLAKE_TYPES_MAP.get(type_string, None)
323
+ return SNOWFLAKE_TYPES_MAP.get(type_string)
324
324
 
325
325
 
326
326
  # see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
@@ -180,10 +180,11 @@ def optimized_get_columns(
180
180
  connection: Connection,
181
181
  table_name: str,
182
182
  schema: Optional[str] = None,
183
- tables_cache: MutableMapping[str, List[TeradataTable]] = {},
183
+ tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
184
184
  use_qvci: bool = False,
185
185
  **kw: Dict[str, Any],
186
186
  ) -> List[Dict]:
187
+ tables_cache = tables_cache or {}
187
188
  if schema is None:
188
189
  schema = self.default_schema_name
189
190
 
@@ -314,9 +315,10 @@ def optimized_get_view_definition(
314
315
  connection: Connection,
315
316
  view_name: str,
316
317
  schema: Optional[str] = None,
317
- tables_cache: MutableMapping[str, List[TeradataTable]] = {},
318
+ tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
318
319
  **kw: Dict[str, Any],
319
320
  ) -> Optional[str]:
321
+ tables_cache = tables_cache or {}
320
322
  if schema is None:
321
323
  schema = self.default_schema_name
322
324
 
@@ -142,7 +142,7 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
142
142
  if col_value is not None:
143
143
  properties[col_name] = col_value
144
144
 
145
- return {"text": properties.get("comment", None), "properties": properties}
145
+ return {"text": properties.get("comment"), "properties": properties}
146
146
  else:
147
147
  return self.get_table_comment_default(connection, table_name, schema)
148
148
  except Exception:
@@ -483,7 +483,7 @@ def _parse_struct_fields(parts):
483
483
 
484
484
 
485
485
  def _parse_basic_datatype(s):
486
- for sql_type in _all_atomic_types.keys():
486
+ for sql_type in _all_atomic_types:
487
487
  if isinstance(s, sql_type):
488
488
  return {
489
489
  "type": _all_atomic_types[sql_type],
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ from dataclasses import dataclass, field
3
4
  from datetime import datetime
4
5
  from functools import lru_cache
5
6
  from typing import Any, Dict, Iterable, List, Optional
@@ -22,6 +23,7 @@ from datahub.emitter.mce_builder import (
22
23
  make_dataset_urn,
23
24
  make_dataset_urn_with_platform_instance,
24
25
  make_domain_urn,
26
+ make_user_urn,
25
27
  )
26
28
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
27
29
  from datahub.ingestion.api.common import PipelineContext
@@ -36,9 +38,6 @@ from datahub.ingestion.api.decorators import (
36
38
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
37
39
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
40
  from datahub.ingestion.source.sql.sql_types import resolve_sql_type
39
- from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
40
- get_platform_from_sqlalchemy_uri,
41
- )
42
41
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
43
42
  StaleEntityRemovalHandler,
44
43
  StaleEntityRemovalSourceReport,
@@ -49,7 +48,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
49
48
  StatefulIngestionSourceBase,
50
49
  )
51
50
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
52
- AuditStamp,
53
51
  ChangeAuditStamps,
54
52
  Status,
55
53
  TimeStamp,
@@ -68,12 +66,22 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
68
66
  SchemaMetadata,
69
67
  )
70
68
  from datahub.metadata.schema_classes import (
69
+ AuditStampClass,
71
70
  ChartInfoClass,
72
71
  ChartTypeClass,
73
72
  DashboardInfoClass,
73
+ DatasetLineageTypeClass,
74
74
  DatasetPropertiesClass,
75
+ GlobalTagsClass,
76
+ OwnerClass,
77
+ OwnershipClass,
78
+ OwnershipTypeClass,
79
+ TagAssociationClass,
80
+ UpstreamClass,
81
+ UpstreamLineageClass,
75
82
  )
76
83
  from datahub.utilities import config_clean
84
+ from datahub.utilities.lossy_collections import LossyList
77
85
  from datahub.utilities.registries.domain_registry import DomainRegistry
78
86
 
79
87
  logger = logging.getLogger(__name__)
@@ -101,6 +109,14 @@ chart_type_from_viz_type = {
101
109
  platform_without_databases = ["druid"]
102
110
 
103
111
 
112
+ @dataclass
113
+ class SupersetSourceReport(StaleEntityRemovalSourceReport):
114
+ filtered: LossyList[str] = field(default_factory=LossyList)
115
+
116
+ def report_dropped(self, name: str) -> None:
117
+ self.filtered.append(name)
118
+
119
+
104
120
  class SupersetDataset(BaseModel):
105
121
  id: int
106
122
  table_name: str
@@ -136,6 +152,18 @@ class SupersetConfig(
136
152
  default=dict(),
137
153
  description="regex patterns for tables to filter to assign domain_key. ",
138
154
  )
155
+ dataset_pattern: AllowDenyPattern = Field(
156
+ default=AllowDenyPattern.allow_all(),
157
+ description="Regex patterns for dataset to filter in ingestion.",
158
+ )
159
+ chart_pattern: AllowDenyPattern = Field(
160
+ AllowDenyPattern.allow_all(),
161
+ description="Patterns for selecting chart names that are to be included",
162
+ )
163
+ dashboard_pattern: AllowDenyPattern = Field(
164
+ AllowDenyPattern.allow_all(),
165
+ description="Patterns for selecting dashboard names that are to be included",
166
+ )
139
167
  username: Optional[str] = Field(default=None, description="Superset username.")
140
168
  password: Optional[str] = Field(default=None, description="Superset password.")
141
169
  # Configuration for stateful ingestion
@@ -216,7 +244,7 @@ class SupersetSource(StatefulIngestionSourceBase):
216
244
  """
217
245
 
218
246
  config: SupersetConfig
219
- report: StaleEntityRemovalSourceReport
247
+ report: SupersetSourceReport
220
248
  platform = "superset"
221
249
 
222
250
  def __hash__(self):
@@ -225,13 +253,14 @@ class SupersetSource(StatefulIngestionSourceBase):
225
253
  def __init__(self, ctx: PipelineContext, config: SupersetConfig):
226
254
  super().__init__(config, ctx)
227
255
  self.config = config
228
- self.report = StaleEntityRemovalSourceReport()
256
+ self.report = SupersetSourceReport()
229
257
  if self.config.domain:
230
258
  self.domain_registry = DomainRegistry(
231
259
  cached_domains=[domain_id for domain_id in self.config.domain],
232
260
  graph=self.ctx.graph,
233
261
  )
234
262
  self.session = self.login()
263
+ self.owner_info = self.parse_owner_info()
235
264
 
236
265
  def login(self) -> requests.Session:
237
266
  login_response = requests.post(
@@ -271,7 +300,7 @@ class SupersetSource(StatefulIngestionSourceBase):
271
300
 
272
301
  while current_page * page_size < total_items:
273
302
  response = self.session.get(
274
- f"{self.config.connect_uri}/api/v1/{entity_type}/",
303
+ f"{self.config.connect_uri}/api/v1/{entity_type}",
275
304
  params={"q": f"(page:{current_page},page_size:{page_size})"},
276
305
  )
277
306
 
@@ -287,25 +316,24 @@ class SupersetSource(StatefulIngestionSourceBase):
287
316
 
288
317
  current_page += 1
289
318
 
290
- @lru_cache(maxsize=None)
291
- def get_platform_from_database_id(self, database_id):
292
- database_response = self.session.get(
293
- f"{self.config.connect_uri}/api/v1/database/{database_id}"
294
- ).json()
295
- sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri")
296
- if sqlalchemy_uri is None:
297
- platform_name = database_response.get("result", {}).get(
298
- "backend", "external"
299
- )
300
- else:
301
- platform_name = get_platform_from_sqlalchemy_uri(sqlalchemy_uri)
302
- if platform_name == "awsathena":
303
- return "athena"
304
- if platform_name == "clickhousedb":
305
- return "clickhouse"
306
- if platform_name == "postgresql":
307
- return "postgres"
308
- return platform_name
319
+ def parse_owner_info(self) -> Dict[str, Any]:
320
+ entity_types = ["dataset", "dashboard", "chart"]
321
+ owners_info = {}
322
+
323
+ for entity in entity_types:
324
+ for owner in self.paginate_entity_api_results(f"{entity}/related/owners"):
325
+ owner_id = owner.get("value")
326
+ if owner_id:
327
+ owners_info[owner_id] = owner.get("extra", {}).get("email", "")
328
+
329
+ return owners_info
330
+
331
+ def build_owner_urn(self, data: Dict[str, Any]) -> List[str]:
332
+ return [
333
+ make_user_urn(self.owner_info.get(owner.get("id"), ""))
334
+ for owner in data.get("owners", [])
335
+ if owner.get("id")
336
+ ]
309
337
 
310
338
  @lru_cache(maxsize=None)
311
339
  def get_dataset_info(self, dataset_id: int) -> dict:
@@ -323,8 +351,6 @@ class SupersetSource(StatefulIngestionSourceBase):
323
351
  schema_name = dataset_response.get("result", {}).get("schema")
324
352
  table_name = dataset_response.get("result", {}).get("table_name")
325
353
  database_id = dataset_response.get("result", {}).get("database", {}).get("id")
326
- platform = self.get_platform_from_database_id(database_id)
327
-
328
354
  database_name = (
329
355
  dataset_response.get("result", {}).get("database", {}).get("database_name")
330
356
  )
@@ -333,21 +359,24 @@ class SupersetSource(StatefulIngestionSourceBase):
333
359
  # Druid do not have a database concept and has a limited schema concept, but they are nonetheless reported
334
360
  # from superset. There is only one database per platform instance, and one schema named druid, so it would be
335
361
  # redundant to systemically store them both in the URN.
336
- if platform in platform_without_databases:
362
+ if platform_instance in platform_without_databases:
337
363
  database_name = None
338
364
 
339
- if platform == "druid" and schema_name == "druid":
365
+ if platform_instance == "druid" and schema_name == "druid":
340
366
  # Follow DataHub's druid source convention.
341
367
  schema_name = None
342
368
 
343
- if database_id and table_name:
369
+ # If the information about the datasource is already contained in the dataset response,
370
+ # can just return the urn directly
371
+ if table_name and database_id:
344
372
  return make_dataset_urn(
345
- platform=platform,
373
+ platform=platform_instance,
346
374
  name=".".join(
347
375
  name for name in [database_name, schema_name, table_name] if name
348
376
  ),
349
377
  env=self.config.env,
350
378
  )
379
+
351
380
  raise ValueError("Could not construct dataset URN")
352
381
 
353
382
  def construct_dashboard_from_api_data(
@@ -363,15 +392,16 @@ class SupersetSource(StatefulIngestionSourceBase):
363
392
  aspects=[Status(removed=False)],
364
393
  )
365
394
 
366
- modified_actor = f"urn:li:corpuser:{(dashboard_data.get('changed_by') or {}).get('username', 'unknown')}"
395
+ modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
367
396
  modified_ts = int(
368
397
  dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
369
398
  )
370
399
  title = dashboard_data.get("dashboard_title", "")
371
400
  # note: the API does not currently supply created_by usernames due to a bug
372
- last_modified = ChangeAuditStamps(
373
- created=None,
374
- lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
401
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
402
+
403
+ change_audit_stamps = ChangeAuditStamps(
404
+ created=None, lastModified=last_modified
375
405
  )
376
406
  dashboard_url = f"{self.config.display_uri}{dashboard_data.get('url', '')}"
377
407
 
@@ -397,7 +427,7 @@ class SupersetSource(StatefulIngestionSourceBase):
397
427
  "IsPublished": str(dashboard_data.get("published", False)).lower(),
398
428
  "Owners": ", ".join(
399
429
  map(
400
- lambda owner: owner.get("username", "unknown"),
430
+ lambda owner: self.owner_info.get(owner.get("id", -1), "unknown"),
401
431
  dashboard_data.get("owners", []),
402
432
  )
403
433
  ),
@@ -417,16 +447,39 @@ class SupersetSource(StatefulIngestionSourceBase):
417
447
  description="",
418
448
  title=title,
419
449
  charts=chart_urns,
420
- lastModified=last_modified,
421
450
  dashboardUrl=dashboard_url,
422
451
  customProperties=custom_properties,
452
+ lastModified=change_audit_stamps,
423
453
  )
424
454
  dashboard_snapshot.aspects.append(dashboard_info)
455
+
456
+ dashboard_owners_list = self.build_owner_urn(dashboard_data)
457
+ owners_info = OwnershipClass(
458
+ owners=[
459
+ OwnerClass(
460
+ owner=urn,
461
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
462
+ )
463
+ for urn in (dashboard_owners_list or [])
464
+ ],
465
+ lastModified=last_modified,
466
+ )
467
+ dashboard_snapshot.aspects.append(owners_info)
468
+
425
469
  return dashboard_snapshot
426
470
 
427
471
  def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
428
- for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
472
+ for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
429
473
  try:
474
+ dashboard_id = str(dashboard_data.get("id"))
475
+ dashboard_title = dashboard_data.get("dashboard_title", "")
476
+
477
+ if not self.config.dashboard_pattern.allowed(dashboard_title):
478
+ self.report.report_dropped(
479
+ f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
480
+ )
481
+ continue
482
+
430
483
  dashboard_snapshot = self.construct_dashboard_from_api_data(
431
484
  dashboard_data
432
485
  )
@@ -439,7 +492,7 @@ class SupersetSource(StatefulIngestionSourceBase):
439
492
  mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
440
493
  yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
441
494
  yield from self._get_domain_wu(
442
- title=dashboard_data.get("dashboard_title", ""),
495
+ title=dashboard_title,
443
496
  entity_urn=dashboard_snapshot.urn,
444
497
  )
445
498
 
@@ -454,25 +507,33 @@ class SupersetSource(StatefulIngestionSourceBase):
454
507
  aspects=[Status(removed=False)],
455
508
  )
456
509
 
457
- modified_actor = f"urn:li:corpuser:{(chart_data.get('changed_by') or {}).get('username', 'unknown')}"
510
+ modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
458
511
  modified_ts = int(
459
512
  dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
460
513
  )
461
514
  title = chart_data.get("slice_name", "")
462
515
 
463
516
  # note: the API does not currently supply created_by usernames due to a bug
464
- last_modified = ChangeAuditStamps(
465
- created=None,
466
- lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
517
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
518
+
519
+ change_audit_stamps = ChangeAuditStamps(
520
+ created=None, lastModified=last_modified
467
521
  )
522
+
468
523
  chart_type = chart_type_from_viz_type.get(chart_data.get("viz_type", ""))
469
524
  chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
470
525
 
471
526
  datasource_id = chart_data.get("datasource_id")
472
- dataset_response = self.get_dataset_info(datasource_id)
473
- datasource_urn = self.get_datasource_urn_from_id(
474
- dataset_response, self.platform
475
- )
527
+ if not datasource_id:
528
+ logger.debug(
529
+ f"chart {chart_data['id']} has no datasource_id, skipping fetching dataset info"
530
+ )
531
+ datasource_urn = None
532
+ else:
533
+ dataset_response = self.get_dataset_info(datasource_id)
534
+ datasource_urn = self.get_datasource_urn_from_id(
535
+ dataset_response, self.platform
536
+ )
476
537
 
477
538
  params = json.loads(chart_data.get("params", "{}"))
478
539
  metrics = [
@@ -515,23 +576,61 @@ class SupersetSource(StatefulIngestionSourceBase):
515
576
  type=chart_type,
516
577
  description="",
517
578
  title=title,
518
- lastModified=last_modified,
519
579
  chartUrl=chart_url,
520
580
  inputs=[datasource_urn] if datasource_urn else None,
521
581
  customProperties=custom_properties,
582
+ lastModified=change_audit_stamps,
522
583
  )
523
584
  chart_snapshot.aspects.append(chart_info)
585
+
586
+ chart_owners_list = self.build_owner_urn(chart_data)
587
+ owners_info = OwnershipClass(
588
+ owners=[
589
+ OwnerClass(
590
+ owner=urn,
591
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
592
+ )
593
+ for urn in (chart_owners_list or [])
594
+ ],
595
+ lastModified=last_modified,
596
+ )
597
+ chart_snapshot.aspects.append(owners_info)
524
598
  return chart_snapshot
525
599
 
526
600
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
527
- for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
601
+ for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
528
602
  try:
603
+ chart_id = str(chart_data.get("id"))
604
+ chart_name = chart_data.get("slice_name", "")
605
+
606
+ if not self.config.chart_pattern.allowed(chart_name):
607
+ self.report.report_dropped(
608
+ f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
609
+ )
610
+ continue
611
+
612
+ # Emit a warning if charts use data from a dataset that will be filtered out
613
+ if self.config.dataset_pattern != AllowDenyPattern.allow_all():
614
+ datasource_id = chart_data.get("datasource_id")
615
+ if datasource_id:
616
+ dataset_response = self.get_dataset_info(datasource_id)
617
+ dataset_name = dataset_response.get("result", {}).get(
618
+ "table_name", ""
619
+ )
620
+
621
+ if dataset_name and not self.config.dataset_pattern.allowed(
622
+ dataset_name
623
+ ):
624
+ self.report.warning(
625
+ f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
626
+ )
627
+
529
628
  chart_snapshot = self.construct_chart_from_chart_data(chart_data)
530
629
 
531
630
  mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
532
631
  except Exception as e:
533
632
  self.report.warning(
534
- f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
633
+ f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
535
634
  )
536
635
  continue
537
636
  # Emit the chart
@@ -588,25 +687,65 @@ class SupersetSource(StatefulIngestionSourceBase):
588
687
  ) -> DatasetSnapshot:
589
688
  dataset_response = self.get_dataset_info(dataset_data.get("id"))
590
689
  dataset = SupersetDataset(**dataset_response["result"])
690
+
591
691
  datasource_urn = self.get_datasource_urn_from_id(
592
692
  dataset_response, self.platform
593
693
  )
694
+ dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
695
+
696
+ modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
697
+ modified_ts = int(
698
+ dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
699
+ )
700
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
701
+
702
+ upstream_warehouse_platform = (
703
+ dataset_response.get("result", {}).get("database", {}).get("backend")
704
+ )
594
705
 
595
- dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
706
+ # Preset has a way of naming their platforms differently than
707
+ # how datahub names them, so map the platform name to the correct naming
708
+ warehouse_naming = {
709
+ "awsathena": "athena",
710
+ "clickhousedb": "clickhouse",
711
+ "postgresql": "postgres",
712
+ }
713
+
714
+ if upstream_warehouse_platform in warehouse_naming:
715
+ upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
716
+
717
+ # TODO: Categorize physical vs virtual upstream dataset
718
+ # mark all upstream dataset as physical for now, in the future we would ideally like
719
+ # to differentiate physical vs virtual upstream datasets
720
+ tag_urn = f"urn:li:tag:{self.platform}:physical"
721
+ upstream_dataset = self.get_datasource_urn_from_id(
722
+ dataset_response, upstream_warehouse_platform
723
+ )
724
+ upstream_lineage = UpstreamLineageClass(
725
+ upstreams=[
726
+ UpstreamClass(
727
+ type=DatasetLineageTypeClass.TRANSFORMED,
728
+ dataset=upstream_dataset,
729
+ properties={"externalUrl": dataset_url},
730
+ )
731
+ ]
732
+ )
596
733
 
597
734
  dataset_info = DatasetPropertiesClass(
598
735
  name=dataset.table_name,
599
736
  description="",
600
- lastModified=TimeStamp(time=dataset.modified_ts)
601
- if dataset.modified_ts
602
- else None,
603
737
  externalUrl=dataset_url,
738
+ lastModified=TimeStamp(time=modified_ts),
604
739
  )
740
+ global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
741
+
605
742
  aspects_items: List[Any] = []
606
743
  aspects_items.extend(
607
744
  [
608
745
  self.gen_schema_metadata(dataset_response),
609
746
  dataset_info,
747
+ upstream_lineage,
748
+ global_tags,
610
749
  ]
611
750
  )
612
751
 
@@ -614,11 +753,34 @@ class SupersetSource(StatefulIngestionSourceBase):
614
753
  urn=datasource_urn,
615
754
  aspects=aspects_items,
616
755
  )
756
+
757
+ dataset_owners_list = self.build_owner_urn(dataset_data)
758
+ owners_info = OwnershipClass(
759
+ owners=[
760
+ OwnerClass(
761
+ owner=urn,
762
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
763
+ )
764
+ for urn in (dataset_owners_list or [])
765
+ ],
766
+ lastModified=last_modified,
767
+ )
768
+ aspects_items.append(owners_info)
769
+
617
770
  return dataset_snapshot
618
771
 
619
772
  def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
620
- for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
773
+ for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
621
774
  try:
775
+ dataset_name = dataset_data.get("table_name", "")
776
+
777
+ # Check if dataset should be filtered by dataset name
778
+ if not self.config.dataset_pattern.allowed(dataset_name):
779
+ self.report.report_dropped(
780
+ f"Dataset '{dataset_name}' filtered by dataset_pattern"
781
+ )
782
+ continue
783
+
622
784
  dataset_snapshot = self.construct_dataset_from_dataset_data(
623
785
  dataset_data
624
786
  )
@@ -1911,11 +1911,7 @@ class TableauSiteSource:
1911
1911
  if upstream_col.get(c.TABLE)
1912
1912
  else None
1913
1913
  )
1914
- if (
1915
- name
1916
- and upstream_table_id
1917
- and upstream_table_id in table_id_to_urn.keys()
1918
- ):
1914
+ if name and upstream_table_id and upstream_table_id in table_id_to_urn:
1919
1915
  parent_dataset_urn = table_id_to_urn[upstream_table_id]
1920
1916
  if (
1921
1917
  self.is_snowflake_urn(parent_dataset_urn)
@@ -760,15 +760,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
760
760
  entity_id=[str(data_platform_urn), data_platform_instance],
761
761
  )
762
762
  self._create_edges_from_data_platform_instance(data_platform_instance_urn)
763
- elif isinstance(aspect, ChartInfoClass):
764
- urn = Urn.from_string(entity_urn)
765
- self.add_edge(
766
- entity_urn,
767
- "name",
768
- aspect.title + f" ({urn.get_entity_id()[-1]})",
769
- remove_existing=True,
770
- )
771
- elif isinstance(aspect, DashboardInfoClass):
763
+ elif isinstance(aspect, ChartInfoClass) or isinstance(
764
+ aspect, DashboardInfoClass
765
+ ):
772
766
  urn = Urn.from_string(entity_urn)
773
767
  self.add_edge(
774
768
  entity_urn,