acryl-datahub 1.0.0rc7__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (52) hide show
  1. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2405 -2405
  2. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +52 -52
  3. datahub/_version.py +1 -1
  4. datahub/configuration/git.py +1 -3
  5. datahub/ingestion/glossary/classification_mixin.py +1 -1
  6. datahub/ingestion/graph/client.py +1 -1
  7. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  8. datahub/ingestion/source/abs/config.py +2 -4
  9. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  10. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
  11. datahub/ingestion/source/csv_enricher.py +1 -1
  12. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  13. datahub/ingestion/source/file.py +5 -2
  14. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  15. datahub/ingestion/source/ge_data_profiler.py +11 -14
  16. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  17. datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
  18. datahub/ingestion/source/identity/okta.py +1 -3
  19. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
  20. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  21. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  22. datahub/ingestion/source/looker/lookml_source.py +2 -1
  23. datahub/ingestion/source/metadata/lineage.py +2 -2
  24. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  25. datahub/ingestion/source/nifi.py +6 -3
  26. datahub/ingestion/source/openapi_parser.py +2 -2
  27. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  28. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  29. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  30. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  31. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  32. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  33. datahub/ingestion/source/redash.py +2 -1
  34. datahub/ingestion/source/s3/config.py +2 -4
  35. datahub/ingestion/source/s3/source.py +20 -41
  36. datahub/ingestion/source/salesforce.py +1 -1
  37. datahub/ingestion/source/schema_inference/object.py +1 -1
  38. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  39. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  40. datahub/ingestion/source/sql/athena.py +2 -2
  41. datahub/ingestion/source/sql/sql_common.py +2 -2
  42. datahub/ingestion/source/sql/sql_types.py +2 -2
  43. datahub/ingestion/source/sql/teradata.py +4 -2
  44. datahub/ingestion/source/sql/trino.py +2 -2
  45. datahub/ingestion/source/superset.py +65 -37
  46. datahub/ingestion/source/tableau/tableau.py +1 -5
  47. datahub/lite/duckdb_lite.py +3 -9
  48. datahub/sdk/dataset.py +3 -3
  49. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
  50. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +0 -0
  51. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
  52. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0
@@ -180,10 +180,11 @@ def optimized_get_columns(
180
180
  connection: Connection,
181
181
  table_name: str,
182
182
  schema: Optional[str] = None,
183
- tables_cache: MutableMapping[str, List[TeradataTable]] = {},
183
+ tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
184
184
  use_qvci: bool = False,
185
185
  **kw: Dict[str, Any],
186
186
  ) -> List[Dict]:
187
+ tables_cache = tables_cache or {}
187
188
  if schema is None:
188
189
  schema = self.default_schema_name
189
190
 
@@ -314,9 +315,10 @@ def optimized_get_view_definition(
314
315
  connection: Connection,
315
316
  view_name: str,
316
317
  schema: Optional[str] = None,
317
- tables_cache: MutableMapping[str, List[TeradataTable]] = {},
318
+ tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
318
319
  **kw: Dict[str, Any],
319
320
  ) -> Optional[str]:
321
+ tables_cache = tables_cache or {}
320
322
  if schema is None:
321
323
  schema = self.default_schema_name
322
324
 
@@ -142,7 +142,7 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
142
142
  if col_value is not None:
143
143
  properties[col_name] = col_value
144
144
 
145
- return {"text": properties.get("comment", None), "properties": properties}
145
+ return {"text": properties.get("comment"), "properties": properties}
146
146
  else:
147
147
  return self.get_table_comment_default(connection, table_name, schema)
148
148
  except Exception:
@@ -483,7 +483,7 @@ def _parse_struct_fields(parts):
483
483
 
484
484
 
485
485
  def _parse_basic_datatype(s):
486
- for sql_type in _all_atomic_types.keys():
486
+ for sql_type in _all_atomic_types:
487
487
  if isinstance(s, sql_type):
488
488
  return {
489
489
  "type": _all_atomic_types[sql_type],
@@ -36,9 +36,6 @@ from datahub.ingestion.api.decorators import (
36
36
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
37
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
38
  from datahub.ingestion.source.sql.sql_types import resolve_sql_type
39
- from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
40
- get_platform_from_sqlalchemy_uri,
41
- )
42
39
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
43
40
  StaleEntityRemovalHandler,
44
41
  StaleEntityRemovalSourceReport,
@@ -71,7 +68,12 @@ from datahub.metadata.schema_classes import (
71
68
  ChartInfoClass,
72
69
  ChartTypeClass,
73
70
  DashboardInfoClass,
71
+ DatasetLineageTypeClass,
74
72
  DatasetPropertiesClass,
73
+ GlobalTagsClass,
74
+ TagAssociationClass,
75
+ UpstreamClass,
76
+ UpstreamLineageClass,
75
77
  )
76
78
  from datahub.utilities import config_clean
77
79
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -287,26 +289,6 @@ class SupersetSource(StatefulIngestionSourceBase):
287
289
 
288
290
  current_page += 1
289
291
 
290
- @lru_cache(maxsize=None)
291
- def get_platform_from_database_id(self, database_id):
292
- database_response = self.session.get(
293
- f"{self.config.connect_uri}/api/v1/database/{database_id}"
294
- ).json()
295
- sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri")
296
- if sqlalchemy_uri is None:
297
- platform_name = database_response.get("result", {}).get(
298
- "backend", "external"
299
- )
300
- else:
301
- platform_name = get_platform_from_sqlalchemy_uri(sqlalchemy_uri)
302
- if platform_name == "awsathena":
303
- return "athena"
304
- if platform_name == "clickhousedb":
305
- return "clickhouse"
306
- if platform_name == "postgresql":
307
- return "postgres"
308
- return platform_name
309
-
310
292
  @lru_cache(maxsize=None)
311
293
  def get_dataset_info(self, dataset_id: int) -> dict:
312
294
  dataset_response = self.session.get(
@@ -323,8 +305,6 @@ class SupersetSource(StatefulIngestionSourceBase):
323
305
  schema_name = dataset_response.get("result", {}).get("schema")
324
306
  table_name = dataset_response.get("result", {}).get("table_name")
325
307
  database_id = dataset_response.get("result", {}).get("database", {}).get("id")
326
- platform = self.get_platform_from_database_id(database_id)
327
-
328
308
  database_name = (
329
309
  dataset_response.get("result", {}).get("database", {}).get("database_name")
330
310
  )
@@ -333,21 +313,24 @@ class SupersetSource(StatefulIngestionSourceBase):
333
313
  # Druid do not have a database concept and has a limited schema concept, but they are nonetheless reported
334
314
  # from superset. There is only one database per platform instance, and one schema named druid, so it would be
335
315
  # redundant to systemically store them both in the URN.
336
- if platform in platform_without_databases:
316
+ if platform_instance in platform_without_databases:
337
317
  database_name = None
338
318
 
339
- if platform == "druid" and schema_name == "druid":
319
+ if platform_instance == "druid" and schema_name == "druid":
340
320
  # Follow DataHub's druid source convention.
341
321
  schema_name = None
342
322
 
343
- if database_id and table_name:
323
+ # If the information about the datasource is already contained in the dataset response,
324
+ # can just return the urn directly
325
+ if table_name and database_id:
344
326
  return make_dataset_urn(
345
- platform=platform,
327
+ platform=platform_instance,
346
328
  name=".".join(
347
329
  name for name in [database_name, schema_name, table_name] if name
348
330
  ),
349
331
  env=self.config.env,
350
332
  )
333
+
351
334
  raise ValueError("Could not construct dataset URN")
352
335
 
353
336
  def construct_dashboard_from_api_data(
@@ -469,10 +452,16 @@ class SupersetSource(StatefulIngestionSourceBase):
469
452
  chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
470
453
 
471
454
  datasource_id = chart_data.get("datasource_id")
472
- dataset_response = self.get_dataset_info(datasource_id)
473
- datasource_urn = self.get_datasource_urn_from_id(
474
- dataset_response, self.platform
475
- )
455
+ if not datasource_id:
456
+ logger.debug(
457
+ f"chart {chart_data['id']} has no datasource_id, skipping fetching dataset info"
458
+ )
459
+ datasource_urn = None
460
+ else:
461
+ dataset_response = self.get_dataset_info(datasource_id)
462
+ datasource_urn = self.get_datasource_urn_from_id(
463
+ dataset_response, self.platform
464
+ )
476
465
 
477
466
  params = json.loads(chart_data.get("params", "{}"))
478
467
  metrics = [
@@ -588,25 +577,61 @@ class SupersetSource(StatefulIngestionSourceBase):
588
577
  ) -> DatasetSnapshot:
589
578
  dataset_response = self.get_dataset_info(dataset_data.get("id"))
590
579
  dataset = SupersetDataset(**dataset_response["result"])
580
+
591
581
  datasource_urn = self.get_datasource_urn_from_id(
592
582
  dataset_response, self.platform
593
583
  )
584
+ dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
594
585
 
595
- dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
586
+ upstream_warehouse_platform = (
587
+ dataset_response.get("result", {}).get("database", {}).get("backend")
588
+ )
589
+
590
+ # Preset has a way of naming their platforms differently than
591
+ # how datahub names them, so map the platform name to the correct naming
592
+ warehouse_naming = {
593
+ "awsathena": "athena",
594
+ "clickhousedb": "clickhouse",
595
+ "postgresql": "postgres",
596
+ }
597
+
598
+ if upstream_warehouse_platform in warehouse_naming:
599
+ upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
600
+
601
+ # TODO: Categorize physical vs virtual upstream dataset
602
+ # mark all upstream dataset as physical for now, in the future we would ideally like
603
+ # to differentiate physical vs virtual upstream datasets
604
+ tag_urn = f"urn:li:tag:{self.platform}:physical"
605
+ upstream_dataset = self.get_datasource_urn_from_id(
606
+ dataset_response, upstream_warehouse_platform
607
+ )
608
+ upstream_lineage = UpstreamLineageClass(
609
+ upstreams=[
610
+ UpstreamClass(
611
+ type=DatasetLineageTypeClass.TRANSFORMED,
612
+ dataset=upstream_dataset,
613
+ properties={"externalUrl": dataset_url},
614
+ )
615
+ ]
616
+ )
596
617
 
597
618
  dataset_info = DatasetPropertiesClass(
598
619
  name=dataset.table_name,
599
620
  description="",
600
- lastModified=TimeStamp(time=dataset.modified_ts)
601
- if dataset.modified_ts
602
- else None,
621
+ lastModified=(
622
+ TimeStamp(time=dataset.modified_ts) if dataset.modified_ts else None
623
+ ),
603
624
  externalUrl=dataset_url,
604
625
  )
626
+ global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
627
+
605
628
  aspects_items: List[Any] = []
606
629
  aspects_items.extend(
607
630
  [
608
631
  self.gen_schema_metadata(dataset_response),
609
632
  dataset_info,
633
+ upstream_lineage,
634
+ global_tags,
610
635
  ]
611
636
  )
612
637
 
@@ -614,6 +639,9 @@ class SupersetSource(StatefulIngestionSourceBase):
614
639
  urn=datasource_urn,
615
640
  aspects=aspects_items,
616
641
  )
642
+
643
+ logger.info(f"Constructed dataset {datasource_urn}")
644
+
617
645
  return dataset_snapshot
618
646
 
619
647
  def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
@@ -1911,11 +1911,7 @@ class TableauSiteSource:
1911
1911
  if upstream_col.get(c.TABLE)
1912
1912
  else None
1913
1913
  )
1914
- if (
1915
- name
1916
- and upstream_table_id
1917
- and upstream_table_id in table_id_to_urn.keys()
1918
- ):
1914
+ if name and upstream_table_id and upstream_table_id in table_id_to_urn:
1919
1915
  parent_dataset_urn = table_id_to_urn[upstream_table_id]
1920
1916
  if (
1921
1917
  self.is_snowflake_urn(parent_dataset_urn)
@@ -760,15 +760,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
760
760
  entity_id=[str(data_platform_urn), data_platform_instance],
761
761
  )
762
762
  self._create_edges_from_data_platform_instance(data_platform_instance_urn)
763
- elif isinstance(aspect, ChartInfoClass):
764
- urn = Urn.from_string(entity_urn)
765
- self.add_edge(
766
- entity_urn,
767
- "name",
768
- aspect.title + f" ({urn.get_entity_id()[-1]})",
769
- remove_existing=True,
770
- )
771
- elif isinstance(aspect, DashboardInfoClass):
763
+ elif isinstance(aspect, ChartInfoClass) or isinstance(
764
+ aspect, DashboardInfoClass
765
+ ):
772
766
  urn = Urn.from_string(entity_urn)
773
767
  self.add_edge(
774
768
  entity_urn,
datahub/sdk/dataset.py CHANGED
@@ -72,9 +72,9 @@ UpstreamLineageInputType: TypeAlias = Union[
72
72
  def _parse_upstream_input(
73
73
  upstream_input: UpstreamInputType,
74
74
  ) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
75
- if isinstance(upstream_input, models.UpstreamClass):
76
- return upstream_input
77
- elif isinstance(upstream_input, models.FineGrainedLineageClass):
75
+ if isinstance(upstream_input, models.UpstreamClass) or isinstance(
76
+ upstream_input, models.FineGrainedLineageClass
77
+ ):
78
78
  return upstream_input
79
79
  elif isinstance(upstream_input, (str, DatasetUrn)):
80
80
  return models.UpstreamClass(