acryl-datahub 1.0.0rc6__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (74) hide show
  1. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2490 -2490
  2. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +74 -74
  3. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/docker_cli.py +1 -1
  6. datahub/cli/iceberg_cli.py +1 -1
  7. datahub/cli/lite_cli.py +4 -2
  8. datahub/cli/specific/dataproduct_cli.py +1 -1
  9. datahub/configuration/git.py +1 -3
  10. datahub/configuration/kafka.py +1 -1
  11. datahub/ingestion/fs/s3_fs.py +2 -2
  12. datahub/ingestion/glossary/classification_mixin.py +1 -1
  13. datahub/ingestion/graph/client.py +16 -7
  14. datahub/ingestion/graph/entity_versioning.py +3 -3
  15. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  16. datahub/ingestion/source/abs/config.py +2 -4
  17. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  18. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
  19. datahub/ingestion/source/cassandra/cassandra_api.py +2 -1
  20. datahub/ingestion/source/csv_enricher.py +3 -3
  21. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  22. datahub/ingestion/source/dremio/dremio_api.py +3 -3
  23. datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
  24. datahub/ingestion/source/file.py +5 -2
  25. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  26. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  27. datahub/ingestion/source/ge_data_profiler.py +11 -14
  28. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  29. datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
  30. datahub/ingestion/source/identity/okta.py +1 -3
  31. datahub/ingestion/source/kafka/kafka.py +1 -1
  32. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
  33. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  34. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  35. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  36. datahub/ingestion/source/looker/lookml_source.py +3 -2
  37. datahub/ingestion/source/metabase.py +54 -32
  38. datahub/ingestion/source/metadata/lineage.py +2 -2
  39. datahub/ingestion/source/mode.py +1 -1
  40. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  41. datahub/ingestion/source/nifi.py +6 -3
  42. datahub/ingestion/source/openapi_parser.py +2 -2
  43. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  44. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  45. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  46. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  47. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  48. datahub/ingestion/source/pulsar.py +2 -2
  49. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  50. datahub/ingestion/source/redash.py +2 -1
  51. datahub/ingestion/source/s3/config.py +2 -4
  52. datahub/ingestion/source/s3/source.py +20 -41
  53. datahub/ingestion/source/salesforce.py +1 -1
  54. datahub/ingestion/source/schema_inference/object.py +1 -1
  55. datahub/ingestion/source/sigma/sigma.py +1 -1
  56. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  57. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  58. datahub/ingestion/source/sql/athena.py +2 -2
  59. datahub/ingestion/source/sql/druid.py +1 -5
  60. datahub/ingestion/source/sql/sql_common.py +2 -2
  61. datahub/ingestion/source/sql/sql_types.py +2 -2
  62. datahub/ingestion/source/sql/teradata.py +4 -2
  63. datahub/ingestion/source/sql/trino.py +2 -2
  64. datahub/ingestion/source/superset.py +65 -37
  65. datahub/ingestion/source/tableau/tableau.py +3 -6
  66. datahub/ingestion/source/tableau/tableau_common.py +2 -1
  67. datahub/lite/duckdb_lite.py +5 -10
  68. datahub/lite/lite_local.py +1 -1
  69. datahub/lite/lite_util.py +4 -3
  70. datahub/sdk/dataset.py +3 -3
  71. datahub/utilities/memory_footprint.py +3 -2
  72. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
  73. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
  74. {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0
@@ -36,9 +36,6 @@ from datahub.ingestion.api.decorators import (
36
36
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
37
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
38
  from datahub.ingestion.source.sql.sql_types import resolve_sql_type
39
- from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
40
- get_platform_from_sqlalchemy_uri,
41
- )
42
39
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
43
40
  StaleEntityRemovalHandler,
44
41
  StaleEntityRemovalSourceReport,
@@ -71,7 +68,12 @@ from datahub.metadata.schema_classes import (
71
68
  ChartInfoClass,
72
69
  ChartTypeClass,
73
70
  DashboardInfoClass,
71
+ DatasetLineageTypeClass,
74
72
  DatasetPropertiesClass,
73
+ GlobalTagsClass,
74
+ TagAssociationClass,
75
+ UpstreamClass,
76
+ UpstreamLineageClass,
75
77
  )
76
78
  from datahub.utilities import config_clean
77
79
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -287,26 +289,6 @@ class SupersetSource(StatefulIngestionSourceBase):
287
289
 
288
290
  current_page += 1
289
291
 
290
- @lru_cache(maxsize=None)
291
- def get_platform_from_database_id(self, database_id):
292
- database_response = self.session.get(
293
- f"{self.config.connect_uri}/api/v1/database/{database_id}"
294
- ).json()
295
- sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri")
296
- if sqlalchemy_uri is None:
297
- platform_name = database_response.get("result", {}).get(
298
- "backend", "external"
299
- )
300
- else:
301
- platform_name = get_platform_from_sqlalchemy_uri(sqlalchemy_uri)
302
- if platform_name == "awsathena":
303
- return "athena"
304
- if platform_name == "clickhousedb":
305
- return "clickhouse"
306
- if platform_name == "postgresql":
307
- return "postgres"
308
- return platform_name
309
-
310
292
  @lru_cache(maxsize=None)
311
293
  def get_dataset_info(self, dataset_id: int) -> dict:
312
294
  dataset_response = self.session.get(
@@ -323,8 +305,6 @@ class SupersetSource(StatefulIngestionSourceBase):
323
305
  schema_name = dataset_response.get("result", {}).get("schema")
324
306
  table_name = dataset_response.get("result", {}).get("table_name")
325
307
  database_id = dataset_response.get("result", {}).get("database", {}).get("id")
326
- platform = self.get_platform_from_database_id(database_id)
327
-
328
308
  database_name = (
329
309
  dataset_response.get("result", {}).get("database", {}).get("database_name")
330
310
  )
@@ -333,21 +313,24 @@ class SupersetSource(StatefulIngestionSourceBase):
333
313
  # Druid do not have a database concept and has a limited schema concept, but they are nonetheless reported
334
314
  # from superset. There is only one database per platform instance, and one schema named druid, so it would be
335
315
  # redundant to systemically store them both in the URN.
336
- if platform in platform_without_databases:
316
+ if platform_instance in platform_without_databases:
337
317
  database_name = None
338
318
 
339
- if platform == "druid" and schema_name == "druid":
319
+ if platform_instance == "druid" and schema_name == "druid":
340
320
  # Follow DataHub's druid source convention.
341
321
  schema_name = None
342
322
 
343
- if database_id and table_name:
323
+ # If the information about the datasource is already contained in the dataset response,
324
+ # can just return the urn directly
325
+ if table_name and database_id:
344
326
  return make_dataset_urn(
345
- platform=platform,
327
+ platform=platform_instance,
346
328
  name=".".join(
347
329
  name for name in [database_name, schema_name, table_name] if name
348
330
  ),
349
331
  env=self.config.env,
350
332
  )
333
+
351
334
  raise ValueError("Could not construct dataset URN")
352
335
 
353
336
  def construct_dashboard_from_api_data(
@@ -469,10 +452,16 @@ class SupersetSource(StatefulIngestionSourceBase):
469
452
  chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
470
453
 
471
454
  datasource_id = chart_data.get("datasource_id")
472
- dataset_response = self.get_dataset_info(datasource_id)
473
- datasource_urn = self.get_datasource_urn_from_id(
474
- dataset_response, self.platform
475
- )
455
+ if not datasource_id:
456
+ logger.debug(
457
+ f"chart {chart_data['id']} has no datasource_id, skipping fetching dataset info"
458
+ )
459
+ datasource_urn = None
460
+ else:
461
+ dataset_response = self.get_dataset_info(datasource_id)
462
+ datasource_urn = self.get_datasource_urn_from_id(
463
+ dataset_response, self.platform
464
+ )
476
465
 
477
466
  params = json.loads(chart_data.get("params", "{}"))
478
467
  metrics = [
@@ -588,25 +577,61 @@ class SupersetSource(StatefulIngestionSourceBase):
588
577
  ) -> DatasetSnapshot:
589
578
  dataset_response = self.get_dataset_info(dataset_data.get("id"))
590
579
  dataset = SupersetDataset(**dataset_response["result"])
580
+
591
581
  datasource_urn = self.get_datasource_urn_from_id(
592
582
  dataset_response, self.platform
593
583
  )
584
+ dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
594
585
 
595
- dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
586
+ upstream_warehouse_platform = (
587
+ dataset_response.get("result", {}).get("database", {}).get("backend")
588
+ )
589
+
590
+ # Preset has a way of naming their platforms differently than
591
+ # how datahub names them, so map the platform name to the correct naming
592
+ warehouse_naming = {
593
+ "awsathena": "athena",
594
+ "clickhousedb": "clickhouse",
595
+ "postgresql": "postgres",
596
+ }
597
+
598
+ if upstream_warehouse_platform in warehouse_naming:
599
+ upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
600
+
601
+ # TODO: Categorize physical vs virtual upstream dataset
602
+ # mark all upstream dataset as physical for now, in the future we would ideally like
603
+ # to differentiate physical vs virtual upstream datasets
604
+ tag_urn = f"urn:li:tag:{self.platform}:physical"
605
+ upstream_dataset = self.get_datasource_urn_from_id(
606
+ dataset_response, upstream_warehouse_platform
607
+ )
608
+ upstream_lineage = UpstreamLineageClass(
609
+ upstreams=[
610
+ UpstreamClass(
611
+ type=DatasetLineageTypeClass.TRANSFORMED,
612
+ dataset=upstream_dataset,
613
+ properties={"externalUrl": dataset_url},
614
+ )
615
+ ]
616
+ )
596
617
 
597
618
  dataset_info = DatasetPropertiesClass(
598
619
  name=dataset.table_name,
599
620
  description="",
600
- lastModified=TimeStamp(time=dataset.modified_ts)
601
- if dataset.modified_ts
602
- else None,
621
+ lastModified=(
622
+ TimeStamp(time=dataset.modified_ts) if dataset.modified_ts else None
623
+ ),
603
624
  externalUrl=dataset_url,
604
625
  )
626
+ global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
627
+
605
628
  aspects_items: List[Any] = []
606
629
  aspects_items.extend(
607
630
  [
608
631
  self.gen_schema_metadata(dataset_response),
609
632
  dataset_info,
633
+ upstream_lineage,
634
+ global_tags,
610
635
  ]
611
636
  )
612
637
 
@@ -614,6 +639,9 @@ class SupersetSource(StatefulIngestionSourceBase):
614
639
  urn=datasource_urn,
615
640
  aspects=aspects_items,
616
641
  )
642
+
643
+ logger.info(f"Constructed dataset {datasource_urn}")
644
+
617
645
  return dataset_snapshot
618
646
 
619
647
  def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
@@ -1562,8 +1562,9 @@ class TableauSiteSource:
1562
1562
  query: str,
1563
1563
  connection_type: str,
1564
1564
  page_size: int,
1565
- query_filter: dict = {},
1565
+ query_filter: Optional[dict] = None,
1566
1566
  ) -> Iterable[dict]:
1567
+ query_filter = query_filter or {}
1567
1568
  query_filter = optimize_query_filter(query_filter)
1568
1569
 
1569
1570
  # Calls the get_connection_object_page function to get the objects,
@@ -1910,11 +1911,7 @@ class TableauSiteSource:
1910
1911
  if upstream_col.get(c.TABLE)
1911
1912
  else None
1912
1913
  )
1913
- if (
1914
- name
1915
- and upstream_table_id
1916
- and upstream_table_id in table_id_to_urn.keys()
1917
- ):
1914
+ if name and upstream_table_id and upstream_table_id in table_id_to_urn:
1918
1915
  parent_dataset_urn = table_id_to_urn[upstream_table_id]
1919
1916
  if (
1920
1917
  self.is_snowflake_urn(parent_dataset_urn)
@@ -514,7 +514,8 @@ FIELD_TYPE_MAPPING = {
514
514
  }
515
515
 
516
516
 
517
- def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass:
517
+ def get_tags_from_params(params: Optional[List[str]] = None) -> GlobalTagsClass:
518
+ params = params or []
518
519
  tags = [
519
520
  TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
520
521
  for tag in params
@@ -284,9 +284,10 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
284
284
  self,
285
285
  query: str,
286
286
  flavor: SearchFlavor,
287
- aspects: List[str] = [],
287
+ aspects: Optional[List[str]] = None,
288
288
  snippet: bool = True,
289
289
  ) -> Iterable[Searchable]:
290
+ aspects = aspects or []
290
291
  if flavor == SearchFlavor.FREE_TEXT:
291
292
  base_query = f"SELECT distinct(urn), 'urn', NULL from metadata_aspect_v2 where urn ILIKE '%{query}%' UNION SELECT urn, aspect_name, metadata from metadata_aspect_v2 where metadata->>'$.name' ILIKE '%{query}%'"
292
293
  for r in self.duckdb_client.execute(base_query).fetchall():
@@ -759,15 +760,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
759
760
  entity_id=[str(data_platform_urn), data_platform_instance],
760
761
  )
761
762
  self._create_edges_from_data_platform_instance(data_platform_instance_urn)
762
- elif isinstance(aspect, ChartInfoClass):
763
- urn = Urn.from_string(entity_urn)
764
- self.add_edge(
765
- entity_urn,
766
- "name",
767
- aspect.title + f" ({urn.get_entity_id()[-1]})",
768
- remove_existing=True,
769
- )
770
- elif isinstance(aspect, DashboardInfoClass):
763
+ elif isinstance(aspect, ChartInfoClass) or isinstance(
764
+ aspect, DashboardInfoClass
765
+ ):
771
766
  urn = Urn.from_string(entity_urn)
772
767
  self.add_edge(
773
768
  entity_urn,
@@ -90,7 +90,7 @@ class DataHubLiteLocal(Generic[LiteConfig], Closeable, metaclass=ABCMeta):
90
90
  self,
91
91
  query: str,
92
92
  flavor: SearchFlavor,
93
- aspects: List[str] = [],
93
+ aspects: Optional[List[str]] = None,
94
94
  snippet: bool = True,
95
95
  ) -> Iterable[Searchable]:
96
96
  pass
datahub/lite/lite_util.py CHANGED
@@ -70,9 +70,10 @@ class DataHubLiteWrapper(DataHubLiteLocal):
70
70
  self,
71
71
  query: str,
72
72
  flavor: SearchFlavor,
73
- aspects: List[str] = [],
73
+ aspects: Optional[List[str]] = None,
74
74
  snippet: bool = True,
75
75
  ) -> Iterable[Searchable]:
76
+ aspects = aspects or []
76
77
  yield from self.lite.search(query, flavor, aspects, snippet)
77
78
 
78
79
  def ls(self, path: str) -> List[Browseable]:
@@ -96,10 +97,10 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
96
97
  lite_type = lite_local_config.type
97
98
  try:
98
99
  lite_class = lite_registry.get(lite_type)
99
- except KeyError:
100
+ except KeyError as e:
100
101
  raise Exception(
101
102
  f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping.keys()]}"
102
- )
103
+ ) from e
103
104
 
104
105
  lite_specific_config = lite_class.get_config_class().parse_obj(
105
106
  lite_local_config.config
datahub/sdk/dataset.py CHANGED
@@ -72,9 +72,9 @@ UpstreamLineageInputType: TypeAlias = Union[
72
72
  def _parse_upstream_input(
73
73
  upstream_input: UpstreamInputType,
74
74
  ) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
75
- if isinstance(upstream_input, models.UpstreamClass):
76
- return upstream_input
77
- elif isinstance(upstream_input, models.FineGrainedLineageClass):
75
+ if isinstance(upstream_input, models.UpstreamClass) or isinstance(
76
+ upstream_input, models.FineGrainedLineageClass
77
+ ):
78
78
  return upstream_input
79
79
  elif isinstance(upstream_input, (str, DatasetUrn)):
80
80
  return models.UpstreamClass(
@@ -1,10 +1,10 @@
1
1
  from collections import deque
2
2
  from itertools import chain
3
3
  from sys import getsizeof
4
- from typing import Any, Iterator
4
+ from typing import Any, Iterator, Optional
5
5
 
6
6
 
7
- def total_size(o: Any, handlers: Any = {}) -> int:
7
+ def total_size(o: Any, handlers: Optional[Any] = None) -> int:
8
8
  """Returns the approximate memory footprint an object and all of its contents.
9
9
  Automatically finds the contents of the following builtin containers and
10
10
  their subclasses: tuple, list, deque, dict, set and frozenset.
@@ -14,6 +14,7 @@ def total_size(o: Any, handlers: Any = {}) -> int:
14
14
 
15
15
  Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
16
16
  """
17
+ handlers = handlers or {}
17
18
 
18
19
  def dict_handler(d: dict) -> Iterator[Any]:
19
20
  return chain.from_iterable(d.items())