acryl-datahub 1.0.0rc6__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2490 -2490
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +74 -74
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/cli/docker_cli.py +1 -1
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -1
- datahub/ingestion/graph/client.py +16 -7
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_api.py +2 -1
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/dbt/dbt_common.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +3 -3
- datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
- datahub/ingestion/source/file.py +5 -2
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +11 -14
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
- datahub/ingestion/source/identity/okta.py +1 -3
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +54 -32
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mode.py +1 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +6 -3
- datahub/ingestion/source/openapi_parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/pulsar.py +2 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +2 -1
- datahub/ingestion/source/s3/config.py +2 -4
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +1 -1
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +2 -2
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/sql_common.py +2 -2
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +4 -2
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/superset.py +65 -37
- datahub/ingestion/source/tableau/tableau.py +3 -6
- datahub/ingestion/source/tableau/tableau_common.py +2 -1
- datahub/lite/duckdb_lite.py +5 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/sdk/dataset.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc6.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0
|
@@ -36,9 +36,6 @@ from datahub.ingestion.api.decorators import (
|
|
|
36
36
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
37
37
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
38
|
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
39
|
-
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
40
|
-
get_platform_from_sqlalchemy_uri,
|
|
41
|
-
)
|
|
42
39
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
43
40
|
StaleEntityRemovalHandler,
|
|
44
41
|
StaleEntityRemovalSourceReport,
|
|
@@ -71,7 +68,12 @@ from datahub.metadata.schema_classes import (
|
|
|
71
68
|
ChartInfoClass,
|
|
72
69
|
ChartTypeClass,
|
|
73
70
|
DashboardInfoClass,
|
|
71
|
+
DatasetLineageTypeClass,
|
|
74
72
|
DatasetPropertiesClass,
|
|
73
|
+
GlobalTagsClass,
|
|
74
|
+
TagAssociationClass,
|
|
75
|
+
UpstreamClass,
|
|
76
|
+
UpstreamLineageClass,
|
|
75
77
|
)
|
|
76
78
|
from datahub.utilities import config_clean
|
|
77
79
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -287,26 +289,6 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
287
289
|
|
|
288
290
|
current_page += 1
|
|
289
291
|
|
|
290
|
-
@lru_cache(maxsize=None)
|
|
291
|
-
def get_platform_from_database_id(self, database_id):
|
|
292
|
-
database_response = self.session.get(
|
|
293
|
-
f"{self.config.connect_uri}/api/v1/database/{database_id}"
|
|
294
|
-
).json()
|
|
295
|
-
sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri")
|
|
296
|
-
if sqlalchemy_uri is None:
|
|
297
|
-
platform_name = database_response.get("result", {}).get(
|
|
298
|
-
"backend", "external"
|
|
299
|
-
)
|
|
300
|
-
else:
|
|
301
|
-
platform_name = get_platform_from_sqlalchemy_uri(sqlalchemy_uri)
|
|
302
|
-
if platform_name == "awsathena":
|
|
303
|
-
return "athena"
|
|
304
|
-
if platform_name == "clickhousedb":
|
|
305
|
-
return "clickhouse"
|
|
306
|
-
if platform_name == "postgresql":
|
|
307
|
-
return "postgres"
|
|
308
|
-
return platform_name
|
|
309
|
-
|
|
310
292
|
@lru_cache(maxsize=None)
|
|
311
293
|
def get_dataset_info(self, dataset_id: int) -> dict:
|
|
312
294
|
dataset_response = self.session.get(
|
|
@@ -323,8 +305,6 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
323
305
|
schema_name = dataset_response.get("result", {}).get("schema")
|
|
324
306
|
table_name = dataset_response.get("result", {}).get("table_name")
|
|
325
307
|
database_id = dataset_response.get("result", {}).get("database", {}).get("id")
|
|
326
|
-
platform = self.get_platform_from_database_id(database_id)
|
|
327
|
-
|
|
328
308
|
database_name = (
|
|
329
309
|
dataset_response.get("result", {}).get("database", {}).get("database_name")
|
|
330
310
|
)
|
|
@@ -333,21 +313,24 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
333
313
|
# Druid do not have a database concept and has a limited schema concept, but they are nonetheless reported
|
|
334
314
|
# from superset. There is only one database per platform instance, and one schema named druid, so it would be
|
|
335
315
|
# redundant to systemically store them both in the URN.
|
|
336
|
-
if
|
|
316
|
+
if platform_instance in platform_without_databases:
|
|
337
317
|
database_name = None
|
|
338
318
|
|
|
339
|
-
if
|
|
319
|
+
if platform_instance == "druid" and schema_name == "druid":
|
|
340
320
|
# Follow DataHub's druid source convention.
|
|
341
321
|
schema_name = None
|
|
342
322
|
|
|
343
|
-
|
|
323
|
+
# If the information about the datasource is already contained in the dataset response,
|
|
324
|
+
# can just return the urn directly
|
|
325
|
+
if table_name and database_id:
|
|
344
326
|
return make_dataset_urn(
|
|
345
|
-
platform=
|
|
327
|
+
platform=platform_instance,
|
|
346
328
|
name=".".join(
|
|
347
329
|
name for name in [database_name, schema_name, table_name] if name
|
|
348
330
|
),
|
|
349
331
|
env=self.config.env,
|
|
350
332
|
)
|
|
333
|
+
|
|
351
334
|
raise ValueError("Could not construct dataset URN")
|
|
352
335
|
|
|
353
336
|
def construct_dashboard_from_api_data(
|
|
@@ -469,10 +452,16 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
469
452
|
chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
|
|
470
453
|
|
|
471
454
|
datasource_id = chart_data.get("datasource_id")
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
455
|
+
if not datasource_id:
|
|
456
|
+
logger.debug(
|
|
457
|
+
f"chart {chart_data['id']} has no datasource_id, skipping fetching dataset info"
|
|
458
|
+
)
|
|
459
|
+
datasource_urn = None
|
|
460
|
+
else:
|
|
461
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
462
|
+
datasource_urn = self.get_datasource_urn_from_id(
|
|
463
|
+
dataset_response, self.platform
|
|
464
|
+
)
|
|
476
465
|
|
|
477
466
|
params = json.loads(chart_data.get("params", "{}"))
|
|
478
467
|
metrics = [
|
|
@@ -588,25 +577,61 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
588
577
|
) -> DatasetSnapshot:
|
|
589
578
|
dataset_response = self.get_dataset_info(dataset_data.get("id"))
|
|
590
579
|
dataset = SupersetDataset(**dataset_response["result"])
|
|
580
|
+
|
|
591
581
|
datasource_urn = self.get_datasource_urn_from_id(
|
|
592
582
|
dataset_response, self.platform
|
|
593
583
|
)
|
|
584
|
+
dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
|
|
594
585
|
|
|
595
|
-
|
|
586
|
+
upstream_warehouse_platform = (
|
|
587
|
+
dataset_response.get("result", {}).get("database", {}).get("backend")
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# Preset has a way of naming their platforms differently than
|
|
591
|
+
# how datahub names them, so map the platform name to the correct naming
|
|
592
|
+
warehouse_naming = {
|
|
593
|
+
"awsathena": "athena",
|
|
594
|
+
"clickhousedb": "clickhouse",
|
|
595
|
+
"postgresql": "postgres",
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
if upstream_warehouse_platform in warehouse_naming:
|
|
599
|
+
upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
|
|
600
|
+
|
|
601
|
+
# TODO: Categorize physical vs virtual upstream dataset
|
|
602
|
+
# mark all upstream dataset as physical for now, in the future we would ideally like
|
|
603
|
+
# to differentiate physical vs virtual upstream datasets
|
|
604
|
+
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
605
|
+
upstream_dataset = self.get_datasource_urn_from_id(
|
|
606
|
+
dataset_response, upstream_warehouse_platform
|
|
607
|
+
)
|
|
608
|
+
upstream_lineage = UpstreamLineageClass(
|
|
609
|
+
upstreams=[
|
|
610
|
+
UpstreamClass(
|
|
611
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
612
|
+
dataset=upstream_dataset,
|
|
613
|
+
properties={"externalUrl": dataset_url},
|
|
614
|
+
)
|
|
615
|
+
]
|
|
616
|
+
)
|
|
596
617
|
|
|
597
618
|
dataset_info = DatasetPropertiesClass(
|
|
598
619
|
name=dataset.table_name,
|
|
599
620
|
description="",
|
|
600
|
-
lastModified=
|
|
601
|
-
|
|
602
|
-
|
|
621
|
+
lastModified=(
|
|
622
|
+
TimeStamp(time=dataset.modified_ts) if dataset.modified_ts else None
|
|
623
|
+
),
|
|
603
624
|
externalUrl=dataset_url,
|
|
604
625
|
)
|
|
626
|
+
global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
|
|
627
|
+
|
|
605
628
|
aspects_items: List[Any] = []
|
|
606
629
|
aspects_items.extend(
|
|
607
630
|
[
|
|
608
631
|
self.gen_schema_metadata(dataset_response),
|
|
609
632
|
dataset_info,
|
|
633
|
+
upstream_lineage,
|
|
634
|
+
global_tags,
|
|
610
635
|
]
|
|
611
636
|
)
|
|
612
637
|
|
|
@@ -614,6 +639,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
614
639
|
urn=datasource_urn,
|
|
615
640
|
aspects=aspects_items,
|
|
616
641
|
)
|
|
642
|
+
|
|
643
|
+
logger.info(f"Constructed dataset {datasource_urn}")
|
|
644
|
+
|
|
617
645
|
return dataset_snapshot
|
|
618
646
|
|
|
619
647
|
def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -1562,8 +1562,9 @@ class TableauSiteSource:
|
|
|
1562
1562
|
query: str,
|
|
1563
1563
|
connection_type: str,
|
|
1564
1564
|
page_size: int,
|
|
1565
|
-
query_filter: dict =
|
|
1565
|
+
query_filter: Optional[dict] = None,
|
|
1566
1566
|
) -> Iterable[dict]:
|
|
1567
|
+
query_filter = query_filter or {}
|
|
1567
1568
|
query_filter = optimize_query_filter(query_filter)
|
|
1568
1569
|
|
|
1569
1570
|
# Calls the get_connection_object_page function to get the objects,
|
|
@@ -1910,11 +1911,7 @@ class TableauSiteSource:
|
|
|
1910
1911
|
if upstream_col.get(c.TABLE)
|
|
1911
1912
|
else None
|
|
1912
1913
|
)
|
|
1913
|
-
if
|
|
1914
|
-
name
|
|
1915
|
-
and upstream_table_id
|
|
1916
|
-
and upstream_table_id in table_id_to_urn.keys()
|
|
1917
|
-
):
|
|
1914
|
+
if name and upstream_table_id and upstream_table_id in table_id_to_urn:
|
|
1918
1915
|
parent_dataset_urn = table_id_to_urn[upstream_table_id]
|
|
1919
1916
|
if (
|
|
1920
1917
|
self.is_snowflake_urn(parent_dataset_urn)
|
|
@@ -514,7 +514,8 @@ FIELD_TYPE_MAPPING = {
|
|
|
514
514
|
}
|
|
515
515
|
|
|
516
516
|
|
|
517
|
-
def get_tags_from_params(params: List[str] =
|
|
517
|
+
def get_tags_from_params(params: Optional[List[str]] = None) -> GlobalTagsClass:
|
|
518
|
+
params = params or []
|
|
518
519
|
tags = [
|
|
519
520
|
TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
|
|
520
521
|
for tag in params
|
datahub/lite/duckdb_lite.py
CHANGED
|
@@ -284,9 +284,10 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
284
284
|
self,
|
|
285
285
|
query: str,
|
|
286
286
|
flavor: SearchFlavor,
|
|
287
|
-
aspects: List[str] =
|
|
287
|
+
aspects: Optional[List[str]] = None,
|
|
288
288
|
snippet: bool = True,
|
|
289
289
|
) -> Iterable[Searchable]:
|
|
290
|
+
aspects = aspects or []
|
|
290
291
|
if flavor == SearchFlavor.FREE_TEXT:
|
|
291
292
|
base_query = f"SELECT distinct(urn), 'urn', NULL from metadata_aspect_v2 where urn ILIKE '%{query}%' UNION SELECT urn, aspect_name, metadata from metadata_aspect_v2 where metadata->>'$.name' ILIKE '%{query}%'"
|
|
292
293
|
for r in self.duckdb_client.execute(base_query).fetchall():
|
|
@@ -759,15 +760,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
759
760
|
entity_id=[str(data_platform_urn), data_platform_instance],
|
|
760
761
|
)
|
|
761
762
|
self._create_edges_from_data_platform_instance(data_platform_instance_urn)
|
|
762
|
-
elif isinstance(aspect, ChartInfoClass)
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
entity_urn,
|
|
766
|
-
"name",
|
|
767
|
-
aspect.title + f" ({urn.get_entity_id()[-1]})",
|
|
768
|
-
remove_existing=True,
|
|
769
|
-
)
|
|
770
|
-
elif isinstance(aspect, DashboardInfoClass):
|
|
763
|
+
elif isinstance(aspect, ChartInfoClass) or isinstance(
|
|
764
|
+
aspect, DashboardInfoClass
|
|
765
|
+
):
|
|
771
766
|
urn = Urn.from_string(entity_urn)
|
|
772
767
|
self.add_edge(
|
|
773
768
|
entity_urn,
|
datahub/lite/lite_local.py
CHANGED
datahub/lite/lite_util.py
CHANGED
|
@@ -70,9 +70,10 @@ class DataHubLiteWrapper(DataHubLiteLocal):
|
|
|
70
70
|
self,
|
|
71
71
|
query: str,
|
|
72
72
|
flavor: SearchFlavor,
|
|
73
|
-
aspects: List[str] =
|
|
73
|
+
aspects: Optional[List[str]] = None,
|
|
74
74
|
snippet: bool = True,
|
|
75
75
|
) -> Iterable[Searchable]:
|
|
76
|
+
aspects = aspects or []
|
|
76
77
|
yield from self.lite.search(query, flavor, aspects, snippet)
|
|
77
78
|
|
|
78
79
|
def ls(self, path: str) -> List[Browseable]:
|
|
@@ -96,10 +97,10 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
|
|
|
96
97
|
lite_type = lite_local_config.type
|
|
97
98
|
try:
|
|
98
99
|
lite_class = lite_registry.get(lite_type)
|
|
99
|
-
except KeyError:
|
|
100
|
+
except KeyError as e:
|
|
100
101
|
raise Exception(
|
|
101
102
|
f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping.keys()]}"
|
|
102
|
-
)
|
|
103
|
+
) from e
|
|
103
104
|
|
|
104
105
|
lite_specific_config = lite_class.get_config_class().parse_obj(
|
|
105
106
|
lite_local_config.config
|
datahub/sdk/dataset.py
CHANGED
|
@@ -72,9 +72,9 @@ UpstreamLineageInputType: TypeAlias = Union[
|
|
|
72
72
|
def _parse_upstream_input(
|
|
73
73
|
upstream_input: UpstreamInputType,
|
|
74
74
|
) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
|
|
75
|
-
if isinstance(upstream_input, models.UpstreamClass)
|
|
76
|
-
|
|
77
|
-
|
|
75
|
+
if isinstance(upstream_input, models.UpstreamClass) or isinstance(
|
|
76
|
+
upstream_input, models.FineGrainedLineageClass
|
|
77
|
+
):
|
|
78
78
|
return upstream_input
|
|
79
79
|
elif isinstance(upstream_input, (str, DatasetUrn)):
|
|
80
80
|
return models.UpstreamClass(
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from collections import deque
|
|
2
2
|
from itertools import chain
|
|
3
3
|
from sys import getsizeof
|
|
4
|
-
from typing import Any, Iterator
|
|
4
|
+
from typing import Any, Iterator, Optional
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def total_size(o: Any, handlers: Any =
|
|
7
|
+
def total_size(o: Any, handlers: Optional[Any] = None) -> int:
|
|
8
8
|
"""Returns the approximate memory footprint an object and all of its contents.
|
|
9
9
|
Automatically finds the contents of the following builtin containers and
|
|
10
10
|
their subclasses: tuple, list, deque, dict, set and frozenset.
|
|
@@ -14,6 +14,7 @@ def total_size(o: Any, handlers: Any = {}) -> int:
|
|
|
14
14
|
|
|
15
15
|
Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
|
|
16
16
|
"""
|
|
17
|
+
handlers = handlers or {}
|
|
17
18
|
|
|
18
19
|
def dict_handler(d: dict) -> Iterator[Any]:
|
|
19
20
|
return chain.from_iterable(d.items())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|