acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/METADATA +2486 -2487
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/RECORD +64 -49
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/emitter/request_helper.py +19 -14
- datahub/ingestion/api/source.py +6 -2
- datahub/ingestion/api/source_helpers.py +6 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +15 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +62 -66
- datahub/ingestion/source/mlflow.py +198 -7
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/powerbi.py +29 -23
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/superset.py +138 -22
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +311 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +30 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +22 -0
- datahub/sdk/search_filters.py +4 -4
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/top_level.txt +0 -0
|
@@ -23,6 +23,7 @@ from datahub.emitter.mce_builder import (
|
|
|
23
23
|
make_dataset_urn,
|
|
24
24
|
make_dataset_urn_with_platform_instance,
|
|
25
25
|
make_domain_urn,
|
|
26
|
+
make_schema_field_urn,
|
|
26
27
|
make_user_urn,
|
|
27
28
|
)
|
|
28
29
|
from datahub.emitter.mcp_builder import add_domain_to_entity_wu
|
|
@@ -72,6 +73,9 @@ from datahub.metadata.schema_classes import (
|
|
|
72
73
|
DashboardInfoClass,
|
|
73
74
|
DatasetLineageTypeClass,
|
|
74
75
|
DatasetPropertiesClass,
|
|
76
|
+
FineGrainedLineageClass,
|
|
77
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
78
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
75
79
|
GlobalTagsClass,
|
|
76
80
|
OwnerClass,
|
|
77
81
|
OwnershipClass,
|
|
@@ -80,6 +84,10 @@ from datahub.metadata.schema_classes import (
|
|
|
80
84
|
UpstreamClass,
|
|
81
85
|
UpstreamLineageClass,
|
|
82
86
|
)
|
|
87
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
88
|
+
SqlParsingResult,
|
|
89
|
+
create_lineage_sql_parsed_result,
|
|
90
|
+
)
|
|
83
91
|
from datahub.utilities import config_clean
|
|
84
92
|
from datahub.utilities.lossy_collections import LossyList
|
|
85
93
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -181,6 +189,10 @@ class SupersetConfig(
|
|
|
181
189
|
provider: str = Field(default="db", description="Superset provider.")
|
|
182
190
|
options: Dict = Field(default={}, description="")
|
|
183
191
|
|
|
192
|
+
timeout: int = Field(
|
|
193
|
+
default=10, description="Timeout of single API call to superset."
|
|
194
|
+
)
|
|
195
|
+
|
|
184
196
|
# TODO: Check and remove this if no longer needed.
|
|
185
197
|
# Config database_alias is removed from sql sources.
|
|
186
198
|
database_alias: Dict[str, str] = Field(
|
|
@@ -285,13 +297,16 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
285
297
|
}
|
|
286
298
|
)
|
|
287
299
|
|
|
288
|
-
# Test the connection
|
|
289
300
|
test_response = requests_session.get(
|
|
290
|
-
f"{self.config.connect_uri}/api/v1/dashboard/"
|
|
301
|
+
f"{self.config.connect_uri}/api/v1/dashboard/",
|
|
302
|
+
timeout=self.config.timeout,
|
|
291
303
|
)
|
|
292
|
-
if test_response.status_code
|
|
293
|
-
|
|
294
|
-
#
|
|
304
|
+
if test_response.status_code != 200:
|
|
305
|
+
# throw an error and terminate ingestion,
|
|
306
|
+
# cannot proceed without access token
|
|
307
|
+
logger.error(
|
|
308
|
+
f"Failed to log in to Superset with status: {test_response.status_code}"
|
|
309
|
+
)
|
|
295
310
|
return requests_session
|
|
296
311
|
|
|
297
312
|
def paginate_entity_api_results(self, entity_type, page_size=100):
|
|
@@ -302,6 +317,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
302
317
|
response = self.session.get(
|
|
303
318
|
f"{self.config.connect_uri}/api/v1/{entity_type}",
|
|
304
319
|
params={"q": f"(page:{current_page},page_size:{page_size})"},
|
|
320
|
+
timeout=self.config.timeout,
|
|
305
321
|
)
|
|
306
322
|
|
|
307
323
|
if response.status_code != 200:
|
|
@@ -339,10 +355,11 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
339
355
|
def get_dataset_info(self, dataset_id: int) -> dict:
|
|
340
356
|
dataset_response = self.session.get(
|
|
341
357
|
f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
|
|
358
|
+
timeout=self.config.timeout,
|
|
342
359
|
)
|
|
343
360
|
if dataset_response.status_code != 200:
|
|
344
361
|
logger.warning(f"Failed to get dataset info: {dataset_response.text}")
|
|
345
|
-
|
|
362
|
+
return {}
|
|
346
363
|
return dataset_response.json()
|
|
347
364
|
|
|
348
365
|
def get_datasource_urn_from_id(
|
|
@@ -393,8 +410,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
393
410
|
)
|
|
394
411
|
|
|
395
412
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
413
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
396
414
|
modified_ts = int(
|
|
397
|
-
dp.parse(dashboard_data.get("changed_on_utc",
|
|
415
|
+
dp.parse(dashboard_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
398
416
|
)
|
|
399
417
|
title = dashboard_data.get("dashboard_title", "")
|
|
400
418
|
# note: the API does not currently supply created_by usernames due to a bug
|
|
@@ -506,8 +524,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
506
524
|
)
|
|
507
525
|
|
|
508
526
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
527
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
509
528
|
modified_ts = int(
|
|
510
|
-
dp.parse(chart_data.get("changed_on_utc",
|
|
529
|
+
dp.parse(chart_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
511
530
|
)
|
|
512
531
|
title = chart_data.get("slice_name", "")
|
|
513
532
|
|
|
@@ -680,6 +699,88 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
680
699
|
env=self.config.env,
|
|
681
700
|
)
|
|
682
701
|
|
|
702
|
+
def generate_virtual_dataset_lineage(
|
|
703
|
+
self,
|
|
704
|
+
parsed_query_object: SqlParsingResult,
|
|
705
|
+
datasource_urn: str,
|
|
706
|
+
) -> UpstreamLineageClass:
|
|
707
|
+
cll = (
|
|
708
|
+
parsed_query_object.column_lineage
|
|
709
|
+
if parsed_query_object.column_lineage is not None
|
|
710
|
+
else []
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
714
|
+
|
|
715
|
+
for cll_info in cll:
|
|
716
|
+
downstream = (
|
|
717
|
+
[make_schema_field_urn(datasource_urn, cll_info.downstream.column)]
|
|
718
|
+
if cll_info.downstream and cll_info.downstream.column
|
|
719
|
+
else []
|
|
720
|
+
)
|
|
721
|
+
upstreams = [
|
|
722
|
+
make_schema_field_urn(column_ref.table, column_ref.column)
|
|
723
|
+
for column_ref in cll_info.upstreams
|
|
724
|
+
]
|
|
725
|
+
fine_grained_lineages.append(
|
|
726
|
+
FineGrainedLineageClass(
|
|
727
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
728
|
+
downstreams=downstream,
|
|
729
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
730
|
+
upstreams=upstreams,
|
|
731
|
+
)
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
upstream_lineage = UpstreamLineageClass(
|
|
735
|
+
upstreams=[
|
|
736
|
+
UpstreamClass(
|
|
737
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
738
|
+
dataset=input_table_urn,
|
|
739
|
+
)
|
|
740
|
+
for input_table_urn in parsed_query_object.in_tables
|
|
741
|
+
],
|
|
742
|
+
fineGrainedLineages=fine_grained_lineages,
|
|
743
|
+
)
|
|
744
|
+
return upstream_lineage
|
|
745
|
+
|
|
746
|
+
def generate_physical_dataset_lineage(
|
|
747
|
+
self,
|
|
748
|
+
dataset_response: dict,
|
|
749
|
+
upstream_dataset: str,
|
|
750
|
+
datasource_urn: str,
|
|
751
|
+
) -> UpstreamLineageClass:
|
|
752
|
+
# To generate column level lineage, we can manually decode the metadata
|
|
753
|
+
# to produce the ColumnLineageInfo
|
|
754
|
+
columns = dataset_response.get("result", {}).get("columns", [])
|
|
755
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
756
|
+
|
|
757
|
+
for column in columns:
|
|
758
|
+
column_name = column.get("column_name", "")
|
|
759
|
+
if not column_name:
|
|
760
|
+
continue
|
|
761
|
+
|
|
762
|
+
downstream = [make_schema_field_urn(datasource_urn, column_name)]
|
|
763
|
+
upstreams = [make_schema_field_urn(upstream_dataset, column_name)]
|
|
764
|
+
fine_grained_lineages.append(
|
|
765
|
+
FineGrainedLineageClass(
|
|
766
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
767
|
+
downstreams=downstream,
|
|
768
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
769
|
+
upstreams=upstreams,
|
|
770
|
+
)
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
upstream_lineage = UpstreamLineageClass(
|
|
774
|
+
upstreams=[
|
|
775
|
+
UpstreamClass(
|
|
776
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
777
|
+
dataset=upstream_dataset,
|
|
778
|
+
)
|
|
779
|
+
],
|
|
780
|
+
fineGrainedLineages=fine_grained_lineages,
|
|
781
|
+
)
|
|
782
|
+
return upstream_lineage
|
|
783
|
+
|
|
683
784
|
def construct_dataset_from_dataset_data(
|
|
684
785
|
self, dataset_data: dict
|
|
685
786
|
) -> DatasetSnapshot:
|
|
@@ -692,14 +793,23 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
692
793
|
dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
|
|
693
794
|
|
|
694
795
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
796
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
695
797
|
modified_ts = int(
|
|
696
|
-
dp.parse(dataset_data.get("changed_on_utc",
|
|
798
|
+
dp.parse(dataset_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
697
799
|
)
|
|
698
800
|
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
699
801
|
|
|
700
802
|
upstream_warehouse_platform = (
|
|
701
803
|
dataset_response.get("result", {}).get("database", {}).get("backend")
|
|
702
804
|
)
|
|
805
|
+
upstream_warehouse_db_name = (
|
|
806
|
+
dataset_response.get("result", {}).get("database", {}).get("database_name")
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
# if we have rendered sql, we always use that and defualt back to regular sql
|
|
810
|
+
sql = dataset_response.get("result", {}).get(
|
|
811
|
+
"rendered_sql"
|
|
812
|
+
) or dataset_response.get("result", {}).get("sql")
|
|
703
813
|
|
|
704
814
|
# Preset has a way of naming their platforms differently than
|
|
705
815
|
# how datahub names them, so map the platform name to the correct naming
|
|
@@ -712,22 +822,28 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
712
822
|
if upstream_warehouse_platform in warehouse_naming:
|
|
713
823
|
upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
|
|
714
824
|
|
|
715
|
-
# TODO: Categorize physical vs virtual upstream dataset
|
|
716
|
-
# mark all upstream dataset as physical for now, in the future we would ideally like
|
|
717
|
-
# to differentiate physical vs virtual upstream datasets
|
|
718
|
-
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
719
825
|
upstream_dataset = self.get_datasource_urn_from_id(
|
|
720
826
|
dataset_response, upstream_warehouse_platform
|
|
721
827
|
)
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
828
|
+
|
|
829
|
+
# Sometimes the field will be null instead of not existing
|
|
830
|
+
if sql == "null" or not sql:
|
|
831
|
+
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
832
|
+
upstream_lineage = self.generate_physical_dataset_lineage(
|
|
833
|
+
dataset_response, upstream_dataset, datasource_urn
|
|
834
|
+
)
|
|
835
|
+
else:
|
|
836
|
+
tag_urn = f"urn:li:tag:{self.platform}:virtual"
|
|
837
|
+
parsed_query_object = create_lineage_sql_parsed_result(
|
|
838
|
+
query=sql,
|
|
839
|
+
default_db=upstream_warehouse_db_name,
|
|
840
|
+
platform=upstream_warehouse_platform,
|
|
841
|
+
platform_instance=None,
|
|
842
|
+
env=self.config.env,
|
|
843
|
+
)
|
|
844
|
+
upstream_lineage = self.generate_virtual_dataset_lineage(
|
|
845
|
+
parsed_query_object, datasource_urn
|
|
846
|
+
)
|
|
731
847
|
|
|
732
848
|
dataset_info = DatasetPropertiesClass(
|
|
733
849
|
name=dataset.table_name,
|
|
File without changes
|