acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1rc1.dist-info}/METADATA +2476 -2478
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1rc1.dist-info}/RECORD +14 -14
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1rc1.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
- datahub/ingestion/source/superset.py +124 -17
- datahub/ingestion/source/vertexai.py +10 -12
- datahub/metadata/schema.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/testing/mcp_diff.py +1 -18
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1rc1.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
|
-
datahub/_version.py,sha256=
|
|
3
|
+
datahub/_version.py,sha256=4sjCSS8UbsxYv_VuzZNtNxudBmSHVgJLBmaqBIjHobs,323
|
|
4
4
|
datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
|
|
5
5
|
datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
|
|
6
6
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -216,8 +216,8 @@ datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99Wd
|
|
|
216
216
|
datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
|
|
217
217
|
datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
|
|
218
218
|
datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
|
|
219
|
-
datahub/ingestion/source/superset.py,sha256=
|
|
220
|
-
datahub/ingestion/source/vertexai.py,sha256=
|
|
219
|
+
datahub/ingestion/source/superset.py,sha256=HW3oiS6EDdVkSFopqg48e76NkElc8U4I8lEMjIvxrPo,34997
|
|
220
|
+
datahub/ingestion/source/vertexai.py,sha256=_dpT4RSvd1IzucH-grDkD80sdGXume_qA-OLkHex_GM,27600
|
|
221
221
|
datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
222
222
|
datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
|
|
223
223
|
datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
|
|
@@ -447,7 +447,7 @@ datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=pEw2O9xoTSIWDi
|
|
|
447
447
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
448
448
|
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
|
|
449
449
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
450
|
-
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=
|
|
450
|
+
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=gX9E1Z_CemAZsuTDmtvqrxY7vBL2da75j7X8Xwhaf8Y,28441
|
|
451
451
|
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=qz_rhRMNCXxHd23bePbb3YxhFgN7eRpV4s6g58hQ5bU,39678
|
|
452
452
|
datahub/ingestion/source/snowflake/snowflake_report.py,sha256=ahea-bwpW6T0iDehGo0Qq_J7wKxPkV61aYHm8bGwDqo,6651
|
|
453
453
|
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=qkGgk6WdKSPThFjexXHrxUPYiVtzDk2MbGX3b281A4c,26044
|
|
@@ -580,7 +580,7 @@ datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1
|
|
|
580
580
|
datahub/lite/lite_util.py,sha256=Cm6trMTeo0X1fv4nSsW9lC0jqce7Jt-05GhOtIGzsVc,4559
|
|
581
581
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
582
582
|
datahub/metadata/_schema_classes.py,sha256=WMINRH1eF7TmnGXSrUCVw5mxplZf5wXGy8QCAm4pxTk,994687
|
|
583
|
-
datahub/metadata/schema.avsc,sha256=
|
|
583
|
+
datahub/metadata/schema.avsc,sha256=iIXHOzZeBr7dSUzP2Bl8H3janQ7diKYXZejt5crFneU,743361
|
|
584
584
|
datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
|
|
585
585
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
586
586
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
@@ -726,9 +726,9 @@ datahub/metadata/schemas/DataPlatformInstanceKey.avsc,sha256=nHFRKlg98lfqYyHZEAX
|
|
|
726
726
|
datahub/metadata/schemas/DataPlatformInstanceProperties.avsc,sha256=4-UrBTtVAR0rKQ4OPt4MVZeFtolXzIajGtyh3KC8-MQ,1623
|
|
727
727
|
datahub/metadata/schemas/DataPlatformKey.avsc,sha256=5Z2adruXKzSucmgCba768UXdsGsYBH9t9DvFF9L9mxo,461
|
|
728
728
|
datahub/metadata/schemas/DataProcessInfo.avsc,sha256=n4Zuk4kpHrHI2BdINhG-OucdCefb2GEsDv5mXQtSWIw,1558
|
|
729
|
-
datahub/metadata/schemas/DataProcessInstanceInput.avsc,sha256=
|
|
729
|
+
datahub/metadata/schemas/DataProcessInstanceInput.avsc,sha256=DmPchn1kjjXKJ8e4EU6I8Bg8iuQkn43UJkTi1O-Zn-8,6328
|
|
730
730
|
datahub/metadata/schemas/DataProcessInstanceKey.avsc,sha256=YSEVtSWql1IZ9AG37HmJZ4118pgi8kVCygI_GqFf3YA,945
|
|
731
|
-
datahub/metadata/schemas/DataProcessInstanceOutput.avsc,sha256=
|
|
731
|
+
datahub/metadata/schemas/DataProcessInstanceOutput.avsc,sha256=xyGBUf3vFHrMLtmZjs6H7FEs-wqmB8Apx5iZubCbkEo,6383
|
|
732
732
|
datahub/metadata/schemas/DataProcessInstanceProperties.avsc,sha256=2qsDFeSA2-ag5IVetgD8mW2k--F6CwmYXM3KOE6edU8,3836
|
|
733
733
|
datahub/metadata/schemas/DataProcessInstanceRelationships.avsc,sha256=VhBpnyGGvO06WEnM6zy4PmjiT0nivRQfkSdJCUgIavw,2358
|
|
734
734
|
datahub/metadata/schemas/DataProcessInstanceRunEvent.avsc,sha256=zwTYULEnpMbqwkLN8NbXW9PQWFG4X6TZkZwTQ1Wb53Y,6713
|
|
@@ -932,7 +932,7 @@ datahub/testing/check_str_enum.py,sha256=yqk0XXHOGteN-IGqCp5JHy0Kca13BnI09ZqKc4N
|
|
|
932
932
|
datahub/testing/compare_metadata_json.py,sha256=mTU5evu7KLS3cx8OLOC1fFxj0eY1J1CGV2PEQZmapos,5361
|
|
933
933
|
datahub/testing/docker_utils.py,sha256=g169iy_jNR_mg0p8X31cChZqjOryutAIHUYLq3xqueY,2415
|
|
934
934
|
datahub/testing/doctest.py,sha256=1_8WEhHZ2eRQtw8vsXKzr9L5zzvs0Tcr6q4mnkyyvtw,295
|
|
935
|
-
datahub/testing/mcp_diff.py,sha256
|
|
935
|
+
datahub/testing/mcp_diff.py,sha256=-4Q1GYvv1zXMEBXGCIxAwCwrOttNV5QNZvxFFz0WNuE,10205
|
|
936
936
|
datahub/testing/pytest_hooks.py,sha256=eifmj0M68AIfjTn_-0vtaBkKl75vNKMjsbYX-pJqmGY,1417
|
|
937
937
|
datahub/upgrade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
938
938
|
datahub/upgrade/upgrade.py,sha256=lf60_dCu51twObAL5E8NqdrW3_2lsnUJUaB9MSEVXwI,16638
|
|
@@ -1025,9 +1025,9 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1025
1025
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1026
1026
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1027
1027
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1028
|
-
acryl_datahub-1.0.
|
|
1029
|
-
acryl_datahub-1.0.
|
|
1030
|
-
acryl_datahub-1.0.
|
|
1031
|
-
acryl_datahub-1.0.
|
|
1032
|
-
acryl_datahub-1.0.
|
|
1033
|
-
acryl_datahub-1.0.
|
|
1028
|
+
acryl_datahub-1.0.0.1rc1.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
1029
|
+
acryl_datahub-1.0.0.1rc1.dist-info/METADATA,sha256=eJIEEXWE89GHGpg0sV7xwmAzxQ9fN-E1bNQvv0egVpo,176827
|
|
1030
|
+
acryl_datahub-1.0.0.1rc1.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
|
|
1031
|
+
acryl_datahub-1.0.0.1rc1.dist-info/entry_points.txt,sha256=7-eDilp0OACUtlmmZ-LF6H9MF_SWD_bWHKNG7Dvhhos,9652
|
|
1032
|
+
acryl_datahub-1.0.0.1rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1033
|
+
acryl_datahub-1.0.0.1rc1.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
|
@@ -403,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
403
403
|
res["session_id"],
|
|
404
404
|
res["query_start_time"],
|
|
405
405
|
object_modified_by_ddl,
|
|
406
|
+
res["query_type"],
|
|
406
407
|
)
|
|
407
408
|
if known_ddl_entry:
|
|
408
409
|
return known_ddl_entry
|
|
@@ -537,40 +538,42 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
537
538
|
session_id: str,
|
|
538
539
|
timestamp: datetime,
|
|
539
540
|
object_modified_by_ddl: dict,
|
|
541
|
+
query_type: str,
|
|
540
542
|
) -> Optional[Union[TableRename, TableSwap]]:
|
|
541
543
|
timestamp = timestamp.astimezone(timezone.utc)
|
|
542
|
-
if
|
|
543
|
-
"operationType"
|
|
544
|
-
|
|
545
|
-
|
|
544
|
+
if (
|
|
545
|
+
object_modified_by_ddl["operationType"] == "ALTER"
|
|
546
|
+
and query_type == "RENAME_TABLE"
|
|
547
|
+
and object_modified_by_ddl["properties"].get("objectName")
|
|
548
|
+
):
|
|
549
|
+
original_un = self.identifiers.gen_dataset_urn(
|
|
546
550
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
547
551
|
object_modified_by_ddl["objectName"]
|
|
548
552
|
)
|
|
549
553
|
)
|
|
550
554
|
|
|
551
|
-
|
|
555
|
+
new_urn = self.identifiers.gen_dataset_urn(
|
|
552
556
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
553
|
-
object_modified_by_ddl["properties"]["
|
|
557
|
+
object_modified_by_ddl["properties"]["objectName"]["value"]
|
|
554
558
|
)
|
|
555
559
|
)
|
|
556
|
-
|
|
557
|
-
return TableSwap(urn1, urn2, query, session_id, timestamp)
|
|
560
|
+
return TableRename(original_un, new_urn, query, session_id, timestamp)
|
|
558
561
|
elif object_modified_by_ddl[
|
|
559
562
|
"operationType"
|
|
560
|
-
] == "
|
|
561
|
-
|
|
563
|
+
] == "ALTER" and object_modified_by_ddl["properties"].get("swapTargetName"):
|
|
564
|
+
urn1 = self.identifiers.gen_dataset_urn(
|
|
562
565
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
563
566
|
object_modified_by_ddl["objectName"]
|
|
564
567
|
)
|
|
565
568
|
)
|
|
566
569
|
|
|
567
|
-
|
|
570
|
+
urn2 = self.identifiers.gen_dataset_urn(
|
|
568
571
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
569
|
-
object_modified_by_ddl["properties"]["
|
|
572
|
+
object_modified_by_ddl["properties"]["swapTargetName"]["value"]
|
|
570
573
|
)
|
|
571
574
|
)
|
|
572
575
|
|
|
573
|
-
return
|
|
576
|
+
return TableSwap(urn1, urn2, query, session_id, timestamp)
|
|
574
577
|
else:
|
|
575
578
|
self.report.num_ddl_queries_dropped += 1
|
|
576
579
|
return None
|
|
@@ -23,6 +23,7 @@ from datahub.emitter.mce_builder import (
|
|
|
23
23
|
make_dataset_urn,
|
|
24
24
|
make_dataset_urn_with_platform_instance,
|
|
25
25
|
make_domain_urn,
|
|
26
|
+
make_schema_field_urn,
|
|
26
27
|
make_user_urn,
|
|
27
28
|
)
|
|
28
29
|
from datahub.emitter.mcp_builder import add_domain_to_entity_wu
|
|
@@ -72,6 +73,9 @@ from datahub.metadata.schema_classes import (
|
|
|
72
73
|
DashboardInfoClass,
|
|
73
74
|
DatasetLineageTypeClass,
|
|
74
75
|
DatasetPropertiesClass,
|
|
76
|
+
FineGrainedLineageClass,
|
|
77
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
78
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
75
79
|
GlobalTagsClass,
|
|
76
80
|
OwnerClass,
|
|
77
81
|
OwnershipClass,
|
|
@@ -80,6 +84,10 @@ from datahub.metadata.schema_classes import (
|
|
|
80
84
|
UpstreamClass,
|
|
81
85
|
UpstreamLineageClass,
|
|
82
86
|
)
|
|
87
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
88
|
+
SqlParsingResult,
|
|
89
|
+
create_lineage_sql_parsed_result,
|
|
90
|
+
)
|
|
83
91
|
from datahub.utilities import config_clean
|
|
84
92
|
from datahub.utilities.lossy_collections import LossyList
|
|
85
93
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -342,7 +350,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
342
350
|
)
|
|
343
351
|
if dataset_response.status_code != 200:
|
|
344
352
|
logger.warning(f"Failed to get dataset info: {dataset_response.text}")
|
|
345
|
-
|
|
353
|
+
return {}
|
|
346
354
|
return dataset_response.json()
|
|
347
355
|
|
|
348
356
|
def get_datasource_urn_from_id(
|
|
@@ -393,8 +401,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
393
401
|
)
|
|
394
402
|
|
|
395
403
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
404
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
396
405
|
modified_ts = int(
|
|
397
|
-
dp.parse(dashboard_data.get("changed_on_utc",
|
|
406
|
+
dp.parse(dashboard_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
398
407
|
)
|
|
399
408
|
title = dashboard_data.get("dashboard_title", "")
|
|
400
409
|
# note: the API does not currently supply created_by usernames due to a bug
|
|
@@ -506,8 +515,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
506
515
|
)
|
|
507
516
|
|
|
508
517
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
518
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
509
519
|
modified_ts = int(
|
|
510
|
-
dp.parse(chart_data.get("changed_on_utc",
|
|
520
|
+
dp.parse(chart_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
511
521
|
)
|
|
512
522
|
title = chart_data.get("slice_name", "")
|
|
513
523
|
|
|
@@ -680,6 +690,88 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
680
690
|
env=self.config.env,
|
|
681
691
|
)
|
|
682
692
|
|
|
693
|
+
def generate_virtual_dataset_lineage(
|
|
694
|
+
self,
|
|
695
|
+
parsed_query_object: SqlParsingResult,
|
|
696
|
+
datasource_urn: str,
|
|
697
|
+
) -> UpstreamLineageClass:
|
|
698
|
+
cll = (
|
|
699
|
+
parsed_query_object.column_lineage
|
|
700
|
+
if parsed_query_object.column_lineage is not None
|
|
701
|
+
else []
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
705
|
+
|
|
706
|
+
for cll_info in cll:
|
|
707
|
+
downstream = (
|
|
708
|
+
[make_schema_field_urn(datasource_urn, cll_info.downstream.column)]
|
|
709
|
+
if cll_info.downstream and cll_info.downstream.column
|
|
710
|
+
else []
|
|
711
|
+
)
|
|
712
|
+
upstreams = [
|
|
713
|
+
make_schema_field_urn(column_ref.table, column_ref.column)
|
|
714
|
+
for column_ref in cll_info.upstreams
|
|
715
|
+
]
|
|
716
|
+
fine_grained_lineages.append(
|
|
717
|
+
FineGrainedLineageClass(
|
|
718
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
719
|
+
downstreams=downstream,
|
|
720
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
721
|
+
upstreams=upstreams,
|
|
722
|
+
)
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
upstream_lineage = UpstreamLineageClass(
|
|
726
|
+
upstreams=[
|
|
727
|
+
UpstreamClass(
|
|
728
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
729
|
+
dataset=input_table_urn,
|
|
730
|
+
)
|
|
731
|
+
for input_table_urn in parsed_query_object.in_tables
|
|
732
|
+
],
|
|
733
|
+
fineGrainedLineages=fine_grained_lineages,
|
|
734
|
+
)
|
|
735
|
+
return upstream_lineage
|
|
736
|
+
|
|
737
|
+
def generate_physical_dataset_lineage(
|
|
738
|
+
self,
|
|
739
|
+
dataset_response: dict,
|
|
740
|
+
upstream_dataset: str,
|
|
741
|
+
datasource_urn: str,
|
|
742
|
+
) -> UpstreamLineageClass:
|
|
743
|
+
# To generate column level lineage, we can manually decode the metadata
|
|
744
|
+
# to produce the ColumnLineageInfo
|
|
745
|
+
columns = dataset_response.get("result", {}).get("columns", [])
|
|
746
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
747
|
+
|
|
748
|
+
for column in columns:
|
|
749
|
+
column_name = column.get("column_name", "")
|
|
750
|
+
if not column_name:
|
|
751
|
+
continue
|
|
752
|
+
|
|
753
|
+
downstream = [make_schema_field_urn(datasource_urn, column_name)]
|
|
754
|
+
upstreams = [make_schema_field_urn(upstream_dataset, column_name)]
|
|
755
|
+
fine_grained_lineages.append(
|
|
756
|
+
FineGrainedLineageClass(
|
|
757
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
758
|
+
downstreams=downstream,
|
|
759
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
760
|
+
upstreams=upstreams,
|
|
761
|
+
)
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
upstream_lineage = UpstreamLineageClass(
|
|
765
|
+
upstreams=[
|
|
766
|
+
UpstreamClass(
|
|
767
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
768
|
+
dataset=upstream_dataset,
|
|
769
|
+
)
|
|
770
|
+
],
|
|
771
|
+
fineGrainedLineages=fine_grained_lineages,
|
|
772
|
+
)
|
|
773
|
+
return upstream_lineage
|
|
774
|
+
|
|
683
775
|
def construct_dataset_from_dataset_data(
|
|
684
776
|
self, dataset_data: dict
|
|
685
777
|
) -> DatasetSnapshot:
|
|
@@ -692,14 +784,23 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
692
784
|
dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
|
|
693
785
|
|
|
694
786
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
787
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
695
788
|
modified_ts = int(
|
|
696
|
-
dp.parse(dataset_data.get("changed_on_utc",
|
|
789
|
+
dp.parse(dataset_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
697
790
|
)
|
|
698
791
|
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
699
792
|
|
|
700
793
|
upstream_warehouse_platform = (
|
|
701
794
|
dataset_response.get("result", {}).get("database", {}).get("backend")
|
|
702
795
|
)
|
|
796
|
+
upstream_warehouse_db_name = (
|
|
797
|
+
dataset_response.get("result", {}).get("database", {}).get("database_name")
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
# if we have rendered sql, we always use that and defualt back to regular sql
|
|
801
|
+
sql = dataset_response.get("result", {}).get(
|
|
802
|
+
"rendered_sql"
|
|
803
|
+
) or dataset_response.get("result", {}).get("sql")
|
|
703
804
|
|
|
704
805
|
# Preset has a way of naming their platforms differently than
|
|
705
806
|
# how datahub names them, so map the platform name to the correct naming
|
|
@@ -712,22 +813,28 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
712
813
|
if upstream_warehouse_platform in warehouse_naming:
|
|
713
814
|
upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
|
|
714
815
|
|
|
715
|
-
# TODO: Categorize physical vs virtual upstream dataset
|
|
716
|
-
# mark all upstream dataset as physical for now, in the future we would ideally like
|
|
717
|
-
# to differentiate physical vs virtual upstream datasets
|
|
718
|
-
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
719
816
|
upstream_dataset = self.get_datasource_urn_from_id(
|
|
720
817
|
dataset_response, upstream_warehouse_platform
|
|
721
818
|
)
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
819
|
+
|
|
820
|
+
# Sometimes the field will be null instead of not existing
|
|
821
|
+
if sql == "null" or not sql:
|
|
822
|
+
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
823
|
+
upstream_lineage = self.generate_physical_dataset_lineage(
|
|
824
|
+
dataset_response, upstream_dataset, datasource_urn
|
|
825
|
+
)
|
|
826
|
+
else:
|
|
827
|
+
tag_urn = f"urn:li:tag:{self.platform}:virtual"
|
|
828
|
+
parsed_query_object = create_lineage_sql_parsed_result(
|
|
829
|
+
query=sql,
|
|
830
|
+
default_db=upstream_warehouse_db_name,
|
|
831
|
+
platform=upstream_warehouse_platform,
|
|
832
|
+
platform_instance=None,
|
|
833
|
+
env=self.config.env,
|
|
834
|
+
)
|
|
835
|
+
upstream_lineage = self.generate_virtual_dataset_lineage(
|
|
836
|
+
parsed_query_object, datasource_urn
|
|
837
|
+
)
|
|
731
838
|
|
|
732
839
|
dataset_info = DatasetPropertiesClass(
|
|
733
840
|
name=dataset.table_name,
|
|
@@ -358,12 +358,15 @@ class VertexAISource(Source):
|
|
|
358
358
|
return ProjectIdKey(project_id=self.config.project_id, platform=self.platform)
|
|
359
359
|
|
|
360
360
|
def _is_automl_job(self, job: VertexAiResourceNoun) -> bool:
|
|
361
|
-
return (
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
361
|
+
return isinstance(
|
|
362
|
+
job,
|
|
363
|
+
(
|
|
364
|
+
AutoMLTabularTrainingJob,
|
|
365
|
+
AutoMLTextTrainingJob,
|
|
366
|
+
AutoMLImageTrainingJob,
|
|
367
|
+
AutoMLVideoTrainingJob,
|
|
368
|
+
AutoMLForecastingTrainingJob,
|
|
369
|
+
),
|
|
367
370
|
)
|
|
368
371
|
|
|
369
372
|
def _search_model_version(
|
|
@@ -618,12 +621,7 @@ class VertexAISource(Source):
|
|
|
618
621
|
endpoint_dict[resource.model].append(endpoint)
|
|
619
622
|
self.endpoints = endpoint_dict
|
|
620
623
|
|
|
621
|
-
endpoints
|
|
622
|
-
self.endpoints[model.resource_name]
|
|
623
|
-
if model.resource_name in self.endpoints
|
|
624
|
-
else []
|
|
625
|
-
)
|
|
626
|
-
return endpoints
|
|
624
|
+
return self.endpoints.get(model.resource_name, [])
|
|
627
625
|
|
|
628
626
|
def _make_ml_model_urn(self, model_version: VersionInfo, model_name: str) -> str:
|
|
629
627
|
urn = builder.make_ml_model_urn(
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -16777,7 +16777,8 @@
|
|
|
16777
16777
|
"createdOn": "outputEdges/*/created/time",
|
|
16778
16778
|
"entityTypes": [
|
|
16779
16779
|
"dataset",
|
|
16780
|
-
"mlModel"
|
|
16780
|
+
"mlModel",
|
|
16781
|
+
"dataProcessInstance"
|
|
16781
16782
|
],
|
|
16782
16783
|
"isLineage": true,
|
|
16783
16784
|
"isUpstream": false,
|
|
@@ -17031,7 +17032,8 @@
|
|
|
17031
17032
|
"createdOn": "inputEdges/*/created/time",
|
|
17032
17033
|
"entityTypes": [
|
|
17033
17034
|
"dataset",
|
|
17034
|
-
"mlModel"
|
|
17035
|
+
"mlModel",
|
|
17036
|
+
"dataProcessInstance"
|
|
17035
17037
|
],
|
|
17036
17038
|
"isLineage": true,
|
|
17037
17039
|
"name": "DataProcessInstanceConsumes",
|
datahub/testing/mcp_diff.py
CHANGED
|
@@ -8,7 +8,6 @@ import deepdiff.serialization
|
|
|
8
8
|
import yaml
|
|
9
9
|
from deepdiff import DeepDiff
|
|
10
10
|
from deepdiff.model import DiffLevel
|
|
11
|
-
from deepdiff.operator import BaseOperator
|
|
12
11
|
from typing_extensions import Literal
|
|
13
12
|
|
|
14
13
|
ReportType = Literal[
|
|
@@ -59,27 +58,12 @@ class AspectForDiff:
|
|
|
59
58
|
|
|
60
59
|
@dataclasses.dataclass
|
|
61
60
|
class DeltaInfo:
|
|
62
|
-
"""Information about an MCP used to construct a diff delta.
|
|
63
|
-
|
|
64
|
-
In a separate class so it can be ignored by DeepDiff via MCPDeltaInfoOperator.
|
|
65
|
-
"""
|
|
61
|
+
"""Information about an MCP used to construct a diff delta."""
|
|
66
62
|
|
|
67
63
|
idx: int # Location in list of MCEs in golden file
|
|
68
64
|
original: Dict[str, Any] # Original json-serialized MCP
|
|
69
65
|
|
|
70
66
|
|
|
71
|
-
class DeltaInfoOperator(BaseOperator):
|
|
72
|
-
"""Warning: Doesn't seem to be working right now.
|
|
73
|
-
Ignored via an ignore path as an extra layer of defense.
|
|
74
|
-
"""
|
|
75
|
-
|
|
76
|
-
def __init__(self):
|
|
77
|
-
super().__init__(types=[DeltaInfo])
|
|
78
|
-
|
|
79
|
-
def give_up_diffing(self, *args: Any, **kwargs: Any) -> bool:
|
|
80
|
-
return True
|
|
81
|
-
|
|
82
|
-
|
|
83
67
|
AspectsByUrn = Dict[str, Dict[str, List[AspectForDiff]]]
|
|
84
68
|
|
|
85
69
|
|
|
@@ -176,7 +160,6 @@ class MCPDiff:
|
|
|
176
160
|
t2=t2,
|
|
177
161
|
exclude_regex_paths=ignore_paths,
|
|
178
162
|
ignore_order=True,
|
|
179
|
-
custom_operators=[DeltaInfoOperator()],
|
|
180
163
|
)
|
|
181
164
|
if diff:
|
|
182
165
|
aspect_changes[urn][aspect_name] = MCPAspectDiff.create(diff)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|