acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
- datahub/_version.py,sha256=Pm6pv1SEZW2WH_7EUSAIoR-6UVTQyPmr1utervGvb6M,322
3
+ datahub/_version.py,sha256=4sjCSS8UbsxYv_VuzZNtNxudBmSHVgJLBmaqBIjHobs,323
4
4
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
5
5
  datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
6
6
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -216,8 +216,8 @@ datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99Wd
216
216
  datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
217
217
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
218
218
  datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
219
- datahub/ingestion/source/superset.py,sha256=WrpCiZEC17cmFGcfUTTqUdnKASq7ZpT0ih-4xqB9qt4,30976
220
- datahub/ingestion/source/vertexai.py,sha256=uOtIgHwsH--hkAFqspXGoNN-jHip16s6m5lyvwi-jrg,27735
219
+ datahub/ingestion/source/superset.py,sha256=HW3oiS6EDdVkSFopqg48e76NkElc8U4I8lEMjIvxrPo,34997
220
+ datahub/ingestion/source/vertexai.py,sha256=_dpT4RSvd1IzucH-grDkD80sdGXume_qA-OLkHex_GM,27600
221
221
  datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
222
222
  datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
223
223
  datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
@@ -447,7 +447,7 @@ datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=pEw2O9xoTSIWDi
447
447
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
448
448
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
449
449
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
450
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=R3QxWtdR8T_8YV_3aqt3rJdto1gAij_mEHlSYKqdCfA,28326
450
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=gX9E1Z_CemAZsuTDmtvqrxY7vBL2da75j7X8Xwhaf8Y,28441
451
451
  datahub/ingestion/source/snowflake/snowflake_query.py,sha256=qz_rhRMNCXxHd23bePbb3YxhFgN7eRpV4s6g58hQ5bU,39678
452
452
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=ahea-bwpW6T0iDehGo0Qq_J7wKxPkV61aYHm8bGwDqo,6651
453
453
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=qkGgk6WdKSPThFjexXHrxUPYiVtzDk2MbGX3b281A4c,26044
@@ -580,7 +580,7 @@ datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1
580
580
  datahub/lite/lite_util.py,sha256=Cm6trMTeo0X1fv4nSsW9lC0jqce7Jt-05GhOtIGzsVc,4559
581
581
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
582
582
  datahub/metadata/_schema_classes.py,sha256=WMINRH1eF7TmnGXSrUCVw5mxplZf5wXGy8QCAm4pxTk,994687
583
- datahub/metadata/schema.avsc,sha256=88IHgp2lvp9_uY4XY4xH2LmgNReNfUOnbi6bofQzjs0,743287
583
+ datahub/metadata/schema.avsc,sha256=iIXHOzZeBr7dSUzP2Bl8H3janQ7diKYXZejt5crFneU,743361
584
584
  datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
585
585
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
586
586
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
@@ -726,9 +726,9 @@ datahub/metadata/schemas/DataPlatformInstanceKey.avsc,sha256=nHFRKlg98lfqYyHZEAX
726
726
  datahub/metadata/schemas/DataPlatformInstanceProperties.avsc,sha256=4-UrBTtVAR0rKQ4OPt4MVZeFtolXzIajGtyh3KC8-MQ,1623
727
727
  datahub/metadata/schemas/DataPlatformKey.avsc,sha256=5Z2adruXKzSucmgCba768UXdsGsYBH9t9DvFF9L9mxo,461
728
728
  datahub/metadata/schemas/DataProcessInfo.avsc,sha256=n4Zuk4kpHrHI2BdINhG-OucdCefb2GEsDv5mXQtSWIw,1558
729
- datahub/metadata/schemas/DataProcessInstanceInput.avsc,sha256=qyo5BGB7s2HLcc9crHuIxB0yo5budfrF58zh3Uk6Yrw,6293
729
+ datahub/metadata/schemas/DataProcessInstanceInput.avsc,sha256=DmPchn1kjjXKJ8e4EU6I8Bg8iuQkn43UJkTi1O-Zn-8,6328
730
730
  datahub/metadata/schemas/DataProcessInstanceKey.avsc,sha256=YSEVtSWql1IZ9AG37HmJZ4118pgi8kVCygI_GqFf3YA,945
731
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc,sha256=O33dSMbj_l8SmtCC-MRT1Edl3xIIl4yB1KdVxhJ6Yi0,6348
731
+ datahub/metadata/schemas/DataProcessInstanceOutput.avsc,sha256=xyGBUf3vFHrMLtmZjs6H7FEs-wqmB8Apx5iZubCbkEo,6383
732
732
  datahub/metadata/schemas/DataProcessInstanceProperties.avsc,sha256=2qsDFeSA2-ag5IVetgD8mW2k--F6CwmYXM3KOE6edU8,3836
733
733
  datahub/metadata/schemas/DataProcessInstanceRelationships.avsc,sha256=VhBpnyGGvO06WEnM6zy4PmjiT0nivRQfkSdJCUgIavw,2358
734
734
  datahub/metadata/schemas/DataProcessInstanceRunEvent.avsc,sha256=zwTYULEnpMbqwkLN8NbXW9PQWFG4X6TZkZwTQ1Wb53Y,6713
@@ -932,7 +932,7 @@ datahub/testing/check_str_enum.py,sha256=yqk0XXHOGteN-IGqCp5JHy0Kca13BnI09ZqKc4N
932
932
  datahub/testing/compare_metadata_json.py,sha256=mTU5evu7KLS3cx8OLOC1fFxj0eY1J1CGV2PEQZmapos,5361
933
933
  datahub/testing/docker_utils.py,sha256=g169iy_jNR_mg0p8X31cChZqjOryutAIHUYLq3xqueY,2415
934
934
  datahub/testing/doctest.py,sha256=1_8WEhHZ2eRQtw8vsXKzr9L5zzvs0Tcr6q4mnkyyvtw,295
935
- datahub/testing/mcp_diff.py,sha256=Dxde5uZHqZf1EjOkHm405OHY5PPJp03agZJM9SyR4yE,10717
935
+ datahub/testing/mcp_diff.py,sha256=-4Q1GYvv1zXMEBXGCIxAwCwrOttNV5QNZvxFFz0WNuE,10205
936
936
  datahub/testing/pytest_hooks.py,sha256=eifmj0M68AIfjTn_-0vtaBkKl75vNKMjsbYX-pJqmGY,1417
937
937
  datahub/upgrade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
938
938
  datahub/upgrade/upgrade.py,sha256=lf60_dCu51twObAL5E8NqdrW3_2lsnUJUaB9MSEVXwI,16638
@@ -1025,9 +1025,9 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1025
1025
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1026
1026
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1027
1027
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1028
- acryl_datahub-1.0.0rc17.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1029
- acryl_datahub-1.0.0rc17.dist-info/METADATA,sha256=MDcGSuVfOxVWXtrGFaLcnCPABOS1ZCnehvPDHCWJtrk,176898
1030
- acryl_datahub-1.0.0rc17.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
1031
- acryl_datahub-1.0.0rc17.dist-info/entry_points.txt,sha256=7-eDilp0OACUtlmmZ-LF6H9MF_SWD_bWHKNG7Dvhhos,9652
1032
- acryl_datahub-1.0.0rc17.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1033
- acryl_datahub-1.0.0rc17.dist-info/RECORD,,
1028
+ acryl_datahub-1.0.0.1rc1.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1029
+ acryl_datahub-1.0.0.1rc1.dist-info/METADATA,sha256=eJIEEXWE89GHGpg0sV7xwmAzxQ9fN-E1bNQvv0egVpo,176827
1030
+ acryl_datahub-1.0.0.1rc1.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
1031
+ acryl_datahub-1.0.0.1rc1.dist-info/entry_points.txt,sha256=7-eDilp0OACUtlmmZ-LF6H9MF_SWD_bWHKNG7Dvhhos,9652
1032
+ acryl_datahub-1.0.0.1rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1033
+ acryl_datahub-1.0.0.1rc1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (76.0.0)
2
+ Generator: setuptools (76.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0rc17"
3
+ __version__ = "1.0.0.1rc1"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -403,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
403
403
  res["session_id"],
404
404
  res["query_start_time"],
405
405
  object_modified_by_ddl,
406
+ res["query_type"],
406
407
  )
407
408
  if known_ddl_entry:
408
409
  return known_ddl_entry
@@ -537,40 +538,42 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
537
538
  session_id: str,
538
539
  timestamp: datetime,
539
540
  object_modified_by_ddl: dict,
541
+ query_type: str,
540
542
  ) -> Optional[Union[TableRename, TableSwap]]:
541
543
  timestamp = timestamp.astimezone(timezone.utc)
542
- if object_modified_by_ddl[
543
- "operationType"
544
- ] == "ALTER" and object_modified_by_ddl["properties"].get("swapTargetName"):
545
- urn1 = self.identifiers.gen_dataset_urn(
544
+ if (
545
+ object_modified_by_ddl["operationType"] == "ALTER"
546
+ and query_type == "RENAME_TABLE"
547
+ and object_modified_by_ddl["properties"].get("objectName")
548
+ ):
549
+ original_un = self.identifiers.gen_dataset_urn(
546
550
  self.identifiers.get_dataset_identifier_from_qualified_name(
547
551
  object_modified_by_ddl["objectName"]
548
552
  )
549
553
  )
550
554
 
551
- urn2 = self.identifiers.gen_dataset_urn(
555
+ new_urn = self.identifiers.gen_dataset_urn(
552
556
  self.identifiers.get_dataset_identifier_from_qualified_name(
553
- object_modified_by_ddl["properties"]["swapTargetName"]["value"]
557
+ object_modified_by_ddl["properties"]["objectName"]["value"]
554
558
  )
555
559
  )
556
-
557
- return TableSwap(urn1, urn2, query, session_id, timestamp)
560
+ return TableRename(original_un, new_urn, query, session_id, timestamp)
558
561
  elif object_modified_by_ddl[
559
562
  "operationType"
560
- ] == "RENAME_TABLE" and object_modified_by_ddl["properties"].get("objectName"):
561
- original_un = self.identifiers.gen_dataset_urn(
563
+ ] == "ALTER" and object_modified_by_ddl["properties"].get("swapTargetName"):
564
+ urn1 = self.identifiers.gen_dataset_urn(
562
565
  self.identifiers.get_dataset_identifier_from_qualified_name(
563
566
  object_modified_by_ddl["objectName"]
564
567
  )
565
568
  )
566
569
 
567
- new_urn = self.identifiers.gen_dataset_urn(
570
+ urn2 = self.identifiers.gen_dataset_urn(
568
571
  self.identifiers.get_dataset_identifier_from_qualified_name(
569
- object_modified_by_ddl["properties"]["objectName"]["value"]
572
+ object_modified_by_ddl["properties"]["swapTargetName"]["value"]
570
573
  )
571
574
  )
572
575
 
573
- return TableRename(original_un, new_urn, query, session_id, timestamp)
576
+ return TableSwap(urn1, urn2, query, session_id, timestamp)
574
577
  else:
575
578
  self.report.num_ddl_queries_dropped += 1
576
579
  return None
@@ -23,6 +23,7 @@ from datahub.emitter.mce_builder import (
23
23
  make_dataset_urn,
24
24
  make_dataset_urn_with_platform_instance,
25
25
  make_domain_urn,
26
+ make_schema_field_urn,
26
27
  make_user_urn,
27
28
  )
28
29
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
@@ -72,6 +73,9 @@ from datahub.metadata.schema_classes import (
72
73
  DashboardInfoClass,
73
74
  DatasetLineageTypeClass,
74
75
  DatasetPropertiesClass,
76
+ FineGrainedLineageClass,
77
+ FineGrainedLineageDownstreamTypeClass,
78
+ FineGrainedLineageUpstreamTypeClass,
75
79
  GlobalTagsClass,
76
80
  OwnerClass,
77
81
  OwnershipClass,
@@ -80,6 +84,10 @@ from datahub.metadata.schema_classes import (
80
84
  UpstreamClass,
81
85
  UpstreamLineageClass,
82
86
  )
87
+ from datahub.sql_parsing.sqlglot_lineage import (
88
+ SqlParsingResult,
89
+ create_lineage_sql_parsed_result,
90
+ )
83
91
  from datahub.utilities import config_clean
84
92
  from datahub.utilities.lossy_collections import LossyList
85
93
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -342,7 +350,7 @@ class SupersetSource(StatefulIngestionSourceBase):
342
350
  )
343
351
  if dataset_response.status_code != 200:
344
352
  logger.warning(f"Failed to get dataset info: {dataset_response.text}")
345
- dataset_response.raise_for_status()
353
+ return {}
346
354
  return dataset_response.json()
347
355
 
348
356
  def get_datasource_urn_from_id(
@@ -393,8 +401,9 @@ class SupersetSource(StatefulIngestionSourceBase):
393
401
  )
394
402
 
395
403
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
404
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
396
405
  modified_ts = int(
397
- dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
406
+ dp.parse(dashboard_data.get("changed_on_utc", now)).timestamp() * 1000
398
407
  )
399
408
  title = dashboard_data.get("dashboard_title", "")
400
409
  # note: the API does not currently supply created_by usernames due to a bug
@@ -506,8 +515,9 @@ class SupersetSource(StatefulIngestionSourceBase):
506
515
  )
507
516
 
508
517
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
518
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
509
519
  modified_ts = int(
510
- dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
520
+ dp.parse(chart_data.get("changed_on_utc", now)).timestamp() * 1000
511
521
  )
512
522
  title = chart_data.get("slice_name", "")
513
523
 
@@ -680,6 +690,88 @@ class SupersetSource(StatefulIngestionSourceBase):
680
690
  env=self.config.env,
681
691
  )
682
692
 
693
+ def generate_virtual_dataset_lineage(
694
+ self,
695
+ parsed_query_object: SqlParsingResult,
696
+ datasource_urn: str,
697
+ ) -> UpstreamLineageClass:
698
+ cll = (
699
+ parsed_query_object.column_lineage
700
+ if parsed_query_object.column_lineage is not None
701
+ else []
702
+ )
703
+
704
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
705
+
706
+ for cll_info in cll:
707
+ downstream = (
708
+ [make_schema_field_urn(datasource_urn, cll_info.downstream.column)]
709
+ if cll_info.downstream and cll_info.downstream.column
710
+ else []
711
+ )
712
+ upstreams = [
713
+ make_schema_field_urn(column_ref.table, column_ref.column)
714
+ for column_ref in cll_info.upstreams
715
+ ]
716
+ fine_grained_lineages.append(
717
+ FineGrainedLineageClass(
718
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
719
+ downstreams=downstream,
720
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
721
+ upstreams=upstreams,
722
+ )
723
+ )
724
+
725
+ upstream_lineage = UpstreamLineageClass(
726
+ upstreams=[
727
+ UpstreamClass(
728
+ type=DatasetLineageTypeClass.TRANSFORMED,
729
+ dataset=input_table_urn,
730
+ )
731
+ for input_table_urn in parsed_query_object.in_tables
732
+ ],
733
+ fineGrainedLineages=fine_grained_lineages,
734
+ )
735
+ return upstream_lineage
736
+
737
+ def generate_physical_dataset_lineage(
738
+ self,
739
+ dataset_response: dict,
740
+ upstream_dataset: str,
741
+ datasource_urn: str,
742
+ ) -> UpstreamLineageClass:
743
+ # To generate column level lineage, we can manually decode the metadata
744
+ # to produce the ColumnLineageInfo
745
+ columns = dataset_response.get("result", {}).get("columns", [])
746
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
747
+
748
+ for column in columns:
749
+ column_name = column.get("column_name", "")
750
+ if not column_name:
751
+ continue
752
+
753
+ downstream = [make_schema_field_urn(datasource_urn, column_name)]
754
+ upstreams = [make_schema_field_urn(upstream_dataset, column_name)]
755
+ fine_grained_lineages.append(
756
+ FineGrainedLineageClass(
757
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
758
+ downstreams=downstream,
759
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
760
+ upstreams=upstreams,
761
+ )
762
+ )
763
+
764
+ upstream_lineage = UpstreamLineageClass(
765
+ upstreams=[
766
+ UpstreamClass(
767
+ type=DatasetLineageTypeClass.TRANSFORMED,
768
+ dataset=upstream_dataset,
769
+ )
770
+ ],
771
+ fineGrainedLineages=fine_grained_lineages,
772
+ )
773
+ return upstream_lineage
774
+
683
775
  def construct_dataset_from_dataset_data(
684
776
  self, dataset_data: dict
685
777
  ) -> DatasetSnapshot:
@@ -692,14 +784,23 @@ class SupersetSource(StatefulIngestionSourceBase):
692
784
  dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
693
785
 
694
786
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
787
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
695
788
  modified_ts = int(
696
- dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
789
+ dp.parse(dataset_data.get("changed_on_utc", now)).timestamp() * 1000
697
790
  )
698
791
  last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
699
792
 
700
793
  upstream_warehouse_platform = (
701
794
  dataset_response.get("result", {}).get("database", {}).get("backend")
702
795
  )
796
+ upstream_warehouse_db_name = (
797
+ dataset_response.get("result", {}).get("database", {}).get("database_name")
798
+ )
799
+
800
+ # if we have rendered sql, we always use that and defualt back to regular sql
801
+ sql = dataset_response.get("result", {}).get(
802
+ "rendered_sql"
803
+ ) or dataset_response.get("result", {}).get("sql")
703
804
 
704
805
  # Preset has a way of naming their platforms differently than
705
806
  # how datahub names them, so map the platform name to the correct naming
@@ -712,22 +813,28 @@ class SupersetSource(StatefulIngestionSourceBase):
712
813
  if upstream_warehouse_platform in warehouse_naming:
713
814
  upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
714
815
 
715
- # TODO: Categorize physical vs virtual upstream dataset
716
- # mark all upstream dataset as physical for now, in the future we would ideally like
717
- # to differentiate physical vs virtual upstream datasets
718
- tag_urn = f"urn:li:tag:{self.platform}:physical"
719
816
  upstream_dataset = self.get_datasource_urn_from_id(
720
817
  dataset_response, upstream_warehouse_platform
721
818
  )
722
- upstream_lineage = UpstreamLineageClass(
723
- upstreams=[
724
- UpstreamClass(
725
- type=DatasetLineageTypeClass.TRANSFORMED,
726
- dataset=upstream_dataset,
727
- properties={"externalUrl": dataset_url},
728
- )
729
- ]
730
- )
819
+
820
+ # Sometimes the field will be null instead of not existing
821
+ if sql == "null" or not sql:
822
+ tag_urn = f"urn:li:tag:{self.platform}:physical"
823
+ upstream_lineage = self.generate_physical_dataset_lineage(
824
+ dataset_response, upstream_dataset, datasource_urn
825
+ )
826
+ else:
827
+ tag_urn = f"urn:li:tag:{self.platform}:virtual"
828
+ parsed_query_object = create_lineage_sql_parsed_result(
829
+ query=sql,
830
+ default_db=upstream_warehouse_db_name,
831
+ platform=upstream_warehouse_platform,
832
+ platform_instance=None,
833
+ env=self.config.env,
834
+ )
835
+ upstream_lineage = self.generate_virtual_dataset_lineage(
836
+ parsed_query_object, datasource_urn
837
+ )
731
838
 
732
839
  dataset_info = DatasetPropertiesClass(
733
840
  name=dataset.table_name,
@@ -358,12 +358,15 @@ class VertexAISource(Source):
358
358
  return ProjectIdKey(project_id=self.config.project_id, platform=self.platform)
359
359
 
360
360
  def _is_automl_job(self, job: VertexAiResourceNoun) -> bool:
361
- return (
362
- isinstance(job, AutoMLTabularTrainingJob)
363
- or isinstance(job, AutoMLTextTrainingJob)
364
- or isinstance(job, AutoMLImageTrainingJob)
365
- or isinstance(job, AutoMLVideoTrainingJob)
366
- or isinstance(job, AutoMLForecastingTrainingJob)
361
+ return isinstance(
362
+ job,
363
+ (
364
+ AutoMLTabularTrainingJob,
365
+ AutoMLTextTrainingJob,
366
+ AutoMLImageTrainingJob,
367
+ AutoMLVideoTrainingJob,
368
+ AutoMLForecastingTrainingJob,
369
+ ),
367
370
  )
368
371
 
369
372
  def _search_model_version(
@@ -618,12 +621,7 @@ class VertexAISource(Source):
618
621
  endpoint_dict[resource.model].append(endpoint)
619
622
  self.endpoints = endpoint_dict
620
623
 
621
- endpoints = (
622
- self.endpoints[model.resource_name]
623
- if model.resource_name in self.endpoints
624
- else []
625
- )
626
- return endpoints
624
+ return self.endpoints.get(model.resource_name, [])
627
625
 
628
626
  def _make_ml_model_urn(self, model_version: VersionInfo, model_name: str) -> str:
629
627
  urn = builder.make_ml_model_urn(
@@ -16777,7 +16777,8 @@
16777
16777
  "createdOn": "outputEdges/*/created/time",
16778
16778
  "entityTypes": [
16779
16779
  "dataset",
16780
- "mlModel"
16780
+ "mlModel",
16781
+ "dataProcessInstance"
16781
16782
  ],
16782
16783
  "isLineage": true,
16783
16784
  "isUpstream": false,
@@ -17031,7 +17032,8 @@
17031
17032
  "createdOn": "inputEdges/*/created/time",
17032
17033
  "entityTypes": [
17033
17034
  "dataset",
17034
- "mlModel"
17035
+ "mlModel",
17036
+ "dataProcessInstance"
17035
17037
  ],
17036
17038
  "isLineage": true,
17037
17039
  "name": "DataProcessInstanceConsumes",
@@ -41,7 +41,8 @@
41
41
  "createdOn": "inputEdges/*/created/time",
42
42
  "entityTypes": [
43
43
  "dataset",
44
- "mlModel"
44
+ "mlModel",
45
+ "dataProcessInstance"
45
46
  ],
46
47
  "isLineage": true,
47
48
  "name": "DataProcessInstanceConsumes",
@@ -41,7 +41,8 @@
41
41
  "createdOn": "outputEdges/*/created/time",
42
42
  "entityTypes": [
43
43
  "dataset",
44
- "mlModel"
44
+ "mlModel",
45
+ "dataProcessInstance"
45
46
  ],
46
47
  "isLineage": true,
47
48
  "isUpstream": false,
@@ -8,7 +8,6 @@ import deepdiff.serialization
8
8
  import yaml
9
9
  from deepdiff import DeepDiff
10
10
  from deepdiff.model import DiffLevel
11
- from deepdiff.operator import BaseOperator
12
11
  from typing_extensions import Literal
13
12
 
14
13
  ReportType = Literal[
@@ -59,27 +58,12 @@ class AspectForDiff:
59
58
 
60
59
  @dataclasses.dataclass
61
60
  class DeltaInfo:
62
- """Information about an MCP used to construct a diff delta.
63
-
64
- In a separate class so it can be ignored by DeepDiff via MCPDeltaInfoOperator.
65
- """
61
+ """Information about an MCP used to construct a diff delta."""
66
62
 
67
63
  idx: int # Location in list of MCEs in golden file
68
64
  original: Dict[str, Any] # Original json-serialized MCP
69
65
 
70
66
 
71
- class DeltaInfoOperator(BaseOperator):
72
- """Warning: Doesn't seem to be working right now.
73
- Ignored via an ignore path as an extra layer of defense.
74
- """
75
-
76
- def __init__(self):
77
- super().__init__(types=[DeltaInfo])
78
-
79
- def give_up_diffing(self, *args: Any, **kwargs: Any) -> bool:
80
- return True
81
-
82
-
83
67
  AspectsByUrn = Dict[str, Dict[str, List[AspectForDiff]]]
84
68
 
85
69
 
@@ -176,7 +160,6 @@ class MCPDiff:
176
160
  t2=t2,
177
161
  exclude_regex_paths=ignore_paths,
178
162
  ignore_order=True,
179
- custom_operators=[DeltaInfoOperator()],
180
163
  )
181
164
  if diff:
182
165
  aspect_changes[urn][aspect_name] = MCPAspectDiff.create(diff)