acryl-datahub 1.0.0.3rc1__py3-none-any.whl → 1.0.0.3rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (27) hide show
  1. {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc4.dist-info}/METADATA +2285 -2283
  2. {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc4.dist-info}/RECORD +27 -26
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datacontract/datacontract.py +35 -3
  5. datahub/api/entities/datajob/datajob.py +7 -4
  6. datahub/emitter/mcp.py +5 -1
  7. datahub/ingestion/run/pipeline.py +6 -4
  8. datahub/ingestion/source/common/subtypes.py +3 -0
  9. datahub/ingestion/source/mlflow.py +19 -6
  10. datahub/ingestion/source/powerbi/config.py +12 -0
  11. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  12. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  13. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  14. datahub/ingestion/source/sigma/config.py +2 -3
  15. datahub/ingestion/source/snowflake/snowflake_config.py +1 -0
  16. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  17. datahub/ingestion/source/tableau/tableau.py +31 -6
  18. datahub/ingestion/source/vertexai/vertexai.py +316 -4
  19. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
  20. datahub/metadata/_schema_classes.py +1 -1
  21. datahub/metadata/schema.avsc +2 -1
  22. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  23. datahub/testing/mcp_diff.py +15 -2
  24. {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc4.dist-info}/WHEEL +0 -0
  25. {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc4.dist-info}/entry_points.txt +0 -0
  26. {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc4.dist-info}/licenses/LICENSE +0 -0
  27. {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc4.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,12 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
29
29
  Lineage,
30
30
  ReferencedTable,
31
31
  )
32
+ from datahub.ingestion.source.powerbi.m_query.odbc import (
33
+ extract_dsn,
34
+ extract_platform,
35
+ extract_server,
36
+ normalize_platform_name,
37
+ )
32
38
  from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
33
39
  from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
34
40
  from datahub.sql_parsing.sqlglot_lineage import (
@@ -155,6 +161,7 @@ class AbstractLineage(ABC):
155
161
  tree_function.token_values(arg_list)
156
162
  ),
157
163
  )
164
+ logger.debug(f"DB Details: {arguments}")
158
165
 
159
166
  if len(arguments) < 2:
160
167
  logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
@@ -940,6 +947,147 @@ class NativeQueryLineage(AbstractLineage):
940
947
  )
941
948
 
942
949
 
950
+ class OdbcLineage(AbstractLineage):
951
+ def create_lineage(
952
+ self, data_access_func_detail: DataAccessFunctionDetail
953
+ ) -> Lineage:
954
+ logger.debug(
955
+ f"Processing {self.get_platform_pair().powerbi_data_platform_name} "
956
+ f"data-access function detail {data_access_func_detail}"
957
+ )
958
+
959
+ connect_string, _ = self.get_db_detail_from_argument(
960
+ data_access_func_detail.arg_list
961
+ )
962
+
963
+ if not connect_string:
964
+ self.reporter.warning(
965
+ title="Can not extract ODBC connect string",
966
+ message="Can not extract ODBC connect string from data access function. Skipping Lineage creation.",
967
+ context=f"table-name={self.table.full_name}, data-access-func-detail={data_access_func_detail}",
968
+ )
969
+ return Lineage.empty()
970
+
971
+ logger.debug(f"ODBC connect string: {connect_string}")
972
+ data_platform, powerbi_platform = extract_platform(connect_string)
973
+ server_name = extract_server(connect_string)
974
+
975
+ if not data_platform:
976
+ dsn = extract_dsn(connect_string)
977
+ if dsn:
978
+ logger.debug(f"Extracted DSN: {dsn}")
979
+ server_name = dsn
980
+ if dsn and self.config.dsn_to_platform_name:
981
+ logger.debug(f"Attempting to map DSN {dsn} to platform")
982
+ name = self.config.dsn_to_platform_name.get(dsn)
983
+ if name:
984
+ logger.debug(f"Found DSN {dsn} mapped to platform {name}")
985
+ data_platform, powerbi_platform = normalize_platform_name(name)
986
+
987
+ if not data_platform or not powerbi_platform:
988
+ self.reporter.warning(
989
+ title="Can not determine ODBC platform",
990
+ message="Can not determine platform from ODBC connect string. Skipping Lineage creation.",
991
+ context=f"table-name={self.table.full_name}, connect-string={connect_string}",
992
+ )
993
+ return Lineage.empty()
994
+
995
+ platform_pair: DataPlatformPair = self.create_platform_pair(
996
+ data_platform, powerbi_platform
997
+ )
998
+
999
+ if not server_name and self.config.server_to_platform_instance:
1000
+ self.reporter.warning(
1001
+ title="Can not determine ODBC server name",
1002
+ message="Can not determine server name with server_to_platform_instance mapping. Skipping Lineage creation.",
1003
+ context=f"table-name={self.table.full_name}",
1004
+ )
1005
+ return Lineage.empty()
1006
+ elif not server_name:
1007
+ server_name = "unknown"
1008
+
1009
+ database_name = None
1010
+ schema_name = None
1011
+ table_name = None
1012
+ qualified_table_name = None
1013
+
1014
+ temp_accessor: Optional[IdentifierAccessor] = (
1015
+ data_access_func_detail.identifier_accessor
1016
+ )
1017
+
1018
+ while temp_accessor:
1019
+ logger.debug(
1020
+ f"identifier = {temp_accessor.identifier} items = {temp_accessor.items}"
1021
+ )
1022
+ if temp_accessor.items.get("Kind") == "Database":
1023
+ database_name = temp_accessor.items["Name"]
1024
+
1025
+ if temp_accessor.items.get("Kind") == "Schema":
1026
+ schema_name = temp_accessor.items["Name"]
1027
+
1028
+ if temp_accessor.items.get("Kind") == "Table":
1029
+ table_name = temp_accessor.items["Name"]
1030
+
1031
+ if temp_accessor.next is not None:
1032
+ temp_accessor = temp_accessor.next
1033
+ else:
1034
+ break
1035
+
1036
+ if (
1037
+ database_name is not None
1038
+ and schema_name is not None
1039
+ and table_name is not None
1040
+ ):
1041
+ qualified_table_name = f"{database_name}.{schema_name}.{table_name}"
1042
+ elif database_name is not None and table_name is not None:
1043
+ qualified_table_name = f"{database_name}.{table_name}"
1044
+
1045
+ if not qualified_table_name:
1046
+ self.reporter.warning(
1047
+ title="Can not determine qualified table name",
1048
+ message="Can not determine qualified table name for ODBC data source. Skipping Lineage creation.",
1049
+ context=f"table-name={self.table.full_name}, data-platform={data_platform}",
1050
+ )
1051
+ logger.warning(
1052
+ f"Can not determine qualified table name for ODBC data source {data_platform} "
1053
+ f"table {self.table.full_name}."
1054
+ )
1055
+ return Lineage.empty()
1056
+
1057
+ logger.debug(
1058
+ f"ODBC Platform {data_platform} found qualified table name {qualified_table_name}"
1059
+ )
1060
+
1061
+ urn = make_urn(
1062
+ config=self.config,
1063
+ platform_instance_resolver=self.platform_instance_resolver,
1064
+ data_platform_pair=platform_pair,
1065
+ server=server_name,
1066
+ qualified_table_name=qualified_table_name,
1067
+ )
1068
+
1069
+ column_lineage = self.create_table_column_lineage(urn)
1070
+
1071
+ return Lineage(
1072
+ upstreams=[
1073
+ DataPlatformTable(
1074
+ data_platform_pair=platform_pair,
1075
+ urn=urn,
1076
+ )
1077
+ ],
1078
+ column_lineage=column_lineage,
1079
+ )
1080
+
1081
+ @staticmethod
1082
+ def create_platform_pair(
1083
+ data_platform: str, powerbi_platform: str
1084
+ ) -> DataPlatformPair:
1085
+ return DataPlatformPair(data_platform, powerbi_platform)
1086
+
1087
+ def get_platform_pair(self) -> DataPlatformPair:
1088
+ return SupportedDataPlatform.ODBC.value
1089
+
1090
+
943
1091
  class SupportedPattern(Enum):
944
1092
  DATABRICKS_QUERY = (
945
1093
  DatabricksLineage,
@@ -991,6 +1139,11 @@ class SupportedPattern(Enum):
991
1139
  FunctionName.NATIVE_QUERY,
992
1140
  )
993
1141
 
1142
+ ODBC = (
1143
+ OdbcLineage,
1144
+ FunctionName.ODBC_DATA_ACCESS,
1145
+ )
1146
+
994
1147
  def handler(self) -> Type[AbstractLineage]:
995
1148
  return self.value[0]
996
1149
 
@@ -18,7 +18,6 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
18
18
  from datahub.ingestion.source.state.stateful_ingestion_base import (
19
19
  StatefulIngestionConfigBase,
20
20
  )
21
- from datahub.utilities.lossy_collections import LossyDict
22
21
 
23
22
  logger = logging.getLogger(__name__)
24
23
 
@@ -81,8 +80,8 @@ class WorkspaceCounts(BaseModel):
81
80
  class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
82
81
  type: str = "workspace"
83
82
 
84
- workspace_counts: LossyDict[str, WorkspaceCounts] = Field(
85
- default_factory=LossyDict,
83
+ workspace_counts: Dict[str, WorkspaceCounts] = Field(
84
+ default_factory=dict,
86
85
  description="Counts of workbooks, datasets, elements and pages in each workspace.",
87
86
  )
88
87
 
@@ -301,6 +301,7 @@ class SnowflakeV2Config(
301
301
  default=AllowDenyPattern.allow_all(),
302
302
  description=(
303
303
  "List of regex patterns for structured properties to include in ingestion."
304
+ " Applied to tags with form `<database>.<schema>.<tag_name>`."
304
305
  " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
305
306
  ),
306
307
  )
@@ -23,6 +23,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
23
23
  from datahub.metadata.com.linkedin.pegasus2avro.structured import (
24
24
  StructuredPropertyDefinition,
25
25
  )
26
+ from datahub.metadata.schema_classes import ChangeTypeClass
26
27
  from datahub.metadata.urns import (
27
28
  ContainerUrn,
28
29
  DatasetUrn,
@@ -81,7 +82,7 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
81
82
  def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
82
83
  for tag in self.data_dictionary.get_all_tags():
83
84
  if not self.config.structured_property_pattern.allowed(
84
- tag.tag_identifier()
85
+ tag._id_prefix_as_str()
85
86
  ):
86
87
  continue
87
88
  if self.config.extract_tags_as_structured_properties:
@@ -111,6 +112,8 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
111
112
  yield MetadataChangeProposalWrapper(
112
113
  entityUrn=urn,
113
114
  aspect=aspect,
115
+ changeType=ChangeTypeClass.CREATE,
116
+ headers={"If-None-Match": "*"},
114
117
  ).as_workunit()
115
118
 
116
119
  def _get_tags_on_object_with_propagation(
@@ -12,6 +12,7 @@ from typing import (
12
12
  Dict,
13
13
  Iterable,
14
14
  List,
15
+ Literal,
15
16
  Optional,
16
17
  Set,
17
18
  Tuple,
@@ -612,10 +613,14 @@ class TableauConfig(
612
613
  description="Configuration settings for ingesting Tableau groups and their capabilities as custom properties.",
613
614
  )
614
615
 
615
- ingest_hidden_assets: bool = Field(
616
- True,
617
- description="When enabled, hidden views and dashboards are ingested into Datahub. "
618
- "If a dashboard or view is hidden in Tableau the luid is blank. Default of this config field is True.",
616
+ ingest_hidden_assets: Union[List[Literal["worksheet", "dashboard"]], bool] = Field(
617
+ default=["worksheet", "dashboard"],
618
+ description=(
619
+ "When enabled, hidden worksheets and dashboards are ingested into Datahub."
620
+ " If a dashboard or worksheet is hidden in Tableau the luid is blank."
621
+ " A list of asset types can also be specified, to only ingest those hidden assets."
622
+ " Current options supported are 'worksheet' and 'dashboard'."
623
+ ),
619
624
  )
620
625
 
621
626
  tags_for_hidden_assets: List[str] = Field(
@@ -1348,6 +1353,26 @@ class TableauSiteSource:
1348
1353
  # More info here: https://help.tableau.com/current/api/metadata_api/en-us/reference/view.doc.html
1349
1354
  return not dashboard_or_view.get(c.LUID)
1350
1355
 
1356
+ def _should_ingest_worksheet(self, worksheet: Dict) -> bool:
1357
+ return (
1358
+ self.config.ingest_hidden_assets is True
1359
+ or (
1360
+ isinstance(self.config.ingest_hidden_assets, list)
1361
+ and "worksheet" in self.config.ingest_hidden_assets
1362
+ )
1363
+ or not self._is_hidden_view(worksheet)
1364
+ )
1365
+
1366
+ def _should_ingest_dashboard(self, dashboard: Dict) -> bool:
1367
+ return (
1368
+ self.config.ingest_hidden_assets is True
1369
+ or (
1370
+ isinstance(self.config.ingest_hidden_assets, list)
1371
+ and "dashboard" in self.config.ingest_hidden_assets
1372
+ )
1373
+ or not self._is_hidden_view(dashboard)
1374
+ )
1375
+
1351
1376
  def get_connection_object_page(
1352
1377
  self,
1353
1378
  query: str,
@@ -3059,7 +3084,7 @@ class TableauSiteSource:
3059
3084
  query_filter=sheets_filter,
3060
3085
  page_size=self.config.effective_sheet_page_size,
3061
3086
  ):
3062
- if self.config.ingest_hidden_assets or not self._is_hidden_view(sheet):
3087
+ if self._should_ingest_worksheet(sheet):
3063
3088
  yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
3064
3089
  else:
3065
3090
  self.report.num_hidden_assets_skipped += 1
@@ -3380,7 +3405,7 @@ class TableauSiteSource:
3380
3405
  query_filter=dashboards_filter,
3381
3406
  page_size=self.config.effective_dashboard_page_size,
3382
3407
  ):
3383
- if self.config.ingest_hidden_assets or not self._is_hidden_view(dashboard):
3408
+ if self._should_ingest_dashboard(dashboard):
3384
3409
  yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
3385
3410
  else:
3386
3411
  self.report.num_hidden_assets_skipped += 1