acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (32) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/METADATA +2425 -2425
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/RECORD +31 -27
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/entry_points.txt +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  6. datahub/configuration/source_common.py +13 -0
  7. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  8. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  9. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  10. datahub/ingestion/source/kafka_connect/common.py +202 -0
  11. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  12. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  13. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  14. datahub/ingestion/source/looker/looker_common.py +54 -2
  15. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  16. datahub/ingestion/source/looker/looker_source.py +12 -1
  17. datahub/ingestion/source/mlflow.py +30 -5
  18. datahub/ingestion/source/powerbi/config.py +1 -14
  19. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  20. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  21. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -0
  22. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  23. datahub/ingestion/source/sql/mssql/source.py +14 -0
  24. datahub/ingestion/source/tableau/tableau.py +4 -5
  25. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  26. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  27. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  28. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  29. datahub/sql_parsing/tool_meta_extractor.py +116 -5
  30. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  31. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/WHEEL +0 -0
  32. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/top_level.txt +0 -0
@@ -145,7 +145,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
145
145
  self.source_config: LookerDashboardSourceConfig = config
146
146
  self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport()
147
147
  self.looker_api: LookerAPI = LookerAPI(self.source_config)
148
- self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api)
148
+ self.user_registry: LookerUserRegistry = LookerUserRegistry(
149
+ self.looker_api, self.reporter
150
+ )
149
151
  self.explore_registry: LookerExploreRegistry = LookerExploreRegistry(
150
152
  self.looker_api, self.reporter, self.source_config
151
153
  )
@@ -1673,5 +1675,14 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1673
1675
  yield usage_mcp.as_workunit()
1674
1676
  self.reporter.report_stage_end("usage_extraction")
1675
1677
 
1678
+ # Dump looker user resource mappings.
1679
+ logger.info("Ingesting looker user resource mapping workunits")
1680
+ self.reporter.report_stage_start("user_resource_extraction")
1681
+ yield from auto_workunit(
1682
+ self.user_registry.to_platform_resource(
1683
+ self.source_config.platform_instance
1684
+ )
1685
+ )
1686
+
1676
1687
  def get_report(self) -> SourceReport:
1677
1688
  return self.reporter
@@ -38,16 +38,30 @@ T = TypeVar("T")
38
38
  class MLflowConfig(EnvConfigMixin):
39
39
  tracking_uri: Optional[str] = Field(
40
40
  default=None,
41
- description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)",
41
+ description=(
42
+ "Tracking server URI. If not set, an MLflow default tracking_uri is used"
43
+ " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)"
44
+ ),
42
45
  )
43
46
  registry_uri: Optional[str] = Field(
44
47
  default=None,
45
- description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)",
48
+ description=(
49
+ "Registry server URI. If not set, an MLflow default registry_uri is used"
50
+ " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)"
51
+ ),
46
52
  )
47
53
  model_name_separator: str = Field(
48
54
  default="_",
49
55
  description="A string which separates model name from its version (e.g. model_1 or model-1)",
50
56
  )
57
+ base_external_url: Optional[str] = Field(
58
+ default=None,
59
+ description=(
60
+ "Base URL to use when constructing external URLs to MLflow."
61
+ " If not set, tracking_uri is used if it's an HTTP URL."
62
+ " If neither is set, external URLs are not generated."
63
+ ),
64
+ )
51
65
 
52
66
 
53
67
  @dataclass
@@ -279,12 +293,23 @@ class MLflowSource(Source):
279
293
  )
280
294
  return urn
281
295
 
282
- def _make_external_url(self, model_version: ModelVersion) -> Union[None, str]:
296
+ def _get_base_external_url_from_tracking_uri(self) -> Optional[str]:
297
+ if isinstance(
298
+ self.client.tracking_uri, str
299
+ ) and self.client.tracking_uri.startswith("http"):
300
+ return self.client.tracking_uri
301
+ else:
302
+ return None
303
+
304
+ def _make_external_url(self, model_version: ModelVersion) -> Optional[str]:
283
305
  """
284
306
  Generate URL for a Model Version to MLflow UI.
285
307
  """
286
- base_uri = self.client.tracking_uri
287
- if base_uri.startswith("http"):
308
+ base_uri = (
309
+ self.config.base_external_url
310
+ or self._get_base_external_url_from_tracking_uri()
311
+ )
312
+ if base_uri:
288
313
  return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}"
289
314
  else:
290
315
  return None
@@ -9,7 +9,7 @@ from pydantic.class_validators import root_validator
9
9
 
10
10
  import datahub.emitter.mce_builder as builder
11
11
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
12
- from datahub.configuration.source_common import DatasetSourceConfigMixin
12
+ from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
13
13
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
14
14
  from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
15
15
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]:
232
232
  return dict_
233
233
 
234
234
 
235
- class PlatformDetail(ConfigModel):
236
- platform_instance: Optional[str] = pydantic.Field(
237
- default=None,
238
- description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
239
- "with platform instance name used in ingestion "
240
- "recipe of other datahub sources.",
241
- )
242
- env: str = pydantic.Field(
243
- default=builder.DEFAULT_ENV,
244
- description="The environment that all assets produced by DataHub platform ingestion source belong to",
245
- )
246
-
247
-
248
235
  class DataBricksPlatformDetail(PlatformDetail):
249
236
  """
250
237
  metastore is an additional field used in Databricks connector to generate the dataset urn
@@ -2,8 +2,8 @@ import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import Union
4
4
 
5
+ from datahub.configuration.source_common import PlatformDetail
5
6
  from datahub.ingestion.source.powerbi.config import (
6
- PlatformDetail,
7
7
  PowerBiDashboardSourceConfig,
8
8
  PowerBIPlatformDetail,
9
9
  )
@@ -5,13 +5,13 @@ from typing import Dict, List, Optional, Tuple, Type, cast
5
5
 
6
6
  from lark import Tree
7
7
 
8
+ from datahub.configuration.source_common import PlatformDetail
8
9
  from datahub.emitter import mce_builder as builder
9
10
  from datahub.ingestion.api.common import PipelineContext
10
11
  from datahub.ingestion.source.powerbi.config import (
11
12
  Constant,
12
13
  DataBricksPlatformDetail,
13
14
  DataPlatformPair,
14
- PlatformDetail,
15
15
  PowerBiDashboardSourceConfig,
16
16
  PowerBiDashboardSourceReport,
17
17
  PowerBIPlatformDetail,
@@ -540,6 +540,7 @@ class SnowflakeV2Source(
540
540
  identifiers=self.identifiers,
541
541
  schema_resolver=schema_resolver,
542
542
  discovered_tables=discovered_datasets,
543
+ graph=self.ctx.graph,
543
544
  )
544
545
 
545
546
  # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
@@ -1,11 +1,17 @@
1
1
  from dataclasses import dataclass, field
2
2
  from typing import Dict, List, Optional, Union
3
3
 
4
- from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn
4
+ from datahub.emitter.mce_builder import (
5
+ make_data_flow_urn,
6
+ make_data_job_urn,
7
+ make_data_platform_urn,
8
+ make_dataplatform_instance_urn,
9
+ )
5
10
  from datahub.metadata.schema_classes import (
6
11
  DataFlowInfoClass,
7
12
  DataJobInfoClass,
8
13
  DataJobInputOutputClass,
14
+ DataPlatformInstanceClass,
9
15
  )
10
16
 
11
17
 
@@ -204,6 +210,18 @@ class MSSQLDataJob:
204
210
  status=self.status,
205
211
  )
206
212
 
213
+ @property
214
+ def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
215
+ if self.entity.flow.platform_instance:
216
+ return DataPlatformInstanceClass(
217
+ platform=make_data_platform_urn(self.entity.flow.orchestrator),
218
+ instance=make_dataplatform_instance_urn(
219
+ platform=self.entity.flow.orchestrator,
220
+ instance=self.entity.flow.platform_instance,
221
+ ),
222
+ )
223
+ return None
224
+
207
225
 
208
226
  @dataclass
209
227
  class MSSQLDataFlow:
@@ -238,3 +256,14 @@ class MSSQLDataFlow:
238
256
  customProperties=self.flow_properties,
239
257
  externalUrl=self.external_url,
240
258
  )
259
+
260
+ @property
261
+ def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]:
262
+ if self.entity.platform_instance:
263
+ return DataPlatformInstanceClass(
264
+ platform=make_data_platform_urn(self.entity.orchestrator),
265
+ instance=make_dataplatform_instance_urn(
266
+ self.entity.orchestrator, self.entity.platform_instance
267
+ ),
268
+ )
269
+ return None
@@ -639,6 +639,13 @@ class SQLServerSource(SQLAlchemySource):
639
639
  aspect=data_job.as_datajob_info_aspect,
640
640
  ).as_workunit()
641
641
 
642
+ data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect
643
+ if data_platform_instance_aspect:
644
+ yield MetadataChangeProposalWrapper(
645
+ entityUrn=data_job.urn,
646
+ aspect=data_platform_instance_aspect,
647
+ ).as_workunit()
648
+
642
649
  if include_lineage:
643
650
  yield MetadataChangeProposalWrapper(
644
651
  entityUrn=data_job.urn,
@@ -654,6 +661,13 @@ class SQLServerSource(SQLAlchemySource):
654
661
  entityUrn=data_flow.urn,
655
662
  aspect=data_flow.as_dataflow_info_aspect,
656
663
  ).as_workunit()
664
+
665
+ data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect
666
+ if data_platform_instance_aspect:
667
+ yield MetadataChangeProposalWrapper(
668
+ entityUrn=data_flow.urn,
669
+ aspect=data_platform_instance_aspect,
670
+ ).as_workunit()
657
671
  # TODO: Add SubType when it appear
658
672
 
659
673
  def get_inspectors(self) -> Iterable[Inspector]:
@@ -645,7 +645,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
645
645
  # the site-role might be different on another site
646
646
  logged_in_user: UserInfo = UserInfo.from_server(server=server)
647
647
 
648
- if not logged_in_user.is_site_administrator_explorer():
648
+ if not logged_in_user.has_site_administrator_explorer_privileges():
649
649
  report.warning(
650
650
  title=title,
651
651
  message=message,
@@ -896,10 +896,9 @@ class TableauSiteSource:
896
896
  return f"/{self.config.env.lower()}{self.no_env_browse_prefix}"
897
897
 
898
898
  def _re_authenticate(self):
899
- tableau_auth: Union[
900
- TableauAuth, PersonalAccessTokenAuth
901
- ] = self.config.get_tableau_auth(self.site_id)
902
- self.server.auth.sign_in(tableau_auth)
899
+ # Sign-in again may not be enough because Tableau sometimes caches invalid sessions
900
+ # so we need to recreate the Tableau Server object
901
+ self.server = self.config.make_tableau_client(self.site_id)
903
902
 
904
903
  @property
905
904
  def site_content_url(self) -> Optional[str]:
@@ -82,4 +82,6 @@ PROJECT = "Project"
82
82
  SITE = "Site"
83
83
  IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
84
84
  SITE_PERMISSION = "sitePermission"
85
- SITE_ROLE = "SiteAdministratorExplorer"
85
+ ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer"
86
+ ROLE_SITE_ADMIN_CREATOR = "SiteAdministratorCreator"
87
+ ROLE_SERVER_ADMIN = "ServerAdministrator"
@@ -11,8 +11,12 @@ class UserInfo:
11
11
  site_role: str
12
12
  site_id: str
13
13
 
14
- def is_site_administrator_explorer(self):
15
- return self.site_role == c.SITE_ROLE
14
+ def has_site_administrator_explorer_privileges(self):
15
+ return self.site_role in [
16
+ c.ROLE_SITE_ADMIN_EXPLORER,
17
+ c.ROLE_SITE_ADMIN_CREATOR,
18
+ c.ROLE_SERVER_ADMIN,
19
+ ]
16
20
 
17
21
  @staticmethod
18
22
  def from_server(server: Server) -> "UserInfo":
@@ -28,7 +28,7 @@ def check_user_role(
28
28
 
29
29
  try:
30
30
  # TODO: Add check for `Enable Derived Permissions`
31
- if not logged_in_user.is_site_administrator_explorer():
31
+ if not logged_in_user.has_site_administrator_explorer_privileges():
32
32
  capability_dict[c.SITE_PERMISSION] = CapabilityReport(
33
33
  capable=False,
34
34
  failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.",
@@ -490,7 +490,7 @@ class SqlParsingAggregator(Closeable):
490
490
  self._exit_stack.push(self._query_usage_counts)
491
491
 
492
492
  # Tool Extractor
493
- self._tool_meta_extractor = ToolMetaExtractor()
493
+ self._tool_meta_extractor = ToolMetaExtractor.create(graph)
494
494
  self.report.tool_meta_report = self._tool_meta_extractor.report
495
495
 
496
496
  def close(self) -> None:
@@ -1,3 +1,4 @@
1
+ import contextlib
1
2
  import json
2
3
  import logging
3
4
  from dataclasses import dataclass, field
@@ -5,8 +6,15 @@ from typing import Callable, Dict, List, Optional, Tuple, Union
5
6
 
6
7
  from typing_extensions import Protocol
7
8
 
9
+ from datahub.api.entities.platformresource.platform_resource import (
10
+ ElasticPlatformResourceQuery,
11
+ PlatformResource,
12
+ PlatformResourceSearchFields,
13
+ )
8
14
  from datahub.ingestion.api.report import Report
15
+ from datahub.ingestion.graph.client import DataHubGraph
9
16
  from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
17
+ from datahub.utilities.search_utils import LogicalOperator
10
18
  from datahub.utilities.stats_collections import int_top_k_dict
11
19
 
12
20
  UrnStr = str
@@ -31,6 +39,7 @@ def _get_last_line(query: str) -> str:
31
39
  @dataclass
32
40
  class ToolMetaExtractorReport(Report):
33
41
  num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict)
42
+ failures: List[str] = field(default_factory=list)
34
43
 
35
44
 
36
45
  class ToolMetaExtractor:
@@ -42,14 +51,81 @@ class ToolMetaExtractor:
42
51
  by warehouse query logs.
43
52
  """
44
53
 
45
- def __init__(self) -> None:
46
- self.report = ToolMetaExtractorReport()
54
+ def __init__(
55
+ self,
56
+ report: ToolMetaExtractorReport,
57
+ looker_user_mapping: Optional[Dict[str, str]] = None,
58
+ ) -> None:
59
+ self.report = report
47
60
  self.known_tool_extractors: List[Tuple[str, Callable[[QueryLog], bool]]] = [
48
61
  (
49
62
  "mode",
50
63
  self._extract_mode_query,
51
- )
64
+ ),
65
+ (
66
+ "looker",
67
+ self._extract_looker_query,
68
+ ),
52
69
  ]
70
+ # maps user id (as string) to email address
71
+ self.looker_user_mapping = looker_user_mapping
72
+
73
+ @classmethod
74
+ def create(
75
+ cls,
76
+ graph: Optional[DataHubGraph] = None,
77
+ ) -> "ToolMetaExtractor":
78
+ report = ToolMetaExtractorReport()
79
+ looker_user_mapping = None
80
+ if graph:
81
+ try:
82
+ looker_user_mapping = cls.extract_looker_user_mapping_from_graph(
83
+ graph, report
84
+ )
85
+ except Exception as e:
86
+ report.failures.append(
87
+ f"Unexpected error during Looker user metadata extraction: {str(e)}"
88
+ )
89
+
90
+ return cls(report, looker_user_mapping)
91
+
92
+ @classmethod
93
+ def extract_looker_user_mapping_from_graph(
94
+ cls, graph: DataHubGraph, report: ToolMetaExtractorReport
95
+ ) -> Optional[Dict[str, str]]:
96
+ looker_user_mapping = None
97
+ query = (
98
+ ElasticPlatformResourceQuery.create_from()
99
+ .group(LogicalOperator.AND)
100
+ .add_field_match(PlatformResourceSearchFields.PLATFORM, "looker")
101
+ .add_field_match(
102
+ PlatformResourceSearchFields.RESOURCE_TYPE,
103
+ "USER_ID_MAPPING",
104
+ )
105
+ .end()
106
+ )
107
+ platform_resources = list(
108
+ PlatformResource.search_by_filters(query=query, graph_client=graph)
109
+ )
110
+
111
+ if len(platform_resources) > 1:
112
+ report.failures.append(
113
+ "Looker user metadata extraction failed. Found more than one looker user id mappings."
114
+ )
115
+ else:
116
+ platform_resource = platform_resources[0]
117
+
118
+ if (
119
+ platform_resource
120
+ and platform_resource.resource_info
121
+ and platform_resource.resource_info.value
122
+ ):
123
+ with contextlib.suppress(ValueError, AssertionError):
124
+ value = platform_resource.resource_info.value.as_raw_json()
125
+ if value:
126
+ looker_user_mapping = value
127
+
128
+ return looker_user_mapping
53
129
 
54
130
  def _extract_mode_query(self, entry: QueryLog) -> bool:
55
131
  """
@@ -78,14 +154,49 @@ class ToolMetaExtractor:
78
154
 
79
155
  return True
80
156
 
157
+ def _extract_looker_query(self, entry: QueryLog) -> bool:
158
+ """
159
+ Returns:
160
+ bool: whether QueryLog entry is that of looker and looker user info
161
+ is extracted into entry.
162
+ """
163
+ if not self.looker_user_mapping:
164
+ return False
165
+
166
+ last_line = _get_last_line(entry.query_text)
167
+
168
+ if not (last_line.startswith("--") and "Looker Query Context" in last_line):
169
+ return False
170
+
171
+ start_quote_idx = last_line.index("'")
172
+ end_quote_idx = last_line.rindex("'")
173
+ if start_quote_idx == -1 or end_quote_idx == -1:
174
+ return False
175
+
176
+ looker_json_raw = last_line[start_quote_idx + 1 : end_quote_idx]
177
+ looker_json = json.loads(looker_json_raw)
178
+
179
+ user_id = str(looker_json["user_id"])
180
+ email = self.looker_user_mapping.get(user_id)
181
+ if not email:
182
+ return False
183
+
184
+ original_user = entry.user
185
+
186
+ entry.user = email_to_user_urn(email)
187
+ entry.extra_info = entry.extra_info or {}
188
+ entry.extra_info["user_via"] = original_user
189
+
190
+ return True
191
+
81
192
  def extract_bi_metadata(self, entry: QueryLog) -> bool:
82
193
  for tool, meta_extractor in self.known_tool_extractors:
83
194
  try:
84
195
  if meta_extractor(entry):
85
196
  self.report.num_queries_meta_extracted[tool] += 1
86
197
  return True
87
- except Exception:
88
- logger.debug("Tool metadata extraction failed with error : {e}")
198
+ except Exception as e:
199
+ logger.debug(f"Tool metadata extraction failed with error : {e}")
89
200
  return False
90
201
 
91
202