acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (39) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/METADATA +2470 -2470
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD +38 -33
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  6. datahub/configuration/source_common.py +13 -0
  7. datahub/emitter/rest_emitter.py +16 -1
  8. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +96 -0
  9. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  10. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  11. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  12. datahub/ingestion/source/kafka_connect/common.py +202 -0
  13. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  14. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  15. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  16. datahub/ingestion/source/looker/looker_common.py +54 -2
  17. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  18. datahub/ingestion/source/looker/looker_source.py +12 -1
  19. datahub/ingestion/source/mlflow.py +30 -5
  20. datahub/ingestion/source/powerbi/config.py +1 -14
  21. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  22. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  23. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -51
  24. datahub/ingestion/source/snowflake/snowflake_queries.py +0 -3
  25. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +52 -2
  26. datahub/ingestion/source/snowflake/snowflake_v2.py +24 -28
  27. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  28. datahub/ingestion/source/sql/mssql/source.py +14 -0
  29. datahub/ingestion/source/tableau/tableau.py +4 -5
  30. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  31. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  32. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  33. datahub/ingestion/source/unity/source.py +4 -0
  34. datahub/ingestion/source_report/ingestion_stage.py +1 -0
  35. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  36. datahub/sql_parsing/tool_meta_extractor.py +116 -5
  37. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  38. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL +0 -0
  39. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import contextlib
1
2
  import json
2
3
  import logging
3
4
  from dataclasses import dataclass, field
@@ -5,8 +6,15 @@ from typing import Callable, Dict, List, Optional, Tuple, Union
5
6
 
6
7
  from typing_extensions import Protocol
7
8
 
9
+ from datahub.api.entities.platformresource.platform_resource import (
10
+ ElasticPlatformResourceQuery,
11
+ PlatformResource,
12
+ PlatformResourceSearchFields,
13
+ )
8
14
  from datahub.ingestion.api.report import Report
15
+ from datahub.ingestion.graph.client import DataHubGraph
9
16
  from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
17
+ from datahub.utilities.search_utils import LogicalOperator
10
18
  from datahub.utilities.stats_collections import int_top_k_dict
11
19
 
12
20
  UrnStr = str
@@ -31,6 +39,7 @@ def _get_last_line(query: str) -> str:
31
39
  @dataclass
32
40
  class ToolMetaExtractorReport(Report):
33
41
  num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict)
42
+ failures: List[str] = field(default_factory=list)
34
43
 
35
44
 
36
45
  class ToolMetaExtractor:
@@ -42,14 +51,81 @@ class ToolMetaExtractor:
42
51
  by warehouse query logs.
43
52
  """
44
53
 
45
- def __init__(self) -> None:
46
- self.report = ToolMetaExtractorReport()
54
+ def __init__(
55
+ self,
56
+ report: ToolMetaExtractorReport,
57
+ looker_user_mapping: Optional[Dict[str, str]] = None,
58
+ ) -> None:
59
+ self.report = report
47
60
  self.known_tool_extractors: List[Tuple[str, Callable[[QueryLog], bool]]] = [
48
61
  (
49
62
  "mode",
50
63
  self._extract_mode_query,
51
- )
64
+ ),
65
+ (
66
+ "looker",
67
+ self._extract_looker_query,
68
+ ),
52
69
  ]
70
+ # maps user id (as string) to email address
71
+ self.looker_user_mapping = looker_user_mapping
72
+
73
+ @classmethod
74
+ def create(
75
+ cls,
76
+ graph: Optional[DataHubGraph] = None,
77
+ ) -> "ToolMetaExtractor":
78
+ report = ToolMetaExtractorReport()
79
+ looker_user_mapping = None
80
+ if graph:
81
+ try:
82
+ looker_user_mapping = cls.extract_looker_user_mapping_from_graph(
83
+ graph, report
84
+ )
85
+ except Exception as e:
86
+ report.failures.append(
87
+ f"Unexpected error during Looker user metadata extraction: {str(e)}"
88
+ )
89
+
90
+ return cls(report, looker_user_mapping)
91
+
92
+ @classmethod
93
+ def extract_looker_user_mapping_from_graph(
94
+ cls, graph: DataHubGraph, report: ToolMetaExtractorReport
95
+ ) -> Optional[Dict[str, str]]:
96
+ looker_user_mapping = None
97
+ query = (
98
+ ElasticPlatformResourceQuery.create_from()
99
+ .group(LogicalOperator.AND)
100
+ .add_field_match(PlatformResourceSearchFields.PLATFORM, "looker")
101
+ .add_field_match(
102
+ PlatformResourceSearchFields.RESOURCE_TYPE,
103
+ "USER_ID_MAPPING",
104
+ )
105
+ .end()
106
+ )
107
+ platform_resources = list(
108
+ PlatformResource.search_by_filters(query=query, graph_client=graph)
109
+ )
110
+
111
+ if len(platform_resources) > 1:
112
+ report.failures.append(
113
+ "Looker user metadata extraction failed. Found more than one looker user id mappings."
114
+ )
115
+ else:
116
+ platform_resource = platform_resources[0]
117
+
118
+ if (
119
+ platform_resource
120
+ and platform_resource.resource_info
121
+ and platform_resource.resource_info.value
122
+ ):
123
+ with contextlib.suppress(ValueError, AssertionError):
124
+ value = platform_resource.resource_info.value.as_raw_json()
125
+ if value:
126
+ looker_user_mapping = value
127
+
128
+ return looker_user_mapping
53
129
 
54
130
  def _extract_mode_query(self, entry: QueryLog) -> bool:
55
131
  """
@@ -78,14 +154,49 @@ class ToolMetaExtractor:
78
154
 
79
155
  return True
80
156
 
157
+ def _extract_looker_query(self, entry: QueryLog) -> bool:
158
+ """
159
+ Returns:
160
+ bool: whether QueryLog entry is that of looker and looker user info
161
+ is extracted into entry.
162
+ """
163
+ if not self.looker_user_mapping:
164
+ return False
165
+
166
+ last_line = _get_last_line(entry.query_text)
167
+
168
+ if not (last_line.startswith("--") and "Looker Query Context" in last_line):
169
+ return False
170
+
171
+ start_quote_idx = last_line.index("'")
172
+ end_quote_idx = last_line.rindex("'")
173
+ if start_quote_idx == -1 or end_quote_idx == -1:
174
+ return False
175
+
176
+ looker_json_raw = last_line[start_quote_idx + 1 : end_quote_idx]
177
+ looker_json = json.loads(looker_json_raw)
178
+
179
+ user_id = str(looker_json["user_id"])
180
+ email = self.looker_user_mapping.get(user_id)
181
+ if not email:
182
+ return False
183
+
184
+ original_user = entry.user
185
+
186
+ entry.user = email_to_user_urn(email)
187
+ entry.extra_info = entry.extra_info or {}
188
+ entry.extra_info["user_via"] = original_user
189
+
190
+ return True
191
+
81
192
  def extract_bi_metadata(self, entry: QueryLog) -> bool:
82
193
  for tool, meta_extractor in self.known_tool_extractors:
83
194
  try:
84
195
  if meta_extractor(entry):
85
196
  self.report.num_queries_meta_extracted[tool] += 1
86
197
  return True
87
- except Exception:
88
- logger.debug("Tool metadata extraction failed with error : {e}")
198
+ except Exception as e:
199
+ logger.debug(f"Tool metadata extraction failed with error : {e}")
89
200
  return False
90
201
 
91
202