acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/METADATA +2470 -2470
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD +38 -33
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/rest_emitter.py +16 -1
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +96 -0
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +54 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +12 -1
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -51
- datahub/ingestion/source/snowflake/snowflake_queries.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +52 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +24 -28
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +14 -0
- datahub/ingestion/source/tableau/tableau.py +4 -5
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source_report/ingestion_stage.py +1 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/sql_parsing/tool_meta_extractor.py +116 -5
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import contextlib
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
from dataclasses import dataclass, field
|
|
@@ -5,8 +6,15 @@ from typing import Callable, Dict, List, Optional, Tuple, Union
|
|
|
5
6
|
|
|
6
7
|
from typing_extensions import Protocol
|
|
7
8
|
|
|
9
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
10
|
+
ElasticPlatformResourceQuery,
|
|
11
|
+
PlatformResource,
|
|
12
|
+
PlatformResourceSearchFields,
|
|
13
|
+
)
|
|
8
14
|
from datahub.ingestion.api.report import Report
|
|
15
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
9
16
|
from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
|
|
17
|
+
from datahub.utilities.search_utils import LogicalOperator
|
|
10
18
|
from datahub.utilities.stats_collections import int_top_k_dict
|
|
11
19
|
|
|
12
20
|
UrnStr = str
|
|
@@ -31,6 +39,7 @@ def _get_last_line(query: str) -> str:
|
|
|
31
39
|
@dataclass
|
|
32
40
|
class ToolMetaExtractorReport(Report):
|
|
33
41
|
num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict)
|
|
42
|
+
failures: List[str] = field(default_factory=list)
|
|
34
43
|
|
|
35
44
|
|
|
36
45
|
class ToolMetaExtractor:
|
|
@@ -42,14 +51,81 @@ class ToolMetaExtractor:
|
|
|
42
51
|
by warehouse query logs.
|
|
43
52
|
"""
|
|
44
53
|
|
|
45
|
-
def __init__(
|
|
46
|
-
self
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
report: ToolMetaExtractorReport,
|
|
57
|
+
looker_user_mapping: Optional[Dict[str, str]] = None,
|
|
58
|
+
) -> None:
|
|
59
|
+
self.report = report
|
|
47
60
|
self.known_tool_extractors: List[Tuple[str, Callable[[QueryLog], bool]]] = [
|
|
48
61
|
(
|
|
49
62
|
"mode",
|
|
50
63
|
self._extract_mode_query,
|
|
51
|
-
)
|
|
64
|
+
),
|
|
65
|
+
(
|
|
66
|
+
"looker",
|
|
67
|
+
self._extract_looker_query,
|
|
68
|
+
),
|
|
52
69
|
]
|
|
70
|
+
# maps user id (as string) to email address
|
|
71
|
+
self.looker_user_mapping = looker_user_mapping
|
|
72
|
+
|
|
73
|
+
@classmethod
|
|
74
|
+
def create(
|
|
75
|
+
cls,
|
|
76
|
+
graph: Optional[DataHubGraph] = None,
|
|
77
|
+
) -> "ToolMetaExtractor":
|
|
78
|
+
report = ToolMetaExtractorReport()
|
|
79
|
+
looker_user_mapping = None
|
|
80
|
+
if graph:
|
|
81
|
+
try:
|
|
82
|
+
looker_user_mapping = cls.extract_looker_user_mapping_from_graph(
|
|
83
|
+
graph, report
|
|
84
|
+
)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
report.failures.append(
|
|
87
|
+
f"Unexpected error during Looker user metadata extraction: {str(e)}"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return cls(report, looker_user_mapping)
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def extract_looker_user_mapping_from_graph(
|
|
94
|
+
cls, graph: DataHubGraph, report: ToolMetaExtractorReport
|
|
95
|
+
) -> Optional[Dict[str, str]]:
|
|
96
|
+
looker_user_mapping = None
|
|
97
|
+
query = (
|
|
98
|
+
ElasticPlatformResourceQuery.create_from()
|
|
99
|
+
.group(LogicalOperator.AND)
|
|
100
|
+
.add_field_match(PlatformResourceSearchFields.PLATFORM, "looker")
|
|
101
|
+
.add_field_match(
|
|
102
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
103
|
+
"USER_ID_MAPPING",
|
|
104
|
+
)
|
|
105
|
+
.end()
|
|
106
|
+
)
|
|
107
|
+
platform_resources = list(
|
|
108
|
+
PlatformResource.search_by_filters(query=query, graph_client=graph)
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if len(platform_resources) > 1:
|
|
112
|
+
report.failures.append(
|
|
113
|
+
"Looker user metadata extraction failed. Found more than one looker user id mappings."
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
platform_resource = platform_resources[0]
|
|
117
|
+
|
|
118
|
+
if (
|
|
119
|
+
platform_resource
|
|
120
|
+
and platform_resource.resource_info
|
|
121
|
+
and platform_resource.resource_info.value
|
|
122
|
+
):
|
|
123
|
+
with contextlib.suppress(ValueError, AssertionError):
|
|
124
|
+
value = platform_resource.resource_info.value.as_raw_json()
|
|
125
|
+
if value:
|
|
126
|
+
looker_user_mapping = value
|
|
127
|
+
|
|
128
|
+
return looker_user_mapping
|
|
53
129
|
|
|
54
130
|
def _extract_mode_query(self, entry: QueryLog) -> bool:
|
|
55
131
|
"""
|
|
@@ -78,14 +154,49 @@ class ToolMetaExtractor:
|
|
|
78
154
|
|
|
79
155
|
return True
|
|
80
156
|
|
|
157
|
+
def _extract_looker_query(self, entry: QueryLog) -> bool:
|
|
158
|
+
"""
|
|
159
|
+
Returns:
|
|
160
|
+
bool: whether QueryLog entry is that of looker and looker user info
|
|
161
|
+
is extracted into entry.
|
|
162
|
+
"""
|
|
163
|
+
if not self.looker_user_mapping:
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
last_line = _get_last_line(entry.query_text)
|
|
167
|
+
|
|
168
|
+
if not (last_line.startswith("--") and "Looker Query Context" in last_line):
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
start_quote_idx = last_line.index("'")
|
|
172
|
+
end_quote_idx = last_line.rindex("'")
|
|
173
|
+
if start_quote_idx == -1 or end_quote_idx == -1:
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
looker_json_raw = last_line[start_quote_idx + 1 : end_quote_idx]
|
|
177
|
+
looker_json = json.loads(looker_json_raw)
|
|
178
|
+
|
|
179
|
+
user_id = str(looker_json["user_id"])
|
|
180
|
+
email = self.looker_user_mapping.get(user_id)
|
|
181
|
+
if not email:
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
original_user = entry.user
|
|
185
|
+
|
|
186
|
+
entry.user = email_to_user_urn(email)
|
|
187
|
+
entry.extra_info = entry.extra_info or {}
|
|
188
|
+
entry.extra_info["user_via"] = original_user
|
|
189
|
+
|
|
190
|
+
return True
|
|
191
|
+
|
|
81
192
|
def extract_bi_metadata(self, entry: QueryLog) -> bool:
|
|
82
193
|
for tool, meta_extractor in self.known_tool_extractors:
|
|
83
194
|
try:
|
|
84
195
|
if meta_extractor(entry):
|
|
85
196
|
self.report.num_queries_meta_extracted[tool] += 1
|
|
86
197
|
return True
|
|
87
|
-
except Exception:
|
|
88
|
-
logger.debug("Tool metadata extraction failed with error : {e}")
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.debug(f"Tool metadata extraction failed with error : {e}")
|
|
89
200
|
return False
|
|
90
201
|
|
|
91
202
|
|