acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +23 -7
- datahub/errors.py +8 -0
- datahub/ingestion/api/source.py +7 -2
- datahub/ingestion/api/source_helpers.py +14 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +26 -20
- datahub/ingestion/graph/filters.py +62 -17
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +17 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +193 -140
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +41 -24
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +291 -35
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +313 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +24 -1
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +94 -37
- datahub/sql_parsing/split_statements.py +17 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,16 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import logging
|
|
2
3
|
import textwrap
|
|
3
4
|
from dataclasses import dataclass
|
|
4
|
-
from typing import Iterable, List, Optional, Tuple
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
5
6
|
|
|
6
|
-
from pydantic import Field, SecretStr
|
|
7
|
+
from pydantic import BaseModel, Field, SecretStr
|
|
7
8
|
from slack_sdk import WebClient
|
|
8
9
|
from tenacity import retry, wait_exponential
|
|
9
10
|
from tenacity.before_sleep import before_sleep_log
|
|
10
11
|
|
|
11
12
|
import datahub.emitter.mce_builder as builder
|
|
13
|
+
from datahub.emitter.mce_builder import datahub_guid, make_dataplatform_instance_urn
|
|
12
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
15
|
from datahub.ingestion.api.common import PipelineContext
|
|
14
16
|
from datahub.ingestion.api.decorators import (
|
|
@@ -32,16 +34,153 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
32
34
|
)
|
|
33
35
|
from datahub.metadata.schema_classes import (
|
|
34
36
|
CorpUserEditableInfoClass,
|
|
37
|
+
CorpUserSettingsClass,
|
|
38
|
+
DataPlatformInstanceClass,
|
|
39
|
+
DataPlatformInstancePropertiesClass,
|
|
35
40
|
DatasetPropertiesClass,
|
|
36
41
|
DeprecationClass,
|
|
42
|
+
NotificationSettingsClass,
|
|
43
|
+
PlatformResourceInfoClass,
|
|
44
|
+
SerializedValueClass,
|
|
45
|
+
SerializedValueContentTypeClass,
|
|
46
|
+
SerializedValueSchemaTypeClass,
|
|
47
|
+
SlackNotificationSettingsClass,
|
|
48
|
+
SlackUserInfoClass as SlackUserInfo,
|
|
49
|
+
StatusClass,
|
|
37
50
|
SubTypesClass,
|
|
51
|
+
_Aspect,
|
|
38
52
|
)
|
|
39
53
|
from datahub.utilities.ratelimiter import RateLimiter
|
|
54
|
+
from datahub.utilities.str_enum import StrEnum
|
|
40
55
|
from datahub.utilities.urns.urn import Urn
|
|
41
56
|
|
|
42
57
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
43
58
|
|
|
44
59
|
|
|
60
|
+
# TODO: Relocate this function to a utility module
|
|
61
|
+
def is_picture_default_or_missing(picture_link: Optional[str]) -> bool:
|
|
62
|
+
if not picture_link:
|
|
63
|
+
return True
|
|
64
|
+
return picture_link.endswith("default_avatar.png")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_slack_image(picture_link: Optional[str]) -> bool:
|
|
68
|
+
"""
|
|
69
|
+
Guesses if the picture link is a slack image.
|
|
70
|
+
"""
|
|
71
|
+
if not picture_link:
|
|
72
|
+
return False
|
|
73
|
+
return "slack-edge.com" in picture_link
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class ResourceType(StrEnum):
|
|
77
|
+
USER_INFO = "user-info"
|
|
78
|
+
CHANNEL_INFO = "channel-info"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class SlackInstance(BaseModel):
|
|
82
|
+
id: str
|
|
83
|
+
name: Optional[str] = None
|
|
84
|
+
description: Optional[str] = None
|
|
85
|
+
external_url: Optional[str] = None
|
|
86
|
+
custom_properties: Optional[Dict[str, str]] = None
|
|
87
|
+
|
|
88
|
+
def to_platform_instance_urn(self) -> str:
|
|
89
|
+
return make_dataplatform_instance_urn(
|
|
90
|
+
platform=DATA_PLATFORM_SLACK_URN, instance=self.id
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def with_slack_team_info(self, team_info: dict) -> "SlackInstance":
|
|
94
|
+
"""
|
|
95
|
+
team_info looks like this
|
|
96
|
+
{'id': 'T22BUCL1LKW', 'name': 'DataHub', 'url': 'https://datahubspace.slack.com/', 'domain': 'datahub', 'email_domain': '', 'icon': {'image_default': False, 'image_34': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_34.png', 'image_44': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_44.png', 'image_68': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_68.png', 'image_88': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_88.png', 'image_102': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_102.png', 'image_230': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_230.png', 'image_132': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_132.png'}, 'avatar_base_url': 'https://ca.slack-edge.com/', 'is_verified': False, 'external_org_migrations': {'date_updated': 1722672564, 'current': []}, 'discoverable': 'closed', 'enterprise_id': 'E06TPM5T1G9', 'enterprise_name': 'DataHub', 'enterprise_domain': 'datahubspace', 'lob_sales_home_enabled': False}
|
|
97
|
+
"""
|
|
98
|
+
self.name = team_info.get("name")
|
|
99
|
+
self.description = team_info.get("name")
|
|
100
|
+
self.external_url = team_info.get("url")
|
|
101
|
+
self.custom_properties = {
|
|
102
|
+
k: v
|
|
103
|
+
for k, v in {
|
|
104
|
+
"domain": team_info.get("domain"),
|
|
105
|
+
"enterprise_id": team_info.get("enterprise_id"),
|
|
106
|
+
"enterprise_name": team_info.get("enterprise_name"),
|
|
107
|
+
"enterprise_domain": team_info.get("enterprise_domain"),
|
|
108
|
+
"icon": team_info.get("icon", {}).get("image_102"),
|
|
109
|
+
}.items()
|
|
110
|
+
if v is not None
|
|
111
|
+
}
|
|
112
|
+
return self
|
|
113
|
+
|
|
114
|
+
def to_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
115
|
+
return [
|
|
116
|
+
MetadataChangeProposalWrapper(
|
|
117
|
+
entityUrn=self.to_platform_instance_urn(),
|
|
118
|
+
aspect=DataPlatformInstancePropertiesClass(
|
|
119
|
+
name=self.name or self.id,
|
|
120
|
+
description=self.description,
|
|
121
|
+
externalUrl=self.external_url or None,
|
|
122
|
+
customProperties=self.custom_properties or {},
|
|
123
|
+
),
|
|
124
|
+
)
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def to_serialized_value(value: _Aspect) -> SerializedValueClass:
|
|
129
|
+
# HACK: we remove the .pegasus2avro from the schema type since we want to refer to
|
|
130
|
+
# the original pdl type
|
|
131
|
+
schema_type = value.RECORD_SCHEMA.fullname.replace(".pegasus2avro", "")
|
|
132
|
+
serialized_value = SerializedValueClass(
|
|
133
|
+
blob=json.dumps(value.to_obj()).encode("utf-8"),
|
|
134
|
+
contentType=SerializedValueContentTypeClass.JSON,
|
|
135
|
+
schemaType=SerializedValueSchemaTypeClass.PEGASUS,
|
|
136
|
+
schemaRef=schema_type,
|
|
137
|
+
)
|
|
138
|
+
return serialized_value
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class SlackUserDetails:
|
|
142
|
+
def __init__(self, slack_user_info: SlackUserInfo):
|
|
143
|
+
self.slack_user_info = slack_user_info
|
|
144
|
+
|
|
145
|
+
def to_guid(self) -> str:
|
|
146
|
+
"""
|
|
147
|
+
A slack user is uniquely identified by the combination of their id and teamId.
|
|
148
|
+
"""
|
|
149
|
+
return datahub_guid(
|
|
150
|
+
{"id": self.slack_user_info.id, "dpi": self.slack_user_info.teamId}
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def get_resource_urn(self) -> str:
|
|
154
|
+
return f"urn:li:platformResource:{self.to_guid()}"
|
|
155
|
+
|
|
156
|
+
def to_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
157
|
+
resource_urn = self.get_resource_urn()
|
|
158
|
+
|
|
159
|
+
dpi = DataPlatformInstanceClass(
|
|
160
|
+
platform=DATA_PLATFORM_SLACK_URN,
|
|
161
|
+
instance=self.slack_user_info.slackInstance,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
secondary_keys = []
|
|
165
|
+
if self.slack_user_info.email:
|
|
166
|
+
secondary_keys.append(self.slack_user_info.email)
|
|
167
|
+
|
|
168
|
+
resource_info = PlatformResourceInfoClass(
|
|
169
|
+
resourceType=ResourceType.USER_INFO.value,
|
|
170
|
+
value=to_serialized_value(self.slack_user_info),
|
|
171
|
+
primaryKey=self.slack_user_info.id,
|
|
172
|
+
secondaryKeys=secondary_keys,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
status = StatusClass(
|
|
176
|
+
removed=self.slack_user_info.isDeleted,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
yield from MetadataChangeProposalWrapper.construct_many(
|
|
180
|
+
resource_urn, aspects=[dpi, resource_info, status]
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
45
184
|
@dataclass
|
|
46
185
|
class CorpUser:
|
|
47
186
|
urn: Optional[str] = None
|
|
@@ -52,18 +191,26 @@ class CorpUser:
|
|
|
52
191
|
phone: Optional[str] = None
|
|
53
192
|
real_name: Optional[str] = None
|
|
54
193
|
slack_display_name: Optional[str] = None
|
|
194
|
+
team_id: Optional[str] = None
|
|
195
|
+
team_domain: Optional[str] = None
|
|
196
|
+
is_team_enterprise: Optional[bool] = None
|
|
55
197
|
|
|
56
198
|
|
|
57
199
|
class SlackSourceConfig(
|
|
58
200
|
StatefulIngestionConfigBase,
|
|
59
201
|
):
|
|
60
202
|
bot_token: SecretStr = Field(
|
|
61
|
-
description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email
|
|
203
|
+
description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email`, `users.profile:read`, and `team:read` scopes.",
|
|
62
204
|
)
|
|
63
205
|
enrich_user_metadata: bool = Field(
|
|
64
206
|
type=bool,
|
|
65
207
|
default=True,
|
|
66
|
-
description="
|
|
208
|
+
description="When enabled, will enrich provisioned DataHub users' metadata with information from Slack.",
|
|
209
|
+
)
|
|
210
|
+
ingest_users: bool = Field(
|
|
211
|
+
type=bool,
|
|
212
|
+
default=True,
|
|
213
|
+
description="Whether to ingest users. When set to true, will ingest all users in the Slack workspace (as platform resources) to simplify user enrichment after they are provisioned on DataHub.",
|
|
67
214
|
)
|
|
68
215
|
api_requests_per_min: int = Field(
|
|
69
216
|
type=int,
|
|
@@ -96,9 +243,11 @@ class SlackSourceConfig(
|
|
|
96
243
|
class SlackSourceReport(StaleEntityRemovalSourceReport):
|
|
97
244
|
channels_reported: int = 0
|
|
98
245
|
archived_channels_reported: int = 0
|
|
246
|
+
users_reported: int = 0
|
|
99
247
|
|
|
100
248
|
|
|
101
249
|
PLATFORM_NAME = "slack"
|
|
250
|
+
DATA_PLATFORM_SLACK_URN: str = builder.make_data_platform_urn(PLATFORM_NAME)
|
|
102
251
|
|
|
103
252
|
|
|
104
253
|
@platform_name("Slack")
|
|
@@ -124,6 +273,38 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
124
273
|
def get_slack_client(self) -> WebClient:
|
|
125
274
|
return WebClient(token=self.config.bot_token.get_secret_value())
|
|
126
275
|
|
|
276
|
+
@staticmethod
|
|
277
|
+
def populate_slack_member_from_response(
|
|
278
|
+
user: Dict[str, Any], slack_instance: SlackInstance
|
|
279
|
+
) -> SlackUserDetails:
|
|
280
|
+
profile = user.get("profile", {})
|
|
281
|
+
|
|
282
|
+
user_info = SlackUserInfo(
|
|
283
|
+
slackInstance=slack_instance.to_platform_instance_urn(),
|
|
284
|
+
id=user["id"],
|
|
285
|
+
name=user["name"],
|
|
286
|
+
realName=user.get("real_name", ""),
|
|
287
|
+
displayName=profile.get("display_name", ""),
|
|
288
|
+
email=profile.get("email"),
|
|
289
|
+
teamId=user["team_id"],
|
|
290
|
+
isDeleted=user.get("deleted", False),
|
|
291
|
+
isAdmin=user.get("is_admin", False),
|
|
292
|
+
isOwner=user.get("is_owner", False),
|
|
293
|
+
isPrimaryOwner=user.get("is_primary_owner", False),
|
|
294
|
+
isBot=user.get("is_bot", False),
|
|
295
|
+
timezone=user.get("tz"),
|
|
296
|
+
timezoneOffset=user.get("tz_offset"),
|
|
297
|
+
title=profile.get("title"),
|
|
298
|
+
phone=profile.get("phone"),
|
|
299
|
+
profilePictureUrl=profile.get(
|
|
300
|
+
"image_192"
|
|
301
|
+
), # Using 192px image as an example
|
|
302
|
+
statusText=profile.get("status_text"),
|
|
303
|
+
statusEmoji=profile.get("status_emoji"),
|
|
304
|
+
lastUpdatedSeconds=user.get("updated"),
|
|
305
|
+
)
|
|
306
|
+
return SlackUserDetails(slack_user_info=user_info)
|
|
307
|
+
|
|
127
308
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
128
309
|
return [
|
|
129
310
|
*super().get_workunit_processors(),
|
|
@@ -143,46 +324,108 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
143
324
|
logger.info(auth_resp.data)
|
|
144
325
|
if self.config.ingest_public_channels:
|
|
145
326
|
yield from self.get_public_channels()
|
|
146
|
-
if self.config.enrich_user_metadata:
|
|
327
|
+
if self.config.enrich_user_metadata or self.config.ingest_users:
|
|
147
328
|
yield from self.get_user_info()
|
|
148
329
|
|
|
330
|
+
def _get_datahub_user_info(
|
|
331
|
+
self,
|
|
332
|
+
) -> Dict[str, Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
|
|
333
|
+
# get_user_to_be_updated ensures that the email field is not None
|
|
334
|
+
users = {
|
|
335
|
+
user_obj.email: (user_obj, editable_properties)
|
|
336
|
+
for user_obj, editable_properties in self.get_user_to_be_updated()
|
|
337
|
+
if user_obj.email
|
|
338
|
+
}
|
|
339
|
+
return users
|
|
340
|
+
|
|
149
341
|
def get_user_info(self) -> Iterable[MetadataWorkUnit]:
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
corpuser_editable_info.slack = user_obj.slack_id
|
|
167
|
-
corpuser_editable_info.title = user_obj.title
|
|
168
|
-
if user_obj.image_url:
|
|
169
|
-
corpuser_editable_info.pictureLink = user_obj.image_url
|
|
170
|
-
if user_obj.phone:
|
|
171
|
-
corpuser_editable_info.phone = user_obj.phone
|
|
172
|
-
if (
|
|
173
|
-
not corpuser_editable_info.displayName
|
|
174
|
-
or corpuser_editable_info.displayName == corpuser_editable_info.email
|
|
175
|
-
):
|
|
176
|
-
# let's fill out a real name
|
|
177
|
-
corpuser_editable_info.displayName = user_obj.real_name
|
|
178
|
-
yield MetadataWorkUnit(
|
|
179
|
-
id=f"{user_obj.urn}",
|
|
180
|
-
mcp=MetadataChangeProposalWrapper(
|
|
181
|
-
entityUrn=user_obj.urn,
|
|
182
|
-
aspect=corpuser_editable_info,
|
|
183
|
-
),
|
|
342
|
+
# Get team information to populate for all users
|
|
343
|
+
slack_instance: Optional[SlackInstance] = None
|
|
344
|
+
with self.rate_limiter:
|
|
345
|
+
team_response = self.get_slack_client().team_info()
|
|
346
|
+
if team_response and "team" in team_response:
|
|
347
|
+
team_info = team_response["team"]
|
|
348
|
+
slack_instance = SlackInstance(id=team_info.get("id"))
|
|
349
|
+
slack_instance = slack_instance.with_slack_team_info(team_info)
|
|
350
|
+
|
|
351
|
+
if slack_instance:
|
|
352
|
+
for mcp in slack_instance.to_mcps():
|
|
353
|
+
yield mcp.as_workunit()
|
|
354
|
+
else:
|
|
355
|
+
logger.error("Failed to fetch team information")
|
|
356
|
+
self.report.report_failure(
|
|
357
|
+
"team_info", "Failed to fetch team information for users"
|
|
184
358
|
)
|
|
185
359
|
|
|
360
|
+
assert slack_instance
|
|
361
|
+
|
|
362
|
+
# Fetch all DataHub users that need to be updated
|
|
363
|
+
if self.config.enrich_user_metadata:
|
|
364
|
+
datahub_users = self._get_datahub_user_info()
|
|
365
|
+
else:
|
|
366
|
+
datahub_users = {}
|
|
367
|
+
cursor = None
|
|
368
|
+
while True:
|
|
369
|
+
with self.rate_limiter:
|
|
370
|
+
response = self.get_slack_client().users_list(cursor=cursor)
|
|
371
|
+
assert isinstance(response.data, dict)
|
|
372
|
+
if not response.data["ok"]:
|
|
373
|
+
self.report.report_failure("users", "Failed to fetch users")
|
|
374
|
+
return
|
|
375
|
+
|
|
376
|
+
assert self.ctx.graph is not None
|
|
377
|
+
for user in response.data["members"]:
|
|
378
|
+
# Query all slack users and ingest them into the generic
|
|
379
|
+
# slackMember aspect
|
|
380
|
+
slack_user_details: SlackUserDetails = (
|
|
381
|
+
self.populate_slack_member_from_response(user, slack_instance)
|
|
382
|
+
)
|
|
383
|
+
if self.config.ingest_users:
|
|
384
|
+
for mcp in slack_user_details.to_mcps():
|
|
385
|
+
yield mcp.as_workunit()
|
|
386
|
+
|
|
387
|
+
platform_resource_urn = slack_user_details.get_resource_urn()
|
|
388
|
+
# If user is in DataHub, compute and emit CorpUserEditableInfo
|
|
389
|
+
# aspect. This code will be removed once we have server side
|
|
390
|
+
# processing of raw slackMember aspects. This code path can also
|
|
391
|
+
# be turned off by setting enrich_user_metadata to False.
|
|
392
|
+
user_obj_props_tuple = datahub_users.get(user["profile"].get("email"))
|
|
393
|
+
if user_obj_props_tuple is None:
|
|
394
|
+
# User is not in DataHub or enrichment is disabled
|
|
395
|
+
continue
|
|
396
|
+
user_obj, editable_properties = user_obj_props_tuple
|
|
397
|
+
slack_user_profile = user.get("profile", {})
|
|
398
|
+
user_obj.slack_id = user.get("id")
|
|
399
|
+
user_obj.title = slack_user_profile.get("title")
|
|
400
|
+
user_obj.image_url = slack_user_profile.get("image_192")
|
|
401
|
+
user_obj.phone = slack_user_profile.get("phone")
|
|
402
|
+
user_obj.real_name = slack_user_profile.get("real_name")
|
|
403
|
+
user_obj.slack_display_name = slack_user_profile.get("display_name")
|
|
404
|
+
corpuser_editable_info = editable_properties or (
|
|
405
|
+
CorpUserEditableInfoClass()
|
|
406
|
+
)
|
|
407
|
+
emittable_corpuser_editable_info = self.populate_corpuser_editable_info(
|
|
408
|
+
corpuser_editable_info,
|
|
409
|
+
user_obj,
|
|
410
|
+
platform_resource_urn=platform_resource_urn,
|
|
411
|
+
slack_instance=slack_instance,
|
|
412
|
+
)
|
|
413
|
+
if emittable_corpuser_editable_info:
|
|
414
|
+
yield MetadataChangeProposalWrapper(
|
|
415
|
+
entityUrn=user_obj.urn, aspect=emittable_corpuser_editable_info
|
|
416
|
+
).as_workunit()
|
|
417
|
+
# if we update corpusereditable info, we also update
|
|
418
|
+
# slackuserinfo. This will be removed once we have server
|
|
419
|
+
# side processing of raw slackMember aspects.
|
|
420
|
+
yield MetadataChangeProposalWrapper(
|
|
421
|
+
entityUrn=user_obj.urn,
|
|
422
|
+
aspect=slack_user_details.slack_user_info,
|
|
423
|
+
).as_workunit()
|
|
424
|
+
yield from self.emit_corp_user_slack_settings(user_obj)
|
|
425
|
+
cursor = str(response.data["response_metadata"]["next_cursor"])
|
|
426
|
+
if not cursor:
|
|
427
|
+
break
|
|
428
|
+
|
|
186
429
|
def _get_channel_info(
|
|
187
430
|
self, cursor: Optional[str]
|
|
188
431
|
) -> Tuple[List[MetadataWorkUnit], Optional[str]]:
|
|
@@ -259,6 +502,58 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
259
502
|
cursor = str(response.data["response_metadata"]["next_cursor"])
|
|
260
503
|
return result_channels, cursor
|
|
261
504
|
|
|
505
|
+
def populate_corpuser_editable_info(
|
|
506
|
+
self,
|
|
507
|
+
corpuser_editable_info: CorpUserEditableInfoClass,
|
|
508
|
+
user_obj: CorpUser,
|
|
509
|
+
platform_resource_urn: str,
|
|
510
|
+
slack_instance: SlackInstance,
|
|
511
|
+
) -> Optional[CorpUserEditableInfoClass]:
|
|
512
|
+
"""
|
|
513
|
+
Populate CorpUserEditableInfo aspect with user information from Slack.
|
|
514
|
+
If changes are not required, None is returned.
|
|
515
|
+
If changes are required, the updated aspect is returned.
|
|
516
|
+
"""
|
|
517
|
+
mutation_required = False
|
|
518
|
+
if not corpuser_editable_info.email and user_obj.email:
|
|
519
|
+
mutation_required = True
|
|
520
|
+
corpuser_editable_info.email = user_obj.email
|
|
521
|
+
if not corpuser_editable_info.slack and user_obj.slack_id:
|
|
522
|
+
mutation_required = True
|
|
523
|
+
corpuser_editable_info.slack = user_obj.slack_id
|
|
524
|
+
if not corpuser_editable_info.title and user_obj.title:
|
|
525
|
+
mutation_required = True
|
|
526
|
+
corpuser_editable_info.title = user_obj.title
|
|
527
|
+
if user_obj.image_url and (
|
|
528
|
+
is_picture_default_or_missing(corpuser_editable_info.pictureLink)
|
|
529
|
+
or (
|
|
530
|
+
is_slack_image(corpuser_editable_info.pictureLink)
|
|
531
|
+
and user_obj.image_url != corpuser_editable_info.pictureLink
|
|
532
|
+
)
|
|
533
|
+
):
|
|
534
|
+
mutation_required = True
|
|
535
|
+
corpuser_editable_info.pictureLink = user_obj.image_url
|
|
536
|
+
if user_obj.phone and not corpuser_editable_info.phone:
|
|
537
|
+
mutation_required = True
|
|
538
|
+
corpuser_editable_info.phone = user_obj.phone
|
|
539
|
+
if (
|
|
540
|
+
not corpuser_editable_info.displayName
|
|
541
|
+
or corpuser_editable_info.displayName == corpuser_editable_info.email
|
|
542
|
+
) and user_obj.real_name:
|
|
543
|
+
mutation_required = True
|
|
544
|
+
corpuser_editable_info.displayName = user_obj.real_name
|
|
545
|
+
if mutation_required:
|
|
546
|
+
# update informationSources
|
|
547
|
+
corpuser_editable_info.informationSources = (
|
|
548
|
+
[]
|
|
549
|
+
if not corpuser_editable_info.informationSources
|
|
550
|
+
else corpuser_editable_info.informationSources
|
|
551
|
+
)
|
|
552
|
+
if platform_resource_urn not in corpuser_editable_info.informationSources:
|
|
553
|
+
corpuser_editable_info.informationSources.append(platform_resource_urn)
|
|
554
|
+
return corpuser_editable_info
|
|
555
|
+
return None
|
|
556
|
+
|
|
262
557
|
def get_public_channels(self) -> Iterable[MetadataWorkUnit]:
|
|
263
558
|
cursor = None
|
|
264
559
|
while True:
|
|
@@ -270,58 +565,80 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
270
565
|
if not cursor:
|
|
271
566
|
break
|
|
272
567
|
|
|
273
|
-
def
|
|
274
|
-
|
|
568
|
+
def emit_slack_member_aspect(
|
|
569
|
+
self, user: SlackUserInfo
|
|
570
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
571
|
+
slack_user = SlackUserDetails(slack_user_info=user)
|
|
572
|
+
for mcp in slack_user.to_mcps():
|
|
573
|
+
yield mcp.as_workunit()
|
|
574
|
+
|
|
575
|
+
def emit_corp_user_slack_settings(
|
|
576
|
+
self, user_obj: CorpUser
|
|
577
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
578
|
+
assert self.ctx.graph is not None
|
|
579
|
+
|
|
580
|
+
if not user_obj.urn:
|
|
275
581
|
return
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
user=user_obj.slack_id
|
|
282
|
-
)
|
|
283
|
-
user_profile_res = user_profile_res.get("user", {})
|
|
284
|
-
else:
|
|
285
|
-
user_profile_res = self.get_slack_client().users_profile_get(
|
|
286
|
-
user=user_obj.slack_id
|
|
287
|
-
)
|
|
288
|
-
logger.debug(f"User profile: {user_profile_res}")
|
|
289
|
-
user_profile = user_profile_res.get("profile", {})
|
|
290
|
-
user_obj.title = user_profile.get("title")
|
|
291
|
-
user_obj.image_url = user_profile.get("image_192")
|
|
292
|
-
user_obj.phone = user_profile.get("phone")
|
|
293
|
-
user_obj.real_name = user_profile.get("real_name")
|
|
294
|
-
user_obj.slack_display_name = user_profile.get("display_name")
|
|
295
|
-
|
|
296
|
-
except Exception as e:
|
|
297
|
-
if "missing_scope" in str(e):
|
|
298
|
-
if self._use_users_info:
|
|
299
|
-
raise e
|
|
300
|
-
self._use_users_info = True
|
|
301
|
-
self.populate_user_profile(user_obj)
|
|
582
|
+
|
|
583
|
+
corp_user_settings = self.ctx.graph.get_aspect(
|
|
584
|
+
user_obj.urn, CorpUserSettingsClass
|
|
585
|
+
)
|
|
586
|
+
if not corp_user_settings:
|
|
302
587
|
return
|
|
303
588
|
|
|
304
|
-
|
|
305
|
-
|
|
589
|
+
notification_settings = corp_user_settings.notificationSettings
|
|
590
|
+
|
|
591
|
+
if not notification_settings:
|
|
592
|
+
corp_user_settings.notificationSettings = NotificationSettingsClass(
|
|
593
|
+
sinkTypes=[],
|
|
594
|
+
slackSettings=SlackNotificationSettingsClass(
|
|
595
|
+
userHandle=user_obj.slack_id
|
|
596
|
+
),
|
|
597
|
+
)
|
|
598
|
+
elif (
|
|
599
|
+
not notification_settings.slackSettings
|
|
600
|
+
or not notification_settings.slackSettings.userHandle
|
|
601
|
+
):
|
|
602
|
+
notification_settings.slackSettings = SlackNotificationSettingsClass(
|
|
603
|
+
userHandle=user_obj.slack_id
|
|
604
|
+
)
|
|
605
|
+
else:
|
|
306
606
|
return
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
607
|
+
|
|
608
|
+
yield MetadataWorkUnit(
|
|
609
|
+
id=f"{user_obj.urn}",
|
|
610
|
+
mcp=MetadataChangeProposalWrapper(
|
|
611
|
+
entityUrn=user_obj.urn,
|
|
612
|
+
aspect=corp_user_settings,
|
|
613
|
+
),
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
def get_user_to_be_updated(
|
|
617
|
+
self,
|
|
618
|
+
) -> Iterable[Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
|
|
619
|
+
assert self.ctx.graph is not None
|
|
620
|
+
for urn in self.ctx.graph.get_urns_by_filter(
|
|
621
|
+
entity_types=["corpuser"], query="*"
|
|
622
|
+
):
|
|
623
|
+
user_obj = CorpUser()
|
|
624
|
+
user_obj.urn = urn
|
|
625
|
+
editable_properties = self.ctx.graph.get_aspect(
|
|
626
|
+
urn, CorpUserEditableInfoClass
|
|
627
|
+
)
|
|
628
|
+
if editable_properties and editable_properties.email:
|
|
629
|
+
user_obj.email = editable_properties.email
|
|
630
|
+
else:
|
|
631
|
+
urn_id = Urn.from_string(user_obj.urn).get_entity_id_as_string()
|
|
632
|
+
if "@" in urn_id:
|
|
633
|
+
user_obj.email = urn_id
|
|
634
|
+
if user_obj.email is not None:
|
|
635
|
+
yield (user_obj, editable_properties)
|
|
319
636
|
|
|
320
637
|
@retry(
|
|
321
638
|
wait=wait_exponential(multiplier=2, min=4, max=60),
|
|
322
639
|
before_sleep=before_sleep_log(logger, logging.ERROR, True),
|
|
323
640
|
)
|
|
324
|
-
def
|
|
641
|
+
def get_user_to_be_updated_oss(self) -> Iterable[CorpUser]:
|
|
325
642
|
graphql_query = textwrap.dedent(
|
|
326
643
|
"""
|
|
327
644
|
query listUsers($input: ListUsersInput!) {
|
|
@@ -100,7 +100,15 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
100
100
|
|
|
101
101
|
stream_pattern: AllowDenyPattern = Field(
|
|
102
102
|
default=AllowDenyPattern.allow_all(),
|
|
103
|
-
description="Regex patterns for streams to filter in ingestion.
|
|
103
|
+
description="Regex patterns for streams to filter in ingestion. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
107
|
+
default=AllowDenyPattern.allow_all(),
|
|
108
|
+
description="Regex patterns for procedures to filter in ingestion. "
|
|
109
|
+
"Specify regex to match the entire procedure name in database.schema.procedure format. "
|
|
110
|
+
"e.g. to match all procedures starting with customer in Customer database and public schema,"
|
|
111
|
+
" use the regex 'Customer.public.customer.*'",
|
|
104
112
|
)
|
|
105
113
|
|
|
106
114
|
match_fully_qualified_names: bool = Field(
|
|
@@ -284,6 +292,11 @@ class SnowflakeV2Config(
|
|
|
284
292
|
description="If enabled, streams will be ingested as separate entities from tables/views.",
|
|
285
293
|
)
|
|
286
294
|
|
|
295
|
+
include_procedures: bool = Field(
|
|
296
|
+
default=True,
|
|
297
|
+
description="If enabled, procedures will be ingested as pipelines/tasks.",
|
|
298
|
+
)
|
|
299
|
+
|
|
287
300
|
structured_property_pattern: AllowDenyPattern = Field(
|
|
288
301
|
default=AllowDenyPattern.allow_all(),
|
|
289
302
|
description=(
|
|
@@ -403,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
403
403
|
res["session_id"],
|
|
404
404
|
res["query_start_time"],
|
|
405
405
|
object_modified_by_ddl,
|
|
406
|
+
res["query_type"],
|
|
406
407
|
)
|
|
407
408
|
if known_ddl_entry:
|
|
408
409
|
return known_ddl_entry
|
|
@@ -537,40 +538,42 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
537
538
|
session_id: str,
|
|
538
539
|
timestamp: datetime,
|
|
539
540
|
object_modified_by_ddl: dict,
|
|
541
|
+
query_type: str,
|
|
540
542
|
) -> Optional[Union[TableRename, TableSwap]]:
|
|
541
543
|
timestamp = timestamp.astimezone(timezone.utc)
|
|
542
|
-
if
|
|
543
|
-
"operationType"
|
|
544
|
-
|
|
545
|
-
|
|
544
|
+
if (
|
|
545
|
+
object_modified_by_ddl["operationType"] == "ALTER"
|
|
546
|
+
and query_type == "RENAME_TABLE"
|
|
547
|
+
and object_modified_by_ddl["properties"].get("objectName")
|
|
548
|
+
):
|
|
549
|
+
original_un = self.identifiers.gen_dataset_urn(
|
|
546
550
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
547
551
|
object_modified_by_ddl["objectName"]
|
|
548
552
|
)
|
|
549
553
|
)
|
|
550
554
|
|
|
551
|
-
|
|
555
|
+
new_urn = self.identifiers.gen_dataset_urn(
|
|
552
556
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
553
|
-
object_modified_by_ddl["properties"]["
|
|
557
|
+
object_modified_by_ddl["properties"]["objectName"]["value"]
|
|
554
558
|
)
|
|
555
559
|
)
|
|
556
|
-
|
|
557
|
-
return TableSwap(urn1, urn2, query, session_id, timestamp)
|
|
560
|
+
return TableRename(original_un, new_urn, query, session_id, timestamp)
|
|
558
561
|
elif object_modified_by_ddl[
|
|
559
562
|
"operationType"
|
|
560
|
-
] == "
|
|
561
|
-
|
|
563
|
+
] == "ALTER" and object_modified_by_ddl["properties"].get("swapTargetName"):
|
|
564
|
+
urn1 = self.identifiers.gen_dataset_urn(
|
|
562
565
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
563
566
|
object_modified_by_ddl["objectName"]
|
|
564
567
|
)
|
|
565
568
|
)
|
|
566
569
|
|
|
567
|
-
|
|
570
|
+
urn2 = self.identifiers.gen_dataset_urn(
|
|
568
571
|
self.identifiers.get_dataset_identifier_from_qualified_name(
|
|
569
|
-
object_modified_by_ddl["properties"]["
|
|
572
|
+
object_modified_by_ddl["properties"]["swapTargetName"]["value"]
|
|
570
573
|
)
|
|
571
574
|
)
|
|
572
575
|
|
|
573
|
-
return
|
|
576
|
+
return TableSwap(urn1, urn2, query, session_id, timestamp)
|
|
574
577
|
else:
|
|
575
578
|
self.report.num_ddl_queries_dropped += 1
|
|
576
579
|
return None
|