acryl-datahub 1.0.0.1rc1__py3-none-any.whl → 1.0.0.1rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show
  1. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/METADATA +2575 -2574
  2. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/RECORD +77 -60
  3. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/emitter/request_helper.py +19 -14
  8. datahub/emitter/rest_emitter.py +2 -2
  9. datahub/ingestion/api/source.py +6 -2
  10. datahub/ingestion/api/source_helpers.py +6 -2
  11. datahub/ingestion/extractor/schema_util.py +1 -0
  12. datahub/ingestion/graph/client.py +6 -11
  13. datahub/ingestion/source/common/data_platforms.py +23 -0
  14. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  15. datahub/ingestion/source/common/subtypes.py +16 -1
  16. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  17. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  18. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  19. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  20. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  21. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  22. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  23. datahub/ingestion/source/hex/__init__.py +0 -0
  24. datahub/ingestion/source/hex/api.py +394 -0
  25. datahub/ingestion/source/hex/constants.py +3 -0
  26. datahub/ingestion/source/hex/hex.py +167 -0
  27. datahub/ingestion/source/hex/mapper.py +372 -0
  28. datahub/ingestion/source/hex/model.py +68 -0
  29. datahub/ingestion/source/iceberg/iceberg.py +62 -66
  30. datahub/ingestion/source/mlflow.py +217 -8
  31. datahub/ingestion/source/mode.py +11 -1
  32. datahub/ingestion/source/openapi.py +69 -34
  33. datahub/ingestion/source/powerbi/powerbi.py +29 -23
  34. datahub/ingestion/source/s3/source.py +11 -0
  35. datahub/ingestion/source/slack/slack.py +399 -82
  36. datahub/ingestion/source/snowflake/constants.py +1 -0
  37. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  38. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  39. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  40. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  41. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  42. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  43. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  44. datahub/ingestion/source/sql/mssql/source.py +8 -4
  45. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  46. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  47. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  48. datahub/ingestion/source/superset.py +15 -6
  49. datahub/ingestion/source/vertexai/__init__.py +0 -0
  50. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  51. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  52. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  53. datahub/metadata/_schema_classes.py +472 -1
  54. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  55. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  56. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  57. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  58. datahub/metadata/schema.avsc +309 -0
  59. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  60. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  61. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  62. datahub/metadata/schemas/Deprecation.avsc +2 -0
  63. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  64. datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
  65. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  66. datahub/metadata/schemas/Siblings.avsc +2 -0
  67. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  68. datahub/sdk/dataset.py +122 -0
  69. datahub/sdk/entity.py +99 -3
  70. datahub/sdk/entity_client.py +27 -3
  71. datahub/sdk/main_client.py +22 -0
  72. datahub/sdk/search_filters.py +4 -4
  73. datahub/sql_parsing/split_statements.py +5 -1
  74. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  75. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  76. datahub/ingestion/source/vertexai.py +0 -695
  77. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info/licenses}/LICENSE +0 -0
  78. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,16 @@
1
+ import json
1
2
  import logging
2
3
  import textwrap
3
4
  from dataclasses import dataclass
4
- from typing import Iterable, List, Optional, Tuple
5
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
5
6
 
6
- from pydantic import Field, SecretStr
7
+ from pydantic import BaseModel, Field, SecretStr
7
8
  from slack_sdk import WebClient
8
9
  from tenacity import retry, wait_exponential
9
10
  from tenacity.before_sleep import before_sleep_log
10
11
 
11
12
  import datahub.emitter.mce_builder as builder
13
+ from datahub.emitter.mce_builder import datahub_guid, make_dataplatform_instance_urn
12
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
15
  from datahub.ingestion.api.common import PipelineContext
14
16
  from datahub.ingestion.api.decorators import (
@@ -32,16 +34,153 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
32
34
  )
33
35
  from datahub.metadata.schema_classes import (
34
36
  CorpUserEditableInfoClass,
37
+ CorpUserSettingsClass,
38
+ DataPlatformInstanceClass,
39
+ DataPlatformInstancePropertiesClass,
35
40
  DatasetPropertiesClass,
36
41
  DeprecationClass,
42
+ NotificationSettingsClass,
43
+ PlatformResourceInfoClass,
44
+ SerializedValueClass,
45
+ SerializedValueContentTypeClass,
46
+ SerializedValueSchemaTypeClass,
47
+ SlackNotificationSettingsClass,
48
+ SlackUserInfoClass as SlackUserInfo,
49
+ StatusClass,
37
50
  SubTypesClass,
51
+ _Aspect,
38
52
  )
39
53
  from datahub.utilities.ratelimiter import RateLimiter
54
+ from datahub.utilities.str_enum import StrEnum
40
55
  from datahub.utilities.urns.urn import Urn
41
56
 
42
57
  logger: logging.Logger = logging.getLogger(__name__)
43
58
 
44
59
 
60
+ # TODO: Relocate this function to a utility module
61
+ def is_picture_default_or_missing(picture_link: Optional[str]) -> bool:
62
+ if not picture_link:
63
+ return True
64
+ return picture_link.endswith("default_avatar.png")
65
+
66
+
67
+ def is_slack_image(picture_link: Optional[str]) -> bool:
68
+ """
69
+ Guesses if the picture link is a slack image.
70
+ """
71
+ if not picture_link:
72
+ return False
73
+ return "slack-edge.com" in picture_link
74
+
75
+
76
+ class ResourceType(StrEnum):
77
+ USER_INFO = "user-info"
78
+ CHANNEL_INFO = "channel-info"
79
+
80
+
81
+ class SlackInstance(BaseModel):
82
+ id: str
83
+ name: Optional[str] = None
84
+ description: Optional[str] = None
85
+ external_url: Optional[str] = None
86
+ custom_properties: Optional[Dict[str, str]] = None
87
+
88
+ def to_platform_instance_urn(self) -> str:
89
+ return make_dataplatform_instance_urn(
90
+ platform=DATA_PLATFORM_SLACK_URN, instance=self.id
91
+ )
92
+
93
+ def with_slack_team_info(self, team_info: dict) -> "SlackInstance":
94
+ """
95
+ team_info looks like this
96
+ {'id': 'T22BUCL1LKW', 'name': 'DataHub', 'url': 'https://datahubspace.slack.com/', 'domain': 'datahub', 'email_domain': '', 'icon': {'image_default': False, 'image_34': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_34.png', 'image_44': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_44.png', 'image_68': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_68.png', 'image_88': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_88.png', 'image_102': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_102.png', 'image_230': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_230.png', 'image_132': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_132.png'}, 'avatar_base_url': 'https://ca.slack-edge.com/', 'is_verified': False, 'external_org_migrations': {'date_updated': 1722672564, 'current': []}, 'discoverable': 'closed', 'enterprise_id': 'E06TPM5T1G9', 'enterprise_name': 'DataHub', 'enterprise_domain': 'datahubspace', 'lob_sales_home_enabled': False}
97
+ """
98
+ self.name = team_info.get("name")
99
+ self.description = team_info.get("name")
100
+ self.external_url = team_info.get("url")
101
+ self.custom_properties = {
102
+ k: v
103
+ for k, v in {
104
+ "domain": team_info.get("domain"),
105
+ "enterprise_id": team_info.get("enterprise_id"),
106
+ "enterprise_name": team_info.get("enterprise_name"),
107
+ "enterprise_domain": team_info.get("enterprise_domain"),
108
+ "icon": team_info.get("icon", {}).get("image_102"),
109
+ }.items()
110
+ if v is not None
111
+ }
112
+ return self
113
+
114
+ def to_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
115
+ return [
116
+ MetadataChangeProposalWrapper(
117
+ entityUrn=self.to_platform_instance_urn(),
118
+ aspect=DataPlatformInstancePropertiesClass(
119
+ name=self.name or self.id,
120
+ description=self.description,
121
+ externalUrl=self.external_url or None,
122
+ customProperties=self.custom_properties or {},
123
+ ),
124
+ )
125
+ ]
126
+
127
+
128
+ def to_serialized_value(value: _Aspect) -> SerializedValueClass:
129
+ # HACK: we remove the .pegasus2avro from the schema type since we want to refer to
130
+ # the original pdl type
131
+ schema_type = value.RECORD_SCHEMA.fullname.replace(".pegasus2avro", "")
132
+ serialized_value = SerializedValueClass(
133
+ blob=json.dumps(value.to_obj()).encode("utf-8"),
134
+ contentType=SerializedValueContentTypeClass.JSON,
135
+ schemaType=SerializedValueSchemaTypeClass.PEGASUS,
136
+ schemaRef=schema_type,
137
+ )
138
+ return serialized_value
139
+
140
+
141
+ class SlackUserDetails:
142
+ def __init__(self, slack_user_info: SlackUserInfo):
143
+ self.slack_user_info = slack_user_info
144
+
145
+ def to_guid(self) -> str:
146
+ """
147
+ A slack user is uniquely identified by the combination of their id and teamId.
148
+ """
149
+ return datahub_guid(
150
+ {"id": self.slack_user_info.id, "dpi": self.slack_user_info.teamId}
151
+ )
152
+
153
+ def get_resource_urn(self) -> str:
154
+ return f"urn:li:platformResource:{self.to_guid()}"
155
+
156
+ def to_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
157
+ resource_urn = self.get_resource_urn()
158
+
159
+ dpi = DataPlatformInstanceClass(
160
+ platform=DATA_PLATFORM_SLACK_URN,
161
+ instance=self.slack_user_info.slackInstance,
162
+ )
163
+
164
+ secondary_keys = []
165
+ if self.slack_user_info.email:
166
+ secondary_keys.append(self.slack_user_info.email)
167
+
168
+ resource_info = PlatformResourceInfoClass(
169
+ resourceType=ResourceType.USER_INFO.value,
170
+ value=to_serialized_value(self.slack_user_info),
171
+ primaryKey=self.slack_user_info.id,
172
+ secondaryKeys=secondary_keys,
173
+ )
174
+
175
+ status = StatusClass(
176
+ removed=self.slack_user_info.isDeleted,
177
+ )
178
+
179
+ yield from MetadataChangeProposalWrapper.construct_many(
180
+ resource_urn, aspects=[dpi, resource_info, status]
181
+ )
182
+
183
+
45
184
  @dataclass
46
185
  class CorpUser:
47
186
  urn: Optional[str] = None
@@ -52,18 +191,26 @@ class CorpUser:
52
191
  phone: Optional[str] = None
53
192
  real_name: Optional[str] = None
54
193
  slack_display_name: Optional[str] = None
194
+ team_id: Optional[str] = None
195
+ team_domain: Optional[str] = None
196
+ is_team_enterprise: Optional[bool] = None
55
197
 
56
198
 
57
199
  class SlackSourceConfig(
58
200
  StatefulIngestionConfigBase,
59
201
  ):
60
202
  bot_token: SecretStr = Field(
61
- description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
203
+ description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email`, `users.profile:read`, and `team:read` scopes.",
62
204
  )
63
205
  enrich_user_metadata: bool = Field(
64
206
  type=bool,
65
207
  default=True,
66
- description="Whether to enrich user metadata.",
208
+ description="When enabled, will enrich provisioned DataHub users' metadata with information from Slack.",
209
+ )
210
+ ingest_users: bool = Field(
211
+ type=bool,
212
+ default=True,
213
+ description="Whether to ingest users. When set to true, will ingest all users in the Slack workspace (as platform resources) to simplify user enrichment after they are provisioned on DataHub.",
67
214
  )
68
215
  api_requests_per_min: int = Field(
69
216
  type=int,
@@ -96,9 +243,11 @@ class SlackSourceConfig(
96
243
  class SlackSourceReport(StaleEntityRemovalSourceReport):
97
244
  channels_reported: int = 0
98
245
  archived_channels_reported: int = 0
246
+ users_reported: int = 0
99
247
 
100
248
 
101
249
  PLATFORM_NAME = "slack"
250
+ DATA_PLATFORM_SLACK_URN: str = builder.make_data_platform_urn(PLATFORM_NAME)
102
251
 
103
252
 
104
253
  @platform_name("Slack")
@@ -124,6 +273,38 @@ class SlackSource(StatefulIngestionSourceBase):
124
273
  def get_slack_client(self) -> WebClient:
125
274
  return WebClient(token=self.config.bot_token.get_secret_value())
126
275
 
276
+ @staticmethod
277
+ def populate_slack_member_from_response(
278
+ user: Dict[str, Any], slack_instance: SlackInstance
279
+ ) -> SlackUserDetails:
280
+ profile = user.get("profile", {})
281
+
282
+ user_info = SlackUserInfo(
283
+ slackInstance=slack_instance.to_platform_instance_urn(),
284
+ id=user["id"],
285
+ name=user["name"],
286
+ realName=user.get("real_name", ""),
287
+ displayName=profile.get("display_name", ""),
288
+ email=profile.get("email"),
289
+ teamId=user["team_id"],
290
+ isDeleted=user.get("deleted", False),
291
+ isAdmin=user.get("is_admin", False),
292
+ isOwner=user.get("is_owner", False),
293
+ isPrimaryOwner=user.get("is_primary_owner", False),
294
+ isBot=user.get("is_bot", False),
295
+ timezone=user.get("tz"),
296
+ timezoneOffset=user.get("tz_offset"),
297
+ title=profile.get("title"),
298
+ phone=profile.get("phone"),
299
+ profilePictureUrl=profile.get(
300
+ "image_192"
301
+ ), # Using 192px image as an example
302
+ statusText=profile.get("status_text"),
303
+ statusEmoji=profile.get("status_emoji"),
304
+ lastUpdatedSeconds=user.get("updated"),
305
+ )
306
+ return SlackUserDetails(slack_user_info=user_info)
307
+
127
308
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
128
309
  return [
129
310
  *super().get_workunit_processors(),
@@ -143,46 +324,108 @@ class SlackSource(StatefulIngestionSourceBase):
143
324
  logger.info(auth_resp.data)
144
325
  if self.config.ingest_public_channels:
145
326
  yield from self.get_public_channels()
146
- if self.config.enrich_user_metadata:
327
+ if self.config.enrich_user_metadata or self.config.ingest_users:
147
328
  yield from self.get_user_info()
148
329
 
330
+ def _get_datahub_user_info(
331
+ self,
332
+ ) -> Dict[str, Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
333
+ # get_user_to_be_updated ensures that the email field is not None
334
+ users = {
335
+ user_obj.email: (user_obj, editable_properties)
336
+ for user_obj, editable_properties in self.get_user_to_be_updated()
337
+ if user_obj.email
338
+ }
339
+ return users
340
+
149
341
  def get_user_info(self) -> Iterable[MetadataWorkUnit]:
150
- assert self.ctx.graph is not None
151
- for user_obj in self.get_user_to_be_updated():
152
- self.populate_slack_id_from_email(user_obj)
153
- if user_obj.slack_id is None:
154
- continue
155
- self.populate_user_profile(user_obj)
156
- if user_obj.urn is None:
157
- continue
158
- logger.info(f"User: {user_obj}")
159
- corpuser_editable_info = (
160
- self.ctx.graph.get_aspect(
161
- entity_urn=user_obj.urn, aspect_type=CorpUserEditableInfoClass
162
- )
163
- or CorpUserEditableInfoClass()
164
- )
165
- corpuser_editable_info.email = user_obj.email
166
- corpuser_editable_info.slack = user_obj.slack_id
167
- corpuser_editable_info.title = user_obj.title
168
- if user_obj.image_url:
169
- corpuser_editable_info.pictureLink = user_obj.image_url
170
- if user_obj.phone:
171
- corpuser_editable_info.phone = user_obj.phone
172
- if (
173
- not corpuser_editable_info.displayName
174
- or corpuser_editable_info.displayName == corpuser_editable_info.email
175
- ):
176
- # let's fill out a real name
177
- corpuser_editable_info.displayName = user_obj.real_name
178
- yield MetadataWorkUnit(
179
- id=f"{user_obj.urn}",
180
- mcp=MetadataChangeProposalWrapper(
181
- entityUrn=user_obj.urn,
182
- aspect=corpuser_editable_info,
183
- ),
342
+ # Get team information to populate for all users
343
+ slack_instance: Optional[SlackInstance] = None
344
+ with self.rate_limiter:
345
+ team_response = self.get_slack_client().team_info()
346
+ if team_response and "team" in team_response:
347
+ team_info = team_response["team"]
348
+ slack_instance = SlackInstance(id=team_info.get("id"))
349
+ slack_instance = slack_instance.with_slack_team_info(team_info)
350
+
351
+ if slack_instance:
352
+ for mcp in slack_instance.to_mcps():
353
+ yield mcp.as_workunit()
354
+ else:
355
+ logger.error("Failed to fetch team information")
356
+ self.report.report_failure(
357
+ "team_info", "Failed to fetch team information for users"
184
358
  )
185
359
 
360
+ assert slack_instance
361
+
362
+ # Fetch all DataHub users that need to be updated
363
+ if self.config.enrich_user_metadata:
364
+ datahub_users = self._get_datahub_user_info()
365
+ else:
366
+ datahub_users = {}
367
+ cursor = None
368
+ while True:
369
+ with self.rate_limiter:
370
+ response = self.get_slack_client().users_list(cursor=cursor)
371
+ assert isinstance(response.data, dict)
372
+ if not response.data["ok"]:
373
+ self.report.report_failure("users", "Failed to fetch users")
374
+ return
375
+
376
+ assert self.ctx.graph is not None
377
+ for user in response.data["members"]:
378
+ # Query all slack users and ingest them into the generic
379
+ # slackMember aspect
380
+ slack_user_details: SlackUserDetails = (
381
+ self.populate_slack_member_from_response(user, slack_instance)
382
+ )
383
+ if self.config.ingest_users:
384
+ for mcp in slack_user_details.to_mcps():
385
+ yield mcp.as_workunit()
386
+
387
+ platform_resource_urn = slack_user_details.get_resource_urn()
388
+ # If user is in DataHub, compute and emit CorpUserEditableInfo
389
+ # aspect. This code will be removed once we have server side
390
+ # processing of raw slackMember aspects. This code path can also
391
+ # be turned off by setting enrich_user_metadata to False.
392
+ user_obj_props_tuple = datahub_users.get(user["profile"].get("email"))
393
+ if user_obj_props_tuple is None:
394
+ # User is not in DataHub or enrichment is disabled
395
+ continue
396
+ user_obj, editable_properties = user_obj_props_tuple
397
+ slack_user_profile = user.get("profile", {})
398
+ user_obj.slack_id = user.get("id")
399
+ user_obj.title = slack_user_profile.get("title")
400
+ user_obj.image_url = slack_user_profile.get("image_192")
401
+ user_obj.phone = slack_user_profile.get("phone")
402
+ user_obj.real_name = slack_user_profile.get("real_name")
403
+ user_obj.slack_display_name = slack_user_profile.get("display_name")
404
+ corpuser_editable_info = editable_properties or (
405
+ CorpUserEditableInfoClass()
406
+ )
407
+ emittable_corpuser_editable_info = self.populate_corpuser_editable_info(
408
+ corpuser_editable_info,
409
+ user_obj,
410
+ platform_resource_urn=platform_resource_urn,
411
+ slack_instance=slack_instance,
412
+ )
413
+ if emittable_corpuser_editable_info:
414
+ yield MetadataChangeProposalWrapper(
415
+ entityUrn=user_obj.urn, aspect=emittable_corpuser_editable_info
416
+ ).as_workunit()
417
+ # if we update corpusereditable info, we also update
418
+ # slackuserinfo. This will be removed once we have server
419
+ # side processing of raw slackMember aspects.
420
+ yield MetadataChangeProposalWrapper(
421
+ entityUrn=user_obj.urn,
422
+ aspect=slack_user_details.slack_user_info,
423
+ ).as_workunit()
424
+ yield from self.emit_corp_user_slack_settings(user_obj)
425
+ cursor = str(response.data["response_metadata"]["next_cursor"])
426
+ if not cursor:
427
+ break
428
+
186
429
  def _get_channel_info(
187
430
  self, cursor: Optional[str]
188
431
  ) -> Tuple[List[MetadataWorkUnit], Optional[str]]:
@@ -259,6 +502,58 @@ class SlackSource(StatefulIngestionSourceBase):
259
502
  cursor = str(response.data["response_metadata"]["next_cursor"])
260
503
  return result_channels, cursor
261
504
 
505
+ def populate_corpuser_editable_info(
506
+ self,
507
+ corpuser_editable_info: CorpUserEditableInfoClass,
508
+ user_obj: CorpUser,
509
+ platform_resource_urn: str,
510
+ slack_instance: SlackInstance,
511
+ ) -> Optional[CorpUserEditableInfoClass]:
512
+ """
513
+ Populate CorpUserEditableInfo aspect with user information from Slack.
514
+ If changes are not required, None is returned.
515
+ If changes are required, the updated aspect is returned.
516
+ """
517
+ mutation_required = False
518
+ if not corpuser_editable_info.email and user_obj.email:
519
+ mutation_required = True
520
+ corpuser_editable_info.email = user_obj.email
521
+ if not corpuser_editable_info.slack and user_obj.slack_id:
522
+ mutation_required = True
523
+ corpuser_editable_info.slack = user_obj.slack_id
524
+ if not corpuser_editable_info.title and user_obj.title:
525
+ mutation_required = True
526
+ corpuser_editable_info.title = user_obj.title
527
+ if user_obj.image_url and (
528
+ is_picture_default_or_missing(corpuser_editable_info.pictureLink)
529
+ or (
530
+ is_slack_image(corpuser_editable_info.pictureLink)
531
+ and user_obj.image_url != corpuser_editable_info.pictureLink
532
+ )
533
+ ):
534
+ mutation_required = True
535
+ corpuser_editable_info.pictureLink = user_obj.image_url
536
+ if user_obj.phone and not corpuser_editable_info.phone:
537
+ mutation_required = True
538
+ corpuser_editable_info.phone = user_obj.phone
539
+ if (
540
+ not corpuser_editable_info.displayName
541
+ or corpuser_editable_info.displayName == corpuser_editable_info.email
542
+ ) and user_obj.real_name:
543
+ mutation_required = True
544
+ corpuser_editable_info.displayName = user_obj.real_name
545
+ if mutation_required:
546
+ # update informationSources
547
+ corpuser_editable_info.informationSources = (
548
+ []
549
+ if not corpuser_editable_info.informationSources
550
+ else corpuser_editable_info.informationSources
551
+ )
552
+ if platform_resource_urn not in corpuser_editable_info.informationSources:
553
+ corpuser_editable_info.informationSources.append(platform_resource_urn)
554
+ return corpuser_editable_info
555
+ return None
556
+
262
557
  def get_public_channels(self) -> Iterable[MetadataWorkUnit]:
263
558
  cursor = None
264
559
  while True:
@@ -270,58 +565,80 @@ class SlackSource(StatefulIngestionSourceBase):
270
565
  if not cursor:
271
566
  break
272
567
 
273
- def populate_user_profile(self, user_obj: CorpUser) -> None:
274
- if not user_obj.slack_id:
568
+ def emit_slack_member_aspect(
569
+ self, user: SlackUserInfo
570
+ ) -> Iterable[MetadataWorkUnit]:
571
+ slack_user = SlackUserDetails(slack_user_info=user)
572
+ for mcp in slack_user.to_mcps():
573
+ yield mcp.as_workunit()
574
+
575
+ def emit_corp_user_slack_settings(
576
+ self, user_obj: CorpUser
577
+ ) -> Iterable[MetadataWorkUnit]:
578
+ assert self.ctx.graph is not None
579
+
580
+ if not user_obj.urn:
275
581
  return
276
- try:
277
- # https://api.slack.com/methods/users.profile.get
278
- with self.rate_limiter:
279
- if self._use_users_info:
280
- user_profile_res = self.get_slack_client().users_info(
281
- user=user_obj.slack_id
282
- )
283
- user_profile_res = user_profile_res.get("user", {})
284
- else:
285
- user_profile_res = self.get_slack_client().users_profile_get(
286
- user=user_obj.slack_id
287
- )
288
- logger.debug(f"User profile: {user_profile_res}")
289
- user_profile = user_profile_res.get("profile", {})
290
- user_obj.title = user_profile.get("title")
291
- user_obj.image_url = user_profile.get("image_192")
292
- user_obj.phone = user_profile.get("phone")
293
- user_obj.real_name = user_profile.get("real_name")
294
- user_obj.slack_display_name = user_profile.get("display_name")
295
-
296
- except Exception as e:
297
- if "missing_scope" in str(e):
298
- if self._use_users_info:
299
- raise e
300
- self._use_users_info = True
301
- self.populate_user_profile(user_obj)
582
+
583
+ corp_user_settings = self.ctx.graph.get_aspect(
584
+ user_obj.urn, CorpUserSettingsClass
585
+ )
586
+ if not corp_user_settings:
302
587
  return
303
588
 
304
- def populate_slack_id_from_email(self, user_obj: CorpUser) -> None:
305
- if user_obj.email is None:
589
+ notification_settings = corp_user_settings.notificationSettings
590
+
591
+ if not notification_settings:
592
+ corp_user_settings.notificationSettings = NotificationSettingsClass(
593
+ sinkTypes=[],
594
+ slackSettings=SlackNotificationSettingsClass(
595
+ userHandle=user_obj.slack_id
596
+ ),
597
+ )
598
+ elif (
599
+ not notification_settings.slackSettings
600
+ or not notification_settings.slackSettings.userHandle
601
+ ):
602
+ notification_settings.slackSettings = SlackNotificationSettingsClass(
603
+ userHandle=user_obj.slack_id
604
+ )
605
+ else:
306
606
  return
307
- try:
308
- # https://api.slack.com/methods/users.lookupByEmail
309
- with self.rate_limiter:
310
- user_info_res = self.get_slack_client().users_lookupByEmail(
311
- email=user_obj.email
312
- )
313
- user_info = user_info_res.get("user", {})
314
- user_obj.slack_id = user_info.get("id")
315
- except Exception as e:
316
- if "users_not_found" in str(e):
317
- return
318
- raise e
607
+
608
+ yield MetadataWorkUnit(
609
+ id=f"{user_obj.urn}",
610
+ mcp=MetadataChangeProposalWrapper(
611
+ entityUrn=user_obj.urn,
612
+ aspect=corp_user_settings,
613
+ ),
614
+ )
615
+
616
+ def get_user_to_be_updated(
617
+ self,
618
+ ) -> Iterable[Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
619
+ assert self.ctx.graph is not None
620
+ for urn in self.ctx.graph.get_urns_by_filter(
621
+ entity_types=["corpuser"], query="*"
622
+ ):
623
+ user_obj = CorpUser()
624
+ user_obj.urn = urn
625
+ editable_properties = self.ctx.graph.get_aspect(
626
+ urn, CorpUserEditableInfoClass
627
+ )
628
+ if editable_properties and editable_properties.email:
629
+ user_obj.email = editable_properties.email
630
+ else:
631
+ urn_id = Urn.from_string(user_obj.urn).get_entity_id_as_string()
632
+ if "@" in urn_id:
633
+ user_obj.email = urn_id
634
+ if user_obj.email is not None:
635
+ yield (user_obj, editable_properties)
319
636
 
320
637
  @retry(
321
638
  wait=wait_exponential(multiplier=2, min=4, max=60),
322
639
  before_sleep=before_sleep_log(logger, logging.ERROR, True),
323
640
  )
324
- def get_user_to_be_updated(self) -> Iterable[CorpUser]:
641
+ def get_user_to_be_updated_oss(self) -> Iterable[CorpUser]:
325
642
  graphql_query = textwrap.dedent(
326
643
  """
327
644
  query listUsers($input: ListUsersInput!) {
@@ -54,6 +54,7 @@ class SnowflakeObjectDomain(StrEnum):
54
54
  COLUMN = "column"
55
55
  ICEBERG_TABLE = "iceberg table"
56
56
  STREAM = "stream"
57
+ PROCEDURE = "procedure"
57
58
 
58
59
 
59
60
  GENERIC_PERMISSION_ERROR_KEY = "permission-error"
@@ -100,7 +100,15 @@ class SnowflakeFilterConfig(SQLFilterConfig):
100
100
 
101
101
  stream_pattern: AllowDenyPattern = Field(
102
102
  default=AllowDenyPattern.allow_all(),
103
- description="Regex patterns for streams to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
103
+ description="Regex patterns for streams to filter in ingestion. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
104
+ )
105
+
106
+ procedure_pattern: AllowDenyPattern = Field(
107
+ default=AllowDenyPattern.allow_all(),
108
+ description="Regex patterns for procedures to filter in ingestion. "
109
+ "Specify regex to match the entire procedure name in database.schema.procedure format. "
110
+ "e.g. to match all procedures starting with customer in Customer database and public schema,"
111
+ " use the regex 'Customer.public.customer.*'",
104
112
  )
105
113
 
106
114
  match_fully_qualified_names: bool = Field(
@@ -284,6 +292,11 @@ class SnowflakeV2Config(
284
292
  description="If enabled, streams will be ingested as separate entities from tables/views.",
285
293
  )
286
294
 
295
+ include_procedures: bool = Field(
296
+ default=True,
297
+ description="If enabled, procedures will be ingested as pipelines/tasks.",
298
+ )
299
+
287
300
  structured_property_pattern: AllowDenyPattern = Field(
288
301
  default=AllowDenyPattern.allow_all(),
289
302
  description=(
@@ -164,6 +164,23 @@ class SnowflakeQuery:
164
164
  and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
165
165
  order by table_schema, table_name"""
166
166
 
167
+ @staticmethod
168
+ def procedures_for_database(db_name: Optional[str]) -> str:
169
+ db_clause = f'"{db_name}".' if db_name is not None else ""
170
+ return f"""
171
+ SELECT procedure_catalog AS "PROCEDURE_CATALOG",
172
+ procedure_schema AS "PROCEDURE_SCHEMA",
173
+ procedure_name AS "PROCEDURE_NAME",
174
+ procedure_language AS "PROCEDURE_LANGUAGE",
175
+ argument_signature AS "ARGUMENT_SIGNATURE",
176
+ data_type AS "PROCEDURE_RETURN_TYPE",
177
+ procedure_definition AS "PROCEDURE_DEFINITION",
178
+ created AS "CREATED",
179
+ last_altered AS "LAST_ALTERED",
180
+ comment AS "COMMENT"
181
+ FROM {db_clause}information_schema.procedures
182
+ order by procedure_schema, procedure_name"""
183
+
167
184
  @staticmethod
168
185
  def get_all_tags():
169
186
  return """
@@ -105,6 +105,7 @@ class SnowflakeV2Report(
105
105
  databases_scanned: int = 0
106
106
  tags_scanned: int = 0
107
107
  streams_scanned: int = 0
108
+ procedures_scanned: int = 0
108
109
 
109
110
  include_usage_stats: bool = False
110
111
  include_operational_stats: bool = False
@@ -163,6 +164,8 @@ class SnowflakeV2Report(
163
164
  self.tags_scanned += 1
164
165
  elif ent_type == "stream":
165
166
  self.streams_scanned += 1
167
+ elif ent_type == "procedure":
168
+ self.procedures_scanned += 1
166
169
  else:
167
170
  raise KeyError(f"Unknown entity {ent_type}.")
168
171