acryl-datahub 0.15.0.1rc9__py3-none-any.whl → 0.15.0.1rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=2793dupxo-Ov36jB1Z_p3H61xA3Rxhr1VhzHSdVOKhY,576
1
+ datahub/__init__.py,sha256=Jz7rOAS6YmpfTiD-EFzoeXEIAehGfC59afdTNfvT33s,577
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -273,7 +273,7 @@ datahub/ingestion/source/datahub/datahub_source.py,sha256=2jDnsHEzpGhr00qQI9unSU
273
273
  datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
274
274
  datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
275
275
  datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
276
- datahub/ingestion/source/dbt/dbt_cloud.py,sha256=3bfcCi7xBvlCTGjnDCnyOShsxgVRn7wUYJOid_WT_Vk,17643
276
+ datahub/ingestion/source/dbt/dbt_cloud.py,sha256=Joh4AIjlu-UVJw_Hu32bPxT9w25RX4JfUnUhVpiJcJw,18005
277
277
  datahub/ingestion/source/dbt/dbt_common.py,sha256=0ddiqNx9sUAGZYDQ8tSr5Qh5ti-kgC4saW1yRRNJXgg,80493
278
278
  datahub/ingestion/source/dbt/dbt_core.py,sha256=m6cA9vVd4Nh2arc-T2_xeQoxvreRbMhTDIJuYsx3wHc,22722
279
279
  datahub/ingestion/source/dbt/dbt_tests.py,sha256=Q5KISW_AOOWqyxmyOgJQquyX7xlfOqKu9WhrHoLKC0M,9881
@@ -334,7 +334,7 @@ datahub/ingestion/source/looker/looker_common.py,sha256=KObx5ZTfldN2EO11eb1LrHI-
334
334
  datahub/ingestion/source/looker/looker_config.py,sha256=87WAgdJ_QWdTq25RBwgIqfc2kq7dubSpzbEtXb2ihMw,13182
335
335
  datahub/ingestion/source/looker/looker_connection.py,sha256=yDmC6lDsHmL2e_Pw8ULylwOIHPWPp_6gT1iyLvD0fTw,2075
336
336
  datahub/ingestion/source/looker/looker_constant.py,sha256=GMKYtNXlpojPxa9azridKfcGLSJwKdUCTesp7U8dIrQ,402
337
- datahub/ingestion/source/looker/looker_dataclasses.py,sha256=ULWLFWsV2cKmTuOFavD8QjEBmnXmvjyr8RbUB62DwJQ,12178
337
+ datahub/ingestion/source/looker/looker_dataclasses.py,sha256=LjrP5m_A4HV-XeFlSNGVYNuyF0ulxp_qwB82Ss4Iycs,12200
338
338
  datahub/ingestion/source/looker/looker_file_loader.py,sha256=c1ewDrIb9VJg1o-asbwX9gL83kgL01vIETzzbmZIhmw,4267
339
339
  datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=0gaYjBv4wkbbLWVgvaAV6JyWAFb0utTG6TCve2d9xss,11511
340
340
  datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=mO4G4MNA4YZFvZaDBpdiJ2vP3irC82kY34RdaK4Pbfs,3100
@@ -370,7 +370,7 @@ datahub/ingestion/source/powerbi/m_query/tree_function.py,sha256=h77DunhlgOP0fAg
370
370
  datahub/ingestion/source/powerbi/m_query/validator.py,sha256=crG-VZy2XPieiDliP9yVMgiFcc8b2xbZyDFEATXqEAQ,1155
371
371
  datahub/ingestion/source/powerbi/rest_api_wrapper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
372
372
  datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py,sha256=xqAsnNUCP44Wd1rE1m_phbKtNCMJTFJfOX4_2varadg,8298
373
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py,sha256=O2XTVBdXteIgQF8Lss_t2RhRSsRMmMyWrAoNonDMQFI,39604
373
+ datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py,sha256=8_IIYzcGQR5jcJ3NKg_tIa7VobUEBXzVpvFBaFPUToM,39598
374
374
  datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py,sha256=3nGU-_KQe1WMIAPdxtuzulqpAreNsqi0vX0XdrddCU8,26184
375
375
  datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py,sha256=bgcPheyqOj6KdRjDyANDK5yggItglcBIjbGFIwAxSds,1392
376
376
  datahub/ingestion/source/powerbi/rest_api_wrapper/query.py,sha256=VNw1Uvli6g0pnu9FpigYmnCdEPbVEipz7vdZU_WmHf4,616
@@ -429,21 +429,21 @@ datahub/ingestion/source/snowflake/constants.py,sha256=22n-0r04nuy-ImxWFFpmbrt_G
429
429
  datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
430
430
  datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
431
431
  datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
432
- datahub/ingestion/source/snowflake/snowflake_config.py,sha256=_Ew2nJRoKC9e-SyrhOqn730c4FEhQE3U4bbY6RFV004,17973
432
+ datahub/ingestion/source/snowflake/snowflake_config.py,sha256=jQGSa7ZQs3EsXB9ANShZ4xv9RqrhRfVHRSLeFiDwwxc,17974
433
433
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
434
434
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
435
435
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=5Lpy_irZlbOFJbvVkgsZSBjdLCT3VZNjlEvttzSQAU4,21121
436
436
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
437
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=8QEihOfivalVR9vLo6vCUL-vnZfAGgMio0uhPYX0jTo,25883
438
- datahub/ingestion/source/snowflake/snowflake_query.py,sha256=885pyVnLf8wwTTuWkJ-Q01gKE7Xt518QPbFkrN-vd7o,38310
437
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=Lhc5FAx8pXiUyfODGNkQJhjThSCIjPqG2R82dHN-jg0,26889
438
+ datahub/ingestion/source/snowflake/snowflake_query.py,sha256=5po2FWz41UVowykJYbTFGxsltbmlHBCPcHG20VOhdOE,38469
439
439
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
440
440
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
441
441
  datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=3AxvKfK9WV9x2f2XNuJ-Cmy4szmXKm1Ky0haRVvyC6w,42340
442
442
  datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=maZyFkfrbVogEFM0tTKRiNp9c_1muv6YfleSd3q0umI,6341
443
443
  datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
444
444
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
445
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
446
- datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
445
+ datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=0rXgz8bvRiI9SYVMa0UGLeg_DcjqBy6kQsdq0Uq0HVk,24685
446
+ datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=MoI8-DR9tuMuHMBQcpDo4GFjvcoQZWLNkdFZsTkgK-M,12786
447
447
  datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=hIWtzlxuSQ_3w48o4AF2l9CQOcWIe6AmD07I89sH2B0,31860
448
448
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
449
449
  datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
@@ -491,7 +491,7 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
491
491
  datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
492
492
  datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
493
493
  datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
494
- datahub/ingestion/source/tableau/tableau.py,sha256=nu71B56vB6h5io5GcMXQPlYVCbE-UNAtdxHCm8nXr9o,139751
494
+ datahub/ingestion/source/tableau/tableau.py,sha256=Z_x3aeHPi0mq1cKY1tOzQakPLE0bX40I14iuFwAOmU8,140093
495
495
  datahub/ingestion/source/tableau/tableau_common.py,sha256=9gQLq_3BlAsKll83uVlnWJRWaIDtFtREUyuimXF13Z0,26219
496
496
  datahub/ingestion/source/tableau/tableau_constant.py,sha256=ZcAeHsQUXVVL26ORly0ByZk_GJAFbxaKuJAlX_sYMac,2686
497
497
  datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=nSyx9RzC6TCQDm-cTVJ657qT8iDwzk_8JMKpohhmOc4,1046
@@ -982,8 +982,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
982
982
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
983
983
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
984
984
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
985
- acryl_datahub-0.15.0.1rc9.dist-info/METADATA,sha256=nUI5E0nMS2Ng9RLK_q6N4VmqhzakT3CIw34UEqv8E1E,173642
986
- acryl_datahub-0.15.0.1rc9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
987
- acryl_datahub-0.15.0.1rc9.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
988
- acryl_datahub-0.15.0.1rc9.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
989
- acryl_datahub-0.15.0.1rc9.dist-info/RECORD,,
985
+ acryl_datahub-0.15.0.1rc11.dist-info/METADATA,sha256=PHoYjaieZmYEqeCLCRvjDKfPAMOtdDmUc98nFYjASOY,173645
986
+ acryl_datahub-0.15.0.1rc11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
987
+ acryl_datahub-0.15.0.1rc11.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
988
+ acryl_datahub-0.15.0.1rc11.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
989
+ acryl_datahub-0.15.0.1rc11.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc9"
6
+ __version__ = "0.15.0.1rc11"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from datetime import datetime
3
3
  from json import JSONDecodeError
4
- from typing import Dict, List, Optional, Tuple
4
+ from typing import Dict, List, Literal, Optional, Tuple
5
5
  from urllib.parse import urlparse
6
6
 
7
7
  import dateutil.parser
@@ -62,6 +62,11 @@ class DBTCloudConfig(DBTCommonConfig):
62
62
  description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.",
63
63
  )
64
64
 
65
+ external_url_mode: Literal["explore", "ide"] = Field(
66
+ default="explore",
67
+ description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE',
68
+ )
69
+
65
70
  @root_validator(pre=True)
66
71
  def set_metadata_endpoint(cls, values: dict) -> dict:
67
72
  if values.get("access_url") and not values.get("metadata_endpoint"):
@@ -527,5 +532,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
527
532
  )
528
533
 
529
534
  def get_external_url(self, node: DBTNode) -> Optional[str]:
530
- # TODO: Once dbt Cloud supports deep linking to specific files, we can use that.
531
- return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"
535
+ if self.config.external_url_mode == "explore":
536
+ return f"{self.config.access_url}/explore/{self.config.account_id}/projects/{self.config.project_id}/environments/production/details/{node.dbt_name}"
537
+ else:
538
+ return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"
@@ -186,16 +186,16 @@ class LookerModel:
186
186
  f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}"
187
187
  )
188
188
  if "*" not in inc and not included_files:
189
- reporter.report_failure(
189
+ reporter.warning(
190
190
  title="Error Resolving Include",
191
- message=f"Cannot resolve include {inc}",
192
- context=f"Path: {path}",
191
+ message="Cannot resolve included file",
192
+ context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
193
193
  )
194
194
  elif not included_files:
195
- reporter.report_failure(
195
+ reporter.warning(
196
196
  title="Error Resolving Include",
197
- message=f"Did not resolve anything for wildcard include {inc}",
198
- context=f"Path: {path}",
197
+ message="Did not find anything matching the wildcard include",
198
+ context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
199
199
  )
200
200
  # only load files that we haven't seen so far
201
201
  included_files = [x for x in included_files if x not in seen_so_far]
@@ -231,9 +231,7 @@ class LookerModel:
231
231
  source_config,
232
232
  reporter,
233
233
  seen_so_far,
234
- traversal_path=traversal_path
235
- + "."
236
- + pathlib.Path(included_file).stem,
234
+ traversal_path=f"{traversal_path} -> {pathlib.Path(included_file).stem}",
237
235
  )
238
236
  )
239
237
  except Exception as e:
@@ -84,13 +84,14 @@ class DataResolverBase(ABC):
84
84
  tenant_id: str,
85
85
  metadata_api_timeout: int,
86
86
  ):
87
- self.__access_token: Optional[str] = None
88
- self.__access_token_expiry_time: Optional[datetime] = None
89
- self.__tenant_id = tenant_id
87
+ self._access_token: Optional[str] = None
88
+ self._access_token_expiry_time: Optional[datetime] = None
89
+
90
+ self._tenant_id = tenant_id
90
91
  # Test connection by generating access token
91
92
  logger.info(f"Trying to connect to {self._get_authority_url()}")
92
93
  # Power-Bi Auth (Service Principal Auth)
93
- self.__msal_client = msal.ConfidentialClientApplication(
94
+ self._msal_client = msal.ConfidentialClientApplication(
94
95
  client_id,
95
96
  client_credential=client_secret,
96
97
  authority=DataResolverBase.AUTHORITY + tenant_id,
@@ -168,18 +169,18 @@ class DataResolverBase(ABC):
168
169
  pass
169
170
 
170
171
  def _get_authority_url(self):
171
- return f"{DataResolverBase.AUTHORITY}{self.__tenant_id}"
172
+ return f"{DataResolverBase.AUTHORITY}{self._tenant_id}"
172
173
 
173
174
  def get_authorization_header(self):
174
175
  return {Constant.Authorization: self.get_access_token()}
175
176
 
176
- def get_access_token(self):
177
- if self.__access_token is not None and not self._is_access_token_expired():
178
- return self.__access_token
177
+ def get_access_token(self) -> str:
178
+ if self._access_token is not None and not self._is_access_token_expired():
179
+ return self._access_token
179
180
 
180
181
  logger.info("Generating PowerBi access token")
181
182
 
182
- auth_response = self.__msal_client.acquire_token_for_client(
183
+ auth_response = self._msal_client.acquire_token_for_client(
183
184
  scopes=[DataResolverBase.SCOPE]
184
185
  )
185
186
 
@@ -193,24 +194,24 @@ class DataResolverBase(ABC):
193
194
 
194
195
  logger.info("Generated PowerBi access token")
195
196
 
196
- self.__access_token = "Bearer {}".format(
197
+ self._access_token = "Bearer {}".format(
197
198
  auth_response.get(Constant.ACCESS_TOKEN)
198
199
  )
199
200
  safety_gap = 300
200
- self.__access_token_expiry_time = datetime.now() + timedelta(
201
+ self._access_token_expiry_time = datetime.now() + timedelta(
201
202
  seconds=(
202
203
  max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0)
203
204
  )
204
205
  )
205
206
 
206
- logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}")
207
+ logger.debug(f"{Constant.PBIAccessToken}={self._access_token}")
207
208
 
208
- return self.__access_token
209
+ return self._access_token
209
210
 
210
211
  def _is_access_token_expired(self) -> bool:
211
- if not self.__access_token_expiry_time:
212
+ if not self._access_token_expiry_time:
212
213
  return True
213
- return self.__access_token_expiry_time < datetime.now()
214
+ return self._access_token_expiry_time < datetime.now()
214
215
 
215
216
  def get_dashboards(self, workspace: Workspace) -> List[Dashboard]:
216
217
  """
@@ -138,12 +138,20 @@ class SnowflakeIdentifierConfig(
138
138
  description="Whether to convert dataset urns to lowercase.",
139
139
  )
140
140
 
141
-
142
- class SnowflakeUsageConfig(BaseUsageConfig):
143
141
  email_domain: Optional[str] = pydantic.Field(
144
142
  default=None,
145
143
  description="Email domain of your organization so users can be displayed on UI appropriately.",
146
144
  )
145
+
146
+ email_as_user_identifier: bool = Field(
147
+ default=True,
148
+ description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
149
+ "provided, generates email addresses for snowflake users with unset emails, based on their "
150
+ "username.",
151
+ )
152
+
153
+
154
+ class SnowflakeUsageConfig(BaseUsageConfig):
147
155
  apply_view_usage_to_tables: bool = pydantic.Field(
148
156
  default=False,
149
157
  description="Whether to apply view's usage to its base tables. If set to True, usage is applied to base tables only.",
@@ -267,13 +275,6 @@ class SnowflakeV2Config(
267
275
  " Map of share name -> details of share.",
268
276
  )
269
277
 
270
- email_as_user_identifier: bool = Field(
271
- default=True,
272
- description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
273
- "provided, generates email addresses for snowflake users with unset emails, based on their "
274
- "username.",
275
- )
276
-
277
278
  include_assertion_results: bool = Field(
278
279
  default=False,
279
280
  description="Whether to ingest assertion run results for assertions created using Datahub"
@@ -66,6 +66,11 @@ from datahub.utilities.perf_timer import PerfTimer
66
66
 
67
67
  logger = logging.getLogger(__name__)
68
68
 
69
+ # Define a type alias
70
+ UserName = str
71
+ UserEmail = str
72
+ UsersMapping = Dict[UserName, UserEmail]
73
+
69
74
 
70
75
  class SnowflakeQueriesExtractorConfig(ConfigModel):
71
76
  # TODO: Support stateful ingestion for the time windows.
@@ -114,11 +119,13 @@ class SnowflakeQueriesSourceConfig(
114
119
  class SnowflakeQueriesExtractorReport(Report):
115
120
  copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
116
121
  query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
122
+ users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
117
123
 
118
124
  audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
119
125
  sql_aggregator: Optional[SqlAggregatorReport] = None
120
126
 
121
127
  num_ddl_queries_dropped: int = 0
128
+ num_users: int = 0
122
129
 
123
130
 
124
131
  @dataclass
@@ -225,6 +232,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
225
232
  def get_workunits_internal(
226
233
  self,
227
234
  ) -> Iterable[MetadataWorkUnit]:
235
+ with self.report.users_fetch_timer:
236
+ users = self.fetch_users()
237
+
228
238
  # TODO: Add some logic to check if the cached audit log is stale or not.
229
239
  audit_log_file = self.local_temp_path / "audit_log.sqlite"
230
240
  use_cached_audit_log = audit_log_file.exists()
@@ -248,7 +258,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
248
258
  queries.append(entry)
249
259
 
250
260
  with self.report.query_log_fetch_timer:
251
- for entry in self.fetch_query_log():
261
+ for entry in self.fetch_query_log(users):
252
262
  queries.append(entry)
253
263
 
254
264
  with self.report.audit_log_load_timer:
@@ -263,6 +273,25 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
263
273
  shared_connection.close()
264
274
  audit_log_file.unlink(missing_ok=True)
265
275
 
276
+ def fetch_users(self) -> UsersMapping:
277
+ users: UsersMapping = dict()
278
+ with self.structured_reporter.report_exc("Error fetching users from Snowflake"):
279
+ logger.info("Fetching users from Snowflake")
280
+ query = SnowflakeQuery.get_all_users()
281
+ resp = self.connection.query(query)
282
+
283
+ for row in resp:
284
+ try:
285
+ users[row["NAME"]] = row["EMAIL"]
286
+ self.report.num_users += 1
287
+ except Exception as e:
288
+ self.structured_reporter.warning(
289
+ "Error parsing user row",
290
+ context=f"{row}",
291
+ exc=e,
292
+ )
293
+ return users
294
+
266
295
  def fetch_copy_history(self) -> Iterable[KnownLineageMapping]:
267
296
  # Derived from _populate_external_lineage_from_copy_history.
268
297
 
@@ -298,7 +327,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
298
327
  yield result
299
328
 
300
329
  def fetch_query_log(
301
- self,
330
+ self, users: UsersMapping
302
331
  ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]:
303
332
  query_log_query = _build_enriched_query_log_query(
304
333
  start_time=self.config.window.start_time,
@@ -319,7 +348,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
319
348
 
320
349
  assert isinstance(row, dict)
321
350
  try:
322
- entry = self._parse_audit_log_row(row)
351
+ entry = self._parse_audit_log_row(row, users)
323
352
  except Exception as e:
324
353
  self.structured_reporter.warning(
325
354
  "Error parsing query log row",
@@ -331,7 +360,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
331
360
  yield entry
332
361
 
333
362
  def _parse_audit_log_row(
334
- self, row: Dict[str, Any]
363
+ self, row: Dict[str, Any], users: UsersMapping
335
364
  ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]:
336
365
  json_fields = {
337
366
  "DIRECT_OBJECTS_ACCESSED",
@@ -430,9 +459,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
430
459
  )
431
460
  )
432
461
 
433
- # TODO: Fetch email addresses from Snowflake to map user -> email
434
- # TODO: Support email_domain fallback for generating user urns.
435
- user = CorpUserUrn(self.identifiers.snowflake_identifier(res["user_name"]))
462
+ user = CorpUserUrn(
463
+ self.identifiers.get_user_identifier(
464
+ res["user_name"], users.get(res["user_name"])
465
+ )
466
+ )
436
467
 
437
468
  timestamp: datetime = res["query_start_time"]
438
469
  timestamp = timestamp.astimezone(timezone.utc)
@@ -947,4 +947,8 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
947
947
  AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}'
948
948
  ORDER BY MEASUREMENT_TIME ASC;
949
949
 
950
- """
950
+ """
951
+
952
+ @staticmethod
953
+ def get_all_users() -> str:
954
+ return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS"""
@@ -342,10 +342,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
342
342
  filtered_user_counts.append(
343
343
  DatasetUserUsageCounts(
344
344
  user=make_user_urn(
345
- self.get_user_identifier(
345
+ self.identifiers.get_user_identifier(
346
346
  user_count["user_name"],
347
347
  user_email,
348
- self.config.email_as_user_identifier,
349
348
  )
350
349
  ),
351
350
  count=user_count["total"],
@@ -453,9 +452,7 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
453
452
  reported_time: int = int(time.time() * 1000)
454
453
  last_updated_timestamp: int = int(start_time.timestamp() * 1000)
455
454
  user_urn = make_user_urn(
456
- self.get_user_identifier(
457
- user_name, user_email, self.config.email_as_user_identifier
458
- )
455
+ self.identifiers.get_user_identifier(user_name, user_email)
459
456
  )
460
457
 
461
458
  # NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect
@@ -300,6 +300,28 @@ class SnowflakeIdentifierBuilder:
300
300
  def get_quoted_identifier_for_table(db_name, schema_name, table_name):
301
301
  return f'"{db_name}"."{schema_name}"."{table_name}"'
302
302
 
303
+ # Note - decide how to construct user urns.
304
+ # Historically urns were created using part before @ from user's email.
305
+ # Users without email were skipped from both user entries as well as aggregates.
306
+ # However email is not mandatory field in snowflake user, user_name is always present.
307
+ def get_user_identifier(
308
+ self,
309
+ user_name: str,
310
+ user_email: Optional[str],
311
+ ) -> str:
312
+ if user_email:
313
+ return self.snowflake_identifier(
314
+ user_email
315
+ if self.identifier_config.email_as_user_identifier is True
316
+ else user_email.split("@")[0]
317
+ )
318
+ return self.snowflake_identifier(
319
+ f"{user_name}@{self.identifier_config.email_domain}"
320
+ if self.identifier_config.email_as_user_identifier is True
321
+ and self.identifier_config.email_domain is not None
322
+ else user_name
323
+ )
324
+
303
325
 
304
326
  class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
305
327
  platform = "snowflake"
@@ -315,24 +337,6 @@ class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
315
337
  def identifiers(self) -> SnowflakeIdentifierBuilder:
316
338
  return SnowflakeIdentifierBuilder(self.config, self.report)
317
339
 
318
- # Note - decide how to construct user urns.
319
- # Historically urns were created using part before @ from user's email.
320
- # Users without email were skipped from both user entries as well as aggregates.
321
- # However email is not mandatory field in snowflake user, user_name is always present.
322
- def get_user_identifier(
323
- self,
324
- user_name: str,
325
- user_email: Optional[str],
326
- email_as_user_identifier: bool,
327
- ) -> str:
328
- if user_email:
329
- return self.identifiers.snowflake_identifier(
330
- user_email
331
- if email_as_user_identifier is True
332
- else user_email.split("@")[0]
333
- )
334
- return self.identifiers.snowflake_identifier(user_name)
335
-
336
340
  # TODO: Revisit this after stateful ingestion can commit checkpoint
337
341
  # for failures that do not affect the checkpoint
338
342
  # TODO: Add additional parameters to match the signature of the .warning and .failure methods
@@ -186,6 +186,15 @@ try:
186
186
  except ImportError:
187
187
  REAUTHENTICATE_ERRORS = (NonXMLResponseError,)
188
188
 
189
+ RETRIABLE_ERROR_CODES = [
190
+ 408, # Request Timeout
191
+ 429, # Too Many Requests
192
+ 500, # Internal Server Error
193
+ 502, # Bad Gateway
194
+ 503, # Service Unavailable
195
+ 504, # Gateway Timeout
196
+ ]
197
+
189
198
  logger: logging.Logger = logging.getLogger(__name__)
190
199
 
191
200
  # Replace / with |
@@ -287,7 +296,7 @@ class TableauConnectionConfig(ConfigModel):
287
296
  max_retries=Retry(
288
297
  total=self.max_retries,
289
298
  backoff_factor=1,
290
- status_forcelist=[429, 500, 502, 503, 504],
299
+ status_forcelist=RETRIABLE_ERROR_CODES,
291
300
  )
292
301
  )
293
302
  server._session.mount("http://", adapter)
@@ -1212,9 +1221,11 @@ class TableauSiteSource:
1212
1221
 
1213
1222
  except InternalServerError as ise:
1214
1223
  # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry.
1215
- if ise.code == 504:
1224
+ # Extended with other retryable errors.
1225
+ if ise.code in RETRIABLE_ERROR_CODES:
1216
1226
  if retries_remaining <= 0:
1217
1227
  raise ise
1228
+ logger.info(f"Retrying query due to error {ise.code}")
1218
1229
  return self.get_connection_object_page(
1219
1230
  query=query,
1220
1231
  connection_type=connection_type,