acryl-datahub 0.15.0.1rc8__py3-none-any.whl → 0.15.0.1rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=Ed_HdiA9eGLLG0fhJKPwruUxl4bgAPR8p2MDlRHqts8,576
1
+ datahub/__init__.py,sha256=_-iwjV9mhNtK3Q_48sB1x7crxfllh3ay-QVv4WQ8458,577
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -217,7 +217,7 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
217
217
  datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
218
218
  datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
220
- datahub/ingestion/source/aws/glue.py,sha256=fX0dtaVVq174ZS0aBJvZFYK8ligfZX5EU3pdS3j1KQs,56215
220
+ datahub/ingestion/source/aws/glue.py,sha256=r7y1MPDK__BKX_mrJjVa_CEmSXM3Pa02gt19o0sSLE8,56815
221
221
  datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
222
222
  datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
223
223
  datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
@@ -273,7 +273,7 @@ datahub/ingestion/source/datahub/datahub_source.py,sha256=2jDnsHEzpGhr00qQI9unSU
273
273
  datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
274
274
  datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
275
275
  datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
276
- datahub/ingestion/source/dbt/dbt_cloud.py,sha256=3bfcCi7xBvlCTGjnDCnyOShsxgVRn7wUYJOid_WT_Vk,17643
276
+ datahub/ingestion/source/dbt/dbt_cloud.py,sha256=Joh4AIjlu-UVJw_Hu32bPxT9w25RX4JfUnUhVpiJcJw,18005
277
277
  datahub/ingestion/source/dbt/dbt_common.py,sha256=0ddiqNx9sUAGZYDQ8tSr5Qh5ti-kgC4saW1yRRNJXgg,80493
278
278
  datahub/ingestion/source/dbt/dbt_core.py,sha256=m6cA9vVd4Nh2arc-T2_xeQoxvreRbMhTDIJuYsx3wHc,22722
279
279
  datahub/ingestion/source/dbt/dbt_tests.py,sha256=Q5KISW_AOOWqyxmyOgJQquyX7xlfOqKu9WhrHoLKC0M,9881
@@ -302,9 +302,9 @@ datahub/ingestion/source/fivetran/fivetran.py,sha256=uKbM5czPz-6LOseoh1FwavWDIuL
302
302
  datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
303
303
  datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
304
304
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
305
- datahub/ingestion/source/gc/datahub_gc.py,sha256=AHlKGwDD-E_TEHcJIpRtwk6ikjT-KiyfTo-BXZnMSk0,12114
305
+ datahub/ingestion/source/gc/datahub_gc.py,sha256=WOg3yIaNmwdbSTwytKeSfIUihsM7FMYBip9u2Dnwk3c,12849
306
306
  datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=u90XEmW1vRFbvp4CQ8ujPxTGJUyJqO2U6ApcI6mFrjE,16588
307
- datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=cHJmxz4NmA7VjTX2iGEo3wZ_SDrjC_rCQcnRxKgfUVI,8713
307
+ datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=sZbdkg3MuPVGf8eeeRg_2khGMZ01QoH4dgJiTxf7Srg,9813
308
308
  datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=wRnRaIVUG483tY4nyDkEn6Xi2RL5MjrVvoCoZimqwSg,7514
309
309
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
310
310
  datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
@@ -334,7 +334,7 @@ datahub/ingestion/source/looker/looker_common.py,sha256=KObx5ZTfldN2EO11eb1LrHI-
334
334
  datahub/ingestion/source/looker/looker_config.py,sha256=87WAgdJ_QWdTq25RBwgIqfc2kq7dubSpzbEtXb2ihMw,13182
335
335
  datahub/ingestion/source/looker/looker_connection.py,sha256=yDmC6lDsHmL2e_Pw8ULylwOIHPWPp_6gT1iyLvD0fTw,2075
336
336
  datahub/ingestion/source/looker/looker_constant.py,sha256=GMKYtNXlpojPxa9azridKfcGLSJwKdUCTesp7U8dIrQ,402
337
- datahub/ingestion/source/looker/looker_dataclasses.py,sha256=ULWLFWsV2cKmTuOFavD8QjEBmnXmvjyr8RbUB62DwJQ,12178
337
+ datahub/ingestion/source/looker/looker_dataclasses.py,sha256=LjrP5m_A4HV-XeFlSNGVYNuyF0ulxp_qwB82Ss4Iycs,12200
338
338
  datahub/ingestion/source/looker/looker_file_loader.py,sha256=c1ewDrIb9VJg1o-asbwX9gL83kgL01vIETzzbmZIhmw,4267
339
339
  datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=0gaYjBv4wkbbLWVgvaAV6JyWAFb0utTG6TCve2d9xss,11511
340
340
  datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=mO4G4MNA4YZFvZaDBpdiJ2vP3irC82kY34RdaK4Pbfs,3100
@@ -370,7 +370,7 @@ datahub/ingestion/source/powerbi/m_query/tree_function.py,sha256=h77DunhlgOP0fAg
370
370
  datahub/ingestion/source/powerbi/m_query/validator.py,sha256=crG-VZy2XPieiDliP9yVMgiFcc8b2xbZyDFEATXqEAQ,1155
371
371
  datahub/ingestion/source/powerbi/rest_api_wrapper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
372
372
  datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py,sha256=xqAsnNUCP44Wd1rE1m_phbKtNCMJTFJfOX4_2varadg,8298
373
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py,sha256=O2XTVBdXteIgQF8Lss_t2RhRSsRMmMyWrAoNonDMQFI,39604
373
+ datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py,sha256=8_IIYzcGQR5jcJ3NKg_tIa7VobUEBXzVpvFBaFPUToM,39598
374
374
  datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py,sha256=3nGU-_KQe1WMIAPdxtuzulqpAreNsqi0vX0XdrddCU8,26184
375
375
  datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py,sha256=bgcPheyqOj6KdRjDyANDK5yggItglcBIjbGFIwAxSds,1392
376
376
  datahub/ingestion/source/powerbi/rest_api_wrapper/query.py,sha256=VNw1Uvli6g0pnu9FpigYmnCdEPbVEipz7vdZU_WmHf4,616
@@ -429,21 +429,21 @@ datahub/ingestion/source/snowflake/constants.py,sha256=22n-0r04nuy-ImxWFFpmbrt_G
429
429
  datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
430
430
  datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
431
431
  datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
432
- datahub/ingestion/source/snowflake/snowflake_config.py,sha256=_Ew2nJRoKC9e-SyrhOqn730c4FEhQE3U4bbY6RFV004,17973
432
+ datahub/ingestion/source/snowflake/snowflake_config.py,sha256=jQGSa7ZQs3EsXB9ANShZ4xv9RqrhRfVHRSLeFiDwwxc,17974
433
433
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
434
434
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
435
435
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=5Lpy_irZlbOFJbvVkgsZSBjdLCT3VZNjlEvttzSQAU4,21121
436
436
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
437
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=8QEihOfivalVR9vLo6vCUL-vnZfAGgMio0uhPYX0jTo,25883
438
- datahub/ingestion/source/snowflake/snowflake_query.py,sha256=885pyVnLf8wwTTuWkJ-Q01gKE7Xt518QPbFkrN-vd7o,38310
437
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=Lhc5FAx8pXiUyfODGNkQJhjThSCIjPqG2R82dHN-jg0,26889
438
+ datahub/ingestion/source/snowflake/snowflake_query.py,sha256=5po2FWz41UVowykJYbTFGxsltbmlHBCPcHG20VOhdOE,38469
439
439
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
440
440
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
441
441
  datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=3AxvKfK9WV9x2f2XNuJ-Cmy4szmXKm1Ky0haRVvyC6w,42340
442
442
  datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=maZyFkfrbVogEFM0tTKRiNp9c_1muv6YfleSd3q0umI,6341
443
443
  datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
444
444
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
445
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
446
- datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
445
+ datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=0rXgz8bvRiI9SYVMa0UGLeg_DcjqBy6kQsdq0Uq0HVk,24685
446
+ datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=MoI8-DR9tuMuHMBQcpDo4GFjvcoQZWLNkdFZsTkgK-M,12786
447
447
  datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=hIWtzlxuSQ_3w48o4AF2l9CQOcWIe6AmD07I89sH2B0,31860
448
448
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
449
449
  datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
@@ -517,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
517
517
  datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
518
518
  datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
519
519
  datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
520
- datahub/ingestion/source_report/ingestion_stage.py,sha256=w6qTnJm_-eoTiGxwS7cFnhdIfsv8omC6H5e0qw5t4Jc,1587
520
+ datahub/ingestion/source_report/ingestion_stage.py,sha256=gbYmnio3fAOsjh_RzU3j_5UGu7bYBwUM4bm7S8ID_IU,1649
521
521
  datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
522
522
  datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
523
523
  datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -982,8 +982,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
982
982
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
983
983
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
984
984
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
985
- acryl_datahub-0.15.0.1rc8.dist-info/METADATA,sha256=mW2V4Czvd-ZE_mUJX8XkNZxNwnBa-gLJxebl0KWsM2A,173642
986
- acryl_datahub-0.15.0.1rc8.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
987
- acryl_datahub-0.15.0.1rc8.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
988
- acryl_datahub-0.15.0.1rc8.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
989
- acryl_datahub-0.15.0.1rc8.dist-info/RECORD,,
985
+ acryl_datahub-0.15.0.1rc10.dist-info/METADATA,sha256=GCgEH25gXF0roGuAivBGRw1IyiBv_Xv4wbWj9jGlpIo,173645
986
+ acryl_datahub-0.15.0.1rc10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
987
+ acryl_datahub-0.15.0.1rc10.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
988
+ acryl_datahub-0.15.0.1rc10.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
989
+ acryl_datahub-0.15.0.1rc10.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc8"
6
+ __version__ = "0.15.0.1rc10"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -52,6 +52,7 @@ from datahub.ingestion.api.decorators import (
52
52
  platform_name,
53
53
  support_status,
54
54
  )
55
+ from datahub.ingestion.api.report import EntityFilterReport
55
56
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
56
57
  from datahub.ingestion.api.workunit import MetadataWorkUnit
57
58
  from datahub.ingestion.source.aws import s3_util
@@ -115,7 +116,6 @@ from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_col
115
116
 
116
117
  logger = logging.getLogger(__name__)
117
118
 
118
-
119
119
  DEFAULT_PLATFORM = "glue"
120
120
  VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
121
121
 
@@ -220,6 +220,7 @@ class GlueSourceConfig(
220
220
  class GlueSourceReport(StaleEntityRemovalSourceReport):
221
221
  tables_scanned = 0
222
222
  filtered: List[str] = dataclass_field(default_factory=list)
223
+ databases: EntityFilterReport = EntityFilterReport.field(type="database")
223
224
 
224
225
  num_job_script_location_missing: int = 0
225
226
  num_job_script_location_invalid: int = 0
@@ -668,6 +669,7 @@ class GlueSource(StatefulIngestionSourceBase):
668
669
  return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
669
670
 
670
671
  def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
672
+ logger.debug("Getting all databases")
671
673
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html
672
674
  paginator = self.glue_client.get_paginator("get_databases")
673
675
 
@@ -684,10 +686,18 @@ class GlueSource(StatefulIngestionSourceBase):
684
686
  pattern += "[?!TargetDatabase]"
685
687
 
686
688
  for database in paginator_response.search(pattern):
687
- if self.source_config.database_pattern.allowed(database["Name"]):
689
+ if (not self.source_config.database_pattern.allowed(database["Name"])) or (
690
+ self.source_config.catalog_id
691
+ and database.get("CatalogId")
692
+ and database.get("CatalogId") != self.source_config.catalog_id
693
+ ):
694
+ self.report.databases.dropped(database["Name"])
695
+ else:
696
+ self.report.databases.processed(database["Name"])
688
697
  yield database
689
698
 
690
699
  def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]:
700
+ logger.debug(f"Getting tables from database {database['Name']}")
691
701
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html
692
702
  paginator = self.glue_client.get_paginator("get_tables")
693
703
  database_name = database["Name"]
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from datetime import datetime
3
3
  from json import JSONDecodeError
4
- from typing import Dict, List, Optional, Tuple
4
+ from typing import Dict, List, Literal, Optional, Tuple
5
5
  from urllib.parse import urlparse
6
6
 
7
7
  import dateutil.parser
@@ -62,6 +62,11 @@ class DBTCloudConfig(DBTCommonConfig):
62
62
  description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.",
63
63
  )
64
64
 
65
+ external_url_mode: Literal["explore", "ide"] = Field(
66
+ default="explore",
67
+ description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE',
68
+ )
69
+
65
70
  @root_validator(pre=True)
66
71
  def set_metadata_endpoint(cls, values: dict) -> dict:
67
72
  if values.get("access_url") and not values.get("metadata_endpoint"):
@@ -527,5 +532,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
527
532
  )
528
533
 
529
534
  def get_external_url(self, node: DBTNode) -> Optional[str]:
530
- # TODO: Once dbt Cloud supports deep linking to specific files, we can use that.
531
- return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"
535
+ if self.config.external_url_mode == "explore":
536
+ return f"{self.config.access_url}/explore/{self.config.account_id}/projects/{self.config.project_id}/environments/production/details/{node.dbt_name}"
537
+ else:
538
+ return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"
@@ -34,6 +34,7 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
34
34
  SoftDeletedEntitiesCleanupConfig,
35
35
  SoftDeletedEntitiesReport,
36
36
  )
37
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
37
38
 
38
39
  logger = logging.getLogger(__name__)
39
40
 
@@ -86,6 +87,7 @@ class DataHubGcSourceReport(
86
87
  DataProcessCleanupReport,
87
88
  SoftDeletedEntitiesReport,
88
89
  DatahubExecutionRequestCleanupReport,
90
+ IngestionStageReport,
89
91
  ):
90
92
  expired_tokens_revoked: int = 0
91
93
 
@@ -139,31 +141,40 @@ class DataHubGcSource(Source):
139
141
  ) -> Iterable[MetadataWorkUnit]:
140
142
  if self.config.cleanup_expired_tokens:
141
143
  try:
144
+ self.report.report_ingestion_stage_start("Expired Token Cleanup")
142
145
  self.revoke_expired_tokens()
143
146
  except Exception as e:
144
147
  self.report.failure("While trying to cleanup expired token ", exc=e)
145
148
  if self.config.truncate_indices:
146
149
  try:
150
+ self.report.report_ingestion_stage_start("Truncate Indices")
147
151
  self.truncate_indices()
148
152
  except Exception as e:
149
153
  self.report.failure("While trying to truncate indices ", exc=e)
150
154
  if self.config.soft_deleted_entities_cleanup.enabled:
151
155
  try:
156
+ self.report.report_ingestion_stage_start(
157
+ "Soft Deleted Entities Cleanup"
158
+ )
152
159
  self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
153
160
  except Exception as e:
154
161
  self.report.failure(
155
162
  "While trying to cleanup soft deleted entities ", exc=e
156
163
  )
157
- if self.config.execution_request_cleanup.enabled:
158
- try:
159
- self.execution_request_cleanup.run()
160
- except Exception as e:
161
- self.report.failure("While trying to cleanup execution request ", exc=e)
162
164
  if self.config.dataprocess_cleanup.enabled:
163
165
  try:
166
+ self.report.report_ingestion_stage_start("Data Process Cleanup")
164
167
  yield from self.dataprocess_cleanup.get_workunits_internal()
165
168
  except Exception as e:
166
169
  self.report.failure("While trying to cleanup data process ", exc=e)
170
+ if self.config.execution_request_cleanup.enabled:
171
+ try:
172
+ self.report.report_ingestion_stage_start("Execution request Cleanup")
173
+ self.execution_request_cleanup.run()
174
+ except Exception as e:
175
+ self.report.failure("While trying to cleanup execution request ", exc=e)
176
+ # Otherwise last stage's duration does not get calculated.
177
+ self.report.report_ingestion_stage_start("End")
167
178
  yield from []
168
179
 
169
180
  def truncate_indices(self) -> None:
@@ -281,6 +292,8 @@ class DataHubGcSource(Source):
281
292
  list_access_tokens = expired_tokens_res.get("listAccessTokens", {})
282
293
  tokens = list_access_tokens.get("tokens", [])
283
294
  total = list_access_tokens.get("total", 0)
295
+ if tokens == []:
296
+ break
284
297
  for token in tokens:
285
298
  self.report.expired_tokens_revoked += 1
286
299
  token_id = token["id"]
@@ -1,3 +1,4 @@
1
+ import datetime
1
2
  import logging
2
3
  import time
3
4
  from typing import Any, Dict, Iterator, Optional
@@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
42
43
  description="Global switch for this cleanup task",
43
44
  )
44
45
 
46
+ runtime_limit_seconds: int = Field(
47
+ default=3600,
48
+ description="Maximum runtime in seconds for the cleanup task",
49
+ )
50
+
51
+ max_read_errors: int = Field(
52
+ default=10,
53
+ description="Maximum number of read errors before aborting",
54
+ )
55
+
45
56
  def keep_history_max_milliseconds(self):
46
57
  return self.keep_history_max_days * 24 * 3600 * 1000
47
58
 
48
59
 
49
60
  class DatahubExecutionRequestCleanupReport(SourceReport):
50
- execution_request_cleanup_records_read: int = 0
51
- execution_request_cleanup_records_preserved: int = 0
52
- execution_request_cleanup_records_deleted: int = 0
53
- execution_request_cleanup_read_errors: int = 0
54
- execution_request_cleanup_delete_errors: int = 0
61
+ ergc_records_read: int = 0
62
+ ergc_records_preserved: int = 0
63
+ ergc_records_deleted: int = 0
64
+ ergc_read_errors: int = 0
65
+ ergc_delete_errors: int = 0
66
+ ergc_start_time: Optional[datetime.datetime] = None
67
+ ergc_end_time: Optional[datetime.datetime] = None
55
68
 
56
69
 
57
70
  class CleanupRecord(BaseModel):
@@ -124,6 +137,13 @@ class DatahubExecutionRequestCleanup:
124
137
  params.update(overrides)
125
138
 
126
139
  while True:
140
+ if self._reached_runtime_limit():
141
+ break
142
+ if self.report.ergc_read_errors >= self.config.max_read_errors:
143
+ self.report.failure(
144
+ f"ergc({self.instance_id}): too many read errors, aborting."
145
+ )
146
+ break
127
147
  try:
128
148
  url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}"
129
149
  response = self.graph._session.get(url, headers=headers, params=params)
@@ -141,7 +161,7 @@ class DatahubExecutionRequestCleanup:
141
161
  logger.error(
142
162
  f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}"
143
163
  )
144
- self.report.execution_request_cleanup_read_errors += 1
164
+ self.report.ergc_read_errors += 1
145
165
 
146
166
  def _scroll_garbage_records(self):
147
167
  state: Dict[str, Dict] = {}
@@ -150,7 +170,7 @@ class DatahubExecutionRequestCleanup:
150
170
  running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
151
171
 
152
172
  for entry in self._scroll_execution_requests():
153
- self.report.execution_request_cleanup_records_read += 1
173
+ self.report.ergc_records_read += 1
154
174
  key = entry.ingestion_source
155
175
 
156
176
  # Always delete corrupted records
@@ -171,7 +191,7 @@ class DatahubExecutionRequestCleanup:
171
191
 
172
192
  # Do not delete if number of requests is below minimum
173
193
  if state[key]["count"] < self.config.keep_history_min_count:
174
- self.report.execution_request_cleanup_records_preserved += 1
194
+ self.report.ergc_records_preserved += 1
175
195
  continue
176
196
 
177
197
  # Do not delete if number of requests do not exceed allowed maximum,
@@ -179,7 +199,7 @@ class DatahubExecutionRequestCleanup:
179
199
  if (state[key]["count"] < self.config.keep_history_max_count) and (
180
200
  entry.requested_at > state[key]["cutoffTimestamp"]
181
201
  ):
182
- self.report.execution_request_cleanup_records_preserved += 1
202
+ self.report.ergc_records_preserved += 1
183
203
  continue
184
204
 
185
205
  # Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not
@@ -188,7 +208,7 @@ class DatahubExecutionRequestCleanup:
188
208
  "RUNNING",
189
209
  "PENDING",
190
210
  ]:
191
- self.report.execution_request_cleanup_records_preserved += 1
211
+ self.report.ergc_records_preserved += 1
192
212
  continue
193
213
 
194
214
  # Otherwise delete current record
@@ -200,7 +220,7 @@ class DatahubExecutionRequestCleanup:
200
220
  f"record timestamp: {entry.requested_at}."
201
221
  )
202
222
  )
203
- self.report.execution_request_cleanup_records_deleted += 1
223
+ self.report.ergc_records_deleted += 1
204
224
  yield entry
205
225
 
206
226
  def _delete_entry(self, entry: CleanupRecord) -> None:
@@ -210,17 +230,31 @@ class DatahubExecutionRequestCleanup:
210
230
  )
211
231
  self.graph.delete_entity(entry.urn, True)
212
232
  except Exception as e:
213
- self.report.execution_request_cleanup_delete_errors += 1
233
+ self.report.ergc_delete_errors += 1
214
234
  logger.error(
215
235
  f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}"
216
236
  )
217
237
 
238
+ def _reached_runtime_limit(self) -> bool:
239
+ if (
240
+ self.config.runtime_limit_seconds
241
+ and self.report.ergc_start_time
242
+ and (
243
+ datetime.datetime.now() - self.report.ergc_start_time
244
+ >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
245
+ )
246
+ ):
247
+ logger.info(f"ergc({self.instance_id}): max runtime reached.")
248
+ return True
249
+ return False
250
+
218
251
  def run(self) -> None:
219
252
  if not self.config.enabled:
220
253
  logger.info(
221
254
  f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled."
222
255
  )
223
256
  return
257
+ self.report.ergc_start_time = datetime.datetime.now()
224
258
 
225
259
  logger.info(
226
260
  (
@@ -232,8 +266,11 @@ class DatahubExecutionRequestCleanup:
232
266
  )
233
267
 
234
268
  for entry in self._scroll_garbage_records():
269
+ if self._reached_runtime_limit():
270
+ break
235
271
  self._delete_entry(entry)
236
272
 
273
+ self.report.ergc_end_time = datetime.datetime.now()
237
274
  logger.info(
238
275
  f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records."
239
276
  )
@@ -186,16 +186,16 @@ class LookerModel:
186
186
  f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}"
187
187
  )
188
188
  if "*" not in inc and not included_files:
189
- reporter.report_failure(
189
+ reporter.warning(
190
190
  title="Error Resolving Include",
191
- message=f"Cannot resolve include {inc}",
192
- context=f"Path: {path}",
191
+ message="Cannot resolve included file",
192
+ context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
193
193
  )
194
194
  elif not included_files:
195
- reporter.report_failure(
195
+ reporter.warning(
196
196
  title="Error Resolving Include",
197
- message=f"Did not resolve anything for wildcard include {inc}",
198
- context=f"Path: {path}",
197
+ message="Did not find anything matching the wildcard include",
198
+ context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
199
199
  )
200
200
  # only load files that we haven't seen so far
201
201
  included_files = [x for x in included_files if x not in seen_so_far]
@@ -231,9 +231,7 @@ class LookerModel:
231
231
  source_config,
232
232
  reporter,
233
233
  seen_so_far,
234
- traversal_path=traversal_path
235
- + "."
236
- + pathlib.Path(included_file).stem,
234
+ traversal_path=f"{traversal_path} -> {pathlib.Path(included_file).stem}",
237
235
  )
238
236
  )
239
237
  except Exception as e:
@@ -84,13 +84,14 @@ class DataResolverBase(ABC):
84
84
  tenant_id: str,
85
85
  metadata_api_timeout: int,
86
86
  ):
87
- self.__access_token: Optional[str] = None
88
- self.__access_token_expiry_time: Optional[datetime] = None
89
- self.__tenant_id = tenant_id
87
+ self._access_token: Optional[str] = None
88
+ self._access_token_expiry_time: Optional[datetime] = None
89
+
90
+ self._tenant_id = tenant_id
90
91
  # Test connection by generating access token
91
92
  logger.info(f"Trying to connect to {self._get_authority_url()}")
92
93
  # Power-Bi Auth (Service Principal Auth)
93
- self.__msal_client = msal.ConfidentialClientApplication(
94
+ self._msal_client = msal.ConfidentialClientApplication(
94
95
  client_id,
95
96
  client_credential=client_secret,
96
97
  authority=DataResolverBase.AUTHORITY + tenant_id,
@@ -168,18 +169,18 @@ class DataResolverBase(ABC):
168
169
  pass
169
170
 
170
171
  def _get_authority_url(self):
171
- return f"{DataResolverBase.AUTHORITY}{self.__tenant_id}"
172
+ return f"{DataResolverBase.AUTHORITY}{self._tenant_id}"
172
173
 
173
174
  def get_authorization_header(self):
174
175
  return {Constant.Authorization: self.get_access_token()}
175
176
 
176
- def get_access_token(self):
177
- if self.__access_token is not None and not self._is_access_token_expired():
178
- return self.__access_token
177
+ def get_access_token(self) -> str:
178
+ if self._access_token is not None and not self._is_access_token_expired():
179
+ return self._access_token
179
180
 
180
181
  logger.info("Generating PowerBi access token")
181
182
 
182
- auth_response = self.__msal_client.acquire_token_for_client(
183
+ auth_response = self._msal_client.acquire_token_for_client(
183
184
  scopes=[DataResolverBase.SCOPE]
184
185
  )
185
186
 
@@ -193,24 +194,24 @@ class DataResolverBase(ABC):
193
194
 
194
195
  logger.info("Generated PowerBi access token")
195
196
 
196
- self.__access_token = "Bearer {}".format(
197
+ self._access_token = "Bearer {}".format(
197
198
  auth_response.get(Constant.ACCESS_TOKEN)
198
199
  )
199
200
  safety_gap = 300
200
- self.__access_token_expiry_time = datetime.now() + timedelta(
201
+ self._access_token_expiry_time = datetime.now() + timedelta(
201
202
  seconds=(
202
203
  max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0)
203
204
  )
204
205
  )
205
206
 
206
- logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}")
207
+ logger.debug(f"{Constant.PBIAccessToken}={self._access_token}")
207
208
 
208
- return self.__access_token
209
+ return self._access_token
209
210
 
210
211
  def _is_access_token_expired(self) -> bool:
211
- if not self.__access_token_expiry_time:
212
+ if not self._access_token_expiry_time:
212
213
  return True
213
- return self.__access_token_expiry_time < datetime.now()
214
+ return self._access_token_expiry_time < datetime.now()
214
215
 
215
216
  def get_dashboards(self, workspace: Workspace) -> List[Dashboard]:
216
217
  """
@@ -138,12 +138,20 @@ class SnowflakeIdentifierConfig(
138
138
  description="Whether to convert dataset urns to lowercase.",
139
139
  )
140
140
 
141
-
142
- class SnowflakeUsageConfig(BaseUsageConfig):
143
141
  email_domain: Optional[str] = pydantic.Field(
144
142
  default=None,
145
143
  description="Email domain of your organization so users can be displayed on UI appropriately.",
146
144
  )
145
+
146
+ email_as_user_identifier: bool = Field(
147
+ default=True,
148
+ description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
149
+ "provided, generates email addresses for snowflake users with unset emails, based on their "
150
+ "username.",
151
+ )
152
+
153
+
154
+ class SnowflakeUsageConfig(BaseUsageConfig):
147
155
  apply_view_usage_to_tables: bool = pydantic.Field(
148
156
  default=False,
149
157
  description="Whether to apply view's usage to its base tables. If set to True, usage is applied to base tables only.",
@@ -267,13 +275,6 @@ class SnowflakeV2Config(
267
275
  " Map of share name -> details of share.",
268
276
  )
269
277
 
270
- email_as_user_identifier: bool = Field(
271
- default=True,
272
- description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
273
- "provided, generates email addresses for snowflake users with unset emails, based on their "
274
- "username.",
275
- )
276
-
277
278
  include_assertion_results: bool = Field(
278
279
  default=False,
279
280
  description="Whether to ingest assertion run results for assertions created using Datahub"