acryl-datahub 0.15.0.1rc8__py3-none-any.whl → 0.15.0.1rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/METADATA +2492 -2492
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/RECORD +18 -18
- datahub/__init__.py +1 -1
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +18 -5
- datahub/ingestion/source/gc/execution_request_cleanup.py +49 -12
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -9
- datahub/ingestion/source/snowflake/snowflake_queries.py +38 -7
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source_report/ingestion_stage.py +1 -0
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc8.dist-info → acryl_datahub-0.15.0.1rc10.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=_-iwjV9mhNtK3Q_48sB1x7crxfllh3ay-QVv4WQ8458,577
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -217,7 +217,7 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
|
|
|
217
217
|
datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
|
|
218
218
|
datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
219
219
|
datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
|
|
220
|
-
datahub/ingestion/source/aws/glue.py,sha256=
|
|
220
|
+
datahub/ingestion/source/aws/glue.py,sha256=r7y1MPDK__BKX_mrJjVa_CEmSXM3Pa02gt19o0sSLE8,56815
|
|
221
221
|
datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
|
|
222
222
|
datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
|
|
223
223
|
datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
|
|
@@ -273,7 +273,7 @@ datahub/ingestion/source/datahub/datahub_source.py,sha256=2jDnsHEzpGhr00qQI9unSU
|
|
|
273
273
|
datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
|
|
274
274
|
datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
|
|
275
275
|
datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
276
|
-
datahub/ingestion/source/dbt/dbt_cloud.py,sha256=
|
|
276
|
+
datahub/ingestion/source/dbt/dbt_cloud.py,sha256=Joh4AIjlu-UVJw_Hu32bPxT9w25RX4JfUnUhVpiJcJw,18005
|
|
277
277
|
datahub/ingestion/source/dbt/dbt_common.py,sha256=0ddiqNx9sUAGZYDQ8tSr5Qh5ti-kgC4saW1yRRNJXgg,80493
|
|
278
278
|
datahub/ingestion/source/dbt/dbt_core.py,sha256=m6cA9vVd4Nh2arc-T2_xeQoxvreRbMhTDIJuYsx3wHc,22722
|
|
279
279
|
datahub/ingestion/source/dbt/dbt_tests.py,sha256=Q5KISW_AOOWqyxmyOgJQquyX7xlfOqKu9WhrHoLKC0M,9881
|
|
@@ -302,9 +302,9 @@ datahub/ingestion/source/fivetran/fivetran.py,sha256=uKbM5czPz-6LOseoh1FwavWDIuL
|
|
|
302
302
|
datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
|
|
303
303
|
datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
|
|
304
304
|
datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
305
|
-
datahub/ingestion/source/gc/datahub_gc.py,sha256=
|
|
305
|
+
datahub/ingestion/source/gc/datahub_gc.py,sha256=WOg3yIaNmwdbSTwytKeSfIUihsM7FMYBip9u2Dnwk3c,12849
|
|
306
306
|
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=u90XEmW1vRFbvp4CQ8ujPxTGJUyJqO2U6ApcI6mFrjE,16588
|
|
307
|
-
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=
|
|
307
|
+
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=sZbdkg3MuPVGf8eeeRg_2khGMZ01QoH4dgJiTxf7Srg,9813
|
|
308
308
|
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=wRnRaIVUG483tY4nyDkEn6Xi2RL5MjrVvoCoZimqwSg,7514
|
|
309
309
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
310
310
|
datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
|
|
@@ -334,7 +334,7 @@ datahub/ingestion/source/looker/looker_common.py,sha256=KObx5ZTfldN2EO11eb1LrHI-
|
|
|
334
334
|
datahub/ingestion/source/looker/looker_config.py,sha256=87WAgdJ_QWdTq25RBwgIqfc2kq7dubSpzbEtXb2ihMw,13182
|
|
335
335
|
datahub/ingestion/source/looker/looker_connection.py,sha256=yDmC6lDsHmL2e_Pw8ULylwOIHPWPp_6gT1iyLvD0fTw,2075
|
|
336
336
|
datahub/ingestion/source/looker/looker_constant.py,sha256=GMKYtNXlpojPxa9azridKfcGLSJwKdUCTesp7U8dIrQ,402
|
|
337
|
-
datahub/ingestion/source/looker/looker_dataclasses.py,sha256=
|
|
337
|
+
datahub/ingestion/source/looker/looker_dataclasses.py,sha256=LjrP5m_A4HV-XeFlSNGVYNuyF0ulxp_qwB82Ss4Iycs,12200
|
|
338
338
|
datahub/ingestion/source/looker/looker_file_loader.py,sha256=c1ewDrIb9VJg1o-asbwX9gL83kgL01vIETzzbmZIhmw,4267
|
|
339
339
|
datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=0gaYjBv4wkbbLWVgvaAV6JyWAFb0utTG6TCve2d9xss,11511
|
|
340
340
|
datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=mO4G4MNA4YZFvZaDBpdiJ2vP3irC82kY34RdaK4Pbfs,3100
|
|
@@ -370,7 +370,7 @@ datahub/ingestion/source/powerbi/m_query/tree_function.py,sha256=h77DunhlgOP0fAg
|
|
|
370
370
|
datahub/ingestion/source/powerbi/m_query/validator.py,sha256=crG-VZy2XPieiDliP9yVMgiFcc8b2xbZyDFEATXqEAQ,1155
|
|
371
371
|
datahub/ingestion/source/powerbi/rest_api_wrapper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
372
372
|
datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py,sha256=xqAsnNUCP44Wd1rE1m_phbKtNCMJTFJfOX4_2varadg,8298
|
|
373
|
-
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py,sha256=
|
|
373
|
+
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py,sha256=8_IIYzcGQR5jcJ3NKg_tIa7VobUEBXzVpvFBaFPUToM,39598
|
|
374
374
|
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py,sha256=3nGU-_KQe1WMIAPdxtuzulqpAreNsqi0vX0XdrddCU8,26184
|
|
375
375
|
datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py,sha256=bgcPheyqOj6KdRjDyANDK5yggItglcBIjbGFIwAxSds,1392
|
|
376
376
|
datahub/ingestion/source/powerbi/rest_api_wrapper/query.py,sha256=VNw1Uvli6g0pnu9FpigYmnCdEPbVEipz7vdZU_WmHf4,616
|
|
@@ -429,21 +429,21 @@ datahub/ingestion/source/snowflake/constants.py,sha256=22n-0r04nuy-ImxWFFpmbrt_G
|
|
|
429
429
|
datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
|
|
430
430
|
datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
|
|
431
431
|
datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
|
|
432
|
-
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=
|
|
432
|
+
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=jQGSa7ZQs3EsXB9ANShZ4xv9RqrhRfVHRSLeFiDwwxc,17974
|
|
433
433
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
|
|
434
434
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
435
435
|
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=5Lpy_irZlbOFJbvVkgsZSBjdLCT3VZNjlEvttzSQAU4,21121
|
|
436
436
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
437
|
-
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=
|
|
438
|
-
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=
|
|
437
|
+
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=Lhc5FAx8pXiUyfODGNkQJhjThSCIjPqG2R82dHN-jg0,26889
|
|
438
|
+
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=5po2FWz41UVowykJYbTFGxsltbmlHBCPcHG20VOhdOE,38469
|
|
439
439
|
datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
|
|
440
440
|
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
|
|
441
441
|
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=3AxvKfK9WV9x2f2XNuJ-Cmy4szmXKm1Ky0haRVvyC6w,42340
|
|
442
442
|
datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=maZyFkfrbVogEFM0tTKRiNp9c_1muv6YfleSd3q0umI,6341
|
|
443
443
|
datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
|
|
444
444
|
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
|
|
445
|
-
datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=
|
|
446
|
-
datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=
|
|
445
|
+
datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=0rXgz8bvRiI9SYVMa0UGLeg_DcjqBy6kQsdq0Uq0HVk,24685
|
|
446
|
+
datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=MoI8-DR9tuMuHMBQcpDo4GFjvcoQZWLNkdFZsTkgK-M,12786
|
|
447
447
|
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=hIWtzlxuSQ_3w48o4AF2l9CQOcWIe6AmD07I89sH2B0,31860
|
|
448
448
|
datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
449
449
|
datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
|
|
@@ -517,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
|
|
|
517
517
|
datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
|
|
518
518
|
datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
|
|
519
519
|
datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
520
|
-
datahub/ingestion/source_report/ingestion_stage.py,sha256=
|
|
520
|
+
datahub/ingestion/source_report/ingestion_stage.py,sha256=gbYmnio3fAOsjh_RzU3j_5UGu7bYBwUM4bm7S8ID_IU,1649
|
|
521
521
|
datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
|
|
522
522
|
datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
|
|
523
523
|
datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -982,8 +982,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
982
982
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
983
983
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
984
984
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
985
|
-
acryl_datahub-0.15.0.
|
|
986
|
-
acryl_datahub-0.15.0.
|
|
987
|
-
acryl_datahub-0.15.0.
|
|
988
|
-
acryl_datahub-0.15.0.
|
|
989
|
-
acryl_datahub-0.15.0.
|
|
985
|
+
acryl_datahub-0.15.0.1rc10.dist-info/METADATA,sha256=GCgEH25gXF0roGuAivBGRw1IyiBv_Xv4wbWj9jGlpIo,173645
|
|
986
|
+
acryl_datahub-0.15.0.1rc10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
987
|
+
acryl_datahub-0.15.0.1rc10.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
988
|
+
acryl_datahub-0.15.0.1rc10.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
989
|
+
acryl_datahub-0.15.0.1rc10.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -52,6 +52,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
52
52
|
platform_name,
|
|
53
53
|
support_status,
|
|
54
54
|
)
|
|
55
|
+
from datahub.ingestion.api.report import EntityFilterReport
|
|
55
56
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
56
57
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
57
58
|
from datahub.ingestion.source.aws import s3_util
|
|
@@ -115,7 +116,6 @@ from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_col
|
|
|
115
116
|
|
|
116
117
|
logger = logging.getLogger(__name__)
|
|
117
118
|
|
|
118
|
-
|
|
119
119
|
DEFAULT_PLATFORM = "glue"
|
|
120
120
|
VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
|
|
121
121
|
|
|
@@ -220,6 +220,7 @@ class GlueSourceConfig(
|
|
|
220
220
|
class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
221
221
|
tables_scanned = 0
|
|
222
222
|
filtered: List[str] = dataclass_field(default_factory=list)
|
|
223
|
+
databases: EntityFilterReport = EntityFilterReport.field(type="database")
|
|
223
224
|
|
|
224
225
|
num_job_script_location_missing: int = 0
|
|
225
226
|
num_job_script_location_invalid: int = 0
|
|
@@ -668,6 +669,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
668
669
|
return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
|
|
669
670
|
|
|
670
671
|
def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
|
|
672
|
+
logger.debug("Getting all databases")
|
|
671
673
|
# see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html
|
|
672
674
|
paginator = self.glue_client.get_paginator("get_databases")
|
|
673
675
|
|
|
@@ -684,10 +686,18 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
684
686
|
pattern += "[?!TargetDatabase]"
|
|
685
687
|
|
|
686
688
|
for database in paginator_response.search(pattern):
|
|
687
|
-
if self.source_config.database_pattern.allowed(database["Name"])
|
|
689
|
+
if (not self.source_config.database_pattern.allowed(database["Name"])) or (
|
|
690
|
+
self.source_config.catalog_id
|
|
691
|
+
and database.get("CatalogId")
|
|
692
|
+
and database.get("CatalogId") != self.source_config.catalog_id
|
|
693
|
+
):
|
|
694
|
+
self.report.databases.dropped(database["Name"])
|
|
695
|
+
else:
|
|
696
|
+
self.report.databases.processed(database["Name"])
|
|
688
697
|
yield database
|
|
689
698
|
|
|
690
699
|
def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]:
|
|
700
|
+
logger.debug(f"Getting tables from database {database['Name']}")
|
|
691
701
|
# see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html
|
|
692
702
|
paginator = self.glue_client.get_paginator("get_tables")
|
|
693
703
|
database_name = database["Name"]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from json import JSONDecodeError
|
|
4
|
-
from typing import Dict, List, Optional, Tuple
|
|
4
|
+
from typing import Dict, List, Literal, Optional, Tuple
|
|
5
5
|
from urllib.parse import urlparse
|
|
6
6
|
|
|
7
7
|
import dateutil.parser
|
|
@@ -62,6 +62,11 @@ class DBTCloudConfig(DBTCommonConfig):
|
|
|
62
62
|
description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.",
|
|
63
63
|
)
|
|
64
64
|
|
|
65
|
+
external_url_mode: Literal["explore", "ide"] = Field(
|
|
66
|
+
default="explore",
|
|
67
|
+
description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE',
|
|
68
|
+
)
|
|
69
|
+
|
|
65
70
|
@root_validator(pre=True)
|
|
66
71
|
def set_metadata_endpoint(cls, values: dict) -> dict:
|
|
67
72
|
if values.get("access_url") and not values.get("metadata_endpoint"):
|
|
@@ -527,5 +532,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
527
532
|
)
|
|
528
533
|
|
|
529
534
|
def get_external_url(self, node: DBTNode) -> Optional[str]:
|
|
530
|
-
|
|
531
|
-
|
|
535
|
+
if self.config.external_url_mode == "explore":
|
|
536
|
+
return f"{self.config.access_url}/explore/{self.config.account_id}/projects/{self.config.project_id}/environments/production/details/{node.dbt_name}"
|
|
537
|
+
else:
|
|
538
|
+
return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"
|
|
@@ -34,6 +34,7 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
|
|
|
34
34
|
SoftDeletedEntitiesCleanupConfig,
|
|
35
35
|
SoftDeletedEntitiesReport,
|
|
36
36
|
)
|
|
37
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
37
38
|
|
|
38
39
|
logger = logging.getLogger(__name__)
|
|
39
40
|
|
|
@@ -86,6 +87,7 @@ class DataHubGcSourceReport(
|
|
|
86
87
|
DataProcessCleanupReport,
|
|
87
88
|
SoftDeletedEntitiesReport,
|
|
88
89
|
DatahubExecutionRequestCleanupReport,
|
|
90
|
+
IngestionStageReport,
|
|
89
91
|
):
|
|
90
92
|
expired_tokens_revoked: int = 0
|
|
91
93
|
|
|
@@ -139,31 +141,40 @@ class DataHubGcSource(Source):
|
|
|
139
141
|
) -> Iterable[MetadataWorkUnit]:
|
|
140
142
|
if self.config.cleanup_expired_tokens:
|
|
141
143
|
try:
|
|
144
|
+
self.report.report_ingestion_stage_start("Expired Token Cleanup")
|
|
142
145
|
self.revoke_expired_tokens()
|
|
143
146
|
except Exception as e:
|
|
144
147
|
self.report.failure("While trying to cleanup expired token ", exc=e)
|
|
145
148
|
if self.config.truncate_indices:
|
|
146
149
|
try:
|
|
150
|
+
self.report.report_ingestion_stage_start("Truncate Indices")
|
|
147
151
|
self.truncate_indices()
|
|
148
152
|
except Exception as e:
|
|
149
153
|
self.report.failure("While trying to truncate indices ", exc=e)
|
|
150
154
|
if self.config.soft_deleted_entities_cleanup.enabled:
|
|
151
155
|
try:
|
|
156
|
+
self.report.report_ingestion_stage_start(
|
|
157
|
+
"Soft Deleted Entities Cleanup"
|
|
158
|
+
)
|
|
152
159
|
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
153
160
|
except Exception as e:
|
|
154
161
|
self.report.failure(
|
|
155
162
|
"While trying to cleanup soft deleted entities ", exc=e
|
|
156
163
|
)
|
|
157
|
-
if self.config.execution_request_cleanup.enabled:
|
|
158
|
-
try:
|
|
159
|
-
self.execution_request_cleanup.run()
|
|
160
|
-
except Exception as e:
|
|
161
|
-
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
162
164
|
if self.config.dataprocess_cleanup.enabled:
|
|
163
165
|
try:
|
|
166
|
+
self.report.report_ingestion_stage_start("Data Process Cleanup")
|
|
164
167
|
yield from self.dataprocess_cleanup.get_workunits_internal()
|
|
165
168
|
except Exception as e:
|
|
166
169
|
self.report.failure("While trying to cleanup data process ", exc=e)
|
|
170
|
+
if self.config.execution_request_cleanup.enabled:
|
|
171
|
+
try:
|
|
172
|
+
self.report.report_ingestion_stage_start("Execution request Cleanup")
|
|
173
|
+
self.execution_request_cleanup.run()
|
|
174
|
+
except Exception as e:
|
|
175
|
+
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
176
|
+
# Otherwise last stage's duration does not get calculated.
|
|
177
|
+
self.report.report_ingestion_stage_start("End")
|
|
167
178
|
yield from []
|
|
168
179
|
|
|
169
180
|
def truncate_indices(self) -> None:
|
|
@@ -281,6 +292,8 @@ class DataHubGcSource(Source):
|
|
|
281
292
|
list_access_tokens = expired_tokens_res.get("listAccessTokens", {})
|
|
282
293
|
tokens = list_access_tokens.get("tokens", [])
|
|
283
294
|
total = list_access_tokens.get("total", 0)
|
|
295
|
+
if tokens == []:
|
|
296
|
+
break
|
|
284
297
|
for token in tokens:
|
|
285
298
|
self.report.expired_tokens_revoked += 1
|
|
286
299
|
token_id = token["id"]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import datetime
|
|
1
2
|
import logging
|
|
2
3
|
import time
|
|
3
4
|
from typing import Any, Dict, Iterator, Optional
|
|
@@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
42
43
|
description="Global switch for this cleanup task",
|
|
43
44
|
)
|
|
44
45
|
|
|
46
|
+
runtime_limit_seconds: int = Field(
|
|
47
|
+
default=3600,
|
|
48
|
+
description="Maximum runtime in seconds for the cleanup task",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
max_read_errors: int = Field(
|
|
52
|
+
default=10,
|
|
53
|
+
description="Maximum number of read errors before aborting",
|
|
54
|
+
)
|
|
55
|
+
|
|
45
56
|
def keep_history_max_milliseconds(self):
|
|
46
57
|
return self.keep_history_max_days * 24 * 3600 * 1000
|
|
47
58
|
|
|
48
59
|
|
|
49
60
|
class DatahubExecutionRequestCleanupReport(SourceReport):
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
61
|
+
ergc_records_read: int = 0
|
|
62
|
+
ergc_records_preserved: int = 0
|
|
63
|
+
ergc_records_deleted: int = 0
|
|
64
|
+
ergc_read_errors: int = 0
|
|
65
|
+
ergc_delete_errors: int = 0
|
|
66
|
+
ergc_start_time: Optional[datetime.datetime] = None
|
|
67
|
+
ergc_end_time: Optional[datetime.datetime] = None
|
|
55
68
|
|
|
56
69
|
|
|
57
70
|
class CleanupRecord(BaseModel):
|
|
@@ -124,6 +137,13 @@ class DatahubExecutionRequestCleanup:
|
|
|
124
137
|
params.update(overrides)
|
|
125
138
|
|
|
126
139
|
while True:
|
|
140
|
+
if self._reached_runtime_limit():
|
|
141
|
+
break
|
|
142
|
+
if self.report.ergc_read_errors >= self.config.max_read_errors:
|
|
143
|
+
self.report.failure(
|
|
144
|
+
f"ergc({self.instance_id}): too many read errors, aborting."
|
|
145
|
+
)
|
|
146
|
+
break
|
|
127
147
|
try:
|
|
128
148
|
url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}"
|
|
129
149
|
response = self.graph._session.get(url, headers=headers, params=params)
|
|
@@ -141,7 +161,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
141
161
|
logger.error(
|
|
142
162
|
f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}"
|
|
143
163
|
)
|
|
144
|
-
self.report.
|
|
164
|
+
self.report.ergc_read_errors += 1
|
|
145
165
|
|
|
146
166
|
def _scroll_garbage_records(self):
|
|
147
167
|
state: Dict[str, Dict] = {}
|
|
@@ -150,7 +170,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
150
170
|
running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
|
|
151
171
|
|
|
152
172
|
for entry in self._scroll_execution_requests():
|
|
153
|
-
self.report.
|
|
173
|
+
self.report.ergc_records_read += 1
|
|
154
174
|
key = entry.ingestion_source
|
|
155
175
|
|
|
156
176
|
# Always delete corrupted records
|
|
@@ -171,7 +191,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
171
191
|
|
|
172
192
|
# Do not delete if number of requests is below minimum
|
|
173
193
|
if state[key]["count"] < self.config.keep_history_min_count:
|
|
174
|
-
self.report.
|
|
194
|
+
self.report.ergc_records_preserved += 1
|
|
175
195
|
continue
|
|
176
196
|
|
|
177
197
|
# Do not delete if number of requests do not exceed allowed maximum,
|
|
@@ -179,7 +199,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
179
199
|
if (state[key]["count"] < self.config.keep_history_max_count) and (
|
|
180
200
|
entry.requested_at > state[key]["cutoffTimestamp"]
|
|
181
201
|
):
|
|
182
|
-
self.report.
|
|
202
|
+
self.report.ergc_records_preserved += 1
|
|
183
203
|
continue
|
|
184
204
|
|
|
185
205
|
# Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not
|
|
@@ -188,7 +208,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
188
208
|
"RUNNING",
|
|
189
209
|
"PENDING",
|
|
190
210
|
]:
|
|
191
|
-
self.report.
|
|
211
|
+
self.report.ergc_records_preserved += 1
|
|
192
212
|
continue
|
|
193
213
|
|
|
194
214
|
# Otherwise delete current record
|
|
@@ -200,7 +220,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
200
220
|
f"record timestamp: {entry.requested_at}."
|
|
201
221
|
)
|
|
202
222
|
)
|
|
203
|
-
self.report.
|
|
223
|
+
self.report.ergc_records_deleted += 1
|
|
204
224
|
yield entry
|
|
205
225
|
|
|
206
226
|
def _delete_entry(self, entry: CleanupRecord) -> None:
|
|
@@ -210,17 +230,31 @@ class DatahubExecutionRequestCleanup:
|
|
|
210
230
|
)
|
|
211
231
|
self.graph.delete_entity(entry.urn, True)
|
|
212
232
|
except Exception as e:
|
|
213
|
-
self.report.
|
|
233
|
+
self.report.ergc_delete_errors += 1
|
|
214
234
|
logger.error(
|
|
215
235
|
f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}"
|
|
216
236
|
)
|
|
217
237
|
|
|
238
|
+
def _reached_runtime_limit(self) -> bool:
|
|
239
|
+
if (
|
|
240
|
+
self.config.runtime_limit_seconds
|
|
241
|
+
and self.report.ergc_start_time
|
|
242
|
+
and (
|
|
243
|
+
datetime.datetime.now() - self.report.ergc_start_time
|
|
244
|
+
>= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
|
|
245
|
+
)
|
|
246
|
+
):
|
|
247
|
+
logger.info(f"ergc({self.instance_id}): max runtime reached.")
|
|
248
|
+
return True
|
|
249
|
+
return False
|
|
250
|
+
|
|
218
251
|
def run(self) -> None:
|
|
219
252
|
if not self.config.enabled:
|
|
220
253
|
logger.info(
|
|
221
254
|
f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled."
|
|
222
255
|
)
|
|
223
256
|
return
|
|
257
|
+
self.report.ergc_start_time = datetime.datetime.now()
|
|
224
258
|
|
|
225
259
|
logger.info(
|
|
226
260
|
(
|
|
@@ -232,8 +266,11 @@ class DatahubExecutionRequestCleanup:
|
|
|
232
266
|
)
|
|
233
267
|
|
|
234
268
|
for entry in self._scroll_garbage_records():
|
|
269
|
+
if self._reached_runtime_limit():
|
|
270
|
+
break
|
|
235
271
|
self._delete_entry(entry)
|
|
236
272
|
|
|
273
|
+
self.report.ergc_end_time = datetime.datetime.now()
|
|
237
274
|
logger.info(
|
|
238
275
|
f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records."
|
|
239
276
|
)
|
|
@@ -186,16 +186,16 @@ class LookerModel:
|
|
|
186
186
|
f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}"
|
|
187
187
|
)
|
|
188
188
|
if "*" not in inc and not included_files:
|
|
189
|
-
reporter.
|
|
189
|
+
reporter.warning(
|
|
190
190
|
title="Error Resolving Include",
|
|
191
|
-
message=
|
|
192
|
-
context=f"
|
|
191
|
+
message="Cannot resolve included file",
|
|
192
|
+
context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
|
|
193
193
|
)
|
|
194
194
|
elif not included_files:
|
|
195
|
-
reporter.
|
|
195
|
+
reporter.warning(
|
|
196
196
|
title="Error Resolving Include",
|
|
197
|
-
message=
|
|
198
|
-
context=f"
|
|
197
|
+
message="Did not find anything matching the wildcard include",
|
|
198
|
+
context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
|
|
199
199
|
)
|
|
200
200
|
# only load files that we haven't seen so far
|
|
201
201
|
included_files = [x for x in included_files if x not in seen_so_far]
|
|
@@ -231,9 +231,7 @@ class LookerModel:
|
|
|
231
231
|
source_config,
|
|
232
232
|
reporter,
|
|
233
233
|
seen_so_far,
|
|
234
|
-
traversal_path=traversal_path
|
|
235
|
-
+ "."
|
|
236
|
-
+ pathlib.Path(included_file).stem,
|
|
234
|
+
traversal_path=f"{traversal_path} -> {pathlib.Path(included_file).stem}",
|
|
237
235
|
)
|
|
238
236
|
)
|
|
239
237
|
except Exception as e:
|
|
@@ -84,13 +84,14 @@ class DataResolverBase(ABC):
|
|
|
84
84
|
tenant_id: str,
|
|
85
85
|
metadata_api_timeout: int,
|
|
86
86
|
):
|
|
87
|
-
self.
|
|
88
|
-
self.
|
|
89
|
-
|
|
87
|
+
self._access_token: Optional[str] = None
|
|
88
|
+
self._access_token_expiry_time: Optional[datetime] = None
|
|
89
|
+
|
|
90
|
+
self._tenant_id = tenant_id
|
|
90
91
|
# Test connection by generating access token
|
|
91
92
|
logger.info(f"Trying to connect to {self._get_authority_url()}")
|
|
92
93
|
# Power-Bi Auth (Service Principal Auth)
|
|
93
|
-
self.
|
|
94
|
+
self._msal_client = msal.ConfidentialClientApplication(
|
|
94
95
|
client_id,
|
|
95
96
|
client_credential=client_secret,
|
|
96
97
|
authority=DataResolverBase.AUTHORITY + tenant_id,
|
|
@@ -168,18 +169,18 @@ class DataResolverBase(ABC):
|
|
|
168
169
|
pass
|
|
169
170
|
|
|
170
171
|
def _get_authority_url(self):
|
|
171
|
-
return f"{DataResolverBase.AUTHORITY}{self.
|
|
172
|
+
return f"{DataResolverBase.AUTHORITY}{self._tenant_id}"
|
|
172
173
|
|
|
173
174
|
def get_authorization_header(self):
|
|
174
175
|
return {Constant.Authorization: self.get_access_token()}
|
|
175
176
|
|
|
176
|
-
def get_access_token(self):
|
|
177
|
-
if self.
|
|
178
|
-
return self.
|
|
177
|
+
def get_access_token(self) -> str:
|
|
178
|
+
if self._access_token is not None and not self._is_access_token_expired():
|
|
179
|
+
return self._access_token
|
|
179
180
|
|
|
180
181
|
logger.info("Generating PowerBi access token")
|
|
181
182
|
|
|
182
|
-
auth_response = self.
|
|
183
|
+
auth_response = self._msal_client.acquire_token_for_client(
|
|
183
184
|
scopes=[DataResolverBase.SCOPE]
|
|
184
185
|
)
|
|
185
186
|
|
|
@@ -193,24 +194,24 @@ class DataResolverBase(ABC):
|
|
|
193
194
|
|
|
194
195
|
logger.info("Generated PowerBi access token")
|
|
195
196
|
|
|
196
|
-
self.
|
|
197
|
+
self._access_token = "Bearer {}".format(
|
|
197
198
|
auth_response.get(Constant.ACCESS_TOKEN)
|
|
198
199
|
)
|
|
199
200
|
safety_gap = 300
|
|
200
|
-
self.
|
|
201
|
+
self._access_token_expiry_time = datetime.now() + timedelta(
|
|
201
202
|
seconds=(
|
|
202
203
|
max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0)
|
|
203
204
|
)
|
|
204
205
|
)
|
|
205
206
|
|
|
206
|
-
logger.debug(f"{Constant.PBIAccessToken}={self.
|
|
207
|
+
logger.debug(f"{Constant.PBIAccessToken}={self._access_token}")
|
|
207
208
|
|
|
208
|
-
return self.
|
|
209
|
+
return self._access_token
|
|
209
210
|
|
|
210
211
|
def _is_access_token_expired(self) -> bool:
|
|
211
|
-
if not self.
|
|
212
|
+
if not self._access_token_expiry_time:
|
|
212
213
|
return True
|
|
213
|
-
return self.
|
|
214
|
+
return self._access_token_expiry_time < datetime.now()
|
|
214
215
|
|
|
215
216
|
def get_dashboards(self, workspace: Workspace) -> List[Dashboard]:
|
|
216
217
|
"""
|
|
@@ -138,12 +138,20 @@ class SnowflakeIdentifierConfig(
|
|
|
138
138
|
description="Whether to convert dataset urns to lowercase.",
|
|
139
139
|
)
|
|
140
140
|
|
|
141
|
-
|
|
142
|
-
class SnowflakeUsageConfig(BaseUsageConfig):
|
|
143
141
|
email_domain: Optional[str] = pydantic.Field(
|
|
144
142
|
default=None,
|
|
145
143
|
description="Email domain of your organization so users can be displayed on UI appropriately.",
|
|
146
144
|
)
|
|
145
|
+
|
|
146
|
+
email_as_user_identifier: bool = Field(
|
|
147
|
+
default=True,
|
|
148
|
+
description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
|
|
149
|
+
"provided, generates email addresses for snowflake users with unset emails, based on their "
|
|
150
|
+
"username.",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class SnowflakeUsageConfig(BaseUsageConfig):
|
|
147
155
|
apply_view_usage_to_tables: bool = pydantic.Field(
|
|
148
156
|
default=False,
|
|
149
157
|
description="Whether to apply view's usage to its base tables. If set to True, usage is applied to base tables only.",
|
|
@@ -267,13 +275,6 @@ class SnowflakeV2Config(
|
|
|
267
275
|
" Map of share name -> details of share.",
|
|
268
276
|
)
|
|
269
277
|
|
|
270
|
-
email_as_user_identifier: bool = Field(
|
|
271
|
-
default=True,
|
|
272
|
-
description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is "
|
|
273
|
-
"provided, generates email addresses for snowflake users with unset emails, based on their "
|
|
274
|
-
"username.",
|
|
275
|
-
)
|
|
276
|
-
|
|
277
278
|
include_assertion_results: bool = Field(
|
|
278
279
|
default=False,
|
|
279
280
|
description="Whether to ingest assertion run results for assertions created using Datahub"
|