acryl-datahub 0.15.0rc13__py3-none-any.whl → 0.15.0rc14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=uasr1AqGhDFOT97_MigdNOSRw0ewVjoBizWutajHjN4,575
1
+ datahub/__init__.py,sha256=TT6lQ2hGZTB3ZKUJKer0P4dfP-zsktBs_ZiZhHSPcV8,575
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,13 +200,13 @@ datahub/ingestion/source/mongodb.py,sha256=vZue4Nz0xaBoCUsQr3_0OIRkWRxeE_IH_Y_QK
200
200
  datahub/ingestion/source/nifi.py,sha256=ttsjZ9aRUvINmewvKFIQD8Rwa4jcl35WFG-F-jPGPWQ,56146
201
201
  datahub/ingestion/source/openapi.py,sha256=3ea2ORz1cuq4e7L2hSjxG9Cw3__pVoJ5UNYTJS3EnKU,17386
202
202
  datahub/ingestion/source/openapi_parser.py,sha256=1_68wHWe_SzWYEyC1YVDw9vxoadKjW1yv8DecvyIhwY,13606
203
- datahub/ingestion/source/preset.py,sha256=eq7h1qKs8nfSBVot1ofN-YgZhw_rzq8DG4cKOGfDHko,3948
203
+ datahub/ingestion/source/preset.py,sha256=fByqamRLnXxsfCGdLPzWN_5LJR_s2_G2f_zwSKUc8EA,3981
204
204
  datahub/ingestion/source/pulsar.py,sha256=H8XJC7xIX8Kdkd7006PxllAGVO_Pjza8Xx9VUBOvpPc,19827
205
205
  datahub/ingestion/source/redash.py,sha256=E-a14X19zppPun7_-S-pZ2lRiw1-68QiT-jL7bDzG10,32057
206
206
  datahub/ingestion/source/salesforce.py,sha256=S6LSM6mzl8-zKbrJPoINhM1SCpYfM244Xb74pbEI-J0,31792
207
207
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
208
208
  datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
209
- datahub/ingestion/source/superset.py,sha256=yf-Jb1eKle3zyf_hPpRrFT4kuyj4ZEnG0SyveSjSHXo,19579
209
+ datahub/ingestion/source/superset.py,sha256=5hUI83QEArHoDy5tb8rx5P6t-1louxX1Ki1XIplIuFo,24777
210
210
  datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
211
211
  datahub/ingestion/source/abs/config.py,sha256=Doecl1mA6JshJTNar7oTVR7wnWl4gMu64MBHp3hIVJc,6737
212
212
  datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=qh38q-Zw8TUTZD5RF0_hSoEfR6BilNGXyKPRsq1KQKE,3600
@@ -214,17 +214,17 @@ datahub/ingestion/source/abs/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBo
214
214
  datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
215
215
  datahub/ingestion/source/abs/source.py,sha256=eH3SIWnbJH30VGNsdNOExFfjfyR9bLuS6KKzMsi6vz4,24339
216
216
  datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
217
- datahub/ingestion/source/aws/aws_common.py,sha256=_MCtiRLZq1iYMFPXMkguA2FKAAx1qPA7F0MWWk27buw,9971
217
+ datahub/ingestion/source/aws/aws_common.py,sha256=BqDe19yqHdwj6_sjIryGZ9_5lsAJ0PZhfPfGqLZrCcE,10649
218
218
  datahub/ingestion/source/aws/glue.py,sha256=fX0dtaVVq174ZS0aBJvZFYK8ligfZX5EU3pdS3j1KQs,56215
219
219
  datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
220
220
  datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
221
- datahub/ingestion/source/aws/sagemaker.py,sha256=23m8a9-VofWDJZWm4uCrf0MLkFZKbxce7839qDYTh7w,4995
221
+ datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
222
222
  datahub/ingestion/source/aws/sagemaker_processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
223
- datahub/ingestion/source/aws/sagemaker_processors/common.py,sha256=SSvpOszqJAHoZB3jMZgx8odInEy3lTZZCThhpOWkyvE,2012
223
+ datahub/ingestion/source/aws/sagemaker_processors/common.py,sha256=NvYfI8LHgDvhEZE7qp6qF1NSZ0_SQKhg3ivtdjsdpFg,2172
224
224
  datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py,sha256=bnx6uKwXvzafYhcIl112INTMmotu6xy8FjFNhTO4b6c,10384
225
225
  datahub/ingestion/source/aws/sagemaker_processors/job_classes.py,sha256=CfJkzjZU2uvZvw7qvmxfNgeWI1EvgHFY-7bn5Ih71no,9154
226
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py,sha256=OHLiqeZCTR9GgfmSx6O8oX9ZCd983RiFnx23JTiKZ3I,32395
227
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py,sha256=dvSCoiZhJLN4Hic5nRH3REI7SxMdMsm_4Ugmv0U8Zdg,9290
226
+ datahub/ingestion/source/aws/sagemaker_processors/jobs.py,sha256=aHgQ4QMufdWAA62TNBoEPT3YSQKXg39IJ2-6MZXs8sw,32915
227
+ datahub/ingestion/source/aws/sagemaker_processors/lineage.py,sha256=TcT8xmVuQDQdlRKYaCRXbFjMcW5brfSWFBNpoRdPx1o,9789
228
228
  datahub/ingestion/source/aws/sagemaker_processors/models.py,sha256=6Ltmy6MAwbexN_JRYu7LXlAKpihXGlW4WXxo7qdwEF8,19845
229
229
  datahub/ingestion/source/azure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
230
230
  datahub/ingestion/source/azure/abs_folder_utils.py,sha256=7skXus-4fSIoKpqCeU-GG0ch1oF2SJSYDZ1JMB_Onso,7605
@@ -301,7 +301,7 @@ datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP
301
301
  datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
302
302
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
303
303
  datahub/ingestion/source/gc/datahub_gc.py,sha256=f6Erj3KfD0Hx3ydwL5MUVCZgFzS9c6U2Pkr54JLIUOA,12394
304
- datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=GOdLVYYfmmHV7Xtr-cwCTiQiRSVwq80Nu0EmARyoEeo,15323
304
+ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=ficLiRb9DEx4YFXZqWO8o-6ndVIrNW_yR-Yn2SXfDxc,15836
305
305
  datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=cHJmxz4NmA7VjTX2iGEo3wZ_SDrjC_rCQcnRxKgfUVI,8713
306
306
  datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=_tms5AqNAJRDRzQmyN_VydzXbdME2lkvTwa5u1La5z8,7353
307
307
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -355,12 +355,12 @@ datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py,sha256=AIU89l
355
355
  datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule,sha256=PRWzuZMMhKdOVoAaE8csHvUFbZHxYe5meJHgrqlgiuw,19795
356
356
  datahub/ingestion/source/powerbi/powerbi.py,sha256=7UsAEqaFlkWONcXJdQ2hotUYYn46ks6Fe71KXEMh7lI,54495
357
357
  datahub/ingestion/source/powerbi/m_query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
358
- datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=s2Nckmr50hxae5gPFcIfpyLzYpaMH56Q9nsDsEgi_-k,2243
358
+ datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=EbaEasEOGZ73jz0cQofH9ez65wSvRBof0R6GQaIVLnM,2009
359
359
  datahub/ingestion/source/powerbi/m_query/native_sql_parser.py,sha256=zzKVDGeUM3Yv3-zNah4D6mSnr6jXsstNuLmzczcPQEE,3683
360
360
  datahub/ingestion/source/powerbi/m_query/parser.py,sha256=pB1LGdb02Ryf8Pr8JPSgiOmLE6mEAgDbKodtKOOY6LU,5782
361
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=K9kyX-pMFjOOCAvedKXXce7xG-cHwTGQTrBT5GFXEIg,32865
362
- datahub/ingestion/source/powerbi/m_query/resolver.py,sha256=v2DJYT70Vtw9NyIwFecP7tHGcVxBWH9UqV7vbpYPhws,16985
363
- datahub/ingestion/source/powerbi/m_query/tree_function.py,sha256=i7HM9Oyj9XdJpNfG2lE8puxeuc47aSJ-5dPdBEcw2WU,6165
361
+ datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=h1gBh9Gz1zF3m-hA0WMQaBdEtcZUW9vScNKEvqIWCfk,32442
362
+ datahub/ingestion/source/powerbi/m_query/resolver.py,sha256=TMt84_JaCazpP7vcneW0O3cUjtbIuh8Yid78JWfDxsI,16953
363
+ datahub/ingestion/source/powerbi/m_query/tree_function.py,sha256=h77DunhlgOP0fAg8UXDXxxInOi7Pay85_d1Ca4YqyKs,6134
364
364
  datahub/ingestion/source/powerbi/m_query/validator.py,sha256=crG-VZy2XPieiDliP9yVMgiFcc8b2xbZyDFEATXqEAQ,1155
365
365
  datahub/ingestion/source/powerbi/rest_api_wrapper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
366
366
  datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py,sha256=xqAsnNUCP44Wd1rE1m_phbKtNCMJTFJfOX4_2varadg,8298
@@ -423,13 +423,13 @@ datahub/ingestion/source/snowflake/constants.py,sha256=22n-0r04nuy-ImxWFFpmbrt_G
423
423
  datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
424
424
  datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
425
425
  datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
426
- datahub/ingestion/source/snowflake/snowflake_config.py,sha256=XkBDHH1BWGAp4FhBZCHB3LrPjBxJ28GcyeQgvv-uBxA,18704
426
+ datahub/ingestion/source/snowflake/snowflake_config.py,sha256=BzxM1EYIV2-KdZz3jphCAhitjpMUadiLLEbL6s-Z2J4,18707
427
427
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
428
428
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
429
429
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=w2CPm5XEU-KMUSIpb58aKOaxTDHfM5NvghutCVRicy4,23247
430
430
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
431
431
  datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
432
- datahub/ingestion/source/snowflake/snowflake_query.py,sha256=oNmtg-ZVcZ3-w1X5t-JGv2qTH64Z0qzEnaZaRxbRquo,38035
432
+ datahub/ingestion/source/snowflake/snowflake_query.py,sha256=e6WodpmNto-I8lmexRd7VO0lxJDxM66MCGnG5dzr1Dk,38067
433
433
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=KjNvYufQMVkFP7F5sEFumKorkiFAmFVCQ1jYqXr0ev0,6419
434
434
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=fatrKpBUY9CnzXhLJcFlHkHGt0QWFhkYH9ZXwWoQCLA,20392
435
435
  datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=JjzhhyEN9QBUv-64sHhkq-4Vq1XhDtz9npLMiqlSICo,38893
@@ -452,7 +452,7 @@ datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_
452
452
  datahub/ingestion/source/sql/oracle.py,sha256=ibBtjaneCFto-Rw3k2OxsbT3YHgux1aCtPtv5oA8St4,24533
453
453
  datahub/ingestion/source/sql/postgres.py,sha256=uC1kYEI8VdxiZ1Y9IxMWzwmg11wtMqYN0e2fkok1rxo,11972
454
454
  datahub/ingestion/source/sql/presto.py,sha256=PB-CS5MX2dSRFRHjlxfkLHGXLZXFNCsVAAyRBtY6HMg,3611
455
- datahub/ingestion/source/sql/sql_common.py,sha256=yvYUz7cAkzynxLiZht38TOca0hWPXotalQhZPYd54yc,51114
455
+ datahub/ingestion/source/sql/sql_common.py,sha256=FMLbRQVSxLWjLNTiBBQVb9SG4F7h9lwsfNzAoKhOyAU,51356
456
456
  datahub/ingestion/source/sql/sql_config.py,sha256=M-l_uXau0ODolLZHBzAXhy-Rq5yYxvJ6cLbCIea7Mww,9449
457
457
  datahub/ingestion/source/sql/sql_generic.py,sha256=9AERvkK8kdJUeDOzCYJDb93xdv6Z4DGho0NfeHj5Uyg,2740
458
458
  datahub/ingestion/source/sql/sql_generic_profiler.py,sha256=6QbhkQH_F13GV1HsavVTq3BE9F7Pr_vfGOjCX2o2c60,11675
@@ -467,7 +467,7 @@ datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5f
467
467
  datahub/ingestion/source/sql/vertica.py,sha256=_9OgSgIgqBml0av063rb8nACiT3SAmzpw0ouyF91wv8,33382
468
468
  datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
469
469
  datahub/ingestion/source/sql/mssql/job_models.py,sha256=eMyR0Efl5kvi7QNgNXzd5_6PdDKYly_552Y8OGSj9PY,6012
470
- datahub/ingestion/source/sql/mssql/source.py,sha256=lOfvNUGfxLmgCAops2jwm1OTBvJxgBivHcEj-9DEaAE,30390
470
+ datahub/ingestion/source/sql/mssql/source.py,sha256=-B0bnFKEReciYzQ4p_2xJJzdn-H8vYz2MQ_h-1B0ibs,30329
471
471
  datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
472
472
  datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
473
473
  datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
974
974
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
975
975
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
976
976
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
977
- acryl_datahub-0.15.0rc13.dist-info/METADATA,sha256=ZDtXy_rVYg4PJ-roJ3o-vXLsdQl6snJmceHd8l03Qm4,174408
978
- acryl_datahub-0.15.0rc13.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
- acryl_datahub-0.15.0rc13.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
- acryl_datahub-0.15.0rc13.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
- acryl_datahub-0.15.0rc13.dist-info/RECORD,,
977
+ acryl_datahub-0.15.0rc14.dist-info/METADATA,sha256=GAon0PKaDuM17zZePUxkbcfczQk1bvUN_FVAcWCPVgI,174408
978
+ acryl_datahub-0.15.0rc14.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
+ acryl_datahub-0.15.0rc14.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
+ acryl_datahub-0.15.0rc14.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
+ acryl_datahub-0.15.0rc14.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0rc13"
6
+ __version__ = "0.15.0rc14"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -1,5 +1,5 @@
1
1
  from datetime import datetime, timedelta, timezone
2
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
3
3
 
4
4
  import boto3
5
5
  from boto3.session import Session
@@ -107,6 +107,14 @@ class AwsConnectionConfig(ConfigModel):
107
107
  default=None,
108
108
  description="A set of proxy configs to use with AWS. See the [botocore.config](https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html) docs for details.",
109
109
  )
110
+ aws_retry_num: int = Field(
111
+ default=5,
112
+ description="Number of times to retry failed AWS requests. See the [botocore.retry](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) docs for details.",
113
+ )
114
+ aws_retry_mode: Literal["legacy", "standard", "adaptive"] = Field(
115
+ default="standard",
116
+ description="Retry mode to use for failed AWS requests. See the [botocore.retry](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) docs for details.",
117
+ )
110
118
 
111
119
  read_timeout: float = Field(
112
120
  default=DEFAULT_TIMEOUT,
@@ -199,6 +207,10 @@ class AwsConnectionConfig(ConfigModel):
199
207
  return Config(
200
208
  proxies=self.aws_proxy,
201
209
  read_timeout=self.read_timeout,
210
+ retries={
211
+ "max_attempts": self.aws_retry_num,
212
+ "mode": self.aws_retry_mode,
213
+ },
202
214
  **self.aws_advanced_config,
203
215
  )
204
216
 
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from collections import defaultdict
2
3
  from typing import TYPE_CHECKING, DefaultDict, Dict, Iterable, List, Optional
3
4
 
@@ -36,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
36
37
  if TYPE_CHECKING:
37
38
  from mypy_boto3_sagemaker import SageMakerClient
38
39
 
40
+ logger = logging.getLogger(__name__)
41
+
39
42
 
40
43
  @platform_name("SageMaker")
41
44
  @config_class(SagemakerSourceConfig)
@@ -75,6 +78,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
75
78
  ]
76
79
 
77
80
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
81
+ logger.info("Starting SageMaker ingestion...")
78
82
  # get common lineage graph
79
83
  lineage_processor = LineageProcessor(
80
84
  sagemaker_client=self.sagemaker_client, env=self.env, report=self.report
@@ -83,6 +87,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
83
87
 
84
88
  # extract feature groups if specified
85
89
  if self.source_config.extract_feature_groups:
90
+ logger.info("Extracting feature groups...")
86
91
  feature_group_processor = FeatureGroupProcessor(
87
92
  sagemaker_client=self.sagemaker_client, env=self.env, report=self.report
88
93
  )
@@ -95,6 +100,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
95
100
 
96
101
  # extract jobs if specified
97
102
  if self.source_config.extract_jobs is not False:
103
+ logger.info("Extracting jobs...")
98
104
  job_processor = JobProcessor(
99
105
  sagemaker_client=self.client_factory.get_client,
100
106
  env=self.env,
@@ -109,6 +115,8 @@ class SagemakerSource(StatefulIngestionSourceBase):
109
115
 
110
116
  # extract models if specified
111
117
  if self.source_config.extract_models:
118
+ logger.info("Extracting models...")
119
+
112
120
  model_processor = ModelProcessor(
113
121
  sagemaker_client=self.sagemaker_client,
114
122
  env=self.env,
@@ -40,8 +40,11 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
40
40
  groups_scanned = 0
41
41
  models_scanned = 0
42
42
  jobs_scanned = 0
43
+ jobs_processed = 0
43
44
  datasets_scanned = 0
44
45
  filtered: List[str] = field(default_factory=list)
46
+ model_endpoint_lineage = 0
47
+ model_group_lineage = 0
45
48
 
46
49
  def report_feature_group_scanned(self) -> None:
47
50
  self.feature_groups_scanned += 1
@@ -58,6 +61,9 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
58
61
  def report_model_scanned(self) -> None:
59
62
  self.models_scanned += 1
60
63
 
64
+ def report_job_processed(self) -> None:
65
+ self.jobs_processed += 1
66
+
61
67
  def report_job_scanned(self) -> None:
62
68
  self.jobs_scanned += 1
63
69
 
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from collections import defaultdict
2
3
  from dataclasses import dataclass, field
3
4
  from enum import Enum
@@ -49,6 +50,8 @@ from datahub.metadata.schema_classes import (
49
50
  if TYPE_CHECKING:
50
51
  from mypy_boto3_sagemaker import SageMakerClient
51
52
 
53
+ logger = logging.getLogger(__name__)
54
+
52
55
  JobInfo = TypeVar(
53
56
  "JobInfo",
54
57
  AutoMlJobInfo,
@@ -274,15 +277,18 @@ class JobProcessor:
274
277
  )
275
278
 
276
279
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
280
+ logger.info("Getting all SageMaker jobs")
277
281
  jobs = self.get_all_jobs()
278
282
 
279
283
  processed_jobs: Dict[str, SageMakerJob] = {}
280
284
 
285
+ logger.info("Processing SageMaker jobs")
281
286
  # first pass: process jobs and collect datasets used
287
+ logger.info("first pass: process jobs and collect datasets used")
282
288
  for job in jobs:
283
289
  job_type = job_type_to_info[job["type"]]
284
290
  job_name = job[job_type.list_name_key]
285
-
291
+ logger.debug(f"Processing job {job_name} with type {job_type}")
286
292
  job_details = self.get_job_details(job_name, job["type"])
287
293
 
288
294
  processed_job = getattr(self, job_type.processor)(job_details)
@@ -293,6 +299,9 @@ class JobProcessor:
293
299
  # second pass:
294
300
  # - move output jobs to inputs
295
301
  # - aggregate i/o datasets
302
+ logger.info(
303
+ "second pass: move output jobs to inputs and aggregate i/o datasets"
304
+ )
296
305
  for job_urn in sorted(processed_jobs):
297
306
  processed_job = processed_jobs[job_urn]
298
307
 
@@ -301,6 +310,7 @@ class JobProcessor:
301
310
 
302
311
  all_datasets.update(processed_job.input_datasets)
303
312
  all_datasets.update(processed_job.output_datasets)
313
+ self.report.report_job_processed()
304
314
 
305
315
  # yield datasets
306
316
  for dataset_urn, dataset in all_datasets.items():
@@ -322,6 +332,7 @@ class JobProcessor:
322
332
  self.report.report_dataset_scanned()
323
333
 
324
334
  # third pass: construct and yield MCEs
335
+ logger.info("third pass: construct and yield MCEs")
325
336
  for job_urn in sorted(processed_jobs):
326
337
  processed_job = processed_jobs[job_urn]
327
338
  job_snapshot = processed_job.job_snapshot
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from collections import defaultdict
2
3
  from dataclasses import dataclass, field
3
4
  from typing import TYPE_CHECKING, Any, DefaultDict, Dict, List, Set
@@ -6,6 +7,8 @@ from datahub.ingestion.source.aws.sagemaker_processors.common import (
6
7
  SagemakerSourceReport,
7
8
  )
8
9
 
10
+ logger = logging.getLogger(__name__)
11
+
9
12
  if TYPE_CHECKING:
10
13
  from mypy_boto3_sagemaker import SageMakerClient
11
14
  from mypy_boto3_sagemaker.type_defs import (
@@ -88,7 +91,6 @@ class LineageProcessor:
88
91
  paginator = self.sagemaker_client.get_paginator("list_contexts")
89
92
  for page in paginator.paginate():
90
93
  contexts += page["ContextSummaries"]
91
-
92
94
  return contexts
93
95
 
94
96
  def get_incoming_edges(self, node_arn: str) -> List["AssociationSummaryTypeDef"]:
@@ -225,27 +227,32 @@ class LineageProcessor:
225
227
  """
226
228
  Get the lineage of all artifacts in SageMaker.
227
229
  """
228
-
230
+ logger.info("Getting lineage for SageMaker artifacts...")
231
+ logger.info("Getting all actions")
229
232
  for action in self.get_all_actions():
230
233
  self.nodes[action["ActionArn"]] = {**action, "node_type": "action"}
234
+ logger.info("Getting all artifacts")
231
235
  for artifact in self.get_all_artifacts():
232
236
  self.nodes[artifact["ArtifactArn"]] = {**artifact, "node_type": "artifact"}
237
+ logger.info("Getting all contexts")
233
238
  for context in self.get_all_contexts():
234
239
  self.nodes[context["ContextArn"]] = {**context, "node_type": "context"}
235
240
 
241
+ logger.info("Getting lineage for model deployments and model groups")
236
242
  for node_arn, node in self.nodes.items():
243
+ logger.debug(f"Getting lineage for node {node_arn}")
237
244
  # get model-endpoint lineage
238
245
  if (
239
246
  node["node_type"] == "action"
240
247
  and node.get("ActionType") == "ModelDeployment"
241
248
  ):
242
249
  self.get_model_deployment_lineage(node_arn)
243
-
250
+ self.report.model_endpoint_lineage += 1
244
251
  # get model-group lineage
245
252
  if (
246
253
  node["node_type"] == "context"
247
254
  and node.get("ContextType") == "ModelGroup"
248
255
  ):
249
256
  self.get_model_group_lineage(node_arn, node)
250
-
257
+ self.report.model_group_lineage += 1
251
258
  return self.lineage_info
@@ -207,6 +207,9 @@ class DataProcessCleanup:
207
207
  assert self.ctx.graph
208
208
  dpis = []
209
209
  start = 0
210
+ # This graphql endpoint doesn't support scrolling and therefore after 10k DPIs it causes performance issues on ES
211
+ # Therefore, we are limiting the max DPIs to 9000
212
+ max_item = 9000
210
213
  while True:
211
214
  try:
212
215
  job_query_result = self.ctx.graph.execute_graphql(
@@ -226,10 +229,12 @@ class DataProcessCleanup:
226
229
  runs = runs_data.get("runs")
227
230
  dpis.extend(runs)
228
231
  start += batch_size
229
- if len(runs) < batch_size:
232
+ if len(runs) < batch_size or start >= max_item:
230
233
  break
231
234
  except Exception as e:
232
- logger.error(f"Exception while fetching DPIs for job {job_urn}: {e}")
235
+ self.report.failure(
236
+ f"Exception while fetching DPIs for job {job_urn}:", exc=e
237
+ )
233
238
  break
234
239
  return dpis
235
240
 
@@ -254,8 +259,9 @@ class DataProcessCleanup:
254
259
  deleted_count_last_n += 1
255
260
  futures[future]["deleted"] = True
256
261
  except Exception as e:
257
- logger.error(f"Exception while deleting DPI: {e}")
258
-
262
+ self.report.report_failure(
263
+ f"Exception while deleting DPI: {e}", exc=e
264
+ )
259
265
  if deleted_count_last_n % self.config.batch_size == 0:
260
266
  logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
261
267
  if self.config.delay:
@@ -289,7 +295,7 @@ class DataProcessCleanup:
289
295
  dpis = self.fetch_dpis(job.urn, self.config.batch_size)
290
296
  dpis.sort(
291
297
  key=lambda x: x["created"]["time"]
292
- if "created" in x and "time" in x["created"]
298
+ if x.get("created") and x["created"].get("time")
293
299
  else 0,
294
300
  reverse=True,
295
301
  )
@@ -325,8 +331,8 @@ class DataProcessCleanup:
325
331
  continue
326
332
 
327
333
  if (
328
- "created" not in dpi
329
- or "time" not in dpi["created"]
334
+ not dpi.get("created")
335
+ or not dpi["created"].get("time")
330
336
  or dpi["created"]["time"] < retention_time * 1000
331
337
  ):
332
338
  future = executor.submit(
@@ -340,7 +346,7 @@ class DataProcessCleanup:
340
346
  deleted_count_retention += 1
341
347
  futures[future]["deleted"] = True
342
348
  except Exception as e:
343
- logger.error(f"Exception while deleting DPI: {e}")
349
+ self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
344
350
 
345
351
  if deleted_count_retention % self.config.batch_size == 0:
346
352
  logger.info(
@@ -351,9 +357,12 @@ class DataProcessCleanup:
351
357
  logger.info(f"Sleeping for {self.config.delay} seconds")
352
358
  time.sleep(self.config.delay)
353
359
 
354
- logger.info(
355
- f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
356
- )
360
+ if deleted_count_retention > 0:
361
+ logger.info(
362
+ f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
363
+ )
364
+ else:
365
+ logger.debug(f"No DPIs to delete from {job.urn} due to retention")
357
366
 
358
367
  def get_data_flows(self) -> Iterable[DataFlowEntity]:
359
368
  assert self.ctx.graph
@@ -1,5 +1,4 @@
1
1
  import os
2
- from abc import ABC
3
2
  from dataclasses import dataclass
4
3
  from enum import Enum
5
4
  from typing import Any, Dict, List, Optional
@@ -12,18 +11,8 @@ from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
12
11
  TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False)
13
12
 
14
13
 
15
- class AbstractIdentifierAccessor(ABC): # To pass lint
16
- pass
17
-
18
-
19
- # @dataclass
20
- # class ItemSelector:
21
- # items: Dict[str, Any]
22
- # next: Optional[AbstractIdentifierAccessor]
23
-
24
-
25
14
  @dataclass
26
- class IdentifierAccessor(AbstractIdentifierAccessor):
15
+ class IdentifierAccessor:
27
16
  """
28
17
  statement
29
18
  public_order_date = Source{[Schema="public",Item="order_date"]}[Data]
@@ -40,7 +29,7 @@ class IdentifierAccessor(AbstractIdentifierAccessor):
40
29
 
41
30
  identifier: str
42
31
  items: Dict[str, Any]
43
- next: Optional[AbstractIdentifierAccessor]
32
+ next: Optional["IdentifierAccessor"]
44
33
 
45
34
 
46
35
  @dataclass
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from enum import Enum
4
- from typing import Dict, List, Optional, Tuple, Type, Union, cast
4
+ from typing import Dict, List, Optional, Tuple, Type, cast
5
5
 
6
6
  from lark import Tree
7
7
 
@@ -22,7 +22,6 @@ from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
22
22
  )
23
23
  from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function
24
24
  from datahub.ingestion.source.powerbi.m_query.data_classes import (
25
- AbstractIdentifierAccessor,
26
25
  DataAccessFunctionDetail,
27
26
  DataPlatformTable,
28
27
  FunctionName,
@@ -412,33 +411,25 @@ class DatabricksLineage(AbstractLineage):
412
411
  )
413
412
  table_detail: Dict[str, str] = {}
414
413
  temp_accessor: Optional[
415
- Union[IdentifierAccessor, AbstractIdentifierAccessor]
414
+ IdentifierAccessor
416
415
  ] = data_access_func_detail.identifier_accessor
417
416
 
418
417
  while temp_accessor:
419
- if isinstance(temp_accessor, IdentifierAccessor):
420
- # Condition to handle databricks M-query pattern where table, schema and database all are present in
421
- # the same invoke statement
422
- if all(
423
- element in temp_accessor.items
424
- for element in ["Item", "Schema", "Catalog"]
425
- ):
426
- table_detail["Schema"] = temp_accessor.items["Schema"]
427
- table_detail["Table"] = temp_accessor.items["Item"]
428
- else:
429
- table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[
430
- "Name"
431
- ]
432
-
433
- if temp_accessor.next is not None:
434
- temp_accessor = temp_accessor.next
435
- else:
436
- break
418
+ # Condition to handle databricks M-query pattern where table, schema and database all are present in
419
+ # the same invoke statement
420
+ if all(
421
+ element in temp_accessor.items
422
+ for element in ["Item", "Schema", "Catalog"]
423
+ ):
424
+ table_detail["Schema"] = temp_accessor.items["Schema"]
425
+ table_detail["Table"] = temp_accessor.items["Item"]
437
426
  else:
438
- logger.debug(
439
- "expecting instance to be IdentifierAccessor, please check if parsing is done properly"
440
- )
441
- return Lineage.empty()
427
+ table_detail[temp_accessor.items["Kind"]] = temp_accessor.items["Name"]
428
+
429
+ if temp_accessor.next is not None:
430
+ temp_accessor = temp_accessor.next
431
+ else:
432
+ break
442
433
 
443
434
  table_reference = self.create_reference_table(
444
435
  arg_list=data_access_func_detail.arg_list,
@@ -786,9 +777,10 @@ class NativeQueryLineage(AbstractLineage):
786
777
  def create_lineage(
787
778
  self, data_access_func_detail: DataAccessFunctionDetail
788
779
  ) -> Lineage:
789
- t1: Tree = cast(
790
- Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list)
780
+ t1: Optional[Tree] = tree_function.first_arg_list_func(
781
+ data_access_func_detail.arg_list
791
782
  )
783
+ assert t1 is not None
792
784
  flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1)
793
785
 
794
786
  if len(flat_argument_list) != 2:
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
- from typing import Any, Dict, List, Optional, Tuple, Union, cast
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
4
 
5
5
  from lark import Tree
6
6
 
@@ -95,14 +95,12 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
95
95
  # remove whitespaces and quotes from token
96
96
  tokens: List[str] = tree_function.strip_char_from_list(
97
97
  tree_function.remove_whitespaces_from_list(
98
- tree_function.token_values(
99
- cast(Tree, item_selector), parameters=self.parameters
100
- )
98
+ tree_function.token_values(item_selector, parameters=self.parameters)
101
99
  ),
102
100
  )
103
101
  identifier: List[str] = tree_function.token_values(
104
- cast(Tree, identifier_tree)
105
- ) # type :ignore
102
+ identifier_tree, parameters={}
103
+ )
106
104
 
107
105
  # convert tokens to dict
108
106
  iterator = iter(tokens)
@@ -238,10 +236,10 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
238
236
  def _process_item_selector_expression(
239
237
  self, rh_tree: Tree
240
238
  ) -> Tuple[Optional[str], Optional[Dict[str, str]]]:
241
- new_identifier, key_vs_value = self.get_item_selector_tokens( # type: ignore
242
- cast(Tree, tree_function.first_expression_func(rh_tree))
243
- )
239
+ first_expression: Optional[Tree] = tree_function.first_expression_func(rh_tree)
240
+ assert first_expression is not None
244
241
 
242
+ new_identifier, key_vs_value = self.get_item_selector_tokens(first_expression)
245
243
  return new_identifier, key_vs_value
246
244
 
247
245
  @staticmethod
@@ -327,7 +325,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
327
325
  # The first argument can be a single table argument or list of table.
328
326
  # For example Table.Combine({t1,t2},....), here first argument is list of table.
329
327
  # Table.AddColumn(t1,....), here first argument is single table.
330
- for token in cast(List[str], result):
328
+ for token in result:
331
329
  internal(token, identifier_accessor)
332
330
 
333
331
  else:
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from functools import partial
3
- from typing import Any, Dict, List, Optional, Union, cast
3
+ from typing import Any, Dict, List, Optional, Union
4
4
 
5
5
  from lark import Token, Tree
6
6
 
@@ -58,7 +58,7 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
58
58
  if isinstance(node, Token):
59
59
  return None
60
60
 
61
- for child in cast(Tree, node).children:
61
+ for child in node.children:
62
62
  child_node: Optional[Tree] = internal(child)
63
63
  if child_node is not None:
64
64
  return child_node
@@ -99,7 +99,7 @@ def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
99
99
  logger.debug(f"Unable to resolve parameter reference to {ref}")
100
100
  values.append(ref)
101
101
  elif isinstance(node, Token):
102
- values.append(cast(Token, node).value)
102
+ values.append(node.value)
103
103
  return
104
104
  else:
105
105
  for child in node.children:
@@ -85,6 +85,7 @@ class PresetSource(SupersetSource):
85
85
  super().__init__(ctx, config)
86
86
  self.config = config
87
87
  self.report = StaleEntityRemovalSourceReport()
88
+ self.platform = "preset"
88
89
 
89
90
  def login(self):
90
91
  try:
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from dataclasses import dataclass
4
- from typing import Dict, List, Optional, Set, cast
4
+ from typing import Dict, List, Optional, Set
5
5
 
6
6
  import pydantic
7
7
  from pydantic import Field, SecretStr, root_validator, validator
@@ -118,9 +118,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
118
118
  )
119
119
 
120
120
  # Always exclude reporting metadata for INFORMATION_SCHEMA schema
121
- if schema_pattern is not None and schema_pattern:
121
+ if schema_pattern:
122
122
  logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.")
123
- cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$")
123
+ assert isinstance(schema_pattern, AllowDenyPattern)
124
+ schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
124
125
 
125
126
  return values
126
127