PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc1__py3-none-any.whl → 0.15.0.1rc3__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc1py3-none-any.whl → 0.15.0.1rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (20) hide show

{acryl_datahub-0.15.0.1rc1.dist-info → acryl_datahub-0.15.0.1rc3.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-datahub/__init__.py,sha256=gK5aLEGMHMZfg-QUDI5T7mr1ej_5OFVKhCrqqoj_QGk,576
+datahub/__init__.py,sha256=BqWrG3ZFOnsobExCu08Kaj1mOgrBZ-Jmo_XLdalPyI0,576
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
 datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
 datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
 datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
 datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
-datahub/emitter/rest_emitter.py,sha256=3kG_aPKy9pLibd4SJNtdJxn792c5TJliFjjCOw6NoUM,15533
+datahub/emitter/rest_emitter.py,sha256=d5Zjo3GXDu9rUqlSsK9aOx-yEbHjDFZHelfq_ZFeb5M,16393
 datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
 datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
 datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -144,6 +144,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
 datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
 datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
+datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=9mzg3vhCFPI9zHSi_j3FvJtK8kUvb8PmRSz-TM3tt6k,4323
 datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
 datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
@@ -215,7 +216,7 @@ datahub/ingestion/source/abs/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBo
 datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
 datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
 datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/aws/aws_common.py,sha256=BqDe19yqHdwj6_sjIryGZ9_5lsAJ0PZhfPfGqLZrCcE,10649
+datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
 datahub/ingestion/source/aws/glue.py,sha256=fX0dtaVVq174ZS0aBJvZFYK8ligfZX5EU3pdS3j1KQs,56215
 datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
 datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
@@ -357,7 +358,7 @@ datahub/ingestion/source/neo4j/neo4j_source.py,sha256=L9WiZ5yZrIDMrgj3gYU9j6zz3T
 datahub/ingestion/source/powerbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/powerbi/config.py,sha256=9cgMrwp3DQqv14iTqulAONkbVe5nhycJ1ElkA_275go,22732
 datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py,sha256=-njW1kJOy-LY5JFwJLhVQ0bMBj9NQz5TZhQqsSi_KsM,2285
-datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule,sha256=PRWzuZMMhKdOVoAaE8csHvUFbZHxYe5meJHgrqlgiuw,19795
+datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule,sha256=5df3qvalCS9hZ46DPXs6XDcw9-IofGf8Eol_rUC7LHI,20329
 datahub/ingestion/source/powerbi/powerbi.py,sha256=7UsAEqaFlkWONcXJdQ2hotUYYn46ks6Fe71KXEMh7lI,54495
 datahub/ingestion/source/powerbi/m_query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=EbaEasEOGZ73jz0cQofH9ez65wSvRBof0R6GQaIVLnM,2009
@@ -431,26 +432,26 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
 datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
 datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
 datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
-datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=suMICPFPvoV6shkjD_14JunLc8jAZBINzlFk2mYldkU,23676
+datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=uMGmMEl4hWEmN7GxMyDBdwlIPAW7WmOnu41kZ0dvCG4,21551
 datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
-datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
+datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=8QEihOfivalVR9vLo6vCUL-vnZfAGgMio0uhPYX0jTo,25883
 datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
 datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
 datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
-datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=vof3mNImstnlL8kc0OkTHzMIqnbEkt9RmnYBX1JX0oE,40386
+datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=K-KEr3OpwMHye08lXAy-5doUUGoGJP3b-ntJAGU_NBY,42472
 datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
 datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
 datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
 datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
 datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
-datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=TlS5d1lpEN74ZP0c8UzUhJZIeBMO3ZIUxRler1p7lnA,31998
+datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=ecaTCJNAQ_IJOPInPGXA3jv1dE5lztSU82UhpBygiq0,31654
 datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
 datahub/ingestion/source/sql/clickhouse.py,sha256=jzvaXP5Wr0SMhj2rtuvVE821xnfpKiXhO3cm0xblgHs,27299
 datahub/ingestion/source/sql/cockroachdb.py,sha256=XaD7eae34plU9ISRC6PzYX9q6RdT2qkzjH6CpTOgkx4,1443
 datahub/ingestion/source/sql/druid.py,sha256=lhO9CCOlHV-6LjBuAxAxtB9I1pvPtsGSdr63bz6_ilA,2837
 datahub/ingestion/source/sql/hana.py,sha256=0PIvcX0Rz59NyR7Ag5Bv1MBV_UbJwxl9UAopo_xe_CA,1342
-datahub/ingestion/source/sql/hive.py,sha256=AgEo94zyBL-NLZxR5-jQlNwq_R9FQ4AUOe8wpOox0Xs,8871
+datahub/ingestion/source/sql/hive.py,sha256=Ueicc3EhKQSC77IDzl_xcTDDalbH1pvB8mox1oFI4Hw,29738
 datahub/ingestion/source/sql/hive_metastore.py,sha256=PisLrswev583xW0xDJ5yfKWCWm_ZTl1OeuaMcv_SvXc,35865
 datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfvpHSzJjqvMw,737
 datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
@@ -471,8 +472,8 @@ datahub/ingestion/source/sql/trino.py,sha256=FEn_BQ3pm23hKx94ek5kk5IXGNYcBqZEhll
 datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5foKrf71SydzEltc3WsVAhQc,5732
 datahub/ingestion/source/sql/vertica.py,sha256=_9OgSgIgqBml0av063rb8nACiT3SAmzpw0ouyF91wv8,33382
 datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
-datahub/ingestion/source/sql/mssql/job_models.py,sha256=ztXDrD4anhzwWvACIm9fucE2WhMDMKkJ4alMYOQOqWA,7083
-datahub/ingestion/source/sql/mssql/source.py,sha256=ODdsOIbDA3X0E7En6GT15mD49W6RW9sXLwRoUgw2a8I,30925
+datahub/ingestion/source/sql/mssql/job_models.py,sha256=qMA4yzRiX-60ugsFu5ob3tOSgVs0uPTW_O8OHCECS_0,8002
+datahub/ingestion/source/sql/mssql/source.py,sha256=0ivc6svImkYfGYuAPDX4ZsaYZTilohwaV4cg3KWOqNI,31237
 datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
 datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
@@ -490,7 +491,7 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
 datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
 datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
 datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/tableau/tableau.py,sha256=80m5a8g2q5jnm5pFtL4s_SmSeGPQIuZGqGe9RGn6OSI,138248
+datahub/ingestion/source/tableau/tableau.py,sha256=5oyAoF7h92zUTh9NIaYipyRXUFUZSz1oo8hwn46RM2w,138661
 datahub/ingestion/source/tableau/tableau_common.py,sha256=9gQLq_3BlAsKll83uVlnWJRWaIDtFtREUyuimXF13Z0,26219
 datahub/ingestion/source/tableau/tableau_constant.py,sha256=ZcAeHsQUXVVL26ORly0ByZk_GJAFbxaKuJAlX_sYMac,2686
 datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=nSyx9RzC6TCQDm-cTVJ657qT8iDwzk_8JMKpohhmOc4,1046
@@ -505,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=2-pYQ-3B9UVUwO1yB9iTdi3DqgqZ2JrpQ
 datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
 datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
 datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
-datahub/ingestion/source/unity/source.py,sha256=YdUPCMJtpmvYVnRNnpqb4BVowFobkLvSJ_K2gHwrvCI,41752
+datahub/ingestion/source/unity/source.py,sha256=ydjqJ91q-Mir_aJSmEdo1umgPvQ5la67rBhC04IyV78,41938
 datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
 datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
@@ -516,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
 datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
 datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
 datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source_report/ingestion_stage.py,sha256=zijPKvJKnB298cMoG6j_w6QRva9t1RWqnmycmbsKdgs,1499
+datahub/ingestion/source_report/ingestion_stage.py,sha256=pJcJeLSjaixlLqQyQtE3bfUcvXEVwrSaWWtU4iU9UEo,1557
 datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
 datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
 datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -980,8 +981,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-0.15.0.1rc1.dist-info/METADATA,sha256=Ll6D3fw03bz2dVmCp9Mcez5IbFKIfXzUnMeSe6Ej4eM,173642
-acryl_datahub-0.15.0.1rc1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-acryl_datahub-0.15.0.1rc1.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
-acryl_datahub-0.15.0.1rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-0.15.0.1rc1.dist-info/RECORD,,
+acryl_datahub-0.15.0.1rc3.dist-info/METADATA,sha256=JxWLTgCDIjuZb3Q_cRrG3nTw6mqsweOED2pg_izLT5I,173642
+acryl_datahub-0.15.0.1rc3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+acryl_datahub-0.15.0.1rc3.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
+acryl_datahub-0.15.0.1rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-0.15.0.1rc3.dist-info/RECORD,,

datahub/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import warnings
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "0.15.0.1rc1"
+__version__ = "0.15.0.1rc3"
 def is_dev_mode() -> bool:

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -291,6 +291,7 @@ class DataHubRestEmitter(Closeable, Emitter):
         mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
         async_flag: Optional[bool] = None,
     ) -> int:
+        logger.debug("Attempting to emit batch mcps")
         url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
         for mcp in mcps:
             ensure_has_system_metadata(mcp)
@@ -303,15 +304,22 @@ class DataHubRestEmitter(Closeable, Emitter):
         current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
         for mcp_obj in mcp_objs:
             mcp_obj_size = len(json.dumps(mcp_obj))
+            logger.debug(
+                f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
+            )
             if (
                 mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
                 or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
             ):
+                logger.debug("Decided to create new chunk")
                 mcp_obj_chunks.append([])
                 current_chunk_size = 0
             mcp_obj_chunks[-1].append(mcp_obj)
             current_chunk_size += mcp_obj_size
+        logger.debug(
+            f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
+        )
         for mcp_obj_chunk in mcp_obj_chunks:
             # TODO: We're calling json.dumps on each MCP object twice, once to estimate
@@ -338,8 +346,15 @@ class DataHubRestEmitter(Closeable, Emitter):
     def _emit_generic(self, url: str, payload: str) -> None:
         curl_command = make_curl_command(self._session, "POST", url, payload)
+        payload_size = len(payload)
+        if payload_size > INGEST_MAX_PAYLOAD_BYTES:
+            # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
+            logger.warning(
+                f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
+            )
         logger.debug(
-            "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
+            "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
+            payload_size,
             curl_command,
         )
         try:

datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py ADDED Viewed

@@ -0,0 +1,96 @@
+import json
+import logging
+from typing import Iterable, List
+from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
+from datahub.emitter.serialization_helper import pre_json_transform
+from datahub.ingestion.api.source import SourceReport
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.metadata.schema_classes import (
+    DatasetProfileClass,
+    SchemaFieldClass,
+    SchemaMetadataClass,
+)
+logger = logging.getLogger(__name__)
+class EnsureAspectSizeProcessor:
+    def __init__(
+        self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
+    ):
+        self.report = report
+        self.payload_constraint = payload_constraint
+    def ensure_dataset_profile_size(
+        self, dataset_urn: str, profile: DatasetProfileClass
+    ) -> None:
+        """
+        This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
+        in the future
+        """
+        sample_fields_size = 0
+        if profile.fieldProfiles:
+            logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
+            for field in profile.fieldProfiles:
+                if field.sampleValues:
+                    values_len = 0
+                    for value in field.sampleValues:
+                        if value:
+                            values_len += len(value)
+                    logger.debug(
+                        f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
+                    )
+                    if sample_fields_size + values_len > self.payload_constraint:
+                        field.sampleValues = []
+                        self.report.warning(
+                            title="Dataset profile truncated due to size constraint",
+                            message="Dataset profile contained too much data and would have caused ingestion to fail",
+                            context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
+                        )
+                    else:
+                        sample_fields_size += values_len
+                else:
+                    logger.debug(f"Field {field.fieldPath} has no sample values")
+    def ensure_schema_metadata_size(
+        self, dataset_urn: str, schema: SchemaMetadataClass
+    ) -> None:
+        """
+        This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
+        in the future
+        """
+        total_fields_size = 0
+        logger.debug(f"Amount of schema fields: {len(schema.fields)}")
+        accepted_fields: List[SchemaFieldClass] = []
+        for field in schema.fields:
+            field_size = len(json.dumps(pre_json_transform(field.to_obj())))
+            logger.debug(f"Field {field.fieldPath} takes total {field_size}")
+            if total_fields_size + field_size < self.payload_constraint:
+                accepted_fields.append(field)
+                total_fields_size += field_size
+            else:
+                self.report.warning(
+                    title="Schema truncated due to size constraint",
+                    message="Dataset schema contained too much data and would have caused ingestion to fail",
+                    context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
+                )
+        schema.fields = accepted_fields
+    def ensure_aspect_size(
+        self,
+        stream: Iterable[MetadataWorkUnit],
+    ) -> Iterable[MetadataWorkUnit]:
+        """
+        We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
+        on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
+        """
+        for wu in stream:
+            logger.debug(f"Ensuring size of workunit: {wu.id}")
+            if schema := wu.get_aspect_of_type(SchemaMetadataClass):
+                self.ensure_schema_metadata_size(wu.get_urn(), schema)
+            elif profile := wu.get_aspect_of_type(DatasetProfileClass):
+                self.ensure_dataset_profile_size(wu.get_urn(), profile)
+            yield wu

datahub/ingestion/source/aws/aws_common.py CHANGED Viewed

@@ -1,7 +1,12 @@
+import logging
+import os
 from datetime import datetime, timedelta, timezone
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
+from enum import Enum
+from http import HTTPStatus
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
 import boto3
+import requests
 from boto3.session import Session
 from botocore.config import DEFAULT_TIMEOUT, Config
 from botocore.utils import fix_s3_host
@@ -14,6 +19,8 @@ from datahub.configuration.common import (
 )
 from datahub.configuration.source_common import EnvConfigMixin
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from mypy_boto3_dynamodb import DynamoDBClient
     from mypy_boto3_glue import GlueClient
@@ -22,6 +29,26 @@ if TYPE_CHECKING:
     from mypy_boto3_sts import STSClient
+class AwsEnvironment(Enum):
+    EC2 = "EC2"
+    ECS = "ECS"
+    EKS = "EKS"
+    LAMBDA = "LAMBDA"
+    APP_RUNNER = "APP_RUNNER"
+    BEANSTALK = "ELASTIC_BEANSTALK"
+    CLOUD_FORMATION = "CLOUD_FORMATION"
+    UNKNOWN = "UNKNOWN"
+class AwsServicePrincipal(Enum):
+    LAMBDA = "lambda.amazonaws.com"
+    EKS = "eks.amazonaws.com"
+    APP_RUNNER = "apprunner.amazonaws.com"
+    ECS = "ecs.amazonaws.com"
+    ELASTIC_BEANSTALK = "elasticbeanstalk.amazonaws.com"
+    EC2 = "ec2.amazonaws.com"
 class AwsAssumeRoleConfig(PermissiveConfigModel):
     # Using the PermissiveConfigModel to allow the user to pass additional arguments.
@@ -34,6 +61,163 @@ class AwsAssumeRoleConfig(PermissiveConfigModel):
     )
+def get_instance_metadata_token() -> Optional[str]:
+    """Get IMDSv2 token"""
+    try:
+        response = requests.put(
+            "http://169.254.169.254/latest/api/token",
+            headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
+            timeout=1,
+        )
+        if response.status_code == HTTPStatus.OK:
+            return response.text
+    except requests.exceptions.RequestException:
+        logger.debug("Failed to get IMDSv2 token")
+    return None
+def is_running_on_ec2() -> bool:
+    """Check if code is running on EC2 using IMDSv2"""
+    token = get_instance_metadata_token()
+    if not token:
+        return False
+    try:
+        response = requests.get(
+            "http://169.254.169.254/latest/meta-data/instance-id",
+            headers={"X-aws-ec2-metadata-token": token},
+            timeout=1,
+        )
+        return response.status_code == HTTPStatus.OK
+    except requests.exceptions.RequestException:
+        return False
+def detect_aws_environment() -> AwsEnvironment:
+    """
+    Detect the AWS environment we're running in.
+    Order matters as some environments may have multiple indicators.
+    """
+    # Check Lambda first as it's most specific
+    if os.getenv("AWS_LAMBDA_FUNCTION_NAME"):
+        if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"):
+            return AwsEnvironment.CLOUD_FORMATION
+        return AwsEnvironment.LAMBDA
+    # Check EKS (IRSA)
+    if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"):
+        return AwsEnvironment.EKS
+    # Check App Runner
+    if os.getenv("AWS_APP_RUNNER_SERVICE_ID"):
+        return AwsEnvironment.APP_RUNNER
+    # Check ECS
+    if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
+        "ECS_CONTAINER_METADATA_URI"
+    ):
+        return AwsEnvironment.ECS
+    # Check Elastic Beanstalk
+    if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"):
+        return AwsEnvironment.BEANSTALK
+    if is_running_on_ec2():
+        return AwsEnvironment.EC2
+    return AwsEnvironment.UNKNOWN
+def get_instance_role_arn() -> Optional[str]:
+    """Get role ARN from EC2 instance metadata using IMDSv2"""
+    token = get_instance_metadata_token()
+    if not token:
+        return None
+    try:
+        response = requests.get(
+            "http://169.254.169.254/latest/meta-data/iam/security-credentials/",
+            headers={"X-aws-ec2-metadata-token": token},
+            timeout=1,
+        )
+        if response.status_code == 200:
+            role_name = response.text.strip()
+            if role_name:
+                sts = boto3.client("sts")
+                identity = sts.get_caller_identity()
+                return identity.get("Arn")
+    except Exception as e:
+        logger.debug(f"Failed to get instance role ARN: {e}")
+    return None
+def get_lambda_role_arn() -> Optional[str]:
+    """Get the Lambda function's role ARN"""
+    try:
+        function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME")
+        if not function_name:
+            return None
+        lambda_client = boto3.client("lambda")
+        function_config = lambda_client.get_function_configuration(
+            FunctionName=function_name
+        )
+        return function_config.get("Role")
+    except Exception as e:
+        logger.debug(f"Failed to get Lambda role ARN: {e}")
+        return None
+def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
+    """
+    Get the current role ARN and source type based on the runtime environment.
+    Returns (role_arn, credential_source)
+    """
+    env = detect_aws_environment()
+    if env == AwsEnvironment.LAMBDA:
+        role_arn = get_lambda_role_arn()
+        return role_arn, AwsServicePrincipal.LAMBDA.value
+    elif env == AwsEnvironment.EKS:
+        role_arn = os.getenv("AWS_ROLE_ARN")
+        return role_arn, AwsServicePrincipal.EKS.value
+    elif env == AwsEnvironment.APP_RUNNER:
+        try:
+            sts = boto3.client("sts")
+            identity = sts.get_caller_identity()
+            return identity.get("Arn"), AwsServicePrincipal.APP_RUNNER.value
+        except Exception as e:
+            logger.debug(f"Failed to get App Runner role: {e}")
+    elif env == AwsEnvironment.ECS:
+        try:
+            metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
+                "ECS_CONTAINER_METADATA_URI"
+            )
+            if metadata_uri:
+                response = requests.get(f"{metadata_uri}/task", timeout=1)
+                if response.status_code == HTTPStatus.OK:
+                    task_metadata = response.json()
+                    if "TaskARN" in task_metadata:
+                        return (
+                            task_metadata.get("TaskARN"),
+                            AwsServicePrincipal.ECS.value,
+                        )
+        except Exception as e:
+            logger.debug(f"Failed to get ECS task role: {e}")
+    elif env == AwsEnvironment.BEANSTALK:
+        # Beanstalk uses EC2 instance metadata
+        return get_instance_role_arn(), AwsServicePrincipal.ELASTIC_BEANSTALK.value
+    elif env == AwsEnvironment.EC2:
+        return get_instance_role_arn(), AwsServicePrincipal.EC2.value
+    return None, None
 def assume_role(
     role: AwsAssumeRoleConfig,
     aws_region: Optional[str],
@@ -95,7 +279,7 @@ class AwsConnectionConfig(ConfigModel):
     )
     aws_profile: Optional[str] = Field(
         default=None,
-        description="Named AWS profile to use. Only used if access key / secret are unset. If not set the default will be used",
+        description="The [named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use from AWS credentials. Falls back to default profile if not specified and no access keys provided. Profiles are configured in ~/.aws/credentials or ~/.aws/config.",
     )
     aws_region: Optional[str] = Field(None, description="AWS region code.")
@@ -145,6 +329,7 @@ class AwsConnectionConfig(ConfigModel):
     def get_session(self) -> Session:
         if self.aws_access_key_id and self.aws_secret_access_key:
+            # Explicit credentials take precedence
             session = Session(
                 aws_access_key_id=self.aws_access_key_id,
                 aws_secret_access_key=self.aws_secret_access_key,
@@ -152,38 +337,57 @@ class AwsConnectionConfig(ConfigModel):
                 region_name=self.aws_region,
             )
         elif self.aws_profile:
+            # Named profile is second priority
             session = Session(
                 region_name=self.aws_region, profile_name=self.aws_profile
             )
         else:
-            # Use boto3's credential autodetection.
+            # Use boto3's credential autodetection
             session = Session(region_name=self.aws_region)
-        if self._normalized_aws_roles():
-            # Use existing session credentials to start the chain of role assumption.
-            current_credentials = session.get_credentials()
-            credentials = {
-                "AccessKeyId": current_credentials.access_key,
-                "SecretAccessKey": current_credentials.secret_key,
-                "SessionToken": current_credentials.token,
-            }
-            for role in self._normalized_aws_roles():
-                if self._should_refresh_credentials():
-                    credentials = assume_role(
-                        role,
-                        self.aws_region,
-                        credentials=credentials,
+            target_roles = self._normalized_aws_roles()
+            if target_roles:
+                current_role_arn, credential_source = get_current_identity()
+                # Only assume role if:
+                # 1. We're not in a known AWS environment with a role, or
+                # 2. We need to assume a different role than our current one
+                should_assume_role = current_role_arn is None or any(
+                    role.RoleArn != current_role_arn for role in target_roles
+                )
+                if should_assume_role:
+                    env = detect_aws_environment()
+                    logger.debug(f"Assuming role(s) from {env.value} environment")
+                    current_credentials = session.get_credentials()
+                    if current_credentials is None:
+                        raise ValueError("No credentials available for role assumption")
+                    credentials = {
+                        "AccessKeyId": current_credentials.access_key,
+                        "SecretAccessKey": current_credentials.secret_key,
+                        "SessionToken": current_credentials.token,
+                    }
+                    for role in target_roles:
+                        if self._should_refresh_credentials():
+                            credentials = assume_role(
+                                role=role,
+                                aws_region=self.aws_region,
+                                credentials=credentials,
+                            )
+                            if isinstance(credentials["Expiration"], datetime):
+                                self._credentials_expiration = credentials["Expiration"]
+                    session = Session(
+                        aws_access_key_id=credentials["AccessKeyId"],
+                        aws_secret_access_key=credentials["SecretAccessKey"],
+                        aws_session_token=credentials["SessionToken"],
+                        region_name=self.aws_region,
                     )
-                    if isinstance(credentials["Expiration"], datetime):
-                        self._credentials_expiration = credentials["Expiration"]
-            session = Session(
-                aws_access_key_id=credentials["AccessKeyId"],
-                aws_secret_access_key=credentials["SecretAccessKey"],
-                aws_session_token=credentials["SessionToken"],
-                region_name=self.aws_region,
-            )
+                else:
+                    logger.debug(f"Using existing role from {credential_source}")
         return session

datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule CHANGED Viewed

@@ -21,6 +21,11 @@
 //      |   empty_string
 //      |   empty_string "," argument_list
 // - Added sql_string in any_literal
+// - Added WS_INLINE? in field expression
+//   Added to ignore any comments
+//    %ignore WS             // Ignore whitespace
+//    %ignore CPP_COMMENT    // Ignore single-line comments
+//    %ignore C_COMMENT      // Ignore multi-line comments
 lexical_unit:   lexical_elements?
@@ -245,6 +250,8 @@ operator_or_punctuator: ","
                     |   "=>"
                     |   ".."
                     |   "..."
+                    |   "{{"
+                    |   "}}"
 document:   section_document
       |     expression_document
@@ -275,6 +282,7 @@ expression: logical_or_expression
       |     if_expression
       |     error_raising_expression
       |     error_handling_expression
+      |     outer_expression
 logical_or_expression:  logical_and_expression
@@ -376,6 +384,8 @@ sql_content: /(?:[^\"\\]|\\[\"]|\"\"|\#\(lf\))+/
 sql_string: "\"" sql_content "\""
+outer_expression: "{{" expression "}}"
 argument_list:  WS_INLINE? expression
             |   WS_INLINE? expression WS_INLINE? "," WS_INLINE? argument_list
             |   WS_INLINE? sql_string
@@ -409,7 +419,7 @@ record_expression:  "[" field_list? "]"
 field_list: field
         |   field "," field_list
-field:  field_name WS_INLINE? "=" WS_INLINE? expression
+field:  WS_INLINE? field_name WS_INLINE? "=" WS_INLINE? expression
 field_name: generalized_identifier
         |   quoted_identifier
@@ -621,4 +631,8 @@ any_literal:      record_literal
 %import common.DIGIT
 %import common.LF
 %import common.CR
-%import common.ESCAPED_STRING
+%import common.ESCAPED_STRING
+%ignore WS            // Ignore whitespace
+%ignore CPP_COMMENT    // Ignore single-line comments
+%ignore C_COMMENT      // Ignore multi-line comments

acryl-datahub 0.15.0.1rc1__py3-none-any.whl → 0.15.0.1rc3__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc1py3-none-any.whl → 0.15.0.1rc3py3-none-any.whl