acryl-datahub 0.15.0.1rc1__py3-none-any.whl → 0.15.0.1rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=gK5aLEGMHMZfg-QUDI5T7mr1ej_5OFVKhCrqqoj_QGk,576
1
+ datahub/__init__.py,sha256=BqWrG3ZFOnsobExCu08Kaj1mOgrBZ-Jmo_XLdalPyI0,576
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
119
119
  datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
120
120
  datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
121
121
  datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
122
- datahub/emitter/rest_emitter.py,sha256=3kG_aPKy9pLibd4SJNtdJxn792c5TJliFjjCOw6NoUM,15533
122
+ datahub/emitter/rest_emitter.py,sha256=d5Zjo3GXDu9rUqlSsK9aOx-yEbHjDFZHelfq_ZFeb5M,16393
123
123
  datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
124
124
  datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
125
125
  datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -144,6 +144,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
144
144
  datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
145
145
  datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
147
+ datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=9mzg3vhCFPI9zHSi_j3FvJtK8kUvb8PmRSz-TM3tt6k,4323
147
148
  datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
148
149
  datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
149
150
  datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
@@ -215,7 +216,7 @@ datahub/ingestion/source/abs/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBo
215
216
  datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
216
217
  datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
217
218
  datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
218
- datahub/ingestion/source/aws/aws_common.py,sha256=BqDe19yqHdwj6_sjIryGZ9_5lsAJ0PZhfPfGqLZrCcE,10649
219
+ datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
219
220
  datahub/ingestion/source/aws/glue.py,sha256=fX0dtaVVq174ZS0aBJvZFYK8ligfZX5EU3pdS3j1KQs,56215
220
221
  datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
221
222
  datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
@@ -357,7 +358,7 @@ datahub/ingestion/source/neo4j/neo4j_source.py,sha256=L9WiZ5yZrIDMrgj3gYU9j6zz3T
357
358
  datahub/ingestion/source/powerbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
358
359
  datahub/ingestion/source/powerbi/config.py,sha256=9cgMrwp3DQqv14iTqulAONkbVe5nhycJ1ElkA_275go,22732
359
360
  datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py,sha256=-njW1kJOy-LY5JFwJLhVQ0bMBj9NQz5TZhQqsSi_KsM,2285
360
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule,sha256=PRWzuZMMhKdOVoAaE8csHvUFbZHxYe5meJHgrqlgiuw,19795
361
+ datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule,sha256=5df3qvalCS9hZ46DPXs6XDcw9-IofGf8Eol_rUC7LHI,20329
361
362
  datahub/ingestion/source/powerbi/powerbi.py,sha256=7UsAEqaFlkWONcXJdQ2hotUYYn46ks6Fe71KXEMh7lI,54495
362
363
  datahub/ingestion/source/powerbi/m_query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
363
364
  datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=EbaEasEOGZ73jz0cQofH9ez65wSvRBof0R6GQaIVLnM,2009
@@ -431,26 +432,26 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
431
432
  datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
432
433
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
433
434
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
434
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=suMICPFPvoV6shkjD_14JunLc8jAZBINzlFk2mYldkU,23676
435
+ datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=uMGmMEl4hWEmN7GxMyDBdwlIPAW7WmOnu41kZ0dvCG4,21551
435
436
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
436
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
437
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=8QEihOfivalVR9vLo6vCUL-vnZfAGgMio0uhPYX0jTo,25883
437
438
  datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
438
439
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
439
440
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
440
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=vof3mNImstnlL8kc0OkTHzMIqnbEkt9RmnYBX1JX0oE,40386
441
+ datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=K-KEr3OpwMHye08lXAy-5doUUGoGJP3b-ntJAGU_NBY,42472
441
442
  datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
442
443
  datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
443
444
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
444
445
  datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
445
446
  datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
446
- datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=TlS5d1lpEN74ZP0c8UzUhJZIeBMO3ZIUxRler1p7lnA,31998
447
+ datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=ecaTCJNAQ_IJOPInPGXA3jv1dE5lztSU82UhpBygiq0,31654
447
448
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
448
449
  datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
449
450
  datahub/ingestion/source/sql/clickhouse.py,sha256=jzvaXP5Wr0SMhj2rtuvVE821xnfpKiXhO3cm0xblgHs,27299
450
451
  datahub/ingestion/source/sql/cockroachdb.py,sha256=XaD7eae34plU9ISRC6PzYX9q6RdT2qkzjH6CpTOgkx4,1443
451
452
  datahub/ingestion/source/sql/druid.py,sha256=lhO9CCOlHV-6LjBuAxAxtB9I1pvPtsGSdr63bz6_ilA,2837
452
453
  datahub/ingestion/source/sql/hana.py,sha256=0PIvcX0Rz59NyR7Ag5Bv1MBV_UbJwxl9UAopo_xe_CA,1342
453
- datahub/ingestion/source/sql/hive.py,sha256=AgEo94zyBL-NLZxR5-jQlNwq_R9FQ4AUOe8wpOox0Xs,8871
454
+ datahub/ingestion/source/sql/hive.py,sha256=Ueicc3EhKQSC77IDzl_xcTDDalbH1pvB8mox1oFI4Hw,29738
454
455
  datahub/ingestion/source/sql/hive_metastore.py,sha256=PisLrswev583xW0xDJ5yfKWCWm_ZTl1OeuaMcv_SvXc,35865
455
456
  datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfvpHSzJjqvMw,737
456
457
  datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
@@ -471,8 +472,8 @@ datahub/ingestion/source/sql/trino.py,sha256=FEn_BQ3pm23hKx94ek5kk5IXGNYcBqZEhll
471
472
  datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5foKrf71SydzEltc3WsVAhQc,5732
472
473
  datahub/ingestion/source/sql/vertica.py,sha256=_9OgSgIgqBml0av063rb8nACiT3SAmzpw0ouyF91wv8,33382
473
474
  datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
474
- datahub/ingestion/source/sql/mssql/job_models.py,sha256=ztXDrD4anhzwWvACIm9fucE2WhMDMKkJ4alMYOQOqWA,7083
475
- datahub/ingestion/source/sql/mssql/source.py,sha256=ODdsOIbDA3X0E7En6GT15mD49W6RW9sXLwRoUgw2a8I,30925
475
+ datahub/ingestion/source/sql/mssql/job_models.py,sha256=qMA4yzRiX-60ugsFu5ob3tOSgVs0uPTW_O8OHCECS_0,8002
476
+ datahub/ingestion/source/sql/mssql/source.py,sha256=0ivc6svImkYfGYuAPDX4ZsaYZTilohwaV4cg3KWOqNI,31237
476
477
  datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
477
478
  datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
478
479
  datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
@@ -490,7 +491,7 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
490
491
  datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
491
492
  datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
492
493
  datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
493
- datahub/ingestion/source/tableau/tableau.py,sha256=80m5a8g2q5jnm5pFtL4s_SmSeGPQIuZGqGe9RGn6OSI,138248
494
+ datahub/ingestion/source/tableau/tableau.py,sha256=5oyAoF7h92zUTh9NIaYipyRXUFUZSz1oo8hwn46RM2w,138661
494
495
  datahub/ingestion/source/tableau/tableau_common.py,sha256=9gQLq_3BlAsKll83uVlnWJRWaIDtFtREUyuimXF13Z0,26219
495
496
  datahub/ingestion/source/tableau/tableau_constant.py,sha256=ZcAeHsQUXVVL26ORly0ByZk_GJAFbxaKuJAlX_sYMac,2686
496
497
  datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=nSyx9RzC6TCQDm-cTVJ657qT8iDwzk_8JMKpohhmOc4,1046
@@ -505,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=2-pYQ-3B9UVUwO1yB9iTdi3DqgqZ2JrpQ
505
506
  datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
506
507
  datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
507
508
  datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
508
- datahub/ingestion/source/unity/source.py,sha256=YdUPCMJtpmvYVnRNnpqb4BVowFobkLvSJ_K2gHwrvCI,41752
509
+ datahub/ingestion/source/unity/source.py,sha256=ydjqJ91q-Mir_aJSmEdo1umgPvQ5la67rBhC04IyV78,41938
509
510
  datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
510
511
  datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
511
512
  datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
@@ -516,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
516
517
  datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
517
518
  datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
518
519
  datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
519
- datahub/ingestion/source_report/ingestion_stage.py,sha256=zijPKvJKnB298cMoG6j_w6QRva9t1RWqnmycmbsKdgs,1499
520
+ datahub/ingestion/source_report/ingestion_stage.py,sha256=pJcJeLSjaixlLqQyQtE3bfUcvXEVwrSaWWtU4iU9UEo,1557
520
521
  datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
521
522
  datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
522
523
  datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -980,8 +981,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
980
981
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
981
982
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
982
983
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
983
- acryl_datahub-0.15.0.1rc1.dist-info/METADATA,sha256=Ll6D3fw03bz2dVmCp9Mcez5IbFKIfXzUnMeSe6Ej4eM,173642
984
- acryl_datahub-0.15.0.1rc1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
985
- acryl_datahub-0.15.0.1rc1.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
986
- acryl_datahub-0.15.0.1rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
987
- acryl_datahub-0.15.0.1rc1.dist-info/RECORD,,
984
+ acryl_datahub-0.15.0.1rc3.dist-info/METADATA,sha256=JxWLTgCDIjuZb3Q_cRrG3nTw6mqsweOED2pg_izLT5I,173642
985
+ acryl_datahub-0.15.0.1rc3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
986
+ acryl_datahub-0.15.0.1rc3.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
987
+ acryl_datahub-0.15.0.1rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
988
+ acryl_datahub-0.15.0.1rc3.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc1"
6
+ __version__ = "0.15.0.1rc3"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -291,6 +291,7 @@ class DataHubRestEmitter(Closeable, Emitter):
291
291
  mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
292
292
  async_flag: Optional[bool] = None,
293
293
  ) -> int:
294
+ logger.debug("Attempting to emit batch mcps")
294
295
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
295
296
  for mcp in mcps:
296
297
  ensure_has_system_metadata(mcp)
@@ -303,15 +304,22 @@ class DataHubRestEmitter(Closeable, Emitter):
303
304
  current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
304
305
  for mcp_obj in mcp_objs:
305
306
  mcp_obj_size = len(json.dumps(mcp_obj))
307
+ logger.debug(
308
+ f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
309
+ )
306
310
 
307
311
  if (
308
312
  mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
309
313
  or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
310
314
  ):
315
+ logger.debug("Decided to create new chunk")
311
316
  mcp_obj_chunks.append([])
312
317
  current_chunk_size = 0
313
318
  mcp_obj_chunks[-1].append(mcp_obj)
314
319
  current_chunk_size += mcp_obj_size
320
+ logger.debug(
321
+ f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
322
+ )
315
323
 
316
324
  for mcp_obj_chunk in mcp_obj_chunks:
317
325
  # TODO: We're calling json.dumps on each MCP object twice, once to estimate
@@ -338,8 +346,15 @@ class DataHubRestEmitter(Closeable, Emitter):
338
346
 
339
347
  def _emit_generic(self, url: str, payload: str) -> None:
340
348
  curl_command = make_curl_command(self._session, "POST", url, payload)
349
+ payload_size = len(payload)
350
+ if payload_size > INGEST_MAX_PAYLOAD_BYTES:
351
+ # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
352
+ logger.warning(
353
+ f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
354
+ )
341
355
  logger.debug(
342
- "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
356
+ "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
357
+ payload_size,
343
358
  curl_command,
344
359
  )
345
360
  try:
@@ -0,0 +1,96 @@
1
+ import json
2
+ import logging
3
+ from typing import Iterable, List
4
+
5
+ from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
6
+ from datahub.emitter.serialization_helper import pre_json_transform
7
+ from datahub.ingestion.api.source import SourceReport
8
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
9
+ from datahub.metadata.schema_classes import (
10
+ DatasetProfileClass,
11
+ SchemaFieldClass,
12
+ SchemaMetadataClass,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class EnsureAspectSizeProcessor:
19
+ def __init__(
20
+ self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
21
+ ):
22
+ self.report = report
23
+ self.payload_constraint = payload_constraint
24
+
25
+ def ensure_dataset_profile_size(
26
+ self, dataset_urn: str, profile: DatasetProfileClass
27
+ ) -> None:
28
+ """
29
+ This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
30
+ in the future
31
+ """
32
+ sample_fields_size = 0
33
+ if profile.fieldProfiles:
34
+ logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
35
+ for field in profile.fieldProfiles:
36
+ if field.sampleValues:
37
+ values_len = 0
38
+ for value in field.sampleValues:
39
+ if value:
40
+ values_len += len(value)
41
+ logger.debug(
42
+ f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
43
+ )
44
+ if sample_fields_size + values_len > self.payload_constraint:
45
+ field.sampleValues = []
46
+ self.report.warning(
47
+ title="Dataset profile truncated due to size constraint",
48
+ message="Dataset profile contained too much data and would have caused ingestion to fail",
49
+ context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
50
+ )
51
+ else:
52
+ sample_fields_size += values_len
53
+ else:
54
+ logger.debug(f"Field {field.fieldPath} has no sample values")
55
+
56
+ def ensure_schema_metadata_size(
57
+ self, dataset_urn: str, schema: SchemaMetadataClass
58
+ ) -> None:
59
+ """
60
+ This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
61
+ in the future
62
+ """
63
+ total_fields_size = 0
64
+ logger.debug(f"Amount of schema fields: {len(schema.fields)}")
65
+ accepted_fields: List[SchemaFieldClass] = []
66
+ for field in schema.fields:
67
+ field_size = len(json.dumps(pre_json_transform(field.to_obj())))
68
+ logger.debug(f"Field {field.fieldPath} takes total {field_size}")
69
+ if total_fields_size + field_size < self.payload_constraint:
70
+ accepted_fields.append(field)
71
+ total_fields_size += field_size
72
+ else:
73
+ self.report.warning(
74
+ title="Schema truncated due to size constraint",
75
+ message="Dataset schema contained too much data and would have caused ingestion to fail",
76
+ context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
77
+ )
78
+
79
+ schema.fields = accepted_fields
80
+
81
+ def ensure_aspect_size(
82
+ self,
83
+ stream: Iterable[MetadataWorkUnit],
84
+ ) -> Iterable[MetadataWorkUnit]:
85
+ """
86
+ We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
87
+ on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
88
+ """
89
+ for wu in stream:
90
+ logger.debug(f"Ensuring size of workunit: {wu.id}")
91
+
92
+ if schema := wu.get_aspect_of_type(SchemaMetadataClass):
93
+ self.ensure_schema_metadata_size(wu.get_urn(), schema)
94
+ elif profile := wu.get_aspect_of_type(DatasetProfileClass):
95
+ self.ensure_dataset_profile_size(wu.get_urn(), profile)
96
+ yield wu
@@ -1,7 +1,12 @@
1
+ import logging
2
+ import os
1
3
  from datetime import datetime, timedelta, timezone
2
- from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
4
+ from enum import Enum
5
+ from http import HTTPStatus
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
3
7
 
4
8
  import boto3
9
+ import requests
5
10
  from boto3.session import Session
6
11
  from botocore.config import DEFAULT_TIMEOUT, Config
7
12
  from botocore.utils import fix_s3_host
@@ -14,6 +19,8 @@ from datahub.configuration.common import (
14
19
  )
15
20
  from datahub.configuration.source_common import EnvConfigMixin
16
21
 
22
+ logger = logging.getLogger(__name__)
23
+
17
24
  if TYPE_CHECKING:
18
25
  from mypy_boto3_dynamodb import DynamoDBClient
19
26
  from mypy_boto3_glue import GlueClient
@@ -22,6 +29,26 @@ if TYPE_CHECKING:
22
29
  from mypy_boto3_sts import STSClient
23
30
 
24
31
 
32
+ class AwsEnvironment(Enum):
33
+ EC2 = "EC2"
34
+ ECS = "ECS"
35
+ EKS = "EKS"
36
+ LAMBDA = "LAMBDA"
37
+ APP_RUNNER = "APP_RUNNER"
38
+ BEANSTALK = "ELASTIC_BEANSTALK"
39
+ CLOUD_FORMATION = "CLOUD_FORMATION"
40
+ UNKNOWN = "UNKNOWN"
41
+
42
+
43
+ class AwsServicePrincipal(Enum):
44
+ LAMBDA = "lambda.amazonaws.com"
45
+ EKS = "eks.amazonaws.com"
46
+ APP_RUNNER = "apprunner.amazonaws.com"
47
+ ECS = "ecs.amazonaws.com"
48
+ ELASTIC_BEANSTALK = "elasticbeanstalk.amazonaws.com"
49
+ EC2 = "ec2.amazonaws.com"
50
+
51
+
25
52
  class AwsAssumeRoleConfig(PermissiveConfigModel):
26
53
  # Using the PermissiveConfigModel to allow the user to pass additional arguments.
27
54
 
@@ -34,6 +61,163 @@ class AwsAssumeRoleConfig(PermissiveConfigModel):
34
61
  )
35
62
 
36
63
 
64
+ def get_instance_metadata_token() -> Optional[str]:
65
+ """Get IMDSv2 token"""
66
+ try:
67
+ response = requests.put(
68
+ "http://169.254.169.254/latest/api/token",
69
+ headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
70
+ timeout=1,
71
+ )
72
+ if response.status_code == HTTPStatus.OK:
73
+ return response.text
74
+ except requests.exceptions.RequestException:
75
+ logger.debug("Failed to get IMDSv2 token")
76
+ return None
77
+
78
+
79
+ def is_running_on_ec2() -> bool:
80
+ """Check if code is running on EC2 using IMDSv2"""
81
+ token = get_instance_metadata_token()
82
+ if not token:
83
+ return False
84
+
85
+ try:
86
+ response = requests.get(
87
+ "http://169.254.169.254/latest/meta-data/instance-id",
88
+ headers={"X-aws-ec2-metadata-token": token},
89
+ timeout=1,
90
+ )
91
+ return response.status_code == HTTPStatus.OK
92
+ except requests.exceptions.RequestException:
93
+ return False
94
+
95
+
96
+ def detect_aws_environment() -> AwsEnvironment:
97
+ """
98
+ Detect the AWS environment we're running in.
99
+ Order matters as some environments may have multiple indicators.
100
+ """
101
+ # Check Lambda first as it's most specific
102
+ if os.getenv("AWS_LAMBDA_FUNCTION_NAME"):
103
+ if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"):
104
+ return AwsEnvironment.CLOUD_FORMATION
105
+ return AwsEnvironment.LAMBDA
106
+
107
+ # Check EKS (IRSA)
108
+ if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"):
109
+ return AwsEnvironment.EKS
110
+
111
+ # Check App Runner
112
+ if os.getenv("AWS_APP_RUNNER_SERVICE_ID"):
113
+ return AwsEnvironment.APP_RUNNER
114
+
115
+ # Check ECS
116
+ if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
117
+ "ECS_CONTAINER_METADATA_URI"
118
+ ):
119
+ return AwsEnvironment.ECS
120
+
121
+ # Check Elastic Beanstalk
122
+ if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"):
123
+ return AwsEnvironment.BEANSTALK
124
+
125
+ if is_running_on_ec2():
126
+ return AwsEnvironment.EC2
127
+
128
+ return AwsEnvironment.UNKNOWN
129
+
130
+
131
+ def get_instance_role_arn() -> Optional[str]:
132
+ """Get role ARN from EC2 instance metadata using IMDSv2"""
133
+ token = get_instance_metadata_token()
134
+ if not token:
135
+ return None
136
+
137
+ try:
138
+ response = requests.get(
139
+ "http://169.254.169.254/latest/meta-data/iam/security-credentials/",
140
+ headers={"X-aws-ec2-metadata-token": token},
141
+ timeout=1,
142
+ )
143
+ if response.status_code == 200:
144
+ role_name = response.text.strip()
145
+ if role_name:
146
+ sts = boto3.client("sts")
147
+ identity = sts.get_caller_identity()
148
+ return identity.get("Arn")
149
+ except Exception as e:
150
+ logger.debug(f"Failed to get instance role ARN: {e}")
151
+ return None
152
+
153
+
154
+ def get_lambda_role_arn() -> Optional[str]:
155
+ """Get the Lambda function's role ARN"""
156
+ try:
157
+ function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME")
158
+ if not function_name:
159
+ return None
160
+
161
+ lambda_client = boto3.client("lambda")
162
+ function_config = lambda_client.get_function_configuration(
163
+ FunctionName=function_name
164
+ )
165
+ return function_config.get("Role")
166
+ except Exception as e:
167
+ logger.debug(f"Failed to get Lambda role ARN: {e}")
168
+ return None
169
+
170
+
171
+ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
172
+ """
173
+ Get the current role ARN and source type based on the runtime environment.
174
+ Returns (role_arn, credential_source)
175
+ """
176
+ env = detect_aws_environment()
177
+
178
+ if env == AwsEnvironment.LAMBDA:
179
+ role_arn = get_lambda_role_arn()
180
+ return role_arn, AwsServicePrincipal.LAMBDA.value
181
+
182
+ elif env == AwsEnvironment.EKS:
183
+ role_arn = os.getenv("AWS_ROLE_ARN")
184
+ return role_arn, AwsServicePrincipal.EKS.value
185
+
186
+ elif env == AwsEnvironment.APP_RUNNER:
187
+ try:
188
+ sts = boto3.client("sts")
189
+ identity = sts.get_caller_identity()
190
+ return identity.get("Arn"), AwsServicePrincipal.APP_RUNNER.value
191
+ except Exception as e:
192
+ logger.debug(f"Failed to get App Runner role: {e}")
193
+
194
+ elif env == AwsEnvironment.ECS:
195
+ try:
196
+ metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv(
197
+ "ECS_CONTAINER_METADATA_URI"
198
+ )
199
+ if metadata_uri:
200
+ response = requests.get(f"{metadata_uri}/task", timeout=1)
201
+ if response.status_code == HTTPStatus.OK:
202
+ task_metadata = response.json()
203
+ if "TaskARN" in task_metadata:
204
+ return (
205
+ task_metadata.get("TaskARN"),
206
+ AwsServicePrincipal.ECS.value,
207
+ )
208
+ except Exception as e:
209
+ logger.debug(f"Failed to get ECS task role: {e}")
210
+
211
+ elif env == AwsEnvironment.BEANSTALK:
212
+ # Beanstalk uses EC2 instance metadata
213
+ return get_instance_role_arn(), AwsServicePrincipal.ELASTIC_BEANSTALK.value
214
+
215
+ elif env == AwsEnvironment.EC2:
216
+ return get_instance_role_arn(), AwsServicePrincipal.EC2.value
217
+
218
+ return None, None
219
+
220
+
37
221
  def assume_role(
38
222
  role: AwsAssumeRoleConfig,
39
223
  aws_region: Optional[str],
@@ -95,7 +279,7 @@ class AwsConnectionConfig(ConfigModel):
95
279
  )
96
280
  aws_profile: Optional[str] = Field(
97
281
  default=None,
98
- description="Named AWS profile to use. Only used if access key / secret are unset. If not set the default will be used",
282
+ description="The [named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use from AWS credentials. Falls back to default profile if not specified and no access keys provided. Profiles are configured in ~/.aws/credentials or ~/.aws/config.",
99
283
  )
100
284
  aws_region: Optional[str] = Field(None, description="AWS region code.")
101
285
 
@@ -145,6 +329,7 @@ class AwsConnectionConfig(ConfigModel):
145
329
 
146
330
  def get_session(self) -> Session:
147
331
  if self.aws_access_key_id and self.aws_secret_access_key:
332
+ # Explicit credentials take precedence
148
333
  session = Session(
149
334
  aws_access_key_id=self.aws_access_key_id,
150
335
  aws_secret_access_key=self.aws_secret_access_key,
@@ -152,38 +337,57 @@ class AwsConnectionConfig(ConfigModel):
152
337
  region_name=self.aws_region,
153
338
  )
154
339
  elif self.aws_profile:
340
+ # Named profile is second priority
155
341
  session = Session(
156
342
  region_name=self.aws_region, profile_name=self.aws_profile
157
343
  )
158
344
  else:
159
- # Use boto3's credential autodetection.
345
+ # Use boto3's credential autodetection
160
346
  session = Session(region_name=self.aws_region)
161
347
 
162
- if self._normalized_aws_roles():
163
- # Use existing session credentials to start the chain of role assumption.
164
- current_credentials = session.get_credentials()
165
- credentials = {
166
- "AccessKeyId": current_credentials.access_key,
167
- "SecretAccessKey": current_credentials.secret_key,
168
- "SessionToken": current_credentials.token,
169
- }
170
-
171
- for role in self._normalized_aws_roles():
172
- if self._should_refresh_credentials():
173
- credentials = assume_role(
174
- role,
175
- self.aws_region,
176
- credentials=credentials,
348
+ target_roles = self._normalized_aws_roles()
349
+ if target_roles:
350
+ current_role_arn, credential_source = get_current_identity()
351
+
352
+ # Only assume role if:
353
+ # 1. We're not in a known AWS environment with a role, or
354
+ # 2. We need to assume a different role than our current one
355
+ should_assume_role = current_role_arn is None or any(
356
+ role.RoleArn != current_role_arn for role in target_roles
357
+ )
358
+
359
+ if should_assume_role:
360
+ env = detect_aws_environment()
361
+ logger.debug(f"Assuming role(s) from {env.value} environment")
362
+
363
+ current_credentials = session.get_credentials()
364
+ if current_credentials is None:
365
+ raise ValueError("No credentials available for role assumption")
366
+
367
+ credentials = {
368
+ "AccessKeyId": current_credentials.access_key,
369
+ "SecretAccessKey": current_credentials.secret_key,
370
+ "SessionToken": current_credentials.token,
371
+ }
372
+
373
+ for role in target_roles:
374
+ if self._should_refresh_credentials():
375
+ credentials = assume_role(
376
+ role=role,
377
+ aws_region=self.aws_region,
378
+ credentials=credentials,
379
+ )
380
+ if isinstance(credentials["Expiration"], datetime):
381
+ self._credentials_expiration = credentials["Expiration"]
382
+
383
+ session = Session(
384
+ aws_access_key_id=credentials["AccessKeyId"],
385
+ aws_secret_access_key=credentials["SecretAccessKey"],
386
+ aws_session_token=credentials["SessionToken"],
387
+ region_name=self.aws_region,
177
388
  )
178
- if isinstance(credentials["Expiration"], datetime):
179
- self._credentials_expiration = credentials["Expiration"]
180
-
181
- session = Session(
182
- aws_access_key_id=credentials["AccessKeyId"],
183
- aws_secret_access_key=credentials["SecretAccessKey"],
184
- aws_session_token=credentials["SessionToken"],
185
- region_name=self.aws_region,
186
- )
389
+ else:
390
+ logger.debug(f"Using existing role from {credential_source}")
187
391
 
188
392
  return session
189
393
 
@@ -21,6 +21,11 @@
21
21
  // | empty_string
22
22
  // | empty_string "," argument_list
23
23
  // - Added sql_string in any_literal
24
+ // - Added WS_INLINE? in field expression
25
+ // Added to ignore any comments
26
+ // %ignore WS // Ignore whitespace
27
+ // %ignore CPP_COMMENT // Ignore single-line comments
28
+ // %ignore C_COMMENT // Ignore multi-line comments
24
29
 
25
30
  lexical_unit: lexical_elements?
26
31
 
@@ -245,6 +250,8 @@ operator_or_punctuator: ","
245
250
  | "=>"
246
251
  | ".."
247
252
  | "..."
253
+ | "{{"
254
+ | "}}"
248
255
 
249
256
  document: section_document
250
257
  | expression_document
@@ -275,6 +282,7 @@ expression: logical_or_expression
275
282
  | if_expression
276
283
  | error_raising_expression
277
284
  | error_handling_expression
285
+ | outer_expression
278
286
 
279
287
 
280
288
  logical_or_expression: logical_and_expression
@@ -376,6 +384,8 @@ sql_content: /(?:[^\"\\]|\\[\"]|\"\"|\#\(lf\))+/
376
384
 
377
385
  sql_string: "\"" sql_content "\""
378
386
 
387
+ outer_expression: "{{" expression "}}"
388
+
379
389
  argument_list: WS_INLINE? expression
380
390
  | WS_INLINE? expression WS_INLINE? "," WS_INLINE? argument_list
381
391
  | WS_INLINE? sql_string
@@ -409,7 +419,7 @@ record_expression: "[" field_list? "]"
409
419
  field_list: field
410
420
  | field "," field_list
411
421
 
412
- field: field_name WS_INLINE? "=" WS_INLINE? expression
422
+ field: WS_INLINE? field_name WS_INLINE? "=" WS_INLINE? expression
413
423
 
414
424
  field_name: generalized_identifier
415
425
  | quoted_identifier
@@ -621,4 +631,8 @@ any_literal: record_literal
621
631
  %import common.DIGIT
622
632
  %import common.LF
623
633
  %import common.CR
624
- %import common.ESCAPED_STRING
634
+ %import common.ESCAPED_STRING
635
+
636
+ %ignore WS // Ignore whitespace
637
+ %ignore CPP_COMMENT // Ignore single-line comments
638
+ %ignore C_COMMENT // Ignore multi-line comments