acryl-datahub 1.3.0.1rc4__py3-none-any.whl → 1.3.0.1rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (31) hide show
  1. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/METADATA +2637 -2633
  2. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/RECORD +31 -28
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/source/aws/aws_common.py +161 -0
  5. datahub/ingestion/source/bigquery_v2/bigquery.py +17 -1
  6. datahub/ingestion/source/bigquery_v2/bigquery_config.py +16 -0
  7. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +5 -3
  8. datahub/ingestion/source/bigquery_v2/queries_extractor.py +41 -4
  9. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  10. datahub/ingestion/source/redshift/usage.py +2 -2
  11. datahub/ingestion/source/snowflake/snowflake_config.py +16 -0
  12. datahub/ingestion/source/snowflake/snowflake_queries.py +46 -6
  13. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -1
  14. datahub/ingestion/source/sql/mysql.py +101 -4
  15. datahub/ingestion/source/sql/postgres.py +81 -4
  16. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  17. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  18. datahub/ingestion/source/state/stateful_ingestion_base.py +30 -2
  19. datahub/metadata/_internal_schema_classes.py +772 -546
  20. datahub/metadata/_urns/urn_defs.py +1751 -1695
  21. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  22. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  23. datahub/metadata/schema.avsc +18450 -18242
  24. datahub/metadata/schemas/DataHubFileInfo.avsc +228 -0
  25. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  26. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +3 -1
  27. datahub/sql_parsing/sql_parsing_aggregator.py +18 -4
  28. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/WHEEL +0 -0
  29. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/entry_points.txt +0 -0
  30. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/licenses/LICENSE +0 -0
  31. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
- acryl_datahub-1.3.0.1rc4.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.3.0.1rc6.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=wuuyfFhYvCgNlVjy7dlOQ6sSJuFvaUAnTlhSVJzQ-fM,323
4
+ datahub/_version.py,sha256=PW9A4Uazfqf_qZ54rH_cG6i8GhrmicechIGtJYipA8Q,323
5
5
  datahub/entrypoints.py,sha256=VcbU6Z47b_JKW1zI-WJMYIngm05FSogKLiuvFNtyNcI,9088
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -241,7 +241,7 @@ datahub/ingestion/source/abs/source.py,sha256=z86K5_P_gu8kTytLOAYyQqqD2g14JGSrv1
241
241
  datahub/ingestion/source/apply/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
242
242
  datahub/ingestion/source/apply/datahub_apply.py,sha256=xTD-Iq3UHhxcz61RwNuI2kJjRrnQEfZFSgvS1X6loV4,7703
243
243
  datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
244
- datahub/ingestion/source/aws/aws_common.py,sha256=1NgwpasOO3fpBGtZnxWEzjHFoHjEs5hpg_HkTvsbn0M,18147
244
+ datahub/ingestion/source/aws/aws_common.py,sha256=Va9uxo5aKsAR7qIC625VpRO3XDqzNIg4SfK_eFg25Rw,23781
245
245
  datahub/ingestion/source/aws/glue.py,sha256=dUaMWcI5Ed-TzbbSrF6suT4L1vcRHoHfFCdTvAINc4w,67423
246
246
  datahub/ingestion/source/aws/platform_resource_repository.py,sha256=0eUfGy1FbaBltCSNTtXyLrkrdqTc1KkTgDJB1Gd-Ydk,853
247
247
  datahub/ingestion/source/aws/s3_boto_utils.py,sha256=rGlWAkKZpkeA1_wMvcJvSDvobvduShszowU-KcrQudg,7011
@@ -260,10 +260,10 @@ datahub/ingestion/source/azure/abs_folder_utils.py,sha256=7skXus-4fSIoKpqCeU-GG0
260
260
  datahub/ingestion/source/azure/abs_utils.py,sha256=KdAlCK-PMrn35kFHxz5vrsjajyx2PD5GRgoBKdoRvcg,2075
261
261
  datahub/ingestion/source/azure/azure_common.py,sha256=DvPrLpjQSJ1USB_myGmg8lGkRW-WAl2GIZMcEkBFjOs,4063
262
262
  datahub/ingestion/source/bigquery_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
263
- datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=u4-LLt6ZDe3hKqLWqEByYpc0z-UcEZf85uok9qNEFko,15321
263
+ datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=Z5QsyvBNDWEf9kME_zBRw2oLIh3rD5zafpvuYB0p4ow,15972
264
264
  datahub/ingestion/source/bigquery_v2/bigquery_audit.py,sha256=kEwWhq3ch6WT4q4hcX8-fvQh28KgrNfspFwIytO3vQA,25103
265
265
  datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py,sha256=LuGJ6LgPViLIfDQfylxlQ3CA7fZYM5MDt8M-7sfzm84,5096
266
- datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=3u5p6RM4-q651fXrDTDsB5jY29lK_usr6OWI9N1LjuQ,22977
266
+ datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=A5pLaTm4WCTndhGmGBGrjc05LHtC7C5-FrE3vEMc1ik,23880
267
267
  datahub/ingestion/source/bigquery_v2/bigquery_connection.py,sha256=6XFCc0oxxU3R4IPyYHaf3YMETlMD4ztkNpkf4kf1Elw,3171
268
268
  datahub/ingestion/source/bigquery_v2/bigquery_data_reader.py,sha256=DeT3v_Z82__8En0FcZ0kavBAWQoRvSZ5Rppm9eeDAb8,2393
269
269
  datahub/ingestion/source/bigquery_v2/bigquery_helper.py,sha256=QER3gY8e_k1_eNVj7cBso7ZzrWl_vO5PYSa6CpvqNx8,1554
@@ -271,13 +271,13 @@ datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py,sha256
271
271
  datahub/ingestion/source/bigquery_v2/bigquery_queries.py,sha256=2syDMaRpYEbtGUVejVAK5d6g8HqM54ZyEM908uLJ55o,3393
272
272
  datahub/ingestion/source/bigquery_v2/bigquery_report.py,sha256=zlTkqOmt5zxnO40rVTYHF3fclj4OVlLtqUXwW5WIIcM,7855
273
273
  datahub/ingestion/source/bigquery_v2/bigquery_schema.py,sha256=zbYb1EYnCJxgvsU8oT_76l0q_BW1exVjMWM1GAgd1nc,32600
274
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=_NLFRRXsrxMZ8Vjg2jVL4Pg1_NGt9hzn9EWBooJZ8so,51566
274
+ datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=PbSCMj5ACwEu_HQNe29IHs4y1bn15_nnz6ZW1Yt17wI,51796
275
275
  datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py,sha256=cATxwi5IPzj3BldRRAVcLqzSFmmYEPvqa7U0RFJbaAc,7645
276
276
  datahub/ingestion/source/bigquery_v2/common.py,sha256=IinOy-RO4UZGxSf5scaN02672BzZuNsjJZ56axti6iI,4016
277
277
  datahub/ingestion/source/bigquery_v2/lineage.py,sha256=jju14mJbAUMA_K3j2yq-TdZV202cjd5rBAsDPJGEVno,44900
278
278
  datahub/ingestion/source/bigquery_v2/profiler.py,sha256=oLf5jMjJf-ShNny9Dll2tCsOoPMF1DxAh7e7etpeLq4,10821
279
279
  datahub/ingestion/source/bigquery_v2/queries.py,sha256=gDvvgajptmNn5AiBglmDhGAC9LBh8fzw56_d8ewLbxA,20222
280
- datahub/ingestion/source/bigquery_v2/queries_extractor.py,sha256=7mObcHn6mpZRoO4QnJ0QuZ8AS_MsdPLwb-cLRyP-W6k,19531
280
+ datahub/ingestion/source/bigquery_v2/queries_extractor.py,sha256=bSYusyf-xnhs_1WURsQ2YmMxRn3J5HCp_UKChsxbWIw,21015
281
281
  datahub/ingestion/source/bigquery_v2/usage.py,sha256=A9c-ofclaRk0NSnc4IRaqJYqMPv6ecCld_TPy3V2qFs,40748
282
282
  datahub/ingestion/source/cassandra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
283
283
  datahub/ingestion/source/cassandra/cassandra.py,sha256=pNy61Z4kTqL_wGcWIYee5fnZiuJDseDcRcQwsxeAssk,14487
@@ -456,9 +456,9 @@ datahub/ingestion/source/redshift/profile.py,sha256=H1Xtc2rXScUv4w0b2BbM7POjYEwq
456
456
  datahub/ingestion/source/redshift/query.py,sha256=HKobQ-0crARgT8Mkfe-WBqVR9ZadYCZ9DGaUoEHHHww,48234
457
457
  datahub/ingestion/source/redshift/redshift.py,sha256=RN8rao3j7nocnnD6oPcEju09-8mOZTE4vFkgy_13Az8,41293
458
458
  datahub/ingestion/source/redshift/redshift_data_reader.py,sha256=zc69jwXHdF-w8J4Hq-ZQ6BjHQ75Ij2iNDMpoRJlcmlU,1724
459
- datahub/ingestion/source/redshift/redshift_schema.py,sha256=7F-l_omOuKMuGE_rBWXVPG_GWXFKnCMzC4frNxZB9cs,24800
459
+ datahub/ingestion/source/redshift/redshift_schema.py,sha256=2U8IIPRJkL-HWUeWswOzvcT1hdTBQgPMhr6tYCDuqrM,25226
460
460
  datahub/ingestion/source/redshift/report.py,sha256=aCFDFUbz5xde8b_eRIHSBiELoo9LZFtDpp2lSadiPHU,2937
461
- datahub/ingestion/source/redshift/usage.py,sha256=Q7R-caJovLXv33uZepMGX5Cvm4DqQSLZdiL_s-p06wU,17473
461
+ datahub/ingestion/source/redshift/usage.py,sha256=szdg3cUw4UpZ8rXMZufIAVTq2iiI1VsmrFgjAEH8Dv4,17521
462
462
  datahub/ingestion/source/s3/__init__.py,sha256=HjqFPj11WtNFZM3kcVshlDb7kOsc19-l_3LM8PBjlJM,56
463
463
  datahub/ingestion/source/s3/config.py,sha256=lElFXgEpKDT9SVoiXvtx98wV6Gp880qP4pLQaOGJGOo,7828
464
464
  datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=FfrcgK-JEF94vw-l3q6pN6FENXb-wZzW2w1VUZVkwW8,3620
@@ -496,12 +496,12 @@ datahub/ingestion/source/snowflake/constants.py,sha256=iDTamMozHwLYyglpRfqwTbxPx
496
496
  datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
497
497
  datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
498
498
  datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
499
- datahub/ingestion/source/snowflake/snowflake_config.py,sha256=tpNJvPZYUb6pZWSqh-fRgpIeSx20hkDCLYW_EJbkIlk,23536
499
+ datahub/ingestion/source/snowflake/snowflake_config.py,sha256=HoDzaG3TlP1ui5qY2PZUcu83wOjvJ96Z9fFYC4GmCko,24439
500
500
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=uSHdPqigRzjeNxtn0_m5i57X7X8LBZIpHzDcWIoovyA,19005
501
501
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
502
502
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=nam-bYV6wL9LfR-Tt50Qe_Kea61IuWS-lLu5__aDxk8,21853
503
503
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=PmQi-qDlRhdJ-PsJ7x-EScIiswWRAxDDOKHydvN3mTY,7404
504
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=lAMA--X3nbWFdNs1DTHNm7crctB3RilX_pB-zy47piI,45528
504
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=AiLEbCs6zXbIoejFdLsbSupkvqHj5BOtPz9lubGKLv8,47142
505
505
  datahub/ingestion/source/snowflake/snowflake_query.py,sha256=wLDaYZrWJ0794KKn69rB_QF0_8Bzu5l_7L6mD77KVc4,40469
506
506
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=fA6C-p9wM-jyTsXE_suTbCtrE_lle-5LI52S7wFYf00,6701
507
507
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=a6avRJXbj2qwnu28oK1YotmJo68zEG-1S7vonsUUJy4,41473
@@ -511,7 +511,7 @@ datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=5Li4H8KuS4qBKR98L
511
511
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=eA9xh-G1Ydr1OwUUtrbXUWp26hE1jF0zvyKNky_i_nQ,8887
512
512
  datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=mM0v9b4PHRJAT-SdRids3wdzc5O96gWCCww3e42itV8,24982
513
513
  datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=1c1YNmAxxOwAKy8IEFqVdp6x-EvCYJkN6UZ_RwUUVv0,15062
514
- datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=c6wg_s97Hrckqi0BgAbmnnRQRDDda1-BHFLlnRx0xuw,35753
514
+ datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=Tpx_d8UuO4wD9sIL9nMDR4GtCvVc6KbF82nlBdFRtEI,36408
515
515
  datahub/ingestion/source/snowflake/stored_proc_lineage.py,sha256=rOb78iHiWiK8v8WdVs1xDwVut4Y0OHmszej6IopQfCo,5341
516
516
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
517
517
  datahub/ingestion/source/sql/athena.py,sha256=ZvWGuAPRUeUkE-7N6B3RyCkQp7JZKnLVXTnnR200gls,31532
@@ -523,9 +523,9 @@ datahub/ingestion/source/sql/hana.py,sha256=V6bGVLVjI1VL0deebg8VxIL8Ls-oxUvpSvX9
523
523
  datahub/ingestion/source/sql/hive.py,sha256=SPmAWlk63V-s-loBTU2hXsQA7xA4sa0iPK6pCbF-AJ8,31600
524
524
  datahub/ingestion/source/sql/hive_metastore.py,sha256=UBB7mV2eKuCxv3voi0F3tqF2MyRObSYxArAxETZfO4E,35997
525
525
  datahub/ingestion/source/sql/mariadb.py,sha256=om6QoG5UtDldt1N6AfIWp3T-HXNaaqFmpz2i0JAemfM,654
526
- datahub/ingestion/source/sql/mysql.py,sha256=_KhTODU7mqAoJOlrvRdPa7ihQkYLkgrZwaseQbasotM,5358
526
+ datahub/ingestion/source/sql/mysql.py,sha256=h0kv86-8SxBTmaEhmcyqGcKoaWQ4peUdiE9sfhrTuqY,9734
527
527
  datahub/ingestion/source/sql/oracle.py,sha256=nKMM1O67SkxCgT781eENl5xXpIR8_p5joTSdAYzQwHY,29988
528
- datahub/ingestion/source/sql/postgres.py,sha256=blkO6bI0eDKFK8UNwUYcYtm_ObrQuWVSy5GyfdhL5dg,14274
528
+ datahub/ingestion/source/sql/postgres.py,sha256=Vk1NVI0zJPzMj4SKJ9jfyGu4DY3bow724BDn10BaxzU,17478
529
529
  datahub/ingestion/source/sql/presto.py,sha256=58py4M3UYxkGpbBFA1o96H154eUhD2dBm1hpxxYlYYM,4256
530
530
  datahub/ingestion/source/sql/sql_common.py,sha256=EZGoeGlOYZoOrXOiKDI-S1mw-sPVV33PZQ_mPJlEvRc,57759
531
531
  datahub/ingestion/source/sql/sql_config.py,sha256=u3nGZYYl1WtaxfNsDU5bglgZ5Jq3Fxk9xei_CUIAXB0,8222
@@ -535,7 +535,7 @@ datahub/ingestion/source/sql/sql_report.py,sha256=gw-OPHSExp_b6DRjvwqE1U6Bpkwekx
535
535
  datahub/ingestion/source/sql/sql_types.py,sha256=AVeBBXw8aKB1_jw6Wtg58miu-YUfN_-7ZcXwSF-ESgA,16021
536
536
  datahub/ingestion/source/sql/sql_utils.py,sha256=q-Bsk6WxlsRtrw9RXBxvqI3zuaMTC_F25T2VrCziR9I,8418
537
537
  datahub/ingestion/source/sql/sqlalchemy_data_reader.py,sha256=FvHZ4JEK3aR2DYOBZiT_ZsAy12RjTu4t_KIR_92B11k,2644
538
- datahub/ingestion/source/sql/sqlalchemy_uri.py,sha256=u0ZvgdJjXZdo_vl7YIQfYuuWbGwpnH6OSozI2e8ZV4I,858
538
+ datahub/ingestion/source/sql/sqlalchemy_uri.py,sha256=wE_GX2TtkFEvscC_Epy5hUfBxtE6mUQoVPb7fUea0jk,1882
539
539
  datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py,sha256=KOpbmDIE2h1hyYEsbVHJi2B7FlsyUMTXZx4diyzltQg,1826
540
540
  datahub/ingestion/source/sql/teradata.py,sha256=YydlPGndFGZcpvlmim3T-1yaAmsFt08TZVOTo1R3GLo,66871
541
541
  datahub/ingestion/source/sql/trino.py,sha256=o5hm84iwRHO59TD2LaEqYgF2LYIcSUIKmlgu1VudGBY,19254
@@ -552,10 +552,10 @@ datahub/ingestion/source/state/checkpoint.py,sha256=ob8wtC0zOgTdc_-cVAI3MUKcQaN4
552
552
  datahub/ingestion/source/state/entity_removal_state.py,sha256=dIG1HnueaRTsAu8kYjYiPFPpkZ4WwbraDF6oFDsts2E,6720
553
553
  datahub/ingestion/source/state/profiling_state.py,sha256=lsWu7oZhB9nSlqoklvjs-LjS4XF0p6BxSAcLY-xKRzM,512
554
554
  datahub/ingestion/source/state/profiling_state_handler.py,sha256=jDMiIrAq8k4GrYoh9Ymh0ZAmzejYFk8E1W7-kuw6lXg,4295
555
- datahub/ingestion/source/state/redundant_run_skip_handler.py,sha256=h28twxcsMNvI74bUjAKleRYid8kfIyWS7Y11aBldDlY,9435
555
+ datahub/ingestion/source/state/redundant_run_skip_handler.py,sha256=UENQDZ1Hacd8Hg3jC6okomG9V4EMLfL-Zz60aU6jzyc,10260
556
556
  datahub/ingestion/source/state/sql_common_state.py,sha256=OtJpJfMTBSgyR37dn3w-nnZwlc0nFNb2GoUzIWhnyAc,143
557
557
  datahub/ingestion/source/state/stale_entity_removal_handler.py,sha256=Lr2HYGx_b2FQ8A36s7s11tl-4-mGIM13bfy5JbQ3LtM,14890
558
- datahub/ingestion/source/state/stateful_ingestion_base.py,sha256=FusdOZBvx-Kdc0A8cxEDZ4RfxjmG6MN02PCQImjTyBg,17305
558
+ datahub/ingestion/source/state/stateful_ingestion_base.py,sha256=j78BN_uSBpOJRi19kosZGPsgolKm9i-40MmSzDGeaFs,18837
559
559
  datahub/ingestion/source/state/usage_common_state.py,sha256=TJyb0CpwibsduJYI854EFdtrwWnz7JC-IkzKUXVGDx0,983
560
560
  datahub/ingestion/source/state/use_case_handler.py,sha256=3g8ddTvGXHe0dCiyTkyFeNmR8a3bhwywtIt8EpK5oQs,1271
561
561
  datahub/ingestion/source/state_provider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -646,12 +646,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
646
646
  datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
647
647
  datahub/lite/lite_util.py,sha256=G0LQHKkyEb1pc_q183g6hflShclGx7kikgMaOxtVVcs,4545
648
648
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
649
- datahub/metadata/_internal_schema_classes.py,sha256=iJEKSTxDX1uH8YgUwxt0_HSd2AryK0-vUTBeN-7wG0Q,1076970
650
- datahub/metadata/schema.avsc,sha256=sn-1LEnMogcM9WhP7f8PWa_I7mgLALVeN75XtuCd4hY,812831
649
+ datahub/metadata/_internal_schema_classes.py,sha256=O8MG_weYzOYtGIZsOr7c6EO33NDo9liE8D8kyotoq8Q,1084754
650
+ datahub/metadata/schema.avsc,sha256=YMMQSOELeCC4ZptYnoEfMFHRL-DguCR9KmjZ6NYWMlQ,780967
651
651
  datahub/metadata/schema_classes.py,sha256=tPT8iHCak4IsZi_oL0nirbPpI8ETTPTZzapqLRpeKU4,1326
652
652
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
653
653
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
654
- datahub/metadata/_urns/urn_defs.py,sha256=q1vWaLDYps2cfLh5IYsWUrbFYx_N2txLAFIl9vKdq-M,143257
654
+ datahub/metadata/_urns/urn_defs.py,sha256=GRvs_XhEQ8Xc6abC7B7FwTIMb18R3VeXrjV1_sRi_DE,145429
655
655
  datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
656
656
  datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
657
657
  datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
@@ -688,6 +688,7 @@ datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.
688
688
  datahub/metadata/com/linkedin/pegasus2avro/events/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
689
689
  datahub/metadata/com/linkedin/pegasus2avro/events/metadata/__init__.py,sha256=a1FI_2VZ9Ejc9AIVztO-B5kLPR6VwlOgdFlv4PTCTYs,282
690
690
  datahub/metadata/com/linkedin/pegasus2avro/execution/__init__.py,sha256=O5XAXnGzDnWv8nbqRHxLPPXUbrIu_pn76WUK_hhkHmg,775
691
+ datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py,sha256=wS-GU8YjGghMQcQ42BYv7Rh_-8h-yPiS_QMzJI34yJM,507
691
692
  datahub/metadata/com/linkedin/pegasus2avro/form/__init__.py,sha256=rGDmWiKm6qpXiipZ5veCHqBJGSAryAqnSzRPlwcmLnA,845
692
693
  datahub/metadata/com/linkedin/pegasus2avro/glossary/__init__.py,sha256=fa1QNv08O3TqXqZ14bkJerGho_t-8DPHFdcWKiXkkUA,501
693
694
  datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py,sha256=EGxkzJgQMASL_aUmgjHE3bo8qRTSbAbM_8gUccZblX0,1603
@@ -695,7 +696,7 @@ datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py,sha256=LfB7ytT1u
695
696
  datahub/metadata/com/linkedin/pegasus2avro/ingestion/__init__.py,sha256=1bfG2naq4iS_pwU4J-BVer_gfL0hDbJbnH0gh1MPNgA,871
696
697
  datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py,sha256=7SHiR-KzV1CkAimFy94SkcY0Xg0RlsIlLTUTGmGAW_U,290
697
698
  datahub/metadata/com/linkedin/pegasus2avro/metadata/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
698
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py,sha256=G9CI1UqSXGzselvjnlOI7Obzjn5ZTQVzohRGBZHdnZk,5151
699
+ datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py,sha256=U7nFcTKOeiqyZVxD53p2vcKBj6tupuCJH9I3yz6hFDg,5241
699
700
  datahub/metadata/com/linkedin/pegasus2avro/metadata/query/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
700
701
  datahub/metadata/com/linkedin/pegasus2avro/metadata/query/filter/__init__.py,sha256=DBP_QtxkFmC5q_kuk4dGjb4uOKbB4xKgqTWXGxmNbBQ,532
701
702
  datahub/metadata/com/linkedin/pegasus2avro/metadata/snapshot/__init__.py,sha256=OPboF8SV11wGnjvWQB-rxtB0otMdCsE7Tcy7xkOUgz8,2358
@@ -778,11 +779,13 @@ datahub/metadata/schemas/DataHubAccessTokenKey.avsc,sha256=3EspNIxgb_I4WwV0a2o4N
778
779
  datahub/metadata/schemas/DataHubActionKey.avsc,sha256=bjiKcoyvUPQKaGUi2ICBMJ_ukwnt7dh0szJS4WBZE0A,448
779
780
  datahub/metadata/schemas/DataHubConnectionDetails.avsc,sha256=IvZj6OA7HRvy-ZIIn0UbXdJNnyt_oTn16XIe5ZlcqGk,1661
780
781
  datahub/metadata/schemas/DataHubConnectionKey.avsc,sha256=VwbamVFoEdp6epz1lJm_UShBl6ksBxoA7jAYuPI5u3M,522
782
+ datahub/metadata/schemas/DataHubFileInfo.avsc,sha256=Vt1t5L9QyaIkSh3GUkrQs4I81o2MYAJJu194h1M97tw,6314
783
+ datahub/metadata/schemas/DataHubFileKey.avsc,sha256=-qxMFdpRPSHpA88adIFIJ35PZrN5qXOEo8RK-xTzkSQ,422
781
784
  datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc,sha256=4wac7sluRIq-0ZjODE5SmuVKuQeW8ajLJNRpqEBRyio,4601
782
785
  datahub/metadata/schemas/DataHubIngestionSourceKey.avsc,sha256=TGmm9WEGTaABs7kt5Uc-N-kbc5Sd-2sQwx-JpfAptvw,545
783
786
  datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc,sha256=q6ZyMoxInwmrkrXkUgMe-i-WZzAxbjcvJ-EI99SnEp8,599
784
787
  datahub/metadata/schemas/DataHubPageModuleKey.avsc,sha256=NyFN8cVO6s6rtgoLGJJGfcPfpGr5PfmZlIhM6ajldfQ,460
785
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc,sha256=53Fj4ztBJqo9QMWuza2Kdtfpr2nTOTW0XuuXW77ugB8,10347
788
+ datahub/metadata/schemas/DataHubPageModuleProperties.avsc,sha256=hbIEkpjQxVxSZOQmTVIjOGnGTNeaA0r0oYNKPpwuddg,10443
786
789
  datahub/metadata/schemas/DataHubPageTemplateKey.avsc,sha256=0sVqwL97Rp8YHPytp2RqUP5hIW048hmT2hPNP5k6arc,472
787
790
  datahub/metadata/schemas/DataHubPageTemplateProperties.avsc,sha256=FyNcZIniQy9m6yN9DT4XsPkDrxUsU7tRTqmfdGoEtMU,8565
788
791
  datahub/metadata/schemas/DataHubPersonaInfo.avsc,sha256=OUvbTgPQsBtzkDDb9pxHXpQ6A7dkL77ZnCXZ-MLEG14,227
@@ -1017,7 +1020,7 @@ datahub/sql_parsing/fingerprint_utils.py,sha256=3hGiexaQXnE7eZLxo-t7hlTyVQz7womb
1017
1020
  datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
1018
1021
  datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
1019
1022
  datahub/sql_parsing/split_statements.py,sha256=doCACwQ_Fx6m1djo7t3BnU9ZHki4EV2KJUQkFMGv7lg,10101
1020
- datahub/sql_parsing/sql_parsing_aggregator.py,sha256=NTWrsScS4d8CYLlBdMYo0b6ecuFpMfiwHJ7kMAh9fQg,72685
1023
+ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=chRTe5eZfxsOd2vP_IrO_j3KhFUGLe2sPt0WXQ6yt5M,73320
1021
1024
  datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
1022
1025
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
1023
1026
  datahub/sql_parsing/sqlglot_lineage.py,sha256=l4LZMiaeTARjJG76Uun_yNtFHdSj3yi8zO1XvAQtxl0,66944
@@ -1128,8 +1131,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1128
1131
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1129
1132
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1130
1133
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1131
- acryl_datahub-1.3.0.1rc4.dist-info/METADATA,sha256=cJb6Hz3UpjAA15FINI4riQLYt7C5Q-aUsoWz2pelBp0,184504
1132
- acryl_datahub-1.3.0.1rc4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1133
- acryl_datahub-1.3.0.1rc4.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1134
- acryl_datahub-1.3.0.1rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1135
- acryl_datahub-1.3.0.1rc4.dist-info/RECORD,,
1134
+ acryl_datahub-1.3.0.1rc6.dist-info/METADATA,sha256=JfkdfNc1P5L7jaGrlIUM-DqN9KcYjFuKfxwfg-IvTQI,184688
1135
+ acryl_datahub-1.3.0.1rc6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1136
+ acryl_datahub-1.3.0.1rc6.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1137
+ acryl_datahub-1.3.0.1rc6.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1138
+ acryl_datahub-1.3.0.1rc6.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.3.0.1rc4"
3
+ __version__ = "1.3.0.1rc6"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -3,11 +3,13 @@ from datetime import datetime, timedelta, timezone
3
3
  from enum import Enum
4
4
  from http import HTTPStatus
5
5
  from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
6
+ from urllib.parse import parse_qs, urlparse
6
7
 
7
8
  import boto3
8
9
  import requests
9
10
  from boto3.session import Session
10
11
  from botocore.config import DEFAULT_TIMEOUT, Config
12
+ from botocore.exceptions import ClientError, NoCredentialsError
11
13
  from botocore.utils import fix_s3_host
12
14
  from pydantic.fields import Field
13
15
 
@@ -465,6 +467,165 @@ class AwsConnectionConfig(ConfigModel):
465
467
  def get_lakeformation_client(self) -> "LakeFormationClient":
466
468
  return self.get_session().client("lakeformation", config=self._aws_config())
467
469
 
470
+ def get_rds_client(self):
471
+ """Get an RDS client for generating IAM auth tokens."""
472
+ return self.get_session().client("rds", config=self._aws_config())
473
+
474
+
475
+ def generate_rds_iam_token(
476
+ endpoint: str,
477
+ username: str,
478
+ port: int,
479
+ aws_config: AwsConnectionConfig,
480
+ ) -> str:
481
+ """
482
+ Generate an AWS RDS IAM authentication token.
483
+
484
+ boto3's generate_db_auth_token() returns a presigned URL in the format:
485
+ "hostname:port/?Action=connect&DBUser=username&X-Amz-Date=...&X-Amz-Expires=..."
486
+
487
+ This token should be used as-is by pymysql/psycopg2 drivers.
488
+
489
+ Args:
490
+ endpoint: RDS endpoint hostname
491
+ username: Database username for IAM authentication
492
+ port: Database port (5432 for PostgreSQL, 3306 for MySQL)
493
+ aws_config: AwsConnectionConfig for session management and credentials
494
+
495
+ Returns:
496
+ Authentication token (presigned URL format)
497
+
498
+ Raises:
499
+ ValueError: If AWS credentials are not found or token generation fails
500
+
501
+ """
502
+ try:
503
+ client = aws_config.get_rds_client()
504
+ token = client.generate_db_auth_token(
505
+ DBHostname=endpoint, Port=port, DBUsername=username
506
+ )
507
+ logger.debug(f"Generated RDS IAM token for {username}@{endpoint}:{port}")
508
+ return token
509
+ except NoCredentialsError as e:
510
+ raise ValueError("AWS credentials not found") from e
511
+ except ClientError as e:
512
+ raise ValueError(f"Failed to generate RDS IAM token: {e}") from e
513
+
514
+
515
+ class RDSIAMTokenManager:
516
+ """
517
+ Manages RDS IAM token lifecycle with automatic refresh.
518
+
519
+ RDS IAM tokens include expiration information in the URL parameters.
520
+ This manager parses the token expiry and refreshes before expiration
521
+ to ensure uninterrupted database access.
522
+ """
523
+
524
+ def __init__(
525
+ self,
526
+ endpoint: str,
527
+ username: str,
528
+ port: int,
529
+ aws_config: AwsConnectionConfig,
530
+ refresh_threshold_minutes: int = 5,
531
+ ):
532
+ """
533
+ Initialize the token manager.
534
+
535
+ Args:
536
+ endpoint: RDS endpoint hostname
537
+ username: Database username for IAM authentication
538
+ port: Database port
539
+ aws_config: AwsConnectionConfig for session management and credentials
540
+ refresh_threshold_minutes: Refresh token when this many minutes remain before expiry
541
+ """
542
+ self.endpoint = endpoint
543
+ self.username = username
544
+ self.port = port
545
+ self.aws_config = aws_config
546
+ self.refresh_threshold = timedelta(minutes=refresh_threshold_minutes)
547
+
548
+ self._current_token: Optional[str] = None
549
+ self._token_expires_at: Optional[datetime] = None
550
+
551
+ def get_token(self) -> str:
552
+ """
553
+ Get current token, refreshing if necessary.
554
+
555
+ Returns:
556
+ Valid authentication token
557
+
558
+ Raises:
559
+ RuntimeError: If token generation or refresh fails
560
+ """
561
+ if self._needs_refresh():
562
+ self._refresh_token()
563
+
564
+ assert self._current_token is not None
565
+ return self._current_token
566
+
567
+ def _needs_refresh(self) -> bool:
568
+ """Check if token needs to be refreshed."""
569
+ if self._current_token is None or self._token_expires_at is None:
570
+ return True
571
+
572
+ time_until_expiry = self._token_expires_at - datetime.now(timezone.utc)
573
+ return time_until_expiry <= self.refresh_threshold
574
+
575
+ def _parse_token_expiry(self, token: str) -> datetime:
576
+ """
577
+ Parse token expiry from X-Amz-Date and X-Amz-Expires URL parameters.
578
+
579
+ Args:
580
+ token: RDS IAM authentication token (presigned URL)
581
+
582
+ Returns:
583
+ Expiration datetime in UTC
584
+
585
+ Raises:
586
+ ValueError: If token URL format is invalid or missing required parameters
587
+ """
588
+ try:
589
+ parsed_url = urlparse(token)
590
+ query_params = parse_qs(parsed_url.query)
591
+
592
+ # Extract X-Amz-Date (ISO 8601 format: YYYYMMDDTHHMMSSZ)
593
+ amz_date_list = query_params.get("X-Amz-Date")
594
+ if not amz_date_list:
595
+ raise ValueError("Missing X-Amz-Date parameter in RDS IAM token")
596
+ amz_date_str = amz_date_list[0]
597
+
598
+ # Extract X-Amz-Expires (duration in seconds)
599
+ amz_expires_list = query_params.get("X-Amz-Expires")
600
+ if not amz_expires_list:
601
+ raise ValueError("Missing X-Amz-Expires parameter in RDS IAM token")
602
+ amz_expires_seconds = int(amz_expires_list[0])
603
+
604
+ # Parse X-Amz-Date to datetime
605
+ token_issued_at = datetime.strptime(amz_date_str, "%Y%m%dT%H%M%SZ").replace(
606
+ tzinfo=timezone.utc
607
+ )
608
+
609
+ # Calculate expiration
610
+ return token_issued_at + timedelta(seconds=amz_expires_seconds)
611
+
612
+ except (ValueError, KeyError, IndexError) as e:
613
+ raise ValueError(
614
+ f"Failed to parse RDS IAM token expiry: {e}. Token format may be invalid."
615
+ ) from e
616
+
617
+ def _refresh_token(self) -> None:
618
+ """Generate and store a new token with parsed expiry."""
619
+ logger.info("Refreshing RDS IAM authentication token")
620
+ self._current_token = generate_rds_iam_token(
621
+ endpoint=self.endpoint,
622
+ username=self.username,
623
+ port=self.port,
624
+ aws_config=self.aws_config,
625
+ )
626
+ self._token_expires_at = self._parse_token_expiry(self._current_token)
627
+ logger.debug(f"Token will expire at {self._token_expires_at}")
628
+
468
629
 
469
630
  class AwsSourceConfig(EnvConfigMixin, AwsConnectionConfig):
470
631
  """
@@ -49,6 +49,7 @@ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
49
49
  from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
50
50
  from datahub.ingestion.source.state.redundant_run_skip_handler import (
51
51
  RedundantLineageRunSkipHandler,
52
+ RedundantQueriesRunSkipHandler,
52
53
  RedundantUsageRunSkipHandler,
53
54
  )
54
55
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -145,7 +146,10 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
145
146
  redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
146
147
  None
147
148
  )
148
- if self.config.enable_stateful_lineage_ingestion:
149
+ if (
150
+ self.config.enable_stateful_lineage_ingestion
151
+ and not self.config.use_queries_v2
152
+ ):
149
153
  redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
150
154
  source=self,
151
155
  config=self.config,
@@ -296,6 +300,17 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
296
300
  ):
297
301
  return
298
302
 
303
+ redundant_queries_run_skip_handler: Optional[
304
+ RedundantQueriesRunSkipHandler
305
+ ] = None
306
+ if self.config.enable_stateful_time_window:
307
+ redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
308
+ source=self,
309
+ config=self.config,
310
+ pipeline_name=self.ctx.pipeline_name,
311
+ run_id=self.ctx.run_id,
312
+ )
313
+
299
314
  with (
300
315
  self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
301
316
  BigQueryQueriesExtractor(
@@ -315,6 +330,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
315
330
  structured_report=self.report,
316
331
  filters=self.filters,
317
332
  identifiers=self.identifiers,
333
+ redundant_run_skip_handler=redundant_queries_run_skip_handler,
318
334
  schema_resolver=self.sql_parser_schema_resolver,
319
335
  discovered_tables=self.bq_schema_extractor.table_refs,
320
336
  ) as queries_extractor,
@@ -25,6 +25,7 @@ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterCo
25
25
  from datahub.ingestion.source.state.stateful_ingestion_base import (
26
26
  StatefulLineageConfigMixin,
27
27
  StatefulProfilingConfigMixin,
28
+ StatefulTimeWindowConfigMixin,
28
29
  StatefulUsageConfigMixin,
29
30
  )
30
31
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
@@ -271,6 +272,7 @@ class BigQueryV2Config(
271
272
  SQLCommonConfig,
272
273
  StatefulUsageConfigMixin,
273
274
  StatefulLineageConfigMixin,
275
+ StatefulTimeWindowConfigMixin,
274
276
  StatefulProfilingConfigMixin,
275
277
  ClassificationSourceConfigMixin,
276
278
  ):
@@ -527,6 +529,20 @@ class BigQueryV2Config(
527
529
 
528
530
  return v
529
531
 
532
+ @root_validator(pre=False, skip_on_failure=True)
533
+ def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
534
+ if values.get("use_queries_v2"):
535
+ if values.get("enable_stateful_lineage_ingestion") or values.get(
536
+ "enable_stateful_usage_ingestion"
537
+ ):
538
+ logger.warning(
539
+ "enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
540
+ "when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
541
+ "For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
542
+ "for the unified time window extraction (lineage + usage + operations + queries)."
543
+ )
544
+ return values
545
+
530
546
  def get_table_pattern(self, pattern: List[str]) -> str:
531
547
  return "|".join(pattern) if pattern else ""
532
548
 
@@ -449,10 +449,12 @@ class BigQuerySchemaGenerator:
449
449
  ):
450
450
  yield wu
451
451
  except Exception as e:
452
- if self.config.is_profiling_enabled():
453
- action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission, bigquery.tables.getData permission?"
452
+ # If configuration indicates we need table data access (for profiling or use_tables_list_query_v2),
453
+ # include bigquery.tables.getData in the error message since that's likely the missing permission
454
+ if self.config.have_table_data_read_permission:
455
+ action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list, bigquery.tables.getData permissions?"
454
456
  else:
455
- action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission?"
457
+ action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permissions?"
456
458
 
457
459
  self.report.failure(
458
460
  title="Unable to get tables for dataset",
@@ -36,6 +36,9 @@ from datahub.ingestion.source.bigquery_v2.common import (
36
36
  BigQueryFilter,
37
37
  BigQueryIdentifierBuilder,
38
38
  )
39
+ from datahub.ingestion.source.state.redundant_run_skip_handler import (
40
+ RedundantQueriesRunSkipHandler,
41
+ )
39
42
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
40
43
  from datahub.metadata.urns import CorpUserUrn
41
44
  from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -135,6 +138,7 @@ class BigQueryQueriesExtractor(Closeable):
135
138
  structured_report: SourceReport,
136
139
  filters: BigQueryFilter,
137
140
  identifiers: BigQueryIdentifierBuilder,
141
+ redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
138
142
  graph: Optional[DataHubGraph] = None,
139
143
  schema_resolver: Optional[SchemaResolver] = None,
140
144
  discovered_tables: Optional[Collection[str]] = None,
@@ -158,6 +162,9 @@ class BigQueryQueriesExtractor(Closeable):
158
162
  )
159
163
 
160
164
  self.structured_report = structured_report
165
+ self.redundant_run_skip_handler = redundant_run_skip_handler
166
+
167
+ self.start_time, self.end_time = self._get_time_window()
161
168
 
162
169
  self.aggregator = SqlParsingAggregator(
163
170
  platform=self.identifiers.platform,
@@ -172,8 +179,8 @@ class BigQueryQueriesExtractor(Closeable):
172
179
  generate_query_usage_statistics=self.config.include_query_usage_statistics,
173
180
  usage_config=BaseUsageConfig(
174
181
  bucket_duration=self.config.window.bucket_duration,
175
- start_time=self.config.window.start_time,
176
- end_time=self.config.window.end_time,
182
+ start_time=self.start_time,
183
+ end_time=self.end_time,
177
184
  user_email_pattern=self.config.user_email_pattern,
178
185
  top_n_queries=self.config.top_n_queries,
179
186
  ),
@@ -199,6 +206,34 @@ class BigQueryQueriesExtractor(Closeable):
199
206
  logger.info(f"Using local temp path: {path}")
200
207
  return path
201
208
 
209
+ def _get_time_window(self) -> tuple[datetime, datetime]:
210
+ if self.redundant_run_skip_handler:
211
+ start_time, end_time = (
212
+ self.redundant_run_skip_handler.suggest_run_time_window(
213
+ self.config.window.start_time,
214
+ self.config.window.end_time,
215
+ )
216
+ )
217
+ else:
218
+ start_time = self.config.window.start_time
219
+ end_time = self.config.window.end_time
220
+
221
+ # Usage statistics are aggregated per bucket (typically per day).
222
+ # To ensure accurate aggregated metrics, we need to align the start_time
223
+ # to the beginning of a bucket so that we include complete bucket periods.
224
+ if self.config.include_usage_statistics:
225
+ start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
226
+
227
+ return start_time, end_time
228
+
229
+ def _update_state(self) -> None:
230
+ if self.redundant_run_skip_handler:
231
+ self.redundant_run_skip_handler.update_state(
232
+ self.config.window.start_time,
233
+ self.config.window.end_time,
234
+ self.config.window.bucket_duration,
235
+ )
236
+
202
237
  def is_temp_table(self, name: str) -> bool:
203
238
  try:
204
239
  table = BigqueryTableIdentifier.from_string_name(name)
@@ -299,6 +334,8 @@ class BigQueryQueriesExtractor(Closeable):
299
334
  shared_connection.close()
300
335
  audit_log_file.unlink(missing_ok=True)
301
336
 
337
+ self._update_state()
338
+
302
339
  def deduplicate_queries(
303
340
  self, queries: FileBackedList[ObservedQuery]
304
341
  ) -> FileBackedDict[Dict[int, ObservedQuery]]:
@@ -355,8 +392,8 @@ class BigQueryQueriesExtractor(Closeable):
355
392
  query_log_query = _build_enriched_query_log_query(
356
393
  project_id=project.id,
357
394
  region=region,
358
- start_time=self.config.window.start_time,
359
- end_time=self.config.window.end_time,
395
+ start_time=self.start_time,
396
+ end_time=self.end_time,
360
397
  )
361
398
 
362
399
  logger.info(f"Fetching query log from BQ Project {project.id} for {region}")
@@ -15,6 +15,7 @@ from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable
15
15
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
16
16
  from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
17
17
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
18
+ from datahub.utilities.perf_timer import PerfTimer
18
19
 
19
20
  logger: logging.Logger = logging.getLogger(__name__)
20
21
 
@@ -243,9 +244,13 @@ class RedshiftDataDictionary:
243
244
  conn: redshift_connector.Connection, query: str
244
245
  ) -> redshift_connector.Cursor:
245
246
  cursor: redshift_connector.Cursor = conn.cursor()
246
-
247
- logger.debug(f"Query : {query}")
248
- cursor.execute(query)
247
+ with PerfTimer() as timer:
248
+ query_hash_id = hash(query)
249
+ logger.info(f"Executing query [{query_hash_id}]\n{query}")
250
+ cursor.execute(query)
251
+ logger.info(
252
+ f"Time taken query [{query_hash_id}: {timer.elapsed_seconds():.3f} seconds"
253
+ )
249
254
  return cursor
250
255
 
251
256
  @staticmethod
@@ -545,8 +550,7 @@ class RedshiftDataDictionary:
545
550
  conn: redshift_connector.Connection,
546
551
  query: str,
547
552
  ) -> Iterable[LineageRow]:
548
- cursor = conn.cursor()
549
- cursor.execute(query)
553
+ cursor = RedshiftDataDictionary.get_query_result(conn=conn, query=query)
550
554
  field_names = [i[0] for i in cursor.description]
551
555
 
552
556
  rows = cursor.fetchmany()
@@ -603,9 +607,7 @@ class RedshiftDataDictionary:
603
607
  conn: redshift_connector.Connection,
604
608
  query: str,
605
609
  ) -> Iterable[TempTableRow]:
606
- cursor = conn.cursor()
607
-
608
- cursor.execute(query)
610
+ cursor = RedshiftDataDictionary.get_query_result(conn=conn, query=query)
609
611
 
610
612
  field_names = [i[0] for i in cursor.description]
611
613
 
@@ -662,8 +664,9 @@ class RedshiftDataDictionary:
662
664
  def get_outbound_datashares(
663
665
  conn: redshift_connector.Connection,
664
666
  ) -> Iterable[OutboundDatashare]:
665
- cursor = conn.cursor()
666
- cursor.execute(RedshiftCommonQuery.list_outbound_datashares())
667
+ cursor = RedshiftDataDictionary.get_query_result(
668
+ conn=conn, query=RedshiftCommonQuery.list_outbound_datashares()
669
+ )
667
670
  for item in cursor.fetchall():
668
671
  yield OutboundDatashare(
669
672
  share_name=item[1],
@@ -678,8 +681,10 @@ class RedshiftDataDictionary:
678
681
  conn: redshift_connector.Connection,
679
682
  database: str,
680
683
  ) -> Optional[InboundDatashare]:
681
- cursor = conn.cursor()
682
- cursor.execute(RedshiftCommonQuery.get_inbound_datashare(database))
684
+ cursor = RedshiftDataDictionary.get_query_result(
685
+ conn=conn,
686
+ query=RedshiftCommonQuery.get_inbound_datashare(database),
687
+ )
683
688
  item = cursor.fetchone()
684
689
  if item:
685
690
  return InboundDatashare(