acryl-datahub 1.0.0rc10__py3-none-any.whl → 1.0.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (28) hide show
  1. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/METADATA +2416 -2416
  2. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/RECORD +28 -27
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +4 -3
  5. datahub/ingestion/source/iceberg/iceberg_common.py +40 -1
  6. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  7. datahub/ingestion/source/redshift/config.py +4 -0
  8. datahub/ingestion/source/redshift/datashares.py +236 -0
  9. datahub/ingestion/source/redshift/lineage.py +6 -2
  10. datahub/ingestion/source/redshift/lineage_v2.py +7 -4
  11. datahub/ingestion/source/redshift/profile.py +1 -1
  12. datahub/ingestion/source/redshift/query.py +125 -33
  13. datahub/ingestion/source/redshift/redshift.py +41 -72
  14. datahub/ingestion/source/redshift/redshift_schema.py +166 -6
  15. datahub/ingestion/source/redshift/report.py +3 -0
  16. datahub/ingestion/source/sql/oracle.py +93 -63
  17. datahub/metadata/_schema_classes.py +5 -5
  18. datahub/metadata/schema.avsc +2 -1
  19. datahub/metadata/schemas/DomainKey.avsc +2 -1
  20. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  21. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  22. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  23. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  24. datahub/sql_parsing/sql_parsing_common.py +7 -0
  25. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/LICENSE +0 -0
  26. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/WHEEL +0 -0
  27. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/entry_points.txt +0 -0
  28. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
- datahub/_version.py,sha256=FaV2BCpaCxlMvv5cXKYFF-MDOTi4jF-QgH3AQNNnof0,322
3
+ datahub/_version.py,sha256=m3vMOf1XXwW_i72T14wHeXSyYmTku5A-KQz7nxQXArM,322
4
4
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
5
5
  datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
6
6
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -27,7 +27,7 @@ datahub/api/entities/assertion/sql_assertion.py,sha256=myJU-Wf8O-RbiyU_Xlbp2cacw
27
27
  datahub/api/entities/assertion/volume_assertion.py,sha256=37bNLGP-81MvcZj_cVHvrdw5I4aBxkER0xN0ZqyB3NU,3360
28
28
  datahub/api/entities/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  datahub/api/entities/common/data_platform_instance.py,sha256=AVqQ-yactNZi_bislIEUcQZCGovaHY-gQi1EY7PVsT4,1065
30
- datahub/api/entities/common/serialized_value.py,sha256=pfw16vH2sG6U5kgBsmuEFBxJ0vL9CZpOQ5HZGOFbbQ0,5538
30
+ datahub/api/entities/common/serialized_value.py,sha256=DFPK7p4OwqRTOnH8luEWzqH_4vQHZSNxFIL63x_o2ok,5565
31
31
  datahub/api/entities/corpgroup/__init__.py,sha256=Uf3SxsZUSY-yZ2Kx3-1dWwz600D1C4Ds_z_nG7hwanA,63
32
32
  datahub/api/entities/corpgroup/corpgroup.py,sha256=XSrGHCwl7lMNtzWviMzZbw8VDdesXC2HLZP5kpHt2fQ,8878
33
33
  datahub/api/entities/corpuser/__init__.py,sha256=RspO1ceu6q2zUqYqZqRRY_MPcP7PNdd2lQoZn-KfeQE,60
@@ -324,7 +324,7 @@ datahub/ingestion/source/grafana/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
324
324
  datahub/ingestion/source/grafana/grafana_source.py,sha256=3pU3xodPgS5lmnjuQ_u7F0XPzD_Y8MnPlMxRJ86qz4g,4960
325
325
  datahub/ingestion/source/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
326
326
  datahub/ingestion/source/iceberg/iceberg.py,sha256=pMWQtn88XAYwZsRNkICX1GlQOqOnyuWdLpkcjVQEon0,29039
327
- datahub/ingestion/source/iceberg/iceberg_common.py,sha256=krt-41r90t0CkNeJXsiwO-p5zJIulI-tyq3xaU2yw_c,10645
327
+ datahub/ingestion/source/iceberg/iceberg_common.py,sha256=VGosqYPmn_j6GETSnDHZ8Ay1BVOedmx2x5LHxw16I3A,12278
328
328
  datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=CkBB5fryMVoqqCM6eLSIeb4yP85ABHONNRm0QqZKrnw,9977
329
329
  datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
330
  datahub/ingestion/source/identity/azure_ad.py,sha256=9Hrvm4CSfc02yjnPUsCYSY4Qw9fXPnDFWLexab0mcpc,28559
@@ -360,7 +360,7 @@ datahub/ingestion/source/looker/str_functions.py,sha256=zceEX2ka_4WaWwWgEdyknUSz
360
360
  datahub/ingestion/source/looker/urn_functions.py,sha256=4VvqEfGvIMq3rNHHps0-HlPurMPnpqdxNtDAOOHIZww,528
361
361
  datahub/ingestion/source/looker/view_upstream.py,sha256=4FCjZaU6p2G7npB2RJpP4Gv2yLjbvbsYWEbAg55IvjY,26110
362
362
  datahub/ingestion/source/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
363
- datahub/ingestion/source/metadata/business_glossary.py,sha256=yySwJp2SCUQp8hRwN2lQuSqvOQowIhCKDKj9syhlTZA,18210
363
+ datahub/ingestion/source/metadata/business_glossary.py,sha256=T_RJHst6iQRghJNmLLPeSBMEDsbEKf3yBldOAgMcGuo,19666
364
364
  datahub/ingestion/source/metadata/lineage.py,sha256=2iK-hsORWm7NSvMZcG4D5hb8_PH57g-u6LWbu_f7HM4,9521
365
365
  datahub/ingestion/source/neo4j/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
366
366
  datahub/ingestion/source/neo4j/neo4j_source.py,sha256=O3jjdnsx7IyYPBLbxowL85Qo4zs4H-maMOH4-6ZNCk4,13063
@@ -396,16 +396,17 @@ datahub/ingestion/source/qlik_sense/qlik_api.py,sha256=KoBaD1VowYrbaRg1rjDP1_mmP
396
396
  datahub/ingestion/source/qlik_sense/qlik_sense.py,sha256=bmhmOgSXzC6g-uqO1ljFLRNz2oo6Xjn400UQnWdMA1Y,22530
397
397
  datahub/ingestion/source/qlik_sense/websocket_connection.py,sha256=jp39OInvjCN9BtnKsHU_aa1B3X9hVHqSmD25stXuqHk,1940
398
398
  datahub/ingestion/source/redshift/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
399
- datahub/ingestion/source/redshift/config.py,sha256=S0yQ9wZKdGjWeeziWHpPECJ3wGYWBIsdhS3YJ5oAX_Y,8853
399
+ datahub/ingestion/source/redshift/config.py,sha256=l_hlgsCjvlcgcFQpd5WMKlW8nqQUhaMGec8FnUbSl6Y,8997
400
+ datahub/ingestion/source/redshift/datashares.py,sha256=kH3YkoenOa59XZU12XeUf283lOOAITYD9jOXpy8R06E,9227
400
401
  datahub/ingestion/source/redshift/exception.py,sha256=dxzYUIv5B_FAWhOuzG2u5We7FX-ar4jhOXPXAlEIvgM,2055
401
- datahub/ingestion/source/redshift/lineage.py,sha256=bUy0uJowrqSc33Z50fIxFlJkyhe-OPM_qgPh-smSTgM,43983
402
- datahub/ingestion/source/redshift/lineage_v2.py,sha256=OcVW_27sSaZOYZPTd2j-LS9SzFQ1kXz6cMzM2ZDWhJQ,16751
403
- datahub/ingestion/source/redshift/profile.py,sha256=T4H79ycq2tPobLM1tTLRtu581Qa8LlKxEok49m0AirU,4294
404
- datahub/ingestion/source/redshift/query.py,sha256=X0KlDPzM68j0SYKXhq50DkLbFUIbGuPmGCYYmr8E0v0,44353
405
- datahub/ingestion/source/redshift/redshift.py,sha256=x9dKocJdGPaNs2fRdaddaBtZNxmTJFwYDhXY5nl_5zM,44444
402
+ datahub/ingestion/source/redshift/lineage.py,sha256=Gk2dNuRBEipZkY5W1sArlfRbFR7mBKutCFHHTrn3yX4,44096
403
+ datahub/ingestion/source/redshift/lineage_v2.py,sha256=H6Qky5dLeZEICdDWyH-My78NoKlXpExHg3m-6d5lbgo,16891
404
+ datahub/ingestion/source/redshift/profile.py,sha256=jqFQUSg_qzSYi1yIAq24NFwHW8yIcSDSSh-vgJ4nl6M,4287
405
+ datahub/ingestion/source/redshift/query.py,sha256=6Fw3I8qFLflySDu6WY5D9NjXnRnDIw0yxKisSpaHh0A,47526
406
+ datahub/ingestion/source/redshift/redshift.py,sha256=IZqeQws3mvDdu9K-ixPGZNalDcRRRse-l_TTwQI7B-4,43407
406
407
  datahub/ingestion/source/redshift/redshift_data_reader.py,sha256=zc69jwXHdF-w8J4Hq-ZQ6BjHQ75Ij2iNDMpoRJlcmlU,1724
407
- datahub/ingestion/source/redshift/redshift_schema.py,sha256=9IYeUsnISenq3eVB3k-s7zK8nInWDAYViFnDrNjtkb0,19149
408
- datahub/ingestion/source/redshift/report.py,sha256=M19aUHBkd9n-BVBX4fRhyRNdVkN2b9Es6ZqInRx5ZGI,2958
408
+ datahub/ingestion/source/redshift/redshift_schema.py,sha256=WTc-j4_PYlFgaJZ3hEorGIBWKruTX57E7V_5JaUe8mU,24045
409
+ datahub/ingestion/source/redshift/report.py,sha256=O3QFozHlmMbH9b7KxbqhgTgr_0tCryj6FIzMiN6kRxw,3044
409
410
  datahub/ingestion/source/redshift/usage.py,sha256=eSdB1MYZeQokkQOwl9LPdpo-oCBJSwxJBotSpJ9XjBc,17473
410
411
  datahub/ingestion/source/s3/__init__.py,sha256=HjqFPj11WtNFZM3kcVshlDb7kOsc19-l_3LM8PBjlJM,56
411
412
  datahub/ingestion/source/s3/config.py,sha256=lElFXgEpKDT9SVoiXvtx98wV6Gp880qP4pLQaOGJGOo,7828
@@ -464,7 +465,7 @@ datahub/ingestion/source/sql/hive.py,sha256=NRUrEWnR1JN5U0q4CHlRacdKzxJhS4unFXnX
464
465
  datahub/ingestion/source/sql/hive_metastore.py,sha256=fH7bAcljapYqmF8cQE7humoufFe2RVFRYOcyavMg9yo,36103
465
466
  datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfvpHSzJjqvMw,737
466
467
  datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
467
- datahub/ingestion/source/sql/oracle.py,sha256=pQeeQarUZpC7q09zL0LlPZB0aCwHU3QBRSzxyLHGIKY,26222
468
+ datahub/ingestion/source/sql/oracle.py,sha256=it9qhUkGRHTq_F5DoEsCBLYnB02divzxDlBvXACH4Pk,27712
468
469
  datahub/ingestion/source/sql/postgres.py,sha256=uC1kYEI8VdxiZ1Y9IxMWzwmg11wtMqYN0e2fkok1rxo,11972
469
470
  datahub/ingestion/source/sql/presto.py,sha256=PB-CS5MX2dSRFRHjlxfkLHGXLZXFNCsVAAyRBtY6HMg,3611
470
471
  datahub/ingestion/source/sql/sql_common.py,sha256=r75Cd06Qwe2fqTDRZKWnIf7kpnR0BSxZ9PYBOgY0I6k,48785
@@ -575,8 +576,8 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
575
576
  datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
576
577
  datahub/lite/lite_util.py,sha256=Cm6trMTeo0X1fv4nSsW9lC0jqce7Jt-05GhOtIGzsVc,4559
577
578
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
578
- datahub/metadata/_schema_classes.py,sha256=lQPz3jGfTOCBidsC_Ap62XqRD9A4AKPqhHa3CV9jyL0,993316
579
- datahub/metadata/schema.avsc,sha256=iQJaPYHy4xrGQBEbRgn-RF4kGC1iNPhZawHTAYTyfW0,741430
579
+ datahub/metadata/_schema_classes.py,sha256=uafVvWsnAqPranXzeC9CrSAu7I1-XJOogtiBPhxmn-k,993397
580
+ datahub/metadata/schema.avsc,sha256=uPWX2Rx9A12b-p4ef4zrsjbtQPSIH8w67l3B6pq6zE0,741459
580
581
  datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
581
582
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
582
583
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
@@ -745,7 +746,7 @@ datahub/metadata/schemas/DatasetUsageStatistics.avsc,sha256=JKNy_KlUqr3kt7o1Cu2D
745
746
  datahub/metadata/schemas/Deprecation.avsc,sha256=SmbTlMB9fujdMBjYEQkzaU4XJzwM1gD6E8L2zoL1b4Q,1280
746
747
  datahub/metadata/schemas/DisplayProperties.avsc,sha256=MTa_g2s0roxNFFggWU8rslUH3UFe3xe11uUXyh0Go_I,1732
747
748
  datahub/metadata/schemas/Documentation.avsc,sha256=9vIJG9B08FFrC3y5c1XVaT5U3c-b5sOAc5foUxMnyCs,4836
748
- datahub/metadata/schemas/DomainKey.avsc,sha256=1_kbsMTsO2ebB3zW7KpB71QfkGGR0mAgpNOKRoWHsJU,649
749
+ datahub/metadata/schemas/DomainKey.avsc,sha256=TYCcJRWqwbxbQuR5E68pvdeAmfVdYsJuMNhTxVphbqg,676
749
750
  datahub/metadata/schemas/DomainProperties.avsc,sha256=6do6wZ9G6gyt1QowQyi1xldqgdTXspb05FaqWpKJ6eM,3843
750
751
  datahub/metadata/schemas/Domains.avsc,sha256=5mRQcba6Zmp6Y1srbxhOjETutg0I_ZG4ikuS2r9fkR0,804
751
752
  datahub/metadata/schemas/DynamicFormAssignment.avsc,sha256=SXRL5D6kIYWdGl3zLQYxPnkQX71JXQOKrjQNavFqVp0,7339
@@ -783,7 +784,7 @@ datahub/metadata/schemas/GlobalSettingsInfo.avsc,sha256=OVMM6FwhHhufHkezYcVePK0z
783
784
  datahub/metadata/schemas/GlobalSettingsKey.avsc,sha256=Yj8s5IdM9yF7xrhJcLGCPCXBWqSsrPbufBaQjlZ3JlU,563
784
785
  datahub/metadata/schemas/GlobalTags.avsc,sha256=-SurkodMqTDnPpkRV6qYqmpNWjQNvynUiPZX7EhL5uc,4624
785
786
  datahub/metadata/schemas/GlossaryNodeInfo.avsc,sha256=G1Cb-w9VxIAEhNqyiEsDL_ABRO9QxyTpUANKU6DQrFw,1888
786
- datahub/metadata/schemas/GlossaryNodeKey.avsc,sha256=XzzrUEpRCxNjWfdjXBTuYrmeMWK_-eCXw49K_eOswuw,637
787
+ datahub/metadata/schemas/GlossaryNodeKey.avsc,sha256=hT8ny4TL1WvgFvnaVBjuw6AWDiPDjpkh20f83ZT-UZ8,664
787
788
  datahub/metadata/schemas/GlossaryRelatedTerms.avsc,sha256=ZTP0mrFD4y-C6JekRy8IVuHvICUkJib-ZAYD93Gv1tA,2763
788
789
  datahub/metadata/schemas/GlossaryTermInfo.avsc,sha256=j4s9NCyMOIF03HfaXoQEIkiMTRaCy_-euhenptfu7IA,2935
789
790
  datahub/metadata/schemas/GlossaryTermKey.avsc,sha256=mkyrzmOX_BGRHbcj2ccUALbrPVJNdQbItU-VyKN7P98,836
@@ -806,12 +807,12 @@ datahub/metadata/schemas/MLFeatureTableKey.avsc,sha256=6_typ7K0Bz8x62T31IYqf9XS9
806
807
  datahub/metadata/schemas/MLFeatureTableProperties.avsc,sha256=BtrqcsxoQXObPZXSGRNYtIBJCoeHkMK_Zr_imBWF2Zk,2008
807
808
  datahub/metadata/schemas/MLHyperParam.avsc,sha256=dE6i5r6LTYMNrQe9yy-jKoP09GOJUf__1bO69ldpydc,833
808
809
  datahub/metadata/schemas/MLMetric.avsc,sha256=y8WPVVwjhu3YGtqpFFJYNYK8w778RRL_d2sHG1Dc7uM,804
809
- datahub/metadata/schemas/MLModelDeploymentKey.avsc,sha256=gmXaUYxII8BVLnXOFdlPmyhD1rUhrw455R_hL77foSU,2406
810
+ datahub/metadata/schemas/MLModelDeploymentKey.avsc,sha256=vt04jFF_ZHSvWhqLoxC8C_KspiRLkvNNIXJI0aKPF1Q,2425
810
811
  datahub/metadata/schemas/MLModelDeploymentProperties.avsc,sha256=I3v-uNOeYxO4hooPHOjafWWHuVyeGvG90oma0tzpNFg,5409
811
812
  datahub/metadata/schemas/MLModelFactorPrompts.avsc,sha256=8kX-P4F4mVLFT980z3MwIautt1_6uA-c_Z87nYNDK-k,2712
812
- datahub/metadata/schemas/MLModelGroupKey.avsc,sha256=CL3DS8BfC2hOPmuUPaLcur0IUFg9Cnexc7bQ2lRgBfI,2478
813
+ datahub/metadata/schemas/MLModelGroupKey.avsc,sha256=3LoMWejMfCwdoqz3PFinRbY1_Yy4Kypw7pwg3tL42Jg,2497
813
814
  datahub/metadata/schemas/MLModelGroupProperties.avsc,sha256=zMl6ab6zfcYJmt31f-AUrrfeqfLoaSZQpfB3_S9JFFQ,6534
814
- datahub/metadata/schemas/MLModelKey.avsc,sha256=0D6IECBL-Y5FHMdUpMEnd3e0JjoQ1YCtNTjoUovJuHU,2847
815
+ datahub/metadata/schemas/MLModelKey.avsc,sha256=pRntMhcpgTJL2T2nGK6Sf9_q2vJOqHELYFh59VMXqv0,2866
815
816
  datahub/metadata/schemas/MLModelProperties.avsc,sha256=hDCBHxGe-cmCBeU1k0ANuQlKjtZsDcTfl2X_jWmtFqo,12355
816
817
  datahub/metadata/schemas/MLPrimaryKeyKey.avsc,sha256=Kq2Q9WxZ6nQ8wR4P6wpPCI-J7FwXQyoa10s6BvXtkm8,1110
817
818
  datahub/metadata/schemas/MLPrimaryKeyProperties.avsc,sha256=URIuOpS93RVk8MZVcbZ-dmTwu_cN3KSOKxSR8fm-eTo,6744
@@ -913,7 +914,7 @@ datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGy
913
914
  datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
914
915
  datahub/sql_parsing/split_statements.py,sha256=6KUoIPG7H8Rja3lrPjSrSfhFfwW4oqgfoNQeTbbOWNg,8953
915
916
  datahub/sql_parsing/sql_parsing_aggregator.py,sha256=XNZWjeaRhzaT92mzsJZGJfYaxJENsyp5dSHTmL81RIc,70130
916
- datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
917
+ datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
917
918
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
918
919
  datahub/sql_parsing/sqlglot_lineage.py,sha256=l0kT8MuRIg96X7BNJaboMznF54b-yvM2nMTLyF2d0Nw,47446
919
920
  datahub/sql_parsing/sqlglot_utils.py,sha256=6W6MQ5Yh0xXT9_h0jd19yoGWMdXicyRBDD_FwV7nj04,14701
@@ -1021,9 +1022,9 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1021
1022
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1022
1023
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1023
1024
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1024
- acryl_datahub-1.0.0rc10.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1025
- acryl_datahub-1.0.0rc10.dist-info/METADATA,sha256=KibwklUZSYWViDayJ8MX8bAVCp7PPGAQsqfyBdWoSHM,175337
1026
- acryl_datahub-1.0.0rc10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
1027
- acryl_datahub-1.0.0rc10.dist-info/entry_points.txt,sha256=U1e5ZwqPX1OaIbvGrwvozcdB8SbzFYXQM7plpdLKKeo,9592
1028
- acryl_datahub-1.0.0rc10.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1029
- acryl_datahub-1.0.0rc10.dist-info/RECORD,,
1025
+ acryl_datahub-1.0.0rc11.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1026
+ acryl_datahub-1.0.0rc11.dist-info/METADATA,sha256=hZCrduEZ7Qqkr76OUpdPLHm7AApR7AQHEaKKYq9uJZE,175337
1027
+ acryl_datahub-1.0.0rc11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
1028
+ acryl_datahub-1.0.0rc11.dist-info/entry_points.txt,sha256=U1e5ZwqPX1OaIbvGrwvozcdB8SbzFYXQM7plpdLKKeo,9592
1029
+ acryl_datahub-1.0.0rc11.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1030
+ acryl_datahub-1.0.0rc11.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0rc10"
3
+ __version__ = "1.0.0rc11"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  import logging
3
- from typing import Dict, Optional, Type, Union
3
+ from typing import Dict, Optional, Type, TypeVar, Union
4
4
 
5
5
  from avrogen.dict_wrapper import DictWrapper
6
6
  from pydantic import BaseModel
@@ -13,6 +13,7 @@ logger = logging.getLogger(__name__)
13
13
  _REMAPPED_SCHEMA_TYPES = {
14
14
  k.replace("pegasus2avro.", ""): v for k, v in SCHEMA_TYPES.items()
15
15
  }
16
+ T = TypeVar("T", bound=BaseModel)
16
17
 
17
18
 
18
19
  class SerializedResourceValue(BaseModel):
@@ -83,8 +84,8 @@ class SerializedResourceValue(BaseModel):
83
84
  )
84
85
 
85
86
  def as_pydantic_object(
86
- self, model_type: Type[BaseModel], validate_schema_ref: bool = False
87
- ) -> BaseModel:
87
+ self, model_type: Type[T], validate_schema_ref: bool = False
88
+ ) -> T:
88
89
  """
89
90
  Parse the blob into a Pydantic-defined Python object based on the schema type and schema
90
91
  ref.
@@ -6,7 +6,10 @@ from typing import Any, Dict, Optional
6
6
  from humanfriendly import format_timespan
7
7
  from pydantic import Field, validator
8
8
  from pyiceberg.catalog import Catalog, load_catalog
9
+ from pyiceberg.catalog.rest import RestCatalog
10
+ from requests.adapters import HTTPAdapter
9
11
  from sortedcontainers import SortedList
12
+ from urllib3.util import Retry
10
13
 
11
14
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
12
15
  from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -26,6 +29,23 @@ from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
26
29
 
27
30
  logger = logging.getLogger(__name__)
28
31
 
32
+ DEFAULT_REST_TIMEOUT = 120
33
+ DEFAULT_REST_RETRY_POLICY = {"total": 3, "backoff_factor": 0.1}
34
+
35
+
36
+ class TimeoutHTTPAdapter(HTTPAdapter):
37
+ def __init__(self, *args, **kwargs):
38
+ if "timeout" in kwargs:
39
+ self.timeout = kwargs["timeout"]
40
+ del kwargs["timeout"]
41
+ super().__init__(*args, **kwargs)
42
+
43
+ def send(self, request, **kwargs):
44
+ timeout = kwargs.get("timeout")
45
+ if timeout is None and hasattr(self, "timeout"):
46
+ kwargs["timeout"] = self.timeout
47
+ return super().send(request, **kwargs)
48
+
29
49
 
30
50
  class IcebergProfilingConfig(ConfigModel):
31
51
  enabled: bool = Field(
@@ -146,7 +166,26 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
146
166
  logger.debug(
147
167
  "Initializing the catalog %s with config: %s", catalog_name, catalog_config
148
168
  )
149
- return load_catalog(name=catalog_name, **catalog_config)
169
+ catalog = load_catalog(name=catalog_name, **catalog_config)
170
+ if isinstance(catalog, RestCatalog):
171
+ logger.debug(
172
+ "Recognized REST catalog type being configured, attempting to configure HTTP Adapter for the session"
173
+ )
174
+ retry_policy: Dict[str, Any] = DEFAULT_REST_RETRY_POLICY.copy()
175
+ retry_policy.update(catalog_config.get("connection", {}).get("retry", {}))
176
+ retries = Retry(**retry_policy)
177
+ logger.debug(f"Retry policy to be set: {retry_policy}")
178
+ timeout = catalog_config.get("connection", {}).get(
179
+ "timeout", DEFAULT_REST_TIMEOUT
180
+ )
181
+ logger.debug(f"Timeout to be set: {timeout}")
182
+ catalog._session.mount(
183
+ "http://", TimeoutHTTPAdapter(timeout=timeout, max_retries=retries)
184
+ )
185
+ catalog._session.mount(
186
+ "https://", TimeoutHTTPAdapter(timeout=timeout, max_retries=retries)
187
+ )
188
+ return catalog
150
189
 
151
190
 
152
191
  class TopTableTimings:
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import pathlib
3
+ import re
3
4
  import time
4
5
  from dataclasses import dataclass, field
5
6
  from typing import Any, Dict, Iterable, List, Optional, TypeVar, Union
@@ -118,17 +119,58 @@ class BusinessGlossaryConfig(DefaultConfig):
118
119
  return v
119
120
 
120
121
 
122
+ def clean_url(text: str) -> str:
123
+ """
124
+ Clean text for use in URLs by:
125
+ 1. Replacing spaces with hyphens
126
+ 2. Removing special characters (preserving hyphens and periods)
127
+ 3. Collapsing multiple hyphens and periods into single ones
128
+ """
129
+ # Replace spaces with hyphens
130
+ text = text.replace(" ", "-")
131
+ # Remove special characters except hyphens and periods
132
+ text = re.sub(r"[^a-zA-Z0-9\-.]", "", text)
133
+ # Collapse multiple hyphens into one
134
+ text = re.sub(r"-+", "-", text)
135
+ # Collapse multiple periods into one
136
+ text = re.sub(r"\.+", ".", text)
137
+ # Remove leading/trailing hyphens and periods
138
+ text = text.strip("-.")
139
+ return text
140
+
141
+
121
142
  def create_id(path: List[str], default_id: Optional[str], enable_auto_id: bool) -> str:
143
+ """
144
+ Create an ID for a glossary node or term.
145
+
146
+ Args:
147
+ path: List of path components leading to this node/term
148
+ default_id: Optional manually specified ID
149
+ enable_auto_id: Whether to generate GUIDs
150
+ """
122
151
  if default_id is not None:
123
- return default_id # No need to create id from path as default_id is provided
152
+ return default_id # Use explicitly provided ID
124
153
 
125
154
  id_: str = ".".join(path)
126
155
 
127
- if UrnEncoder.contains_extended_reserved_char(id_):
128
- enable_auto_id = True
156
+ # Check for non-ASCII characters before cleaning
157
+ if any(ord(c) > 127 for c in id_):
158
+ return datahub_guid({"path": id_})
129
159
 
130
160
  if enable_auto_id:
161
+ # Generate GUID for auto_id mode
131
162
  id_ = datahub_guid({"path": id_})
163
+ else:
164
+ # Clean the URL for better readability when not using auto_id
165
+ id_ = clean_url(id_)
166
+
167
+ # Force auto_id if the cleaned URL still contains problematic characters
168
+ if UrnEncoder.contains_extended_reserved_char(id_):
169
+ logger.warning(
170
+ f"ID '{id_}' contains problematic characters after URL cleaning. Falling back to GUID generation for stability."
171
+ )
172
+ id_ = datahub_guid({"path": id_})
173
+
132
174
  return id_
133
175
 
134
176
 
@@ -128,6 +128,10 @@ class RedshiftConfig(
128
128
  default=True,
129
129
  description="Whether lineage should be collected from copy commands",
130
130
  )
131
+ include_share_lineage: bool = Field(
132
+ default=True,
133
+ description="Whether lineage should be collected from datashares",
134
+ )
131
135
 
132
136
  include_usage_statistics: bool = Field(
133
137
  default=False,
@@ -0,0 +1,236 @@
1
+ from typing import Dict, Iterable, List, Optional, Union
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from datahub.api.entities.platformresource.platform_resource import (
6
+ ElasticPlatformResourceQuery,
7
+ PlatformResource,
8
+ PlatformResourceKey,
9
+ PlatformResourceSearchFields,
10
+ )
11
+ from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
+ from datahub.ingestion.graph.client import DataHubGraph
14
+ from datahub.ingestion.source.redshift.config import RedshiftConfig
15
+ from datahub.ingestion.source.redshift.redshift_schema import (
16
+ InboundDatashare,
17
+ OutboundDatashare,
18
+ PartialInboundDatashare,
19
+ RedshiftTable,
20
+ RedshiftView,
21
+ )
22
+ from datahub.ingestion.source.redshift.report import RedshiftReport
23
+ from datahub.sql_parsing.sql_parsing_aggregator import KnownLineageMapping
24
+ from datahub.utilities.search_utils import LogicalOperator
25
+
26
+
27
+ class OutboundSharePlatformResource(BaseModel):
28
+ namespace: str
29
+ platform_instance: Optional[str]
30
+ env: str
31
+ source_database: str
32
+ share_name: str
33
+
34
+ def get_key(self) -> str:
35
+ return f"{self.namespace}.{self.share_name}"
36
+
37
+
38
+ PLATFORM_RESOURCE_TYPE = "OUTBOUND_DATASHARE"
39
+
40
+
41
+ class RedshiftDatasharesHelper:
42
+ """
43
+ Redshift datashares lineage generation relies on PlatformResource entity
44
+ to identify the producer namespace and its platform_instance and env
45
+
46
+ Ingestion of any database in namespace will
47
+ A. generate PlatformResource entity for all outbound shares in namespace.
48
+ B. generate lineage with upstream tables from another namespace, if the database
49
+ is created from an inbound share
50
+
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ config: RedshiftConfig,
56
+ report: RedshiftReport,
57
+ graph: Optional[DataHubGraph],
58
+ ):
59
+ self.platform = "redshift"
60
+ self.config = config
61
+ self.report = report
62
+ self.graph = graph
63
+
64
+ def to_platform_resource(
65
+ self, shares: List[OutboundDatashare]
66
+ ) -> Iterable[MetadataChangeProposalWrapper]:
67
+ if not shares:
68
+ self.report.outbound_shares_count = 0
69
+ return
70
+
71
+ self.report.outbound_shares_count = len(shares)
72
+ # Producer namespace will be current namespace for all
73
+ # outbound data shares
74
+
75
+ for share in shares:
76
+ producer_namespace = share.producer_namespace
77
+ try:
78
+ platform_resource_key = PlatformResourceKey(
79
+ platform=self.platform,
80
+ platform_instance=self.config.platform_instance,
81
+ resource_type=PLATFORM_RESOURCE_TYPE,
82
+ primary_key=share.get_key(),
83
+ )
84
+
85
+ value = OutboundSharePlatformResource(
86
+ namespace=producer_namespace,
87
+ platform_instance=self.config.platform_instance,
88
+ env=self.config.env,
89
+ source_database=share.source_database,
90
+ share_name=share.share_name,
91
+ )
92
+
93
+ platform_resource = PlatformResource.create(
94
+ key=platform_resource_key,
95
+ value=value,
96
+ secondary_keys=[share.share_name, share.producer_namespace],
97
+ )
98
+
99
+ yield from platform_resource.to_mcps()
100
+
101
+ except Exception as exc:
102
+ self.report.warning(
103
+ title="Downstream lineage to outbound datashare may not work",
104
+ message="Failed to generate platform resource for outbound datashares",
105
+ context=f"Namespace {share.producer_namespace} Share {share.share_name}",
106
+ exc=exc,
107
+ )
108
+
109
+ def generate_lineage(
110
+ self,
111
+ share: Union[InboundDatashare, PartialInboundDatashare],
112
+ tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]],
113
+ ) -> Iterable[KnownLineageMapping]:
114
+ upstream_share = self.find_upstream_share(share)
115
+
116
+ if not upstream_share:
117
+ return
118
+
119
+ for schema in tables:
120
+ for table in tables[schema]:
121
+ dataset_urn = self.gen_dataset_urn(
122
+ f"{share.consumer_database}.{schema}.{table.name}",
123
+ self.config.platform_instance,
124
+ self.config.env,
125
+ )
126
+
127
+ upstream_dataset_urn = self.gen_dataset_urn(
128
+ f"{upstream_share.source_database}.{schema}.{table.name}",
129
+ upstream_share.platform_instance,
130
+ upstream_share.env,
131
+ )
132
+
133
+ yield KnownLineageMapping(
134
+ upstream_urn=upstream_dataset_urn, downstream_urn=dataset_urn
135
+ )
136
+
137
+ def find_upstream_share(
138
+ self, share: Union[InboundDatashare, PartialInboundDatashare]
139
+ ) -> Optional[OutboundSharePlatformResource]:
140
+ if not self.graph:
141
+ self.report.warning(
142
+ title="Upstream lineage of inbound datashare will be missing",
143
+ message="Missing datahub graph. Either use the datahub-rest sink or "
144
+ "set the top-level datahub_api config in the recipe",
145
+ )
146
+ else:
147
+ resources = self.get_platform_resources(self.graph, share)
148
+
149
+ if len(resources) == 0 or (
150
+ not any(
151
+ [
152
+ resource.resource_info is not None
153
+ and resource.resource_info.resource_type
154
+ == PLATFORM_RESOURCE_TYPE
155
+ for resource in resources
156
+ ]
157
+ )
158
+ ):
159
+ self.report.info(
160
+ title="Upstream lineage of inbound datashare will be missing",
161
+ message="Missing platform resource for share. "
162
+ "Setup redshift ingestion for namespace if not already done. If ingestion is setup, "
163
+ "check whether ingestion user has ALTER/SHARE permission to share.",
164
+ context=share.get_description(),
165
+ )
166
+ else:
167
+ # Ideally we should get only one resource as primary key is namespace+share
168
+ # and type is "OUTBOUND_DATASHARE"
169
+ for resource in resources:
170
+ try:
171
+ assert (
172
+ resource.resource_info is not None
173
+ and resource.resource_info.value is not None
174
+ )
175
+ return resource.resource_info.value.as_pydantic_object(
176
+ OutboundSharePlatformResource, True
177
+ )
178
+ except Exception as e:
179
+ self.report.warning(
180
+ title="Upstream lineage of inbound datashare will be missing",
181
+ message="Failed to parse platform resource for outbound datashare",
182
+ context=share.get_description(),
183
+ exc=e,
184
+ )
185
+
186
+ return None
187
+
188
+ def get_platform_resources(
189
+ self,
190
+ graph: DataHubGraph,
191
+ share: Union[InboundDatashare, PartialInboundDatashare],
192
+ ) -> List[PlatformResource]:
193
+ # NOTE: ideally we receive InboundDatashare and not PartialInboundDatashare.
194
+ # however due to varchar(128) type of database table that captures datashare options
195
+ # we may receive only partial information about inbound share
196
+ # Alternate option to get InboundDatashare using svv_datashares requires superuser
197
+ if isinstance(share, PartialInboundDatashare):
198
+ return list(
199
+ PlatformResource.search_by_filters(
200
+ graph,
201
+ ElasticPlatformResourceQuery.create_from()
202
+ .group(LogicalOperator.AND)
203
+ .add_field_match(
204
+ PlatformResourceSearchFields.RESOURCE_TYPE,
205
+ PLATFORM_RESOURCE_TYPE,
206
+ )
207
+ .add_field_match(
208
+ PlatformResourceSearchFields.PLATFORM, self.platform
209
+ )
210
+ .add_field_match(
211
+ PlatformResourceSearchFields.SECONDARY_KEYS,
212
+ share.share_name,
213
+ )
214
+ .add_wildcard(
215
+ PlatformResourceSearchFields.SECONDARY_KEYS.field_name,
216
+ f"{share.producer_namespace_prefix}*",
217
+ )
218
+ .end(),
219
+ )
220
+ )
221
+ return list(
222
+ PlatformResource.search_by_key(
223
+ graph, key=share.get_key(), primary=True, is_exact=True
224
+ )
225
+ )
226
+
227
+ # TODO: Refactor and move to new RedshiftIdentifierBuilder class
228
+ def gen_dataset_urn(
229
+ self, datahub_dataset_name: str, platform_instance: Optional[str], env: str
230
+ ) -> str:
231
+ return make_dataset_urn_with_platform_instance(
232
+ platform=self.platform,
233
+ name=datahub_dataset_name,
234
+ platform_instance=platform_instance,
235
+ env=env,
236
+ )
@@ -813,9 +813,13 @@ class RedshiftLineageExtractor:
813
813
  )
814
814
 
815
815
  tablename = table.name
816
- if table.type == "EXTERNAL_TABLE":
816
+ if (
817
+ table.is_external_table
818
+ and schema.is_external_schema
819
+ and schema.external_platform
820
+ ):
817
821
  # external_db_params = schema.option
818
- upstream_platform = schema.type.lower()
822
+ upstream_platform = schema.external_platform.lower()
819
823
  catalog_upstream = UpstreamClass(
820
824
  mce_builder.make_dataset_urn_with_platform_instance(
821
825
  upstream_platform,
@@ -401,11 +401,14 @@ class RedshiftSqlLineageV2(Closeable):
401
401
  ) -> None:
402
402
  for schema_name, tables in all_tables[self.database].items():
403
403
  for table in tables:
404
- if table.type == "EXTERNAL_TABLE":
405
- schema = db_schemas[self.database][schema_name]
406
-
404
+ schema = db_schemas[self.database][schema_name]
405
+ if (
406
+ table.is_external_table
407
+ and schema.is_external_schema
408
+ and schema.external_platform
409
+ ):
407
410
  # external_db_params = schema.option
408
- upstream_platform = schema.type.lower()
411
+ upstream_platform = schema.external_platform.lower()
409
412
 
410
413
  table_urn = mce_builder.make_dataset_urn_with_platform_instance(
411
414
  self.platform,
@@ -48,7 +48,7 @@ class RedshiftProfiler(GenericProfiler):
48
48
  if not self.config.schema_pattern.allowed(schema):
49
49
  continue
50
50
  for table in tables[db].get(schema, {}):
51
- if table.type == "EXTERNAL_TABLE":
51
+ if table.is_external_table:
52
52
  if not self.config.profiling.profile_external_tables:
53
53
  # Case 1: If user did not tell us to profile external tables, simply log this.
54
54
  self.report.profiling_skipped_other[schema] += 1