acryl-datahub 0.15.0.3__py3-none-any.whl → 0.15.0.4rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4rc1.dist-info}/METADATA +2379 -2379
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4rc1.dist-info}/RECORD +20 -18
- datahub/__init__.py +1 -1
- datahub/cli/container_cli.py +89 -0
- datahub/entrypoints.py +2 -0
- datahub/ingestion/source/aws/s3_util.py +1 -24
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -2
- datahub/ingestion/source/dbt/dbt_common.py +2 -2
- datahub/ingestion/source/s3/source.py +6 -2
- datahub/ingestion/source/sql/hive_metastore.py +3 -3
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/tableau/tableau.py +23 -10
- datahub/metadata/_schema_classes.py +401 -401
- datahub/metadata/_urns/urn_defs.py +1395 -1395
- datahub/metadata/schema.avsc +16624 -16266
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -3
- datahub/utilities/groupby.py +17 -0
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.3.dist-info → acryl_datahub-0.15.0.4rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=xAoAiT2wj9RiduZiLAlc28hjzEtCWW6b2I4AOY-rBkc,576
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
|
-
datahub/entrypoints.py,sha256=
|
|
3
|
+
datahub/entrypoints.py,sha256=vbkUx_jVIkr_V4wtoQhOpledna-pD_tco1mloRnb7QY,8029
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
datahub/_codegen/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
datahub/_codegen/aspect.py,sha256=PJRa-Z4ouXHq3OkulfyWhwZn-fFUBDK_UPvmqaWdbWk,1063
|
|
@@ -61,6 +61,7 @@ datahub/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
61
61
|
datahub/cli/check_cli.py,sha256=9dXNyzZayHeoFjwFjLkMVyx6DiCZfeESyI-sYtGA6bE,12850
|
|
62
62
|
datahub/cli/cli_utils.py,sha256=onbG7z9hIm0zCAm0a2ulTOsHC_NVkdIsbg__EMj02DQ,13540
|
|
63
63
|
datahub/cli/config_utils.py,sha256=yuXw7RzpRY5x_-MAoqWbv46qUkIeRNAJL4_OeJpYdBE,4879
|
|
64
|
+
datahub/cli/container_cli.py,sha256=t5hEKqmTEi9LMAs6DTuHtIJBSU5LAXND3JyCLAdJOO8,2828
|
|
64
65
|
datahub/cli/delete_cli.py,sha256=oQ4Yy6hxZHcl67MYJiQumLs_8QmFEj7SPZFzxFXvDk8,23481
|
|
65
66
|
datahub/cli/docker_check.py,sha256=rED4wHXqxcQ_qNFyIgFEZ85BHT9ZTE5YC-oUKqbRqi0,9432
|
|
66
67
|
datahub/cli/docker_cli.py,sha256=w9ZQMRVlHwfJI2XDe7mO0lwnT7-dZoK6tPadSMgwEM8,36493
|
|
@@ -219,7 +220,7 @@ datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
219
220
|
datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
|
|
220
221
|
datahub/ingestion/source/aws/glue.py,sha256=9KYv53loNa-keVwCRLDb-5II_AjeHVRf1Fb2HXqtZXk,57653
|
|
221
222
|
datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Y54jlLV5gLcuZ4Zs57kIW5dYHD89RSFfsVNlFbRnSkQ,3901
|
|
222
|
-
datahub/ingestion/source/aws/s3_util.py,sha256=
|
|
223
|
+
datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
|
|
223
224
|
datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
|
|
224
225
|
datahub/ingestion/source/aws/sagemaker_processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
225
226
|
datahub/ingestion/source/aws/sagemaker_processors/common.py,sha256=NvYfI8LHgDvhEZE7qp6qF1NSZ0_SQKhg3ivtdjsdpFg,2172
|
|
@@ -243,7 +244,7 @@ datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py,sha256
|
|
|
243
244
|
datahub/ingestion/source/bigquery_v2/bigquery_queries.py,sha256=EoHo9twb0_QdX7Nvd1HJC1Yn0rqtrfR52EVk7Hu3XOQ,3296
|
|
244
245
|
datahub/ingestion/source/bigquery_v2/bigquery_report.py,sha256=qH8k8wyMlUVzUTVhSd3FgOMGCK1D5NYuC0KF8tez_Ys,7957
|
|
245
246
|
datahub/ingestion/source/bigquery_v2/bigquery_schema.py,sha256=E5GOx4NWjyZM0xzdpBlNXbvDdKNfW9UtS64XtCYFpzI,31809
|
|
246
|
-
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=
|
|
247
|
+
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=c1hlsgat7l27fQN8GwvHkdme7rQ4LqIQKFwwA8z7kqw,50824
|
|
247
248
|
datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py,sha256=cATxwi5IPzj3BldRRAVcLqzSFmmYEPvqa7U0RFJbaAc,7645
|
|
248
249
|
datahub/ingestion/source/bigquery_v2/common.py,sha256=Cxjf1a8ibkL_YRQeS0BqsjlyMgFJpaZ3iq_d7e8T8MQ,4030
|
|
249
250
|
datahub/ingestion/source/bigquery_v2/lineage.py,sha256=Dkig1SEfPxw6zZDeSulUYnqsu4WGCVPXypGPEUVriyU,44907
|
|
@@ -274,7 +275,7 @@ datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vB
|
|
|
274
275
|
datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
|
|
275
276
|
datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
276
277
|
datahub/ingestion/source/dbt/dbt_cloud.py,sha256=tNpSHbPlLq-oFGbJsdkWY9kIaWmpjcZLWhj1CSewGGY,17981
|
|
277
|
-
datahub/ingestion/source/dbt/dbt_common.py,sha256=
|
|
278
|
+
datahub/ingestion/source/dbt/dbt_common.py,sha256=y4VINaQQ-WhEf-rICGLGi1U88nKmRdVQPmh88OJROWg,80536
|
|
278
279
|
datahub/ingestion/source/dbt/dbt_core.py,sha256=m6cA9vVd4Nh2arc-T2_xeQoxvreRbMhTDIJuYsx3wHc,22722
|
|
279
280
|
datahub/ingestion/source/dbt/dbt_tests.py,sha256=Q5KISW_AOOWqyxmyOgJQquyX7xlfOqKu9WhrHoLKC0M,9881
|
|
280
281
|
datahub/ingestion/source/delta_lake/__init__.py,sha256=u5oqUeus81ONAtdl6o9Puw33ODSMun-0wLIamrZ4BUM,71
|
|
@@ -403,7 +404,7 @@ datahub/ingestion/source/s3/config.py,sha256=Zs1nrBZKLImteZreIcSMMRLj8vBGgxakNDs
|
|
|
403
404
|
datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=FfrcgK-JEF94vw-l3q6pN6FENXb-wZzW2w1VUZVkwW8,3620
|
|
404
405
|
datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
|
|
405
406
|
datahub/ingestion/source/s3/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
|
|
406
|
-
datahub/ingestion/source/s3/source.py,sha256=
|
|
407
|
+
datahub/ingestion/source/s3/source.py,sha256=IE_K_HE_S7w8fpGPT8OptU5-VmwapntsI5PePv_wUQA,47412
|
|
407
408
|
datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
408
409
|
datahub/ingestion/source/sac/sac.py,sha256=zPSO9ukuyhvNaaVzeAYpA-_sFma_XMcCQMPaGvDWuTk,30226
|
|
409
410
|
datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
|
|
@@ -452,7 +453,7 @@ datahub/ingestion/source/sql/cockroachdb.py,sha256=XaD7eae34plU9ISRC6PzYX9q6RdT2
|
|
|
452
453
|
datahub/ingestion/source/sql/druid.py,sha256=lhO9CCOlHV-6LjBuAxAxtB9I1pvPtsGSdr63bz6_ilA,2837
|
|
453
454
|
datahub/ingestion/source/sql/hana.py,sha256=0PIvcX0Rz59NyR7Ag5Bv1MBV_UbJwxl9UAopo_xe_CA,1342
|
|
454
455
|
datahub/ingestion/source/sql/hive.py,sha256=NRUrEWnR1JN5U0q4CHlRacdKzxJhS4unFXnXYZT7vZE,30306
|
|
455
|
-
datahub/ingestion/source/sql/hive_metastore.py,sha256=
|
|
456
|
+
datahub/ingestion/source/sql/hive_metastore.py,sha256=65DI0PeJMpGOEhTfo6cygeybgaFqi93yGnLLRy58ATo,36117
|
|
456
457
|
datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfvpHSzJjqvMw,737
|
|
457
458
|
datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
|
|
458
459
|
datahub/ingestion/source/sql/oracle.py,sha256=tVP3AiZO97psM8O8UzBb9C7__s8y4fkyQbXBv3m1LU4,24503
|
|
@@ -467,7 +468,7 @@ datahub/ingestion/source/sql/sql_types.py,sha256=uuU3taVe4oCTXkqg1wSMGzTwVleRyUR
|
|
|
467
468
|
datahub/ingestion/source/sql/sql_utils.py,sha256=q-Bsk6WxlsRtrw9RXBxvqI3zuaMTC_F25T2VrCziR9I,8418
|
|
468
469
|
datahub/ingestion/source/sql/sqlalchemy_data_reader.py,sha256=FvHZ4JEK3aR2DYOBZiT_ZsAy12RjTu4t_KIR_92B11k,2644
|
|
469
470
|
datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py,sha256=KOpbmDIE2h1hyYEsbVHJi2B7FlsyUMTXZx4diyzltQg,1826
|
|
470
|
-
datahub/ingestion/source/sql/teradata.py,sha256=
|
|
471
|
+
datahub/ingestion/source/sql/teradata.py,sha256=5lTNMOOOmrG71fTAyTs7iYFroeTiGIdATwXQmH6sWJg,32741
|
|
471
472
|
datahub/ingestion/source/sql/trino.py,sha256=FEn_BQ3pm23hKx94ek5kk5IXGNYcBqZEhllRJFUzfU8,17895
|
|
472
473
|
datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5foKrf71SydzEltc3WsVAhQc,5732
|
|
473
474
|
datahub/ingestion/source/sql/vertica.py,sha256=_9OgSgIgqBml0av063rb8nACiT3SAmzpw0ouyF91wv8,33382
|
|
@@ -491,7 +492,7 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
|
|
|
491
492
|
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=DziD57PbHn2Tcy51tYXCG-GQgyTGMUxnkuzVS_xihFY,4079
|
|
492
493
|
datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
|
|
493
494
|
datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
494
|
-
datahub/ingestion/source/tableau/tableau.py,sha256=
|
|
495
|
+
datahub/ingestion/source/tableau/tableau.py,sha256=YjWyzYZ7hGeMxlqAUQNNJ9LJXlHrc5fHqf7lBWkr1aE,153184
|
|
495
496
|
datahub/ingestion/source/tableau/tableau_common.py,sha256=3AUgXxTGOKM609xvcDrRItGXhUfuNYku2LFaj8z2Hg4,26936
|
|
496
497
|
datahub/ingestion/source/tableau/tableau_constant.py,sha256=ZcAeHsQUXVVL26ORly0ByZk_GJAFbxaKuJAlX_sYMac,2686
|
|
497
498
|
datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=nSyx9RzC6TCQDm-cTVJ657qT8iDwzk_8JMKpohhmOc4,1046
|
|
@@ -566,12 +567,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
|
|
|
566
567
|
datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
|
|
567
568
|
datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
|
|
568
569
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
569
|
-
datahub/metadata/_schema_classes.py,sha256=
|
|
570
|
-
datahub/metadata/schema.avsc,sha256=
|
|
570
|
+
datahub/metadata/_schema_classes.py,sha256=GMLN7Ov0m39EWaXlziVgINqxhihZDzNy2BztBIR9YM8,975061
|
|
571
|
+
datahub/metadata/schema.avsc,sha256=sAPtgHSNJ1a126Vz7OjVIMKFjrrIG9f4cvRH6SkJ0jc,640786
|
|
571
572
|
datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
|
|
572
573
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
573
574
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
574
|
-
datahub/metadata/_urns/urn_defs.py,sha256=
|
|
575
|
+
datahub/metadata/_urns/urn_defs.py,sha256=MdAOrpRL4CL5VKSBk1I_DTVYz2_rYuIQ9tAEmAdIK4I,109984
|
|
575
576
|
datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
576
577
|
datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
577
578
|
datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
|
|
@@ -885,7 +886,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
|
|
|
885
886
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
886
887
|
datahub/sql_parsing/schema_resolver.py,sha256=8dYz6pC3Y35pXBn41grOE2dKkSiSeLHOz-N138uWQg4,10796
|
|
887
888
|
datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
|
|
888
|
-
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=
|
|
889
|
+
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=XNZWjeaRhzaT92mzsJZGJfYaxJENsyp5dSHTmL81RIc,70130
|
|
889
890
|
datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
|
|
890
891
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
891
892
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=42n8yCmCt25bOR8fmq4n_nNubn5kLuw_Mx36SFC9Nj0,47460
|
|
@@ -918,6 +919,7 @@ datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,11
|
|
|
918
919
|
datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
|
|
919
920
|
datahub/utilities/file_backed_collections.py,sha256=B3gQS0isgbCM9cH3DEBzpA4PVixtSwr5vJoNGmEG-fg,21960
|
|
920
921
|
datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
|
|
922
|
+
datahub/utilities/groupby.py,sha256=pe6rP4ZCttYB98yjbs0Aey8C32aLb7rq-NJ_BFky0H4,524
|
|
921
923
|
datahub/utilities/hive_schema_to_avro.py,sha256=1MP0a6FFVEYxLg_4lKF7hPxbHJJy0uRQYkML5zRwV3Q,11622
|
|
922
924
|
datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
|
|
923
925
|
datahub/utilities/logging_manager.py,sha256=bc-x5VZGvFUHT0HD-TF3Uz_nzw3dpKdJSbz6kjpAqAQ,10073
|
|
@@ -990,8 +992,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
990
992
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
991
993
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
992
994
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
993
|
-
acryl_datahub-0.15.0.
|
|
994
|
-
acryl_datahub-0.15.0.
|
|
995
|
-
acryl_datahub-0.15.0.
|
|
996
|
-
acryl_datahub-0.15.0.
|
|
997
|
-
acryl_datahub-0.15.0.
|
|
995
|
+
acryl_datahub-0.15.0.4rc1.dist-info/METADATA,sha256=78vUNhiirHpceObDU05VhkgjPwGVXn0TOFHunHTcnV0,173250
|
|
996
|
+
acryl_datahub-0.15.0.4rc1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
997
|
+
acryl_datahub-0.15.0.4rc1.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
998
|
+
acryl_datahub-0.15.0.4rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
999
|
+
acryl_datahub-0.15.0.4rc1.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from datahub.ingestion.graph.client import get_default_graph
|
|
7
|
+
from datahub.metadata.schema_classes import (
|
|
8
|
+
GlossaryTermAssociationClass,
|
|
9
|
+
OwnerClass,
|
|
10
|
+
OwnershipTypeClass,
|
|
11
|
+
TagAssociationClass,
|
|
12
|
+
)
|
|
13
|
+
from datahub.specific.dataset import DatasetPatchBuilder
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@click.group()
|
|
19
|
+
def container() -> None:
|
|
20
|
+
"""A group of commands to interact with containers in DataHub."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def apply_association_to_container(
|
|
25
|
+
container_urn: str,
|
|
26
|
+
association_urn: str,
|
|
27
|
+
association_type: str,
|
|
28
|
+
) -> None:
|
|
29
|
+
"""
|
|
30
|
+
Common function to add either tags, terms, or owners to child datasets (for now).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
container_urn: The URN of the container
|
|
34
|
+
association_urn: The URN of the tag, term, or user to apply
|
|
35
|
+
association_type: One of 'tag', 'term', or 'owner'
|
|
36
|
+
"""
|
|
37
|
+
urns: List[str] = []
|
|
38
|
+
graph = get_default_graph()
|
|
39
|
+
logger.info(f"Using {graph}")
|
|
40
|
+
urns.extend(
|
|
41
|
+
graph.get_urns_by_filter(
|
|
42
|
+
container=container_urn, batch_size=1000, entity_types=["dataset"]
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
for urn in urns:
|
|
47
|
+
logger.info(f"Adding {association_type} {association_urn} to {urn}")
|
|
48
|
+
builder = DatasetPatchBuilder(urn)
|
|
49
|
+
|
|
50
|
+
if association_type == "tag":
|
|
51
|
+
patches = builder.add_tag(TagAssociationClass(association_urn)).build()
|
|
52
|
+
elif association_type == "term":
|
|
53
|
+
patches = builder.add_term(
|
|
54
|
+
GlossaryTermAssociationClass(association_urn)
|
|
55
|
+
).build()
|
|
56
|
+
elif association_type == "owner":
|
|
57
|
+
patches = builder.add_owner(
|
|
58
|
+
OwnerClass(
|
|
59
|
+
owner=association_urn,
|
|
60
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
61
|
+
)
|
|
62
|
+
).build()
|
|
63
|
+
|
|
64
|
+
for mcp in patches:
|
|
65
|
+
graph.emit(mcp)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@container.command()
|
|
69
|
+
@click.option("--container-urn", required=True, type=str)
|
|
70
|
+
@click.option("--tag-urn", required=True, type=str)
|
|
71
|
+
def tag(container_urn: str, tag_urn: str) -> None:
|
|
72
|
+
"""Add patch to add a tag to all datasets in a container"""
|
|
73
|
+
apply_association_to_container(container_urn, tag_urn, "tag")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@container.command()
|
|
77
|
+
@click.option("--container-urn", required=True, type=str)
|
|
78
|
+
@click.option("--term-urn", required=True, type=str)
|
|
79
|
+
def term(container_urn: str, term_urn: str) -> None:
|
|
80
|
+
"""Add patch to add a term to all datasets in a container"""
|
|
81
|
+
apply_association_to_container(container_urn, term_urn, "term")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@container.command()
|
|
85
|
+
@click.option("--container-urn", required=True, type=str)
|
|
86
|
+
@click.option("--owner-id", required=True, type=str)
|
|
87
|
+
def owner(container_urn: str, owner_id: str) -> None:
|
|
88
|
+
"""Add patch to add a owner to all datasets in a container"""
|
|
89
|
+
apply_association_to_container(container_urn, owner_id, "owner")
|
datahub/entrypoints.py
CHANGED
|
@@ -14,6 +14,7 @@ from datahub.cli.cli_utils import (
|
|
|
14
14
|
make_shim_command,
|
|
15
15
|
)
|
|
16
16
|
from datahub.cli.config_utils import DATAHUB_CONFIG_PATH, write_gms_config
|
|
17
|
+
from datahub.cli.container_cli import container
|
|
17
18
|
from datahub.cli.delete_cli import delete
|
|
18
19
|
from datahub.cli.docker_cli import docker
|
|
19
20
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
@@ -180,6 +181,7 @@ datahub.add_command(properties)
|
|
|
180
181
|
datahub.add_command(forms)
|
|
181
182
|
datahub.add_command(datacontract)
|
|
182
183
|
datahub.add_command(assertions)
|
|
184
|
+
datahub.add_command(container)
|
|
183
185
|
|
|
184
186
|
try:
|
|
185
187
|
from datahub.cli.lite_cli import lite
|
|
@@ -1,11 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
4
|
-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional
|
|
5
|
-
|
|
6
|
-
if TYPE_CHECKING:
|
|
7
|
-
from mypy_boto3_s3.service_resource import ObjectSummary
|
|
8
|
-
|
|
3
|
+
from typing import Optional
|
|
9
4
|
|
|
10
5
|
S3_PREFIXES = ["s3://", "s3n://", "s3a://"]
|
|
11
6
|
|
|
@@ -73,21 +68,3 @@ def get_key_prefix(s3_uri: str) -> str:
|
|
|
73
68
|
f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}"
|
|
74
69
|
)
|
|
75
70
|
return strip_s3_prefix(s3_uri).split("/", maxsplit=1)[1]
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def group_s3_objects_by_dirname(
|
|
79
|
-
s3_objects: Iterable["ObjectSummary"],
|
|
80
|
-
) -> Dict[str, List["ObjectSummary"]]:
|
|
81
|
-
"""
|
|
82
|
-
Groups S3 objects by their directory name.
|
|
83
|
-
|
|
84
|
-
If a s3_object in the root directory (i.e., s3://bucket/file.txt), it is grouped under '/'.
|
|
85
|
-
"""
|
|
86
|
-
grouped_s3_objs = defaultdict(list)
|
|
87
|
-
for obj in s3_objects:
|
|
88
|
-
if "/" in obj.key:
|
|
89
|
-
dirname = obj.key.rsplit("/", 1)[0]
|
|
90
|
-
else:
|
|
91
|
-
dirname = "/"
|
|
92
|
-
grouped_s3_objs[dirname].append(obj)
|
|
93
|
-
return grouped_s3_objs
|
|
@@ -2,7 +2,6 @@ import logging
|
|
|
2
2
|
import re
|
|
3
3
|
from base64 import b32decode
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
from itertools import groupby
|
|
6
5
|
from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast
|
|
7
6
|
|
|
8
7
|
from google.cloud.bigquery.table import TableListItem
|
|
@@ -101,6 +100,7 @@ from datahub.metadata.schema_classes import (
|
|
|
101
100
|
from datahub.metadata.urns import TagUrn
|
|
102
101
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
103
102
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
103
|
+
from datahub.utilities.groupby import groupby_unsorted
|
|
104
104
|
from datahub.utilities.hive_schema_to_avro import (
|
|
105
105
|
HiveColumnToAvroConverter,
|
|
106
106
|
get_schema_fields_for_hive_column,
|
|
@@ -730,7 +730,7 @@ class BigQuerySchemaGenerator:
|
|
|
730
730
|
foreign_keys: List[BigqueryTableConstraint] = list(
|
|
731
731
|
filter(lambda x: x.type == "FOREIGN KEY", table.constraints)
|
|
732
732
|
)
|
|
733
|
-
for key, group in
|
|
733
|
+
for key, group in groupby_unsorted(
|
|
734
734
|
foreign_keys,
|
|
735
735
|
lambda x: f"{x.referenced_project_id}.{x.referenced_dataset}.{x.referenced_table_name}",
|
|
736
736
|
):
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import itertools
|
|
2
1
|
import logging
|
|
3
2
|
import re
|
|
4
3
|
from abc import abstractmethod
|
|
@@ -111,6 +110,7 @@ from datahub.sql_parsing.sqlglot_utils import (
|
|
|
111
110
|
parse_statements_and_pick,
|
|
112
111
|
try_format_query,
|
|
113
112
|
)
|
|
113
|
+
from datahub.utilities.groupby import groupby_unsorted
|
|
114
114
|
from datahub.utilities.lossy_collections import LossyList
|
|
115
115
|
from datahub.utilities.mapping import Constants, OperationProcessor
|
|
116
116
|
from datahub.utilities.time import datetime_to_ts_millis
|
|
@@ -1929,7 +1929,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1929
1929
|
else None
|
|
1930
1930
|
),
|
|
1931
1931
|
)
|
|
1932
|
-
for downstream, upstreams in
|
|
1932
|
+
for downstream, upstreams in groupby_unsorted(
|
|
1933
1933
|
node.upstream_cll, lambda x: x.downstream_col
|
|
1934
1934
|
)
|
|
1935
1935
|
]
|
|
@@ -40,7 +40,6 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
40
40
|
get_bucket_name,
|
|
41
41
|
get_bucket_relative_path,
|
|
42
42
|
get_key_prefix,
|
|
43
|
-
group_s3_objects_by_dirname,
|
|
44
43
|
strip_s3_prefix,
|
|
45
44
|
)
|
|
46
45
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
@@ -73,6 +72,7 @@ from datahub.metadata.schema_classes import (
|
|
|
73
72
|
_Aspect,
|
|
74
73
|
)
|
|
75
74
|
from datahub.telemetry import stats, telemetry
|
|
75
|
+
from datahub.utilities.groupby import groupby_unsorted
|
|
76
76
|
from datahub.utilities.perf_timer import PerfTimer
|
|
77
77
|
|
|
78
78
|
if TYPE_CHECKING:
|
|
@@ -868,7 +868,11 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
868
868
|
"""
|
|
869
869
|
partitions: List[Folder] = []
|
|
870
870
|
s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
|
|
871
|
-
|
|
871
|
+
grouped_s3_objects_by_dirname = groupby_unsorted(
|
|
872
|
+
s3_objects,
|
|
873
|
+
key=lambda obj: obj.key.rsplit("/", 1)[0],
|
|
874
|
+
)
|
|
875
|
+
for key, group in grouped_s3_objects_by_dirname:
|
|
872
876
|
file_size = 0
|
|
873
877
|
creation_time = None
|
|
874
878
|
modification_time = None
|
|
@@ -2,7 +2,6 @@ import base64
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
from collections import namedtuple
|
|
5
|
-
from itertools import groupby
|
|
6
5
|
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
7
6
|
|
|
8
7
|
from pydantic.dataclasses import dataclass
|
|
@@ -58,6 +57,7 @@ from datahub.metadata.schema_classes import (
|
|
|
58
57
|
SubTypesClass,
|
|
59
58
|
ViewPropertiesClass,
|
|
60
59
|
)
|
|
60
|
+
from datahub.utilities.groupby import groupby_unsorted
|
|
61
61
|
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
|
62
62
|
from datahub.utilities.str_enum import StrEnum
|
|
63
63
|
|
|
@@ -490,7 +490,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
490
490
|
|
|
491
491
|
iter_res = self._alchemy_client.execute_query(statement)
|
|
492
492
|
|
|
493
|
-
for key, group in
|
|
493
|
+
for key, group in groupby_unsorted(iter_res, self._get_table_key):
|
|
494
494
|
schema_name = (
|
|
495
495
|
f"{db_name}.{key.schema}"
|
|
496
496
|
if self.config.include_catalog_name_in_ids
|
|
@@ -647,7 +647,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
647
647
|
)
|
|
648
648
|
|
|
649
649
|
iter_res = self._alchemy_client.execute_query(statement)
|
|
650
|
-
for key, group in
|
|
650
|
+
for key, group in groupby_unsorted(iter_res, self._get_table_key):
|
|
651
651
|
db_name = self.get_db_name(inspector)
|
|
652
652
|
|
|
653
653
|
schema_name = (
|
|
@@ -3,7 +3,6 @@ from collections import defaultdict
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from functools import lru_cache
|
|
6
|
-
from itertools import groupby
|
|
7
6
|
from typing import (
|
|
8
7
|
Any,
|
|
9
8
|
Dict,
|
|
@@ -59,6 +58,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
59
58
|
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
60
59
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
61
60
|
from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
|
|
61
|
+
from datahub.utilities.groupby import groupby_unsorted
|
|
62
62
|
|
|
63
63
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
64
64
|
|
|
@@ -286,7 +286,7 @@ def optimized_get_foreign_keys(self, connection, table_name, schema=None, **kw):
|
|
|
286
286
|
|
|
287
287
|
# TODO: Check if there's a better way
|
|
288
288
|
fk_dicts = list()
|
|
289
|
-
for constraint_info, constraint_cols in
|
|
289
|
+
for constraint_info, constraint_cols in groupby_unsorted(res, grouper):
|
|
290
290
|
fk_dict = {
|
|
291
291
|
"name": str(constraint_info["name"]),
|
|
292
292
|
"constrained_columns": list(),
|
|
@@ -1147,23 +1147,36 @@ class TableauSiteSource:
|
|
|
1147
1147
|
)
|
|
1148
1148
|
# Set parent project name
|
|
1149
1149
|
for _project_id, project in all_project_map.items():
|
|
1150
|
-
if
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1150
|
+
if project.parent_id is None:
|
|
1151
|
+
continue
|
|
1152
|
+
|
|
1153
|
+
if project.parent_id in all_project_map:
|
|
1154
1154
|
project.parent_name = all_project_map[project.parent_id].name
|
|
1155
|
+
else:
|
|
1156
|
+
self.report.warning(
|
|
1157
|
+
title="Incomplete project hierarchy",
|
|
1158
|
+
message="Project details missing. Child projects will be ingested without reference to their parent project. We generally need Site Administrator Explorer permissions to extract the complete project hierarchy.",
|
|
1159
|
+
context=f"Missing {project.parent_id}, referenced by {project.id} {project.project_name}",
|
|
1160
|
+
)
|
|
1161
|
+
project.parent_id = None
|
|
1162
|
+
|
|
1163
|
+
# Post-condition
|
|
1164
|
+
assert all(
|
|
1165
|
+
[
|
|
1166
|
+
((project.parent_id is None) == (project.parent_name is None))
|
|
1167
|
+
and (
|
|
1168
|
+
project.parent_id is None
|
|
1169
|
+
or project.parent_id in all_project_map
|
|
1170
|
+
)
|
|
1171
|
+
for project in all_project_map.values()
|
|
1172
|
+
]
|
|
1173
|
+
), "Parent project id and name should be consistent"
|
|
1155
1174
|
|
|
1156
1175
|
def set_project_path():
|
|
1157
1176
|
def form_path(project_id: str) -> List[str]:
|
|
1158
1177
|
cur_proj = all_project_map[project_id]
|
|
1159
1178
|
ancestors = [cur_proj.name]
|
|
1160
1179
|
while cur_proj.parent_id is not None:
|
|
1161
|
-
if cur_proj.parent_id not in all_project_map:
|
|
1162
|
-
self.report.warning(
|
|
1163
|
-
"project-issue",
|
|
1164
|
-
f"Parent project {cur_proj.parent_id} not found. We need Site Administrator Explorer permissions.",
|
|
1165
|
-
)
|
|
1166
|
-
break
|
|
1167
1180
|
cur_proj = all_project_map[cur_proj.parent_id]
|
|
1168
1181
|
ancestors = [cur_proj.name, *ancestors]
|
|
1169
1182
|
return ancestors
|