acryl-datahub 1.3.0.1rc5__py3-none-any.whl → 1.3.0.1rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/METADATA +2439 -2439
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/RECORD +23 -20
- datahub/_version.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +17 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +16 -0
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +41 -4
- datahub/ingestion/source/snowflake/snowflake_config.py +16 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -1
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +30 -2
- datahub/metadata/_internal_schema_classes.py +223 -0
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +206 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +228 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -4
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.3.0.
|
|
1
|
+
acryl_datahub-1.3.0.1rc6.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=PW9A4Uazfqf_qZ54rH_cG6i8GhrmicechIGtJYipA8Q,323
|
|
5
5
|
datahub/entrypoints.py,sha256=VcbU6Z47b_JKW1zI-WJMYIngm05FSogKLiuvFNtyNcI,9088
|
|
6
6
|
datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -260,10 +260,10 @@ datahub/ingestion/source/azure/abs_folder_utils.py,sha256=7skXus-4fSIoKpqCeU-GG0
|
|
|
260
260
|
datahub/ingestion/source/azure/abs_utils.py,sha256=KdAlCK-PMrn35kFHxz5vrsjajyx2PD5GRgoBKdoRvcg,2075
|
|
261
261
|
datahub/ingestion/source/azure/azure_common.py,sha256=DvPrLpjQSJ1USB_myGmg8lGkRW-WAl2GIZMcEkBFjOs,4063
|
|
262
262
|
datahub/ingestion/source/bigquery_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
263
|
-
datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=
|
|
263
|
+
datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=Z5QsyvBNDWEf9kME_zBRw2oLIh3rD5zafpvuYB0p4ow,15972
|
|
264
264
|
datahub/ingestion/source/bigquery_v2/bigquery_audit.py,sha256=kEwWhq3ch6WT4q4hcX8-fvQh28KgrNfspFwIytO3vQA,25103
|
|
265
265
|
datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py,sha256=LuGJ6LgPViLIfDQfylxlQ3CA7fZYM5MDt8M-7sfzm84,5096
|
|
266
|
-
datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=
|
|
266
|
+
datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=A5pLaTm4WCTndhGmGBGrjc05LHtC7C5-FrE3vEMc1ik,23880
|
|
267
267
|
datahub/ingestion/source/bigquery_v2/bigquery_connection.py,sha256=6XFCc0oxxU3R4IPyYHaf3YMETlMD4ztkNpkf4kf1Elw,3171
|
|
268
268
|
datahub/ingestion/source/bigquery_v2/bigquery_data_reader.py,sha256=DeT3v_Z82__8En0FcZ0kavBAWQoRvSZ5Rppm9eeDAb8,2393
|
|
269
269
|
datahub/ingestion/source/bigquery_v2/bigquery_helper.py,sha256=QER3gY8e_k1_eNVj7cBso7ZzrWl_vO5PYSa6CpvqNx8,1554
|
|
@@ -277,7 +277,7 @@ datahub/ingestion/source/bigquery_v2/common.py,sha256=IinOy-RO4UZGxSf5scaN02672B
|
|
|
277
277
|
datahub/ingestion/source/bigquery_v2/lineage.py,sha256=jju14mJbAUMA_K3j2yq-TdZV202cjd5rBAsDPJGEVno,44900
|
|
278
278
|
datahub/ingestion/source/bigquery_v2/profiler.py,sha256=oLf5jMjJf-ShNny9Dll2tCsOoPMF1DxAh7e7etpeLq4,10821
|
|
279
279
|
datahub/ingestion/source/bigquery_v2/queries.py,sha256=gDvvgajptmNn5AiBglmDhGAC9LBh8fzw56_d8ewLbxA,20222
|
|
280
|
-
datahub/ingestion/source/bigquery_v2/queries_extractor.py,sha256=
|
|
280
|
+
datahub/ingestion/source/bigquery_v2/queries_extractor.py,sha256=bSYusyf-xnhs_1WURsQ2YmMxRn3J5HCp_UKChsxbWIw,21015
|
|
281
281
|
datahub/ingestion/source/bigquery_v2/usage.py,sha256=A9c-ofclaRk0NSnc4IRaqJYqMPv6ecCld_TPy3V2qFs,40748
|
|
282
282
|
datahub/ingestion/source/cassandra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
283
283
|
datahub/ingestion/source/cassandra/cassandra.py,sha256=pNy61Z4kTqL_wGcWIYee5fnZiuJDseDcRcQwsxeAssk,14487
|
|
@@ -496,12 +496,12 @@ datahub/ingestion/source/snowflake/constants.py,sha256=iDTamMozHwLYyglpRfqwTbxPx
|
|
|
496
496
|
datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
|
|
497
497
|
datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
|
|
498
498
|
datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
|
|
499
|
-
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=
|
|
499
|
+
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=HoDzaG3TlP1ui5qY2PZUcu83wOjvJ96Z9fFYC4GmCko,24439
|
|
500
500
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=uSHdPqigRzjeNxtn0_m5i57X7X8LBZIpHzDcWIoovyA,19005
|
|
501
501
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
502
502
|
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=nam-bYV6wL9LfR-Tt50Qe_Kea61IuWS-lLu5__aDxk8,21853
|
|
503
503
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=PmQi-qDlRhdJ-PsJ7x-EScIiswWRAxDDOKHydvN3mTY,7404
|
|
504
|
-
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=
|
|
504
|
+
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=AiLEbCs6zXbIoejFdLsbSupkvqHj5BOtPz9lubGKLv8,47142
|
|
505
505
|
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=wLDaYZrWJ0794KKn69rB_QF0_8Bzu5l_7L6mD77KVc4,40469
|
|
506
506
|
datahub/ingestion/source/snowflake/snowflake_report.py,sha256=fA6C-p9wM-jyTsXE_suTbCtrE_lle-5LI52S7wFYf00,6701
|
|
507
507
|
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=a6avRJXbj2qwnu28oK1YotmJo68zEG-1S7vonsUUJy4,41473
|
|
@@ -511,7 +511,7 @@ datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=5Li4H8KuS4qBKR98L
|
|
|
511
511
|
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=eA9xh-G1Ydr1OwUUtrbXUWp26hE1jF0zvyKNky_i_nQ,8887
|
|
512
512
|
datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=mM0v9b4PHRJAT-SdRids3wdzc5O96gWCCww3e42itV8,24982
|
|
513
513
|
datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=1c1YNmAxxOwAKy8IEFqVdp6x-EvCYJkN6UZ_RwUUVv0,15062
|
|
514
|
-
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=
|
|
514
|
+
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=Tpx_d8UuO4wD9sIL9nMDR4GtCvVc6KbF82nlBdFRtEI,36408
|
|
515
515
|
datahub/ingestion/source/snowflake/stored_proc_lineage.py,sha256=rOb78iHiWiK8v8WdVs1xDwVut4Y0OHmszej6IopQfCo,5341
|
|
516
516
|
datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
517
517
|
datahub/ingestion/source/sql/athena.py,sha256=ZvWGuAPRUeUkE-7N6B3RyCkQp7JZKnLVXTnnR200gls,31532
|
|
@@ -552,10 +552,10 @@ datahub/ingestion/source/state/checkpoint.py,sha256=ob8wtC0zOgTdc_-cVAI3MUKcQaN4
|
|
|
552
552
|
datahub/ingestion/source/state/entity_removal_state.py,sha256=dIG1HnueaRTsAu8kYjYiPFPpkZ4WwbraDF6oFDsts2E,6720
|
|
553
553
|
datahub/ingestion/source/state/profiling_state.py,sha256=lsWu7oZhB9nSlqoklvjs-LjS4XF0p6BxSAcLY-xKRzM,512
|
|
554
554
|
datahub/ingestion/source/state/profiling_state_handler.py,sha256=jDMiIrAq8k4GrYoh9Ymh0ZAmzejYFk8E1W7-kuw6lXg,4295
|
|
555
|
-
datahub/ingestion/source/state/redundant_run_skip_handler.py,sha256=
|
|
555
|
+
datahub/ingestion/source/state/redundant_run_skip_handler.py,sha256=UENQDZ1Hacd8Hg3jC6okomG9V4EMLfL-Zz60aU6jzyc,10260
|
|
556
556
|
datahub/ingestion/source/state/sql_common_state.py,sha256=OtJpJfMTBSgyR37dn3w-nnZwlc0nFNb2GoUzIWhnyAc,143
|
|
557
557
|
datahub/ingestion/source/state/stale_entity_removal_handler.py,sha256=Lr2HYGx_b2FQ8A36s7s11tl-4-mGIM13bfy5JbQ3LtM,14890
|
|
558
|
-
datahub/ingestion/source/state/stateful_ingestion_base.py,sha256=
|
|
558
|
+
datahub/ingestion/source/state/stateful_ingestion_base.py,sha256=j78BN_uSBpOJRi19kosZGPsgolKm9i-40MmSzDGeaFs,18837
|
|
559
559
|
datahub/ingestion/source/state/usage_common_state.py,sha256=TJyb0CpwibsduJYI854EFdtrwWnz7JC-IkzKUXVGDx0,983
|
|
560
560
|
datahub/ingestion/source/state/use_case_handler.py,sha256=3g8ddTvGXHe0dCiyTkyFeNmR8a3bhwywtIt8EpK5oQs,1271
|
|
561
561
|
datahub/ingestion/source/state_provider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -646,12 +646,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
|
|
|
646
646
|
datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
|
|
647
647
|
datahub/lite/lite_util.py,sha256=G0LQHKkyEb1pc_q183g6hflShclGx7kikgMaOxtVVcs,4545
|
|
648
648
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
649
|
-
datahub/metadata/_internal_schema_classes.py,sha256=
|
|
650
|
-
datahub/metadata/schema.avsc,sha256=
|
|
649
|
+
datahub/metadata/_internal_schema_classes.py,sha256=O8MG_weYzOYtGIZsOr7c6EO33NDo9liE8D8kyotoq8Q,1084754
|
|
650
|
+
datahub/metadata/schema.avsc,sha256=YMMQSOELeCC4ZptYnoEfMFHRL-DguCR9KmjZ6NYWMlQ,780967
|
|
651
651
|
datahub/metadata/schema_classes.py,sha256=tPT8iHCak4IsZi_oL0nirbPpI8ETTPTZzapqLRpeKU4,1326
|
|
652
652
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
653
653
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
654
|
-
datahub/metadata/_urns/urn_defs.py,sha256=
|
|
654
|
+
datahub/metadata/_urns/urn_defs.py,sha256=GRvs_XhEQ8Xc6abC7B7FwTIMb18R3VeXrjV1_sRi_DE,145429
|
|
655
655
|
datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
656
656
|
datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
657
657
|
datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
|
|
@@ -688,6 +688,7 @@ datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.
|
|
|
688
688
|
datahub/metadata/com/linkedin/pegasus2avro/events/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
689
689
|
datahub/metadata/com/linkedin/pegasus2avro/events/metadata/__init__.py,sha256=a1FI_2VZ9Ejc9AIVztO-B5kLPR6VwlOgdFlv4PTCTYs,282
|
|
690
690
|
datahub/metadata/com/linkedin/pegasus2avro/execution/__init__.py,sha256=O5XAXnGzDnWv8nbqRHxLPPXUbrIu_pn76WUK_hhkHmg,775
|
|
691
|
+
datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py,sha256=wS-GU8YjGghMQcQ42BYv7Rh_-8h-yPiS_QMzJI34yJM,507
|
|
691
692
|
datahub/metadata/com/linkedin/pegasus2avro/form/__init__.py,sha256=rGDmWiKm6qpXiipZ5veCHqBJGSAryAqnSzRPlwcmLnA,845
|
|
692
693
|
datahub/metadata/com/linkedin/pegasus2avro/glossary/__init__.py,sha256=fa1QNv08O3TqXqZ14bkJerGho_t-8DPHFdcWKiXkkUA,501
|
|
693
694
|
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py,sha256=EGxkzJgQMASL_aUmgjHE3bo8qRTSbAbM_8gUccZblX0,1603
|
|
@@ -695,7 +696,7 @@ datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py,sha256=LfB7ytT1u
|
|
|
695
696
|
datahub/metadata/com/linkedin/pegasus2avro/ingestion/__init__.py,sha256=1bfG2naq4iS_pwU4J-BVer_gfL0hDbJbnH0gh1MPNgA,871
|
|
696
697
|
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py,sha256=7SHiR-KzV1CkAimFy94SkcY0Xg0RlsIlLTUTGmGAW_U,290
|
|
697
698
|
datahub/metadata/com/linkedin/pegasus2avro/metadata/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
698
|
-
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py,sha256=
|
|
699
|
+
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py,sha256=U7nFcTKOeiqyZVxD53p2vcKBj6tupuCJH9I3yz6hFDg,5241
|
|
699
700
|
datahub/metadata/com/linkedin/pegasus2avro/metadata/query/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
700
701
|
datahub/metadata/com/linkedin/pegasus2avro/metadata/query/filter/__init__.py,sha256=DBP_QtxkFmC5q_kuk4dGjb4uOKbB4xKgqTWXGxmNbBQ,532
|
|
701
702
|
datahub/metadata/com/linkedin/pegasus2avro/metadata/snapshot/__init__.py,sha256=OPboF8SV11wGnjvWQB-rxtB0otMdCsE7Tcy7xkOUgz8,2358
|
|
@@ -778,6 +779,8 @@ datahub/metadata/schemas/DataHubAccessTokenKey.avsc,sha256=3EspNIxgb_I4WwV0a2o4N
|
|
|
778
779
|
datahub/metadata/schemas/DataHubActionKey.avsc,sha256=bjiKcoyvUPQKaGUi2ICBMJ_ukwnt7dh0szJS4WBZE0A,448
|
|
779
780
|
datahub/metadata/schemas/DataHubConnectionDetails.avsc,sha256=IvZj6OA7HRvy-ZIIn0UbXdJNnyt_oTn16XIe5ZlcqGk,1661
|
|
780
781
|
datahub/metadata/schemas/DataHubConnectionKey.avsc,sha256=VwbamVFoEdp6epz1lJm_UShBl6ksBxoA7jAYuPI5u3M,522
|
|
782
|
+
datahub/metadata/schemas/DataHubFileInfo.avsc,sha256=Vt1t5L9QyaIkSh3GUkrQs4I81o2MYAJJu194h1M97tw,6314
|
|
783
|
+
datahub/metadata/schemas/DataHubFileKey.avsc,sha256=-qxMFdpRPSHpA88adIFIJ35PZrN5qXOEo8RK-xTzkSQ,422
|
|
781
784
|
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc,sha256=4wac7sluRIq-0ZjODE5SmuVKuQeW8ajLJNRpqEBRyio,4601
|
|
782
785
|
datahub/metadata/schemas/DataHubIngestionSourceKey.avsc,sha256=TGmm9WEGTaABs7kt5Uc-N-kbc5Sd-2sQwx-JpfAptvw,545
|
|
783
786
|
datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc,sha256=q6ZyMoxInwmrkrXkUgMe-i-WZzAxbjcvJ-EI99SnEp8,599
|
|
@@ -1017,7 +1020,7 @@ datahub/sql_parsing/fingerprint_utils.py,sha256=3hGiexaQXnE7eZLxo-t7hlTyVQz7womb
|
|
|
1017
1020
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
1018
1021
|
datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
|
|
1019
1022
|
datahub/sql_parsing/split_statements.py,sha256=doCACwQ_Fx6m1djo7t3BnU9ZHki4EV2KJUQkFMGv7lg,10101
|
|
1020
|
-
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=
|
|
1023
|
+
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=chRTe5eZfxsOd2vP_IrO_j3KhFUGLe2sPt0WXQ6yt5M,73320
|
|
1021
1024
|
datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
|
|
1022
1025
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
1023
1026
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=l4LZMiaeTARjJG76Uun_yNtFHdSj3yi8zO1XvAQtxl0,66944
|
|
@@ -1128,8 +1131,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1128
1131
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1129
1132
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1130
1133
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1131
|
-
acryl_datahub-1.3.0.
|
|
1132
|
-
acryl_datahub-1.3.0.
|
|
1133
|
-
acryl_datahub-1.3.0.
|
|
1134
|
-
acryl_datahub-1.3.0.
|
|
1135
|
-
acryl_datahub-1.3.0.
|
|
1134
|
+
acryl_datahub-1.3.0.1rc6.dist-info/METADATA,sha256=JfkdfNc1P5L7jaGrlIUM-DqN9KcYjFuKfxwfg-IvTQI,184688
|
|
1135
|
+
acryl_datahub-1.3.0.1rc6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
1136
|
+
acryl_datahub-1.3.0.1rc6.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
|
|
1137
|
+
acryl_datahub-1.3.0.1rc6.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1138
|
+
acryl_datahub-1.3.0.1rc6.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
|
@@ -49,6 +49,7 @@ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
|
49
49
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
50
50
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
51
51
|
RedundantLineageRunSkipHandler,
|
|
52
|
+
RedundantQueriesRunSkipHandler,
|
|
52
53
|
RedundantUsageRunSkipHandler,
|
|
53
54
|
)
|
|
54
55
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -145,7 +146,10 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
145
146
|
redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
|
|
146
147
|
None
|
|
147
148
|
)
|
|
148
|
-
if
|
|
149
|
+
if (
|
|
150
|
+
self.config.enable_stateful_lineage_ingestion
|
|
151
|
+
and not self.config.use_queries_v2
|
|
152
|
+
):
|
|
149
153
|
redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
|
|
150
154
|
source=self,
|
|
151
155
|
config=self.config,
|
|
@@ -296,6 +300,17 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
296
300
|
):
|
|
297
301
|
return
|
|
298
302
|
|
|
303
|
+
redundant_queries_run_skip_handler: Optional[
|
|
304
|
+
RedundantQueriesRunSkipHandler
|
|
305
|
+
] = None
|
|
306
|
+
if self.config.enable_stateful_time_window:
|
|
307
|
+
redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
|
|
308
|
+
source=self,
|
|
309
|
+
config=self.config,
|
|
310
|
+
pipeline_name=self.ctx.pipeline_name,
|
|
311
|
+
run_id=self.ctx.run_id,
|
|
312
|
+
)
|
|
313
|
+
|
|
299
314
|
with (
|
|
300
315
|
self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
|
|
301
316
|
BigQueryQueriesExtractor(
|
|
@@ -315,6 +330,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
315
330
|
structured_report=self.report,
|
|
316
331
|
filters=self.filters,
|
|
317
332
|
identifiers=self.identifiers,
|
|
333
|
+
redundant_run_skip_handler=redundant_queries_run_skip_handler,
|
|
318
334
|
schema_resolver=self.sql_parser_schema_resolver,
|
|
319
335
|
discovered_tables=self.bq_schema_extractor.table_refs,
|
|
320
336
|
) as queries_extractor,
|
|
@@ -25,6 +25,7 @@ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterCo
|
|
|
25
25
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
26
26
|
StatefulLineageConfigMixin,
|
|
27
27
|
StatefulProfilingConfigMixin,
|
|
28
|
+
StatefulTimeWindowConfigMixin,
|
|
28
29
|
StatefulUsageConfigMixin,
|
|
29
30
|
)
|
|
30
31
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
@@ -271,6 +272,7 @@ class BigQueryV2Config(
|
|
|
271
272
|
SQLCommonConfig,
|
|
272
273
|
StatefulUsageConfigMixin,
|
|
273
274
|
StatefulLineageConfigMixin,
|
|
275
|
+
StatefulTimeWindowConfigMixin,
|
|
274
276
|
StatefulProfilingConfigMixin,
|
|
275
277
|
ClassificationSourceConfigMixin,
|
|
276
278
|
):
|
|
@@ -527,6 +529,20 @@ class BigQueryV2Config(
|
|
|
527
529
|
|
|
528
530
|
return v
|
|
529
531
|
|
|
532
|
+
@root_validator(pre=False, skip_on_failure=True)
|
|
533
|
+
def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
|
|
534
|
+
if values.get("use_queries_v2"):
|
|
535
|
+
if values.get("enable_stateful_lineage_ingestion") or values.get(
|
|
536
|
+
"enable_stateful_usage_ingestion"
|
|
537
|
+
):
|
|
538
|
+
logger.warning(
|
|
539
|
+
"enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
|
|
540
|
+
"when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
|
|
541
|
+
"For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
|
|
542
|
+
"for the unified time window extraction (lineage + usage + operations + queries)."
|
|
543
|
+
)
|
|
544
|
+
return values
|
|
545
|
+
|
|
530
546
|
def get_table_pattern(self, pattern: List[str]) -> str:
|
|
531
547
|
return "|".join(pattern) if pattern else ""
|
|
532
548
|
|
|
@@ -36,6 +36,9 @@ from datahub.ingestion.source.bigquery_v2.common import (
|
|
|
36
36
|
BigQueryFilter,
|
|
37
37
|
BigQueryIdentifierBuilder,
|
|
38
38
|
)
|
|
39
|
+
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
40
|
+
RedundantQueriesRunSkipHandler,
|
|
41
|
+
)
|
|
39
42
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
40
43
|
from datahub.metadata.urns import CorpUserUrn
|
|
41
44
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -135,6 +138,7 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
135
138
|
structured_report: SourceReport,
|
|
136
139
|
filters: BigQueryFilter,
|
|
137
140
|
identifiers: BigQueryIdentifierBuilder,
|
|
141
|
+
redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
|
|
138
142
|
graph: Optional[DataHubGraph] = None,
|
|
139
143
|
schema_resolver: Optional[SchemaResolver] = None,
|
|
140
144
|
discovered_tables: Optional[Collection[str]] = None,
|
|
@@ -158,6 +162,9 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
158
162
|
)
|
|
159
163
|
|
|
160
164
|
self.structured_report = structured_report
|
|
165
|
+
self.redundant_run_skip_handler = redundant_run_skip_handler
|
|
166
|
+
|
|
167
|
+
self.start_time, self.end_time = self._get_time_window()
|
|
161
168
|
|
|
162
169
|
self.aggregator = SqlParsingAggregator(
|
|
163
170
|
platform=self.identifiers.platform,
|
|
@@ -172,8 +179,8 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
172
179
|
generate_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
173
180
|
usage_config=BaseUsageConfig(
|
|
174
181
|
bucket_duration=self.config.window.bucket_duration,
|
|
175
|
-
start_time=self.
|
|
176
|
-
end_time=self.
|
|
182
|
+
start_time=self.start_time,
|
|
183
|
+
end_time=self.end_time,
|
|
177
184
|
user_email_pattern=self.config.user_email_pattern,
|
|
178
185
|
top_n_queries=self.config.top_n_queries,
|
|
179
186
|
),
|
|
@@ -199,6 +206,34 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
199
206
|
logger.info(f"Using local temp path: {path}")
|
|
200
207
|
return path
|
|
201
208
|
|
|
209
|
+
def _get_time_window(self) -> tuple[datetime, datetime]:
|
|
210
|
+
if self.redundant_run_skip_handler:
|
|
211
|
+
start_time, end_time = (
|
|
212
|
+
self.redundant_run_skip_handler.suggest_run_time_window(
|
|
213
|
+
self.config.window.start_time,
|
|
214
|
+
self.config.window.end_time,
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
start_time = self.config.window.start_time
|
|
219
|
+
end_time = self.config.window.end_time
|
|
220
|
+
|
|
221
|
+
# Usage statistics are aggregated per bucket (typically per day).
|
|
222
|
+
# To ensure accurate aggregated metrics, we need to align the start_time
|
|
223
|
+
# to the beginning of a bucket so that we include complete bucket periods.
|
|
224
|
+
if self.config.include_usage_statistics:
|
|
225
|
+
start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
|
|
226
|
+
|
|
227
|
+
return start_time, end_time
|
|
228
|
+
|
|
229
|
+
def _update_state(self) -> None:
|
|
230
|
+
if self.redundant_run_skip_handler:
|
|
231
|
+
self.redundant_run_skip_handler.update_state(
|
|
232
|
+
self.config.window.start_time,
|
|
233
|
+
self.config.window.end_time,
|
|
234
|
+
self.config.window.bucket_duration,
|
|
235
|
+
)
|
|
236
|
+
|
|
202
237
|
def is_temp_table(self, name: str) -> bool:
|
|
203
238
|
try:
|
|
204
239
|
table = BigqueryTableIdentifier.from_string_name(name)
|
|
@@ -299,6 +334,8 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
299
334
|
shared_connection.close()
|
|
300
335
|
audit_log_file.unlink(missing_ok=True)
|
|
301
336
|
|
|
337
|
+
self._update_state()
|
|
338
|
+
|
|
302
339
|
def deduplicate_queries(
|
|
303
340
|
self, queries: FileBackedList[ObservedQuery]
|
|
304
341
|
) -> FileBackedDict[Dict[int, ObservedQuery]]:
|
|
@@ -355,8 +392,8 @@ class BigQueryQueriesExtractor(Closeable):
|
|
|
355
392
|
query_log_query = _build_enriched_query_log_query(
|
|
356
393
|
project_id=project.id,
|
|
357
394
|
region=region,
|
|
358
|
-
start_time=self.
|
|
359
|
-
end_time=self.
|
|
395
|
+
start_time=self.start_time,
|
|
396
|
+
end_time=self.end_time,
|
|
360
397
|
)
|
|
361
398
|
|
|
362
399
|
logger.info(f"Fetching query log from BQ Project {project.id} for {region}")
|
|
@@ -31,6 +31,7 @@ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterCo
|
|
|
31
31
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
32
32
|
StatefulLineageConfigMixin,
|
|
33
33
|
StatefulProfilingConfigMixin,
|
|
34
|
+
StatefulTimeWindowConfigMixin,
|
|
34
35
|
StatefulUsageConfigMixin,
|
|
35
36
|
)
|
|
36
37
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
@@ -199,6 +200,7 @@ class SnowflakeV2Config(
|
|
|
199
200
|
SnowflakeUsageConfig,
|
|
200
201
|
StatefulLineageConfigMixin,
|
|
201
202
|
StatefulUsageConfigMixin,
|
|
203
|
+
StatefulTimeWindowConfigMixin,
|
|
202
204
|
StatefulProfilingConfigMixin,
|
|
203
205
|
ClassificationSourceConfigMixin,
|
|
204
206
|
IncrementalPropertiesConfigMixin,
|
|
@@ -477,6 +479,20 @@ class SnowflakeV2Config(
|
|
|
477
479
|
|
|
478
480
|
return shares
|
|
479
481
|
|
|
482
|
+
@root_validator(pre=False, skip_on_failure=True)
|
|
483
|
+
def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
|
|
484
|
+
if values.get("use_queries_v2"):
|
|
485
|
+
if values.get("enable_stateful_lineage_ingestion") or values.get(
|
|
486
|
+
"enable_stateful_usage_ingestion"
|
|
487
|
+
):
|
|
488
|
+
logger.warning(
|
|
489
|
+
"enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
|
|
490
|
+
"when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
|
|
491
|
+
"For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
|
|
492
|
+
"for the unified time window extraction (lineage + usage + operations + queries)."
|
|
493
|
+
)
|
|
494
|
+
return values
|
|
495
|
+
|
|
480
496
|
def outbounds(self) -> Dict[str, Set[DatabaseId]]:
|
|
481
497
|
"""
|
|
482
498
|
Returns mapping of
|
|
@@ -17,6 +17,7 @@ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFr
|
|
|
17
17
|
from datahub.configuration.time_window_config import (
|
|
18
18
|
BaseTimeWindowConfig,
|
|
19
19
|
BucketDuration,
|
|
20
|
+
get_time_bucket,
|
|
20
21
|
)
|
|
21
22
|
from datahub.ingestion.api.closeable import Closeable
|
|
22
23
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -50,6 +51,9 @@ from datahub.ingestion.source.snowflake.stored_proc_lineage import (
|
|
|
50
51
|
StoredProcLineageReport,
|
|
51
52
|
StoredProcLineageTracker,
|
|
52
53
|
)
|
|
54
|
+
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
55
|
+
RedundantQueriesRunSkipHandler,
|
|
56
|
+
)
|
|
53
57
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
54
58
|
from datahub.metadata.urns import CorpUserUrn
|
|
55
59
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -180,6 +184,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
180
184
|
structured_report: SourceReport,
|
|
181
185
|
filters: SnowflakeFilter,
|
|
182
186
|
identifiers: SnowflakeIdentifierBuilder,
|
|
187
|
+
redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
|
|
183
188
|
graph: Optional[DataHubGraph] = None,
|
|
184
189
|
schema_resolver: Optional[SchemaResolver] = None,
|
|
185
190
|
discovered_tables: Optional[List[str]] = None,
|
|
@@ -191,9 +196,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
191
196
|
self.filters = filters
|
|
192
197
|
self.identifiers = identifiers
|
|
193
198
|
self.discovered_tables = set(discovered_tables) if discovered_tables else None
|
|
199
|
+
self.redundant_run_skip_handler = redundant_run_skip_handler
|
|
194
200
|
|
|
195
201
|
self._structured_report = structured_report
|
|
196
202
|
|
|
203
|
+
# Adjust time window based on stateful ingestion state
|
|
204
|
+
self.start_time, self.end_time = self._get_time_window()
|
|
205
|
+
|
|
197
206
|
# The exit stack helps ensure that we close all the resources we open.
|
|
198
207
|
self._exit_stack = contextlib.ExitStack()
|
|
199
208
|
|
|
@@ -211,8 +220,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
211
220
|
generate_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
212
221
|
usage_config=BaseUsageConfig(
|
|
213
222
|
bucket_duration=self.config.window.bucket_duration,
|
|
214
|
-
start_time=self.
|
|
215
|
-
end_time=self.
|
|
223
|
+
start_time=self.start_time,
|
|
224
|
+
end_time=self.end_time,
|
|
216
225
|
user_email_pattern=self.config.user_email_pattern,
|
|
217
226
|
# TODO make the rest of the fields configurable
|
|
218
227
|
),
|
|
@@ -228,6 +237,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
228
237
|
def structured_reporter(self) -> SourceReport:
|
|
229
238
|
return self._structured_report
|
|
230
239
|
|
|
240
|
+
def _get_time_window(self) -> tuple[datetime, datetime]:
|
|
241
|
+
if self.redundant_run_skip_handler:
|
|
242
|
+
start_time, end_time = (
|
|
243
|
+
self.redundant_run_skip_handler.suggest_run_time_window(
|
|
244
|
+
self.config.window.start_time,
|
|
245
|
+
self.config.window.end_time,
|
|
246
|
+
)
|
|
247
|
+
)
|
|
248
|
+
else:
|
|
249
|
+
start_time = self.config.window.start_time
|
|
250
|
+
end_time = self.config.window.end_time
|
|
251
|
+
|
|
252
|
+
# Usage statistics are aggregated per bucket (typically per day).
|
|
253
|
+
# To ensure accurate aggregated metrics, we need to align the start_time
|
|
254
|
+
# to the beginning of a bucket so that we include complete bucket periods.
|
|
255
|
+
if self.config.include_usage_statistics:
|
|
256
|
+
start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
|
|
257
|
+
|
|
258
|
+
return start_time, end_time
|
|
259
|
+
|
|
260
|
+
def _update_state(self) -> None:
|
|
261
|
+
if self.redundant_run_skip_handler:
|
|
262
|
+
self.redundant_run_skip_handler.update_state(
|
|
263
|
+
self.config.window.start_time,
|
|
264
|
+
self.config.window.end_time,
|
|
265
|
+
self.config.window.bucket_duration,
|
|
266
|
+
)
|
|
267
|
+
|
|
231
268
|
@functools.cached_property
|
|
232
269
|
def local_temp_path(self) -> pathlib.Path:
|
|
233
270
|
if self.config.local_temp_path:
|
|
@@ -355,6 +392,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
355
392
|
with self.report.aggregator_generate_timer:
|
|
356
393
|
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
357
394
|
|
|
395
|
+
# Update the stateful ingestion state after successful extraction
|
|
396
|
+
self._update_state()
|
|
397
|
+
|
|
358
398
|
def fetch_users(self) -> UsersMapping:
|
|
359
399
|
users: UsersMapping = dict()
|
|
360
400
|
with self.structured_reporter.report_exc("Error fetching users from Snowflake"):
|
|
@@ -378,8 +418,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
378
418
|
# Derived from _populate_external_lineage_from_copy_history.
|
|
379
419
|
|
|
380
420
|
query: str = SnowflakeQuery.copy_lineage_history(
|
|
381
|
-
start_time_millis=int(self.
|
|
382
|
-
end_time_millis=int(self.
|
|
421
|
+
start_time_millis=int(self.start_time.timestamp() * 1000),
|
|
422
|
+
end_time_millis=int(self.end_time.timestamp() * 1000),
|
|
383
423
|
downstreams_deny_pattern=self.config.temporary_tables_pattern,
|
|
384
424
|
)
|
|
385
425
|
|
|
@@ -414,8 +454,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
414
454
|
Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
|
|
415
455
|
]:
|
|
416
456
|
query_log_query = QueryLogQueryBuilder(
|
|
417
|
-
start_time=self.
|
|
418
|
-
end_time=self.
|
|
457
|
+
start_time=self.start_time,
|
|
458
|
+
end_time=self.end_time,
|
|
419
459
|
bucket_duration=self.config.window.bucket_duration,
|
|
420
460
|
deny_usernames=self.config.pushdown_deny_usernames,
|
|
421
461
|
allow_usernames=self.config.pushdown_allow_usernames,
|
|
@@ -73,6 +73,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
73
73
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
74
74
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
75
75
|
RedundantLineageRunSkipHandler,
|
|
76
|
+
RedundantQueriesRunSkipHandler,
|
|
76
77
|
RedundantUsageRunSkipHandler,
|
|
77
78
|
)
|
|
78
79
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -207,7 +208,7 @@ class SnowflakeV2Source(
|
|
|
207
208
|
)
|
|
208
209
|
self.report.sql_aggregator = self.aggregator.report
|
|
209
210
|
|
|
210
|
-
if self.config.include_table_lineage:
|
|
211
|
+
if self.config.include_table_lineage and not self.config.use_queries_v2:
|
|
211
212
|
redundant_lineage_run_skip_handler: Optional[
|
|
212
213
|
RedundantLineageRunSkipHandler
|
|
213
214
|
] = None
|
|
@@ -589,6 +590,17 @@ class SnowflakeV2Source(
|
|
|
589
590
|
with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
|
|
590
591
|
schema_resolver = self.aggregator._schema_resolver
|
|
591
592
|
|
|
593
|
+
redundant_queries_run_skip_handler: Optional[
|
|
594
|
+
RedundantQueriesRunSkipHandler
|
|
595
|
+
] = None
|
|
596
|
+
if self.config.enable_stateful_time_window:
|
|
597
|
+
redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
|
|
598
|
+
source=self,
|
|
599
|
+
config=self.config,
|
|
600
|
+
pipeline_name=self.ctx.pipeline_name,
|
|
601
|
+
run_id=self.ctx.run_id,
|
|
602
|
+
)
|
|
603
|
+
|
|
592
604
|
queries_extractor = SnowflakeQueriesExtractor(
|
|
593
605
|
connection=self.connection,
|
|
594
606
|
# TODO: this should be its own section in main recipe
|
|
@@ -614,6 +626,7 @@ class SnowflakeV2Source(
|
|
|
614
626
|
structured_report=self.report,
|
|
615
627
|
filters=self.filters,
|
|
616
628
|
identifiers=self.identifiers,
|
|
629
|
+
redundant_run_skip_handler=redundant_queries_run_skip_handler,
|
|
617
630
|
schema_resolver=schema_resolver,
|
|
618
631
|
discovered_tables=self.discovered_datasets,
|
|
619
632
|
graph=self.ctx.graph,
|
|
@@ -244,3 +244,24 @@ class RedundantUsageRunSkipHandler(RedundantRunSkipHandler):
|
|
|
244
244
|
cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
|
|
245
245
|
cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
|
|
246
246
|
cur_state.bucket_duration = bucket_duration
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class RedundantQueriesRunSkipHandler(RedundantRunSkipHandler):
|
|
250
|
+
"""
|
|
251
|
+
Handler for stateful ingestion of queries v2 extraction.
|
|
252
|
+
Manages the time window for audit log extraction that combines
|
|
253
|
+
lineage, usage, operations, and queries.
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
def get_job_name_suffix(self):
|
|
257
|
+
return "_audit_window"
|
|
258
|
+
|
|
259
|
+
def update_state(
|
|
260
|
+
self, start_time: datetime, end_time: datetime, bucket_duration: BucketDuration
|
|
261
|
+
) -> None:
|
|
262
|
+
cur_checkpoint = self.get_current_checkpoint()
|
|
263
|
+
if cur_checkpoint:
|
|
264
|
+
cur_state = cast(BaseTimeWindowCheckpointState, cur_checkpoint.state)
|
|
265
|
+
cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
|
|
266
|
+
cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
|
|
267
|
+
cur_state.bucket_duration = bucket_duration
|
|
@@ -101,7 +101,9 @@ class StatefulLineageConfigMixin(ConfigModel):
|
|
|
101
101
|
default=True,
|
|
102
102
|
description="Enable stateful lineage ingestion."
|
|
103
103
|
" This will store lineage window timestamps after successful lineage ingestion. "
|
|
104
|
-
"and will not run lineage ingestion for same timestamps in subsequent run. "
|
|
104
|
+
"and will not run lineage ingestion for same timestamps in subsequent run. "
|
|
105
|
+
"NOTE: This only works with use_queries_v2=False (legacy extraction path). "
|
|
106
|
+
"For queries v2, use enable_stateful_time_window instead.",
|
|
105
107
|
)
|
|
106
108
|
|
|
107
109
|
_store_last_lineage_extraction_timestamp = pydantic_renamed_field(
|
|
@@ -150,7 +152,9 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
|
150
152
|
default=True,
|
|
151
153
|
description="Enable stateful lineage ingestion."
|
|
152
154
|
" This will store usage window timestamps after successful usage ingestion. "
|
|
153
|
-
"and will not run usage ingestion for same timestamps in subsequent run. "
|
|
155
|
+
"and will not run usage ingestion for same timestamps in subsequent run. "
|
|
156
|
+
"NOTE: This only works with use_queries_v2=False (legacy extraction path). "
|
|
157
|
+
"For queries v2, use enable_stateful_time_window instead.",
|
|
154
158
|
)
|
|
155
159
|
|
|
156
160
|
_store_last_usage_extraction_timestamp = pydantic_renamed_field(
|
|
@@ -169,6 +173,30 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
|
169
173
|
return values
|
|
170
174
|
|
|
171
175
|
|
|
176
|
+
class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
|
|
177
|
+
enable_stateful_time_window: bool = Field(
|
|
178
|
+
default=False,
|
|
179
|
+
description="Enable stateful time window tracking."
|
|
180
|
+
" This will store the time window after successful extraction "
|
|
181
|
+
"and adjust the time window in subsequent runs to avoid reprocessing. "
|
|
182
|
+
"NOTE: This is ONLY applicable when using queries v2 (use_queries_v2=True). "
|
|
183
|
+
"This replaces enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion "
|
|
184
|
+
"for the queries v2 extraction path, since queries v2 extracts lineage, usage, operations, "
|
|
185
|
+
"and queries together from a single audit log and uses a unified time window.",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
@root_validator(skip_on_failure=True)
|
|
189
|
+
def time_window_stateful_option_validator(cls, values: Dict) -> Dict:
|
|
190
|
+
sti = values.get("stateful_ingestion")
|
|
191
|
+
if not sti or not sti.enabled:
|
|
192
|
+
if values.get("enable_stateful_time_window"):
|
|
193
|
+
logger.warning(
|
|
194
|
+
"Stateful ingestion is disabled, disabling enable_stateful_time_window config option as well"
|
|
195
|
+
)
|
|
196
|
+
values["enable_stateful_time_window"] = False
|
|
197
|
+
return values
|
|
198
|
+
|
|
199
|
+
|
|
172
200
|
@dataclass
|
|
173
201
|
class StatefulIngestionReport(SourceReport):
|
|
174
202
|
pass
|