acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/METADATA +2324 -2324
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/RECORD +25 -25
- datahub/__init__.py +1 -1
- datahub/cli/cli_utils.py +12 -1
- datahub/emitter/rest_emitter.py +140 -92
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +14 -11
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/source/aws/glue.py +52 -35
- datahub/ingestion/source/bigquery_v2/bigquery.py +2 -0
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +8 -0
- datahub/ingestion/source/datahub/config.py +10 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +11 -7
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_v2.py +2 -0
- datahub/ingestion/source/unity/source.py +0 -4
- datahub/sql_parsing/sql_parsing_aggregator.py +8 -5
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=ubEB5EHYmuiGuDtVdhhKbNxGtfw-kzV0eBW81uVifQU,576
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -59,7 +59,7 @@ datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1
|
|
|
59
59
|
datahub/api/graphql/operation.py,sha256=h7OXbVRrpJgoth1X4cgeIFhD5JY1MGKg2KjVlQK1gqE,5116
|
|
60
60
|
datahub/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
61
|
datahub/cli/check_cli.py,sha256=9dXNyzZayHeoFjwFjLkMVyx6DiCZfeESyI-sYtGA6bE,12850
|
|
62
|
-
datahub/cli/cli_utils.py,sha256=
|
|
62
|
+
datahub/cli/cli_utils.py,sha256=d_Q9vPZTPxO7XyyghD-i1Nkr4DX0M8cs2IWrMUQAu0c,13539
|
|
63
63
|
datahub/cli/config_utils.py,sha256=yuXw7RzpRY5x_-MAoqWbv46qUkIeRNAJL4_OeJpYdBE,4879
|
|
64
64
|
datahub/cli/delete_cli.py,sha256=VLeHi7MLFCtTk7MI4y8r_k_7aLcCUZIglU2MNLsXU6M,23051
|
|
65
65
|
datahub/cli/docker_check.py,sha256=rED4wHXqxcQ_qNFyIgFEZ85BHT9ZTE5YC-oUKqbRqi0,9432
|
|
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
|
|
|
119
119
|
datahub/emitter/mcp_builder.py,sha256=eOcuz41c4a3oTkNk39yYl9bTxpksxqATPHLcqyhPGT0,9856
|
|
120
120
|
datahub/emitter/mcp_patch_builder.py,sha256=oonC8iGOvDzqj890CxOjWlBdDEF1RnwvbSZy1sivlTY,4572
|
|
121
121
|
datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
|
|
122
|
-
datahub/emitter/rest_emitter.py,sha256=
|
|
122
|
+
datahub/emitter/rest_emitter.py,sha256=O9IJ7r-AXL4Pi892pEFOygvUKTbD8V6ey8KObuqHqgk,17876
|
|
123
123
|
datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
|
|
124
124
|
datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
|
|
125
125
|
datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
|
|
@@ -138,13 +138,13 @@ datahub/ingestion/api/registry.py,sha256=LGElUdzhNQoEr-k2SN23mJaIYnA1PYfF97LQxBm
|
|
|
138
138
|
datahub/ingestion/api/report.py,sha256=zb5Y_9ogmWm00KqX7_64sIMT24Wfpk7txRwEfKacw5I,4652
|
|
139
139
|
datahub/ingestion/api/report_helpers.py,sha256=WbUC1kQeaKqIagGV3XzfPmPs7slAT1mfNY4og2BH2A8,994
|
|
140
140
|
datahub/ingestion/api/sink.py,sha256=3jw7-x9gXGreOPwn49wG5fT3C8pYhaNMQITdMN6kbag,4478
|
|
141
|
-
datahub/ingestion/api/source.py,sha256=
|
|
141
|
+
datahub/ingestion/api/source.py,sha256=kSQ6AKDvLdFOIxaz9nPCmCSUsIMDdXHiOxzFiMdYN14,19001
|
|
142
142
|
datahub/ingestion/api/source_helpers.py,sha256=AVO0ogiCKgYmX1ubJaSs6L30TCCgOIalp6awXPF5XM0,19643
|
|
143
143
|
datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188mmC8J8,582
|
|
144
144
|
datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
|
|
145
145
|
datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
146
146
|
datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
|
|
147
|
-
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=
|
|
147
|
+
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=5jrl7cEyonce-YdWe1Iw6y3Okw5smJosqwOm5e-nvqM,4363
|
|
148
148
|
datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
149
|
datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
|
|
150
150
|
datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
|
|
@@ -160,12 +160,12 @@ datahub/ingestion/fs/local_fs.py,sha256=oWf-PZsl5sI-9eHWGeKlfKYagbQaSZ9fGfNbxcFj
|
|
|
160
160
|
datahub/ingestion/fs/s3_fs.py,sha256=FM6UK9A48UdOjkAO-gh1rAa4N7FTXz0Wutmp8TeX7kY,3199
|
|
161
161
|
datahub/ingestion/glossary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
162
|
datahub/ingestion/glossary/classification_mixin.py,sha256=pkb0Rv2SQH7VwAV5DPLoJLJwkDwTjIhOhg4mbXiz9CI,13332
|
|
163
|
-
datahub/ingestion/glossary/classifier.py,sha256=
|
|
163
|
+
datahub/ingestion/glossary/classifier.py,sha256=zp8Fe3he80H5Zz1EwymKjThUPkTpw6PgEJQvlmqrJmQ,3006
|
|
164
164
|
datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
|
|
165
165
|
datahub/ingestion/glossary/datahub_classifier.py,sha256=8VhwuLDhyOqqOr0jqAPIgorb4eAOnvTr4m13Y2Wy1-E,7515
|
|
166
166
|
datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
|
-
datahub/ingestion/graph/client.py,sha256=
|
|
168
|
-
datahub/ingestion/graph/config.py,sha256=
|
|
167
|
+
datahub/ingestion/graph/client.py,sha256=R50K7NmE3TYgVXvdLnvLZn7N0fkiCXOK0MoJz9ueglA,64963
|
|
168
|
+
datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
|
|
169
169
|
datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
|
|
170
170
|
datahub/ingestion/graph/filters.py,sha256=UeUZQHoimavIYx-jXLA0WGkOUe10TaO8uEZkfa-QgNE,6188
|
|
171
171
|
datahub/ingestion/reporting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -217,7 +217,7 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
|
|
|
217
217
|
datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
|
|
218
218
|
datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
219
219
|
datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
|
|
220
|
-
datahub/ingestion/source/aws/glue.py,sha256=
|
|
220
|
+
datahub/ingestion/source/aws/glue.py,sha256=lJW3QHHz1_SWqLEB-vUSTxSuL0EgUQ0ptdQns_NLNds,57343
|
|
221
221
|
datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
|
|
222
222
|
datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
|
|
223
223
|
datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
|
|
@@ -233,10 +233,10 @@ datahub/ingestion/source/azure/abs_folder_utils.py,sha256=7skXus-4fSIoKpqCeU-GG0
|
|
|
233
233
|
datahub/ingestion/source/azure/abs_utils.py,sha256=KdAlCK-PMrn35kFHxz5vrsjajyx2PD5GRgoBKdoRvcg,2075
|
|
234
234
|
datahub/ingestion/source/azure/azure_common.py,sha256=Zl0pPuE6L3QcM5B1P0LsPthZmD0h7fUUS0kg2okl6IY,4053
|
|
235
235
|
datahub/ingestion/source/bigquery_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
236
|
-
datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=
|
|
236
|
+
datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=c7g8sWuDIMhCSAX0D76P2arxZgTmzd-e0qlO7yt_zJY,13841
|
|
237
237
|
datahub/ingestion/source/bigquery_v2/bigquery_audit.py,sha256=IlbHA8a-gNJvnubgBfxVHpUk8rFNIG80gk5HWXa2lyE,25108
|
|
238
238
|
datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py,sha256=LuGJ6LgPViLIfDQfylxlQ3CA7fZYM5MDt8M-7sfzm84,5096
|
|
239
|
-
datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=
|
|
239
|
+
datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=sjCW997-Su14cVgWd1ZVx1E67yqfTIV5Wjp9Me0hfOw,26289
|
|
240
240
|
datahub/ingestion/source/bigquery_v2/bigquery_data_reader.py,sha256=DeT3v_Z82__8En0FcZ0kavBAWQoRvSZ5Rppm9eeDAb8,2393
|
|
241
241
|
datahub/ingestion/source/bigquery_v2/bigquery_helper.py,sha256=QER3gY8e_k1_eNVj7cBso7ZzrWl_vO5PYSa6CpvqNx8,1554
|
|
242
242
|
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py,sha256=8nuQ8hMuJEswWDZtV2RjbK8RvDJUzT_S74dnyPpGFdQ,4857
|
|
@@ -265,11 +265,11 @@ datahub/ingestion/source/data_lake_common/config.py,sha256=qUk83B01hjuBKHvVz8SmX
|
|
|
265
265
|
datahub/ingestion/source/data_lake_common/data_lake_utils.py,sha256=nxu7osuzqxScPFc-1ODA2M1c_xPNPpRH_SMMU7zKOIE,6212
|
|
266
266
|
datahub/ingestion/source/data_lake_common/path_spec.py,sha256=u3u2eMe70V5vur-j8mYtupZdoeA2hSeK262Whdsc2YU,23506
|
|
267
267
|
datahub/ingestion/source/datahub/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
268
|
-
datahub/ingestion/source/datahub/config.py,sha256=
|
|
268
|
+
datahub/ingestion/source/datahub/config.py,sha256=xBAZJpcw25aMI2zHi2wXi21sAfdy1rlmbBq9tY3adV0,4304
|
|
269
269
|
datahub/ingestion/source/datahub/datahub_api_reader.py,sha256=hlKADVEPoTFiRGKqRsMF5mL4fSu_IrIW8Nx7LpEzvkM,2134
|
|
270
|
-
datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=
|
|
270
|
+
datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=Rd61iHFhvrNmgzIk0jDDYxjxQUnEckbn1SKedoR5qic,8972
|
|
271
271
|
datahub/ingestion/source/datahub/datahub_kafka_reader.py,sha256=gnxhhlK-jrfnHqD_4eVmfcdtBNW6pi1N_qkDZ7uSb3o,4187
|
|
272
|
-
datahub/ingestion/source/datahub/datahub_source.py,sha256=
|
|
272
|
+
datahub/ingestion/source/datahub/datahub_source.py,sha256=5qGg_T0KJaO5WcvrsM0KM8_eTOjy0NvlMI4DUdIAiDo,8482
|
|
273
273
|
datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
|
|
274
274
|
datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
|
|
275
275
|
datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -305,7 +305,7 @@ datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
|
|
|
305
305
|
datahub/ingestion/source/gc/datahub_gc.py,sha256=W6uoeV7B4WIXdxT4tOEdDksdJm656WwwvkH79L7f_8Q,12969
|
|
306
306
|
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeNOjEingKpYy-XvCPeaAb4k,17125
|
|
307
307
|
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=9jsyCIspWSSYSAVPHjKHr05885rXxM6FCH7KzTBceic,10139
|
|
308
|
-
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=
|
|
308
|
+
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=zRtgC_AcZui4qGf9jBASI3R-CrYZxNe3Pm-gNSLT3rw,11420
|
|
309
309
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
310
310
|
datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
|
|
311
311
|
datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
|
|
@@ -429,10 +429,10 @@ datahub/ingestion/source/snowflake/constants.py,sha256=22n-0r04nuy-ImxWFFpmbrt_G
|
|
|
429
429
|
datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
|
|
430
430
|
datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
|
|
431
431
|
datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
|
|
432
|
-
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=
|
|
432
|
+
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=UehWUvqTXRsWmE5bBS53IoLjUL06-wJq6K4O2MTT2R8,18374
|
|
433
433
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
|
|
434
434
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
435
|
-
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=
|
|
435
|
+
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
|
|
436
436
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
437
437
|
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=jTpnFWRqqFId6DKJvvAbNuFPxyNi1oQxxDUyMvh1iu4,26968
|
|
438
438
|
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=5po2FWz41UVowykJYbTFGxsltbmlHBCPcHG20VOhdOE,38469
|
|
@@ -444,7 +444,7 @@ datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYh
|
|
|
444
444
|
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
|
|
445
445
|
datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=0rXgz8bvRiI9SYVMa0UGLeg_DcjqBy6kQsdq0Uq0HVk,24685
|
|
446
446
|
datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=MoI8-DR9tuMuHMBQcpDo4GFjvcoQZWLNkdFZsTkgK-M,12786
|
|
447
|
-
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=
|
|
447
|
+
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=0doZaPPMO64Qi9uN4w8ZYe3gKkkieGJKI5xntF7vS6w,32020
|
|
448
448
|
datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
449
449
|
datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
|
|
450
450
|
datahub/ingestion/source/sql/clickhouse.py,sha256=jzvaXP5Wr0SMhj2rtuvVE821xnfpKiXhO3cm0xblgHs,27299
|
|
@@ -506,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=_6kCI7M4-26pZ9ZMGJUh6LwYmbGAZlnvc
|
|
|
506
506
|
datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
|
|
507
507
|
datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
|
|
508
508
|
datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
|
|
509
|
-
datahub/ingestion/source/unity/source.py,sha256=
|
|
509
|
+
datahub/ingestion/source/unity/source.py,sha256=YdUPCMJtpmvYVnRNnpqb4BVowFobkLvSJ_K2gHwrvCI,41752
|
|
510
510
|
datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
|
|
511
511
|
datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
512
512
|
datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
|
|
@@ -881,7 +881,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
|
|
|
881
881
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
882
882
|
datahub/sql_parsing/schema_resolver.py,sha256=9INZWdxA2dMSLK6RXaVqjbjyLY_VKMhCkQv_Xd6Ln3I,10848
|
|
883
883
|
datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
|
|
884
|
-
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=
|
|
884
|
+
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=ULvLZygN_LtZQg_DKLQ2lDzz3YsEhZBvZUx3wmYeP_Q,69976
|
|
885
885
|
datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
|
|
886
886
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
887
887
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=gUVq3NwZUzQByJs43JZXz8lZf0ZVzVt0FzaW5wZOwK4,47460
|
|
@@ -986,8 +986,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
986
986
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
987
987
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
988
988
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
989
|
-
acryl_datahub-0.15.0.
|
|
990
|
-
acryl_datahub-0.15.0.
|
|
991
|
-
acryl_datahub-0.15.0.
|
|
992
|
-
acryl_datahub-0.15.0.
|
|
993
|
-
acryl_datahub-0.15.0.
|
|
989
|
+
acryl_datahub-0.15.0.2rc1.dist-info/METADATA,sha256=cLoMIIavfob5z2bJbsj69AoG_J7tp489qvVvBf3W_Yo,173441
|
|
990
|
+
acryl_datahub-0.15.0.2rc1.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
|
|
991
|
+
acryl_datahub-0.15.0.2rc1.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
992
|
+
acryl_datahub-0.15.0.2rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
993
|
+
acryl_datahub-0.15.0.2rc1.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
datahub/cli/cli_utils.py
CHANGED
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
import time
|
|
4
4
|
import typing
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
|
|
7
7
|
|
|
8
8
|
import click
|
|
9
9
|
import requests
|
|
@@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]:
|
|
|
33
33
|
return next((el for el in ls if el is not None and el.strip() != ""), None)
|
|
34
34
|
|
|
35
35
|
|
|
36
|
+
_T = TypeVar("_T")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_or_else(value: Optional[_T], default: _T) -> _T:
|
|
40
|
+
# Normally we'd use `value or default`. However, that runs into issues
|
|
41
|
+
# when value is falsey but not None.
|
|
42
|
+
return value if value is not None else default
|
|
43
|
+
|
|
44
|
+
|
|
36
45
|
def parse_run_restli_response(response: requests.Response) -> dict:
|
|
37
46
|
response_json = response.json()
|
|
38
47
|
if response.status_code != 200:
|
|
@@ -321,6 +330,8 @@ def get_frontend_session_login_as(
|
|
|
321
330
|
def _ensure_valid_gms_url_acryl_cloud(url: str) -> str:
|
|
322
331
|
if "acryl.io" not in url:
|
|
323
332
|
return url
|
|
333
|
+
if url.endswith(":8080"):
|
|
334
|
+
url = url.replace(":8080", "")
|
|
324
335
|
if url.startswith("http://"):
|
|
325
336
|
url = url.replace("http://", "https://")
|
|
326
337
|
if url.endswith("acryl.io"):
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -1,9 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import functools
|
|
2
4
|
import json
|
|
3
5
|
import logging
|
|
4
6
|
import os
|
|
5
7
|
from json.decoder import JSONDecodeError
|
|
6
|
-
from typing import
|
|
8
|
+
from typing import (
|
|
9
|
+
TYPE_CHECKING,
|
|
10
|
+
Any,
|
|
11
|
+
Callable,
|
|
12
|
+
Dict,
|
|
13
|
+
List,
|
|
14
|
+
Optional,
|
|
15
|
+
Sequence,
|
|
16
|
+
Tuple,
|
|
17
|
+
Union,
|
|
18
|
+
)
|
|
7
19
|
|
|
8
20
|
import requests
|
|
9
21
|
from deprecated import deprecated
|
|
@@ -12,8 +24,13 @@ from requests.exceptions import HTTPError, RequestException
|
|
|
12
24
|
|
|
13
25
|
from datahub import nice_version_name
|
|
14
26
|
from datahub.cli import config_utils
|
|
15
|
-
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
|
|
16
|
-
from datahub.
|
|
27
|
+
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
|
|
28
|
+
from datahub.cli.env_utils import get_boolean_env_variable
|
|
29
|
+
from datahub.configuration.common import (
|
|
30
|
+
ConfigModel,
|
|
31
|
+
ConfigurationError,
|
|
32
|
+
OperationalError,
|
|
33
|
+
)
|
|
17
34
|
from datahub.emitter.generic_emitter import Emitter
|
|
18
35
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
36
|
from datahub.emitter.request_helper import make_curl_command
|
|
@@ -30,10 +47,8 @@ if TYPE_CHECKING:
|
|
|
30
47
|
|
|
31
48
|
logger = logging.getLogger(__name__)
|
|
32
49
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
30 # Any ingest call taking longer than 30 seconds should be abandoned
|
|
36
|
-
)
|
|
50
|
+
_DEFAULT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
|
|
51
|
+
_TIMEOUT_LOWER_BOUND_SEC = 1 # if below this, we log a warning
|
|
37
52
|
_DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
|
|
38
53
|
429,
|
|
39
54
|
500,
|
|
@@ -46,6 +61,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
46
61
|
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
47
62
|
)
|
|
48
63
|
|
|
64
|
+
_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
|
|
65
|
+
|
|
49
66
|
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
67
|
# for overhead like request headers.
|
|
51
68
|
# This applies to pretty much all calls to GMS.
|
|
@@ -60,15 +77,76 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
|
60
77
|
)
|
|
61
78
|
|
|
62
79
|
|
|
80
|
+
class RequestsSessionConfig(ConfigModel):
|
|
81
|
+
timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
|
|
82
|
+
|
|
83
|
+
retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
|
|
84
|
+
retry_methods: List[str] = _DEFAULT_RETRY_METHODS
|
|
85
|
+
retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
|
|
86
|
+
|
|
87
|
+
extra_headers: Dict[str, str] = {}
|
|
88
|
+
|
|
89
|
+
ca_certificate_path: Optional[str] = None
|
|
90
|
+
client_certificate_path: Optional[str] = None
|
|
91
|
+
disable_ssl_verification: bool = False
|
|
92
|
+
|
|
93
|
+
def build_session(self) -> requests.Session:
|
|
94
|
+
session = requests.Session()
|
|
95
|
+
|
|
96
|
+
if self.extra_headers:
|
|
97
|
+
session.headers.update(self.extra_headers)
|
|
98
|
+
|
|
99
|
+
if self.client_certificate_path:
|
|
100
|
+
session.cert = self.client_certificate_path
|
|
101
|
+
|
|
102
|
+
if self.ca_certificate_path:
|
|
103
|
+
session.verify = self.ca_certificate_path
|
|
104
|
+
|
|
105
|
+
if self.disable_ssl_verification:
|
|
106
|
+
session.verify = False
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Set raise_on_status to False to propagate errors:
|
|
110
|
+
# https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
|
|
111
|
+
# Must call `raise_for_status` after making a request, which we do
|
|
112
|
+
retry_strategy = Retry(
|
|
113
|
+
total=self.retry_max_times,
|
|
114
|
+
status_forcelist=self.retry_status_codes,
|
|
115
|
+
backoff_factor=2,
|
|
116
|
+
allowed_methods=self.retry_methods,
|
|
117
|
+
raise_on_status=False,
|
|
118
|
+
)
|
|
119
|
+
except TypeError:
|
|
120
|
+
# Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
|
|
121
|
+
retry_strategy = Retry(
|
|
122
|
+
total=self.retry_max_times,
|
|
123
|
+
status_forcelist=self.retry_status_codes,
|
|
124
|
+
backoff_factor=2,
|
|
125
|
+
method_whitelist=self.retry_methods,
|
|
126
|
+
raise_on_status=False,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
adapter = HTTPAdapter(
|
|
130
|
+
pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
|
|
131
|
+
)
|
|
132
|
+
session.mount("http://", adapter)
|
|
133
|
+
session.mount("https://", adapter)
|
|
134
|
+
|
|
135
|
+
if self.timeout is not None:
|
|
136
|
+
# Shim session.request to apply default timeout values.
|
|
137
|
+
# Via https://stackoverflow.com/a/59317604.
|
|
138
|
+
session.request = functools.partial( # type: ignore
|
|
139
|
+
session.request,
|
|
140
|
+
timeout=self.timeout,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return session
|
|
144
|
+
|
|
145
|
+
|
|
63
146
|
class DataHubRestEmitter(Closeable, Emitter):
|
|
64
147
|
_gms_server: str
|
|
65
148
|
_token: Optional[str]
|
|
66
149
|
_session: requests.Session
|
|
67
|
-
_connect_timeout_sec: float = _DEFAULT_CONNECT_TIMEOUT_SEC
|
|
68
|
-
_read_timeout_sec: float = _DEFAULT_READ_TIMEOUT_SEC
|
|
69
|
-
_retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
|
|
70
|
-
_retry_methods: List[str] = _DEFAULT_RETRY_METHODS
|
|
71
|
-
_retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
|
|
72
150
|
|
|
73
151
|
def __init__(
|
|
74
152
|
self,
|
|
@@ -99,15 +177,13 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
99
177
|
|
|
100
178
|
self._session = requests.Session()
|
|
101
179
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
}
|
|
108
|
-
)
|
|
180
|
+
headers = {
|
|
181
|
+
"X-RestLi-Protocol-Version": "2.0.0",
|
|
182
|
+
"X-DataHub-Py-Cli-Version": nice_version_name(),
|
|
183
|
+
"Content-Type": "application/json",
|
|
184
|
+
}
|
|
109
185
|
if token:
|
|
110
|
-
|
|
186
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
111
187
|
else:
|
|
112
188
|
# HACK: When no token is provided but system auth env variables are set, we use them.
|
|
113
189
|
# Ideally this should simply get passed in as config, instead of being sneakily injected
|
|
@@ -116,75 +192,43 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
116
192
|
# rest emitter, and the rest sink uses the rest emitter under the hood.
|
|
117
193
|
system_auth = config_utils.get_system_auth()
|
|
118
194
|
if system_auth is not None:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if extra_headers:
|
|
122
|
-
self._session.headers.update(extra_headers)
|
|
123
|
-
|
|
124
|
-
if client_certificate_path:
|
|
125
|
-
self._session.cert = client_certificate_path
|
|
195
|
+
headers["Authorization"] = system_auth
|
|
126
196
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
self._connect_timeout_sec = (
|
|
134
|
-
connect_timeout_sec or timeout_sec or _DEFAULT_CONNECT_TIMEOUT_SEC
|
|
135
|
-
)
|
|
136
|
-
self._read_timeout_sec = (
|
|
137
|
-
read_timeout_sec or timeout_sec or _DEFAULT_READ_TIMEOUT_SEC
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
if self._connect_timeout_sec < 1 or self._read_timeout_sec < 1:
|
|
141
|
-
logger.warning(
|
|
142
|
-
f"Setting timeout values lower than 1 second is not recommended. Your configuration is connect_timeout:{self._connect_timeout_sec}s, read_timeout:{self._read_timeout_sec}s"
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
if retry_status_codes is not None: # Only if missing. Empty list is allowed
|
|
146
|
-
self._retry_status_codes = retry_status_codes
|
|
147
|
-
|
|
148
|
-
if retry_methods is not None:
|
|
149
|
-
self._retry_methods = retry_methods
|
|
150
|
-
|
|
151
|
-
if retry_max_times:
|
|
152
|
-
self._retry_max_times = retry_max_times
|
|
153
|
-
|
|
154
|
-
try:
|
|
155
|
-
# Set raise_on_status to False to propagate errors:
|
|
156
|
-
# https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
|
|
157
|
-
# Must call `raise_for_status` after making a request, which we do
|
|
158
|
-
retry_strategy = Retry(
|
|
159
|
-
total=self._retry_max_times,
|
|
160
|
-
status_forcelist=self._retry_status_codes,
|
|
161
|
-
backoff_factor=2,
|
|
162
|
-
allowed_methods=self._retry_methods,
|
|
163
|
-
raise_on_status=False,
|
|
164
|
-
)
|
|
165
|
-
except TypeError:
|
|
166
|
-
# Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
|
|
167
|
-
retry_strategy = Retry(
|
|
168
|
-
total=self._retry_max_times,
|
|
169
|
-
status_forcelist=self._retry_status_codes,
|
|
170
|
-
backoff_factor=2,
|
|
171
|
-
method_whitelist=self._retry_methods,
|
|
172
|
-
raise_on_status=False,
|
|
197
|
+
timeout: float | tuple[float, float]
|
|
198
|
+
if connect_timeout_sec is not None or read_timeout_sec is not None:
|
|
199
|
+
timeout = (
|
|
200
|
+
connect_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
|
|
201
|
+
read_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
|
|
173
202
|
)
|
|
203
|
+
if (
|
|
204
|
+
timeout[0] < _TIMEOUT_LOWER_BOUND_SEC
|
|
205
|
+
or timeout[1] < _TIMEOUT_LOWER_BOUND_SEC
|
|
206
|
+
):
|
|
207
|
+
logger.warning(
|
|
208
|
+
f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is (connect_timeout, read_timeout) = {timeout} seconds"
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
timeout = get_or_else(timeout_sec, _DEFAULT_TIMEOUT_SEC)
|
|
212
|
+
if timeout < _TIMEOUT_LOWER_BOUND_SEC:
|
|
213
|
+
logger.warning(
|
|
214
|
+
f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is timeout = {timeout} seconds"
|
|
215
|
+
)
|
|
174
216
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
217
|
+
self._session_config = RequestsSessionConfig(
|
|
218
|
+
timeout=timeout,
|
|
219
|
+
retry_status_codes=get_or_else(
|
|
220
|
+
retry_status_codes, _DEFAULT_RETRY_STATUS_CODES
|
|
221
|
+
),
|
|
222
|
+
retry_methods=get_or_else(retry_methods, _DEFAULT_RETRY_METHODS),
|
|
223
|
+
retry_max_times=get_or_else(retry_max_times, _DEFAULT_RETRY_MAX_TIMES),
|
|
224
|
+
extra_headers={**headers, **(extra_headers or {})},
|
|
225
|
+
ca_certificate_path=ca_certificate_path,
|
|
226
|
+
client_certificate_path=client_certificate_path,
|
|
227
|
+
disable_ssl_verification=disable_ssl_verification,
|
|
186
228
|
)
|
|
187
229
|
|
|
230
|
+
self._session = self._session_config.build_session()
|
|
231
|
+
|
|
188
232
|
def test_connection(self) -> None:
|
|
189
233
|
url = f"{self._gms_server}/config"
|
|
190
234
|
response = self._session.get(url)
|
|
@@ -291,7 +335,8 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
291
335
|
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
292
336
|
async_flag: Optional[bool] = None,
|
|
293
337
|
) -> int:
|
|
294
|
-
|
|
338
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
339
|
+
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
|
|
295
340
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
296
341
|
for mcp in mcps:
|
|
297
342
|
ensure_has_system_metadata(mcp)
|
|
@@ -304,22 +349,25 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
304
349
|
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
|
|
305
350
|
for mcp_obj in mcp_objs:
|
|
306
351
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
352
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
353
|
+
logger.debug(
|
|
354
|
+
f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
|
|
355
|
+
)
|
|
310
356
|
|
|
311
357
|
if (
|
|
312
358
|
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
|
|
313
359
|
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
314
360
|
):
|
|
315
|
-
|
|
361
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
362
|
+
logger.debug("Decided to create new chunk")
|
|
316
363
|
mcp_obj_chunks.append([])
|
|
317
364
|
current_chunk_size = 0
|
|
318
365
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
319
366
|
current_chunk_size += mcp_obj_size
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
367
|
+
if len(mcp_obj_chunks) > 0:
|
|
368
|
+
logger.debug(
|
|
369
|
+
f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
|
|
370
|
+
)
|
|
323
371
|
|
|
324
372
|
for mcp_obj_chunk in mcp_obj_chunks:
|
|
325
373
|
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Iterable, List
|
|
3
|
+
from typing import TYPE_CHECKING, Iterable, List
|
|
4
4
|
|
|
5
5
|
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
|
|
6
6
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
7
|
-
from datahub.ingestion.api.source import SourceReport
|
|
8
7
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
9
8
|
from datahub.metadata.schema_classes import (
|
|
10
9
|
DatasetProfileClass,
|
|
@@ -12,12 +11,15 @@ from datahub.metadata.schema_classes import (
|
|
|
12
11
|
SchemaMetadataClass,
|
|
13
12
|
)
|
|
14
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from datahub.ingestion.api.source import SourceReport
|
|
16
|
+
|
|
15
17
|
logger = logging.getLogger(__name__)
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class EnsureAspectSizeProcessor:
|
|
19
21
|
def __init__(
|
|
20
|
-
self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
|
|
22
|
+
self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
|
|
21
23
|
):
|
|
22
24
|
self.report = report
|
|
23
25
|
self.payload_constraint = payload_constraint
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -31,6 +31,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
|
|
|
31
31
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
32
32
|
auto_patch_last_modified,
|
|
33
33
|
)
|
|
34
|
+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
35
|
+
EnsureAspectSizeProcessor,
|
|
36
|
+
)
|
|
34
37
|
from datahub.ingestion.api.closeable import Closeable
|
|
35
38
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
36
39
|
from datahub.ingestion.api.report import Report
|
|
@@ -450,6 +453,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
450
453
|
browse_path_processor,
|
|
451
454
|
partial(auto_workunit_reporter, self.get_report()),
|
|
452
455
|
auto_patch_last_modified,
|
|
456
|
+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
453
457
|
]
|
|
454
458
|
|
|
455
459
|
@staticmethod
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
1
|
from abc import ABCMeta, abstractmethod
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from typing import Any, Dict, List, Optional
|
|
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
|
|
|
38
37
|
)
|
|
39
38
|
|
|
40
39
|
max_workers: int = Field(
|
|
41
|
-
default=
|
|
42
|
-
description="Number of worker processes to use for classification. Set to 1 to disable.",
|
|
40
|
+
default=1,
|
|
41
|
+
description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
|
|
43
42
|
)
|
|
44
43
|
|
|
45
44
|
table_pattern: AllowDenyPattern = Field(
|
|
@@ -179,21 +179,24 @@ class DataHubGraph(DatahubRestEmitter):
|
|
|
179
179
|
|
|
180
180
|
@classmethod
|
|
181
181
|
def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
|
|
182
|
+
session_config = emitter._session_config
|
|
183
|
+
if isinstance(session_config.timeout, tuple):
|
|
184
|
+
# TODO: This is slightly lossy. Eventually, we want to modify the emitter
|
|
185
|
+
# to accept a tuple for timeout_sec, and then we'll be able to remove this.
|
|
186
|
+
timeout_sec: Optional[float] = session_config.timeout[0]
|
|
187
|
+
else:
|
|
188
|
+
timeout_sec = session_config.timeout
|
|
182
189
|
return cls(
|
|
183
190
|
DatahubClientConfig(
|
|
184
191
|
server=emitter._gms_server,
|
|
185
192
|
token=emitter._token,
|
|
186
|
-
timeout_sec=
|
|
187
|
-
retry_status_codes=
|
|
188
|
-
retry_max_times=
|
|
189
|
-
extra_headers=
|
|
190
|
-
disable_ssl_verification=
|
|
191
|
-
ca_certificate_path=
|
|
192
|
-
|
|
193
|
-
if isinstance(emitter._session.verify, str)
|
|
194
|
-
else None
|
|
195
|
-
),
|
|
196
|
-
client_certificate_path=emitter._session.cert,
|
|
193
|
+
timeout_sec=timeout_sec,
|
|
194
|
+
retry_status_codes=session_config.retry_status_codes,
|
|
195
|
+
retry_max_times=session_config.retry_max_times,
|
|
196
|
+
extra_headers=session_config.extra_headers,
|
|
197
|
+
disable_ssl_verification=session_config.disable_ssl_verification,
|
|
198
|
+
ca_certificate_path=session_config.ca_certificate_path,
|
|
199
|
+
client_certificate_path=session_config.client_certificate_path,
|
|
197
200
|
)
|
|
198
201
|
)
|
|
199
202
|
|
|
@@ -10,7 +10,7 @@ class DatahubClientConfig(ConfigModel):
|
|
|
10
10
|
# by callers / the CLI, but the actual client should not have any magic.
|
|
11
11
|
server: str
|
|
12
12
|
token: Optional[str] = None
|
|
13
|
-
timeout_sec: Optional[
|
|
13
|
+
timeout_sec: Optional[float] = None
|
|
14
14
|
retry_status_codes: Optional[List[int]] = None
|
|
15
15
|
retry_max_times: Optional[int] = None
|
|
16
16
|
extra_headers: Optional[Dict[str, str]] = None
|