acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=GvEPomdTJt9ZrIZUgZuwaVdBYiJA2qKcUyUKxDy3owo,577
1
+ datahub/__init__.py,sha256=RIZACzdo76HDJUSxMhtInCcOAu85CL1Blpo_mAXUajA,576
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -59,7 +59,7 @@ datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1
59
59
  datahub/api/graphql/operation.py,sha256=h7OXbVRrpJgoth1X4cgeIFhD5JY1MGKg2KjVlQK1gqE,5116
60
60
  datahub/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
61
  datahub/cli/check_cli.py,sha256=9dXNyzZayHeoFjwFjLkMVyx6DiCZfeESyI-sYtGA6bE,12850
62
- datahub/cli/cli_utils.py,sha256=gFmcOGAT6IdrTwmpRFSwaqzGmoqS4dbWrxILB1uvlGk,13214
62
+ datahub/cli/cli_utils.py,sha256=d_Q9vPZTPxO7XyyghD-i1Nkr4DX0M8cs2IWrMUQAu0c,13539
63
63
  datahub/cli/config_utils.py,sha256=yuXw7RzpRY5x_-MAoqWbv46qUkIeRNAJL4_OeJpYdBE,4879
64
64
  datahub/cli/delete_cli.py,sha256=VLeHi7MLFCtTk7MI4y8r_k_7aLcCUZIglU2MNLsXU6M,23051
65
65
  datahub/cli/docker_check.py,sha256=rED4wHXqxcQ_qNFyIgFEZ85BHT9ZTE5YC-oUKqbRqi0,9432
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
119
119
  datahub/emitter/mcp_builder.py,sha256=eOcuz41c4a3oTkNk39yYl9bTxpksxqATPHLcqyhPGT0,9856
120
120
  datahub/emitter/mcp_patch_builder.py,sha256=oonC8iGOvDzqj890CxOjWlBdDEF1RnwvbSZy1sivlTY,4572
121
121
  datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
122
- datahub/emitter/rest_emitter.py,sha256=YpRQEyuDBq31Iw7bZtOe5arm4YviCQLpvaObzVwheBY,16759
122
+ datahub/emitter/rest_emitter.py,sha256=O9IJ7r-AXL4Pi892pEFOygvUKTbD8V6ey8KObuqHqgk,17876
123
123
  datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
124
124
  datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
125
125
  datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -164,8 +164,8 @@ datahub/ingestion/glossary/classifier.py,sha256=zp8Fe3he80H5Zz1EwymKjThUPkTpw6Pg
164
164
  datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
165
165
  datahub/ingestion/glossary/datahub_classifier.py,sha256=8VhwuLDhyOqqOr0jqAPIgorb4eAOnvTr4m13Y2Wy1-E,7515
166
166
  datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
167
- datahub/ingestion/graph/client.py,sha256=AYDFwP9a_M-fCZv-PcWMSr5tc53XWJl372SWKwdu37E,64651
168
- datahub/ingestion/graph/config.py,sha256=3b_Gxa5wcBnphP63bBiAFdWS7PJhUHRE1WZL_q4Cw8k,749
167
+ datahub/ingestion/graph/client.py,sha256=R50K7NmE3TYgVXvdLnvLZn7N0fkiCXOK0MoJz9ueglA,64963
168
+ datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
169
169
  datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
170
170
  datahub/ingestion/graph/filters.py,sha256=UeUZQHoimavIYx-jXLA0WGkOUe10TaO8uEZkfa-QgNE,6188
171
171
  datahub/ingestion/reporting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -217,7 +217,7 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
217
217
  datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
218
218
  datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
220
- datahub/ingestion/source/aws/glue.py,sha256=r7y1MPDK__BKX_mrJjVa_CEmSXM3Pa02gt19o0sSLE8,56815
220
+ datahub/ingestion/source/aws/glue.py,sha256=lJW3QHHz1_SWqLEB-vUSTxSuL0EgUQ0ptdQns_NLNds,57343
221
221
  datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
222
222
  datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
223
223
  datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
@@ -233,10 +233,10 @@ datahub/ingestion/source/azure/abs_folder_utils.py,sha256=7skXus-4fSIoKpqCeU-GG0
233
233
  datahub/ingestion/source/azure/abs_utils.py,sha256=KdAlCK-PMrn35kFHxz5vrsjajyx2PD5GRgoBKdoRvcg,2075
234
234
  datahub/ingestion/source/azure/azure_common.py,sha256=Zl0pPuE6L3QcM5B1P0LsPthZmD0h7fUUS0kg2okl6IY,4053
235
235
  datahub/ingestion/source/bigquery_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
236
- datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=YMsyj6s7fggzisWfDdbT4w1MKJ3eRdNERsCShnu0Zqo,13681
236
+ datahub/ingestion/source/bigquery_v2/bigquery.py,sha256=c7g8sWuDIMhCSAX0D76P2arxZgTmzd-e0qlO7yt_zJY,13841
237
237
  datahub/ingestion/source/bigquery_v2/bigquery_audit.py,sha256=IlbHA8a-gNJvnubgBfxVHpUk8rFNIG80gk5HWXa2lyE,25108
238
238
  datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py,sha256=LuGJ6LgPViLIfDQfylxlQ3CA7fZYM5MDt8M-7sfzm84,5096
239
- datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=PqbYNqM4-KukCU1meuvsk0qbiWa7UFh5hqHrHsvOSWQ,25889
239
+ datahub/ingestion/source/bigquery_v2/bigquery_config.py,sha256=sjCW997-Su14cVgWd1ZVx1E67yqfTIV5Wjp9Me0hfOw,26289
240
240
  datahub/ingestion/source/bigquery_v2/bigquery_data_reader.py,sha256=DeT3v_Z82__8En0FcZ0kavBAWQoRvSZ5Rppm9eeDAb8,2393
241
241
  datahub/ingestion/source/bigquery_v2/bigquery_helper.py,sha256=QER3gY8e_k1_eNVj7cBso7ZzrWl_vO5PYSa6CpvqNx8,1554
242
242
  datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py,sha256=8nuQ8hMuJEswWDZtV2RjbK8RvDJUzT_S74dnyPpGFdQ,4857
@@ -305,7 +305,7 @@ datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
305
305
  datahub/ingestion/source/gc/datahub_gc.py,sha256=W6uoeV7B4WIXdxT4tOEdDksdJm656WwwvkH79L7f_8Q,12969
306
306
  datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeNOjEingKpYy-XvCPeaAb4k,17125
307
307
  datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=9jsyCIspWSSYSAVPHjKHr05885rXxM6FCH7KzTBceic,10139
308
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=2JpESfsqoJRdLskV3AHYU8nRj_NvNtIaLZ4_RRNIod4,11229
308
+ datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=zRtgC_AcZui4qGf9jBASI3R-CrYZxNe3Pm-gNSLT3rw,11420
309
309
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
310
310
  datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
311
311
  datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
@@ -429,7 +429,7 @@ datahub/ingestion/source/snowflake/constants.py,sha256=22n-0r04nuy-ImxWFFpmbrt_G
429
429
  datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
430
430
  datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
431
431
  datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
432
- datahub/ingestion/source/snowflake/snowflake_config.py,sha256=jQGSa7ZQs3EsXB9ANShZ4xv9RqrhRfVHRSLeFiDwwxc,17974
432
+ datahub/ingestion/source/snowflake/snowflake_config.py,sha256=UehWUvqTXRsWmE5bBS53IoLjUL06-wJq6K4O2MTT2R8,18374
433
433
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
434
434
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
435
435
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
@@ -444,7 +444,7 @@ datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYh
444
444
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
445
445
  datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=0rXgz8bvRiI9SYVMa0UGLeg_DcjqBy6kQsdq0Uq0HVk,24685
446
446
  datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=MoI8-DR9tuMuHMBQcpDo4GFjvcoQZWLNkdFZsTkgK-M,12786
447
- datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=hIWtzlxuSQ_3w48o4AF2l9CQOcWIe6AmD07I89sH2B0,31860
447
+ datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=0doZaPPMO64Qi9uN4w8ZYe3gKkkieGJKI5xntF7vS6w,32020
448
448
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
449
449
  datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
450
450
  datahub/ingestion/source/sql/clickhouse.py,sha256=jzvaXP5Wr0SMhj2rtuvVE821xnfpKiXhO3cm0xblgHs,27299
@@ -912,7 +912,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
912
912
  datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
913
913
  datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
914
914
  datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
915
- datahub/utilities/file_backed_collections.py,sha256=I2GxSYtVzfo38pQpv2FyoBeWISiKD4zUi0t34jPCNrU,21957
915
+ datahub/utilities/file_backed_collections.py,sha256=T0Ck2QQFmws9XhPXA36UBT5d0Qw7INOBph-cqUDd9A0,21958
916
916
  datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
917
917
  datahub/utilities/hive_schema_to_avro.py,sha256=2-7NI9haCAYbyUmHTb-QPxjn4WbmnDDKAIGmHJ11-1E,11622
918
918
  datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
@@ -986,8 +986,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
986
986
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
987
987
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
988
988
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
989
- acryl_datahub-0.15.0.1rc17.dist-info/METADATA,sha256=dV3uL4e_h1lxLVGl_a9Cmnm5pBi8kgHTbR56ypsqjC8,173444
990
- acryl_datahub-0.15.0.1rc17.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
991
- acryl_datahub-0.15.0.1rc17.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
992
- acryl_datahub-0.15.0.1rc17.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
993
- acryl_datahub-0.15.0.1rc17.dist-info/RECORD,,
989
+ acryl_datahub-0.15.0.2rc2.dist-info/METADATA,sha256=z37zis6V6zPyE7Sz_yUducbyoInH_TNOhVozZ0hNpOQ,173662
990
+ acryl_datahub-0.15.0.2rc2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
991
+ acryl_datahub-0.15.0.2rc2.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
992
+ acryl_datahub-0.15.0.2rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
993
+ acryl_datahub-0.15.0.2rc2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.7.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc17"
6
+ __version__ = "0.15.0.2rc2"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
datahub/cli/cli_utils.py CHANGED
@@ -3,7 +3,7 @@ import logging
3
3
  import time
4
4
  import typing
5
5
  from datetime import datetime
6
- from typing import Any, Dict, List, Optional, Tuple, Type, Union
6
+ from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
7
7
 
8
8
  import click
9
9
  import requests
@@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]:
33
33
  return next((el for el in ls if el is not None and el.strip() != ""), None)
34
34
 
35
35
 
36
+ _T = TypeVar("_T")
37
+
38
+
39
+ def get_or_else(value: Optional[_T], default: _T) -> _T:
40
+ # Normally we'd use `value or default`. However, that runs into issues
41
+ # when value is falsey but not None.
42
+ return value if value is not None else default
43
+
44
+
36
45
  def parse_run_restli_response(response: requests.Response) -> dict:
37
46
  response_json = response.json()
38
47
  if response.status_code != 200:
@@ -321,6 +330,8 @@ def get_frontend_session_login_as(
321
330
  def _ensure_valid_gms_url_acryl_cloud(url: str) -> str:
322
331
  if "acryl.io" not in url:
323
332
  return url
333
+ if url.endswith(":8080"):
334
+ url = url.replace(":8080", "")
324
335
  if url.startswith("http://"):
325
336
  url = url.replace("http://", "https://")
326
337
  if url.endswith("acryl.io"):
@@ -1,9 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import functools
2
4
  import json
3
5
  import logging
4
6
  import os
5
7
  from json.decoder import JSONDecodeError
6
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union
8
+ from typing import (
9
+ TYPE_CHECKING,
10
+ Any,
11
+ Callable,
12
+ Dict,
13
+ List,
14
+ Optional,
15
+ Sequence,
16
+ Tuple,
17
+ Union,
18
+ )
7
19
 
8
20
  import requests
9
21
  from deprecated import deprecated
@@ -12,9 +24,13 @@ from requests.exceptions import HTTPError, RequestException
12
24
 
13
25
  from datahub import nice_version_name
14
26
  from datahub.cli import config_utils
15
- from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
27
+ from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
16
28
  from datahub.cli.env_utils import get_boolean_env_variable
17
- from datahub.configuration.common import ConfigurationError, OperationalError
29
+ from datahub.configuration.common import (
30
+ ConfigModel,
31
+ ConfigurationError,
32
+ OperationalError,
33
+ )
18
34
  from datahub.emitter.generic_emitter import Emitter
19
35
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
36
  from datahub.emitter.request_helper import make_curl_command
@@ -31,10 +47,8 @@ if TYPE_CHECKING:
31
47
 
32
48
  logger = logging.getLogger(__name__)
33
49
 
34
- _DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
35
- _DEFAULT_READ_TIMEOUT_SEC = (
36
- 30 # Any ingest call taking longer than 30 seconds should be abandoned
37
- )
50
+ _DEFAULT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect
51
+ _TIMEOUT_LOWER_BOUND_SEC = 1 # if below this, we log a warning
38
52
  _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
39
53
  429,
40
54
  500,
@@ -63,15 +77,76 @@ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
63
77
  )
64
78
 
65
79
 
80
+ class RequestsSessionConfig(ConfigModel):
81
+ timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC
82
+
83
+ retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
84
+ retry_methods: List[str] = _DEFAULT_RETRY_METHODS
85
+ retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
86
+
87
+ extra_headers: Dict[str, str] = {}
88
+
89
+ ca_certificate_path: Optional[str] = None
90
+ client_certificate_path: Optional[str] = None
91
+ disable_ssl_verification: bool = False
92
+
93
+ def build_session(self) -> requests.Session:
94
+ session = requests.Session()
95
+
96
+ if self.extra_headers:
97
+ session.headers.update(self.extra_headers)
98
+
99
+ if self.client_certificate_path:
100
+ session.cert = self.client_certificate_path
101
+
102
+ if self.ca_certificate_path:
103
+ session.verify = self.ca_certificate_path
104
+
105
+ if self.disable_ssl_verification:
106
+ session.verify = False
107
+
108
+ try:
109
+ # Set raise_on_status to False to propagate errors:
110
+ # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
111
+ # Must call `raise_for_status` after making a request, which we do
112
+ retry_strategy = Retry(
113
+ total=self.retry_max_times,
114
+ status_forcelist=self.retry_status_codes,
115
+ backoff_factor=2,
116
+ allowed_methods=self.retry_methods,
117
+ raise_on_status=False,
118
+ )
119
+ except TypeError:
120
+ # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
121
+ retry_strategy = Retry(
122
+ total=self.retry_max_times,
123
+ status_forcelist=self.retry_status_codes,
124
+ backoff_factor=2,
125
+ method_whitelist=self.retry_methods,
126
+ raise_on_status=False,
127
+ )
128
+
129
+ adapter = HTTPAdapter(
130
+ pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
131
+ )
132
+ session.mount("http://", adapter)
133
+ session.mount("https://", adapter)
134
+
135
+ if self.timeout is not None:
136
+ # Shim session.request to apply default timeout values.
137
+ # Via https://stackoverflow.com/a/59317604.
138
+ session.request = functools.partial( # type: ignore
139
+ session.request,
140
+ timeout=self.timeout,
141
+ )
142
+
143
+ return session
144
+
145
+
66
146
  class DataHubRestEmitter(Closeable, Emitter):
67
147
  _gms_server: str
68
148
  _token: Optional[str]
69
149
  _session: requests.Session
70
- _connect_timeout_sec: float = _DEFAULT_CONNECT_TIMEOUT_SEC
71
- _read_timeout_sec: float = _DEFAULT_READ_TIMEOUT_SEC
72
- _retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES
73
- _retry_methods: List[str] = _DEFAULT_RETRY_METHODS
74
- _retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES
75
150
 
76
151
  def __init__(
77
152
  self,
@@ -102,15 +177,13 @@ class DataHubRestEmitter(Closeable, Emitter):
102
177
 
103
178
  self._session = requests.Session()
104
179
 
105
- self._session.headers.update(
106
- {
107
- "X-RestLi-Protocol-Version": "2.0.0",
108
- "X-DataHub-Py-Cli-Version": nice_version_name(),
109
- "Content-Type": "application/json",
110
- }
111
- )
180
+ headers = {
181
+ "X-RestLi-Protocol-Version": "2.0.0",
182
+ "X-DataHub-Py-Cli-Version": nice_version_name(),
183
+ "Content-Type": "application/json",
184
+ }
112
185
  if token:
113
- self._session.headers.update({"Authorization": f"Bearer {token}"})
186
+ headers["Authorization"] = f"Bearer {token}"
114
187
  else:
115
188
  # HACK: When no token is provided but system auth env variables are set, we use them.
116
189
  # Ideally this should simply get passed in as config, instead of being sneakily injected
@@ -119,75 +192,43 @@ class DataHubRestEmitter(Closeable, Emitter):
119
192
  # rest emitter, and the rest sink uses the rest emitter under the hood.
120
193
  system_auth = config_utils.get_system_auth()
121
194
  if system_auth is not None:
122
- self._session.headers.update({"Authorization": system_auth})
123
-
124
- if extra_headers:
125
- self._session.headers.update(extra_headers)
126
-
127
- if client_certificate_path:
128
- self._session.cert = client_certificate_path
129
-
130
- if ca_certificate_path:
131
- self._session.verify = ca_certificate_path
132
-
133
- if disable_ssl_verification:
134
- self._session.verify = False
135
-
136
- self._connect_timeout_sec = (
137
- connect_timeout_sec or timeout_sec or _DEFAULT_CONNECT_TIMEOUT_SEC
138
- )
139
- self._read_timeout_sec = (
140
- read_timeout_sec or timeout_sec or _DEFAULT_READ_TIMEOUT_SEC
141
- )
142
-
143
- if self._connect_timeout_sec < 1 or self._read_timeout_sec < 1:
144
- logger.warning(
145
- f"Setting timeout values lower than 1 second is not recommended. Your configuration is connect_timeout:{self._connect_timeout_sec}s, read_timeout:{self._read_timeout_sec}s"
146
- )
147
-
148
- if retry_status_codes is not None: # Only if missing. Empty list is allowed
149
- self._retry_status_codes = retry_status_codes
150
-
151
- if retry_methods is not None:
152
- self._retry_methods = retry_methods
153
-
154
- if retry_max_times:
155
- self._retry_max_times = retry_max_times
195
+ headers["Authorization"] = system_auth
156
196
 
157
- try:
158
- # Set raise_on_status to False to propagate errors:
159
- # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception
160
- # Must call `raise_for_status` after making a request, which we do
161
- retry_strategy = Retry(
162
- total=self._retry_max_times,
163
- status_forcelist=self._retry_status_codes,
164
- backoff_factor=2,
165
- allowed_methods=self._retry_methods,
166
- raise_on_status=False,
167
- )
168
- except TypeError:
169
- # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`.
170
- retry_strategy = Retry(
171
- total=self._retry_max_times,
172
- status_forcelist=self._retry_status_codes,
173
- backoff_factor=2,
174
- method_whitelist=self._retry_methods,
175
- raise_on_status=False,
197
+ timeout: float | tuple[float, float]
198
+ if connect_timeout_sec is not None or read_timeout_sec is not None:
199
+ timeout = (
200
+ connect_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
201
+ read_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC,
176
202
  )
203
+ if (
204
+ timeout[0] < _TIMEOUT_LOWER_BOUND_SEC
205
+ or timeout[1] < _TIMEOUT_LOWER_BOUND_SEC
206
+ ):
207
+ logger.warning(
208
+ f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is (connect_timeout, read_timeout) = {timeout} seconds"
209
+ )
210
+ else:
211
+ timeout = get_or_else(timeout_sec, _DEFAULT_TIMEOUT_SEC)
212
+ if timeout < _TIMEOUT_LOWER_BOUND_SEC:
213
+ logger.warning(
214
+ f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is timeout = {timeout} seconds"
215
+ )
177
216
 
178
- adapter = HTTPAdapter(
179
- pool_connections=100, pool_maxsize=100, max_retries=retry_strategy
180
- )
181
- self._session.mount("http://", adapter)
182
- self._session.mount("https://", adapter)
183
-
184
- # Shim session.request to apply default timeout values.
185
- # Via https://stackoverflow.com/a/59317604.
186
- self._session.request = functools.partial( # type: ignore
187
- self._session.request,
188
- timeout=(self._connect_timeout_sec, self._read_timeout_sec),
217
+ self._session_config = RequestsSessionConfig(
218
+ timeout=timeout,
219
+ retry_status_codes=get_or_else(
220
+ retry_status_codes, _DEFAULT_RETRY_STATUS_CODES
221
+ ),
222
+ retry_methods=get_or_else(retry_methods, _DEFAULT_RETRY_METHODS),
223
+ retry_max_times=get_or_else(retry_max_times, _DEFAULT_RETRY_MAX_TIMES),
224
+ extra_headers={**headers, **(extra_headers or {})},
225
+ ca_certificate_path=ca_certificate_path,
226
+ client_certificate_path=client_certificate_path,
227
+ disable_ssl_verification=disable_ssl_verification,
189
228
  )
190
229
 
230
+ self._session = self._session_config.build_session()
231
+
191
232
  def test_connection(self) -> None:
192
233
  url = f"{self._gms_server}/config"
193
234
  response = self._session.get(url)
@@ -179,21 +179,24 @@ class DataHubGraph(DatahubRestEmitter):
179
179
 
180
180
  @classmethod
181
181
  def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
182
+ session_config = emitter._session_config
183
+ if isinstance(session_config.timeout, tuple):
184
+ # TODO: This is slightly lossy. Eventually, we want to modify the emitter
185
+ # to accept a tuple for timeout_sec, and then we'll be able to remove this.
186
+ timeout_sec: Optional[float] = session_config.timeout[0]
187
+ else:
188
+ timeout_sec = session_config.timeout
182
189
  return cls(
183
190
  DatahubClientConfig(
184
191
  server=emitter._gms_server,
185
192
  token=emitter._token,
186
- timeout_sec=emitter._read_timeout_sec,
187
- retry_status_codes=emitter._retry_status_codes,
188
- retry_max_times=emitter._retry_max_times,
189
- extra_headers=emitter._session.headers,
190
- disable_ssl_verification=emitter._session.verify is False,
191
- ca_certificate_path=(
192
- emitter._session.verify
193
- if isinstance(emitter._session.verify, str)
194
- else None
195
- ),
196
- client_certificate_path=emitter._session.cert,
193
+ timeout_sec=timeout_sec,
194
+ retry_status_codes=session_config.retry_status_codes,
195
+ retry_max_times=session_config.retry_max_times,
196
+ extra_headers=session_config.extra_headers,
197
+ disable_ssl_verification=session_config.disable_ssl_verification,
198
+ ca_certificate_path=session_config.ca_certificate_path,
199
+ client_certificate_path=session_config.client_certificate_path,
197
200
  )
198
201
  )
199
202
 
@@ -10,7 +10,7 @@ class DatahubClientConfig(ConfigModel):
10
10
  # by callers / the CLI, but the actual client should not have any magic.
11
11
  server: str
12
12
  token: Optional[str] = None
13
- timeout_sec: Optional[int] = None
13
+ timeout_sec: Optional[float] = None
14
14
  retry_status_codes: Optional[List[int]] = None
15
15
  retry_max_times: Optional[int] = None
16
16
  extra_headers: Optional[Dict[str, str]] = None
@@ -1054,49 +1054,66 @@ class GlueSource(StatefulIngestionSourceBase):
1054
1054
  yield from self.gen_database_containers(database)
1055
1055
 
1056
1056
  for table in tables:
1057
- database_name = table["DatabaseName"]
1058
1057
  table_name = table["Name"]
1059
- full_table_name = f"{database_name}.{table_name}"
1060
- self.report.report_table_scanned()
1061
- if not self.source_config.database_pattern.allowed(
1062
- database_name
1063
- ) or not self.source_config.table_pattern.allowed(full_table_name):
1064
- self.report.report_table_dropped(full_table_name)
1065
- continue
1058
+ try:
1059
+ yield from self._gen_table_wu(table=table)
1060
+ except KeyError as e:
1061
+ self.report.report_failure(
1062
+ message="Failed to extract workunit for table",
1063
+ context=f"Table: {table_name}",
1064
+ exc=e,
1065
+ )
1066
+ if self.extract_transforms:
1067
+ yield from self._transform_extraction()
1066
1068
 
1067
- dataset_urn = make_dataset_urn_with_platform_instance(
1068
- platform=self.platform,
1069
- name=full_table_name,
1070
- env=self.env,
1071
- platform_instance=self.source_config.platform_instance,
1072
- )
1069
+ def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]:
1070
+ database_name = table["DatabaseName"]
1071
+ table_name = table["Name"]
1072
+ full_table_name = f"{database_name}.{table_name}"
1073
+ self.report.report_table_scanned()
1074
+ if not self.source_config.database_pattern.allowed(
1075
+ database_name
1076
+ ) or not self.source_config.table_pattern.allowed(full_table_name):
1077
+ self.report.report_table_dropped(full_table_name)
1078
+ return
1079
+
1080
+ dataset_urn = make_dataset_urn_with_platform_instance(
1081
+ platform=self.platform,
1082
+ name=full_table_name,
1083
+ env=self.env,
1084
+ platform_instance=self.source_config.platform_instance,
1085
+ )
1073
1086
 
1074
- mce = self._extract_record(dataset_urn, table, full_table_name)
1075
- yield MetadataWorkUnit(full_table_name, mce=mce)
1087
+ mce = self._extract_record(dataset_urn, table, full_table_name)
1088
+ yield MetadataWorkUnit(full_table_name, mce=mce)
1076
1089
 
1077
- # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
1078
- # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
1079
- yield MetadataChangeProposalWrapper(
1080
- entityUrn=dataset_urn,
1081
- aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
1082
- ).as_workunit()
1090
+ # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
1091
+ # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
1092
+ yield MetadataChangeProposalWrapper(
1093
+ entityUrn=dataset_urn,
1094
+ aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
1095
+ ).as_workunit()
1083
1096
 
1084
- yield from self._get_domain_wu(
1085
- dataset_name=full_table_name,
1086
- entity_urn=dataset_urn,
1087
- )
1088
- yield from self.add_table_to_database_container(
1089
- dataset_urn=dataset_urn, db_name=database_name
1090
- )
1097
+ yield from self._get_domain_wu(
1098
+ dataset_name=full_table_name,
1099
+ entity_urn=dataset_urn,
1100
+ )
1101
+ yield from self.add_table_to_database_container(
1102
+ dataset_urn=dataset_urn, db_name=database_name
1103
+ )
1091
1104
 
1092
- wu = self.get_lineage_if_enabled(mce)
1093
- if wu:
1094
- yield wu
1105
+ wu = self.get_lineage_if_enabled(mce)
1106
+ if wu:
1107
+ yield wu
1095
1108
 
1109
+ try:
1096
1110
  yield from self.get_profile_if_enabled(mce, database_name, table_name)
1097
-
1098
- if self.extract_transforms:
1099
- yield from self._transform_extraction()
1111
+ except KeyError as e:
1112
+ self.report.report_failure(
1113
+ message="Failed to extract profile for table",
1114
+ context=f"Table: {dataset_urn}",
1115
+ exc=e,
1116
+ )
1100
1117
 
1101
1118
  def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
1102
1119
  dags: Dict[str, Optional[Dict[str, Any]]] = {}
@@ -281,6 +281,8 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
281
281
  include_lineage=self.config.include_table_lineage,
282
282
  include_usage_statistics=self.config.include_usage_statistics,
283
283
  include_operations=self.config.usage.include_operational_stats,
284
+ include_queries=self.config.include_queries,
285
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
284
286
  top_n_queries=self.config.usage.top_n_queries,
285
287
  region_qualifiers=self.config.region_qualifiers,
286
288
  ),
@@ -447,6 +447,14 @@ class BigQueryV2Config(
447
447
  default=False,
448
448
  description="If enabled, uses the new queries extractor to extract queries from bigquery.",
449
449
  )
450
+ include_queries: bool = Field(
451
+ default=True,
452
+ description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
453
+ )
454
+ include_query_usage_statistics: bool = Field(
455
+ default=True,
456
+ description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
457
+ )
450
458
 
451
459
  @property
452
460
  def have_table_data_read_permission(self) -> bool: