acryl-datahub 1.3.0.1rc3__py3-none-any.whl → 1.3.0.1rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.3.0.1rc3.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.3.0.1rc5.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=em6A_AK1PEbN80kEFbwFQP_4ZhxiWZZekBVZKx4EpV4,323
4
+ datahub/_version.py,sha256=GFe5nZs9PKs-LLDaT1H1D9udtmOwDf_NsyGlgBGOywE,323
5
5
  datahub/entrypoints.py,sha256=VcbU6Z47b_JKW1zI-WJMYIngm05FSogKLiuvFNtyNcI,9088
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -241,7 +241,7 @@ datahub/ingestion/source/abs/source.py,sha256=z86K5_P_gu8kTytLOAYyQqqD2g14JGSrv1
241
241
  datahub/ingestion/source/apply/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
242
242
  datahub/ingestion/source/apply/datahub_apply.py,sha256=xTD-Iq3UHhxcz61RwNuI2kJjRrnQEfZFSgvS1X6loV4,7703
243
243
  datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
244
- datahub/ingestion/source/aws/aws_common.py,sha256=1NgwpasOO3fpBGtZnxWEzjHFoHjEs5hpg_HkTvsbn0M,18147
244
+ datahub/ingestion/source/aws/aws_common.py,sha256=Va9uxo5aKsAR7qIC625VpRO3XDqzNIg4SfK_eFg25Rw,23781
245
245
  datahub/ingestion/source/aws/glue.py,sha256=dUaMWcI5Ed-TzbbSrF6suT4L1vcRHoHfFCdTvAINc4w,67423
246
246
  datahub/ingestion/source/aws/platform_resource_repository.py,sha256=0eUfGy1FbaBltCSNTtXyLrkrdqTc1KkTgDJB1Gd-Ydk,853
247
247
  datahub/ingestion/source/aws/s3_boto_utils.py,sha256=rGlWAkKZpkeA1_wMvcJvSDvobvduShszowU-KcrQudg,7011
@@ -271,7 +271,7 @@ datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py,sha256
271
271
  datahub/ingestion/source/bigquery_v2/bigquery_queries.py,sha256=2syDMaRpYEbtGUVejVAK5d6g8HqM54ZyEM908uLJ55o,3393
272
272
  datahub/ingestion/source/bigquery_v2/bigquery_report.py,sha256=zlTkqOmt5zxnO40rVTYHF3fclj4OVlLtqUXwW5WIIcM,7855
273
273
  datahub/ingestion/source/bigquery_v2/bigquery_schema.py,sha256=zbYb1EYnCJxgvsU8oT_76l0q_BW1exVjMWM1GAgd1nc,32600
274
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=_NLFRRXsrxMZ8Vjg2jVL4Pg1_NGt9hzn9EWBooJZ8so,51566
274
+ datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=PbSCMj5ACwEu_HQNe29IHs4y1bn15_nnz6ZW1Yt17wI,51796
275
275
  datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py,sha256=cATxwi5IPzj3BldRRAVcLqzSFmmYEPvqa7U0RFJbaAc,7645
276
276
  datahub/ingestion/source/bigquery_v2/common.py,sha256=IinOy-RO4UZGxSf5scaN02672BzZuNsjJZ56axti6iI,4016
277
277
  datahub/ingestion/source/bigquery_v2/lineage.py,sha256=jju14mJbAUMA_K3j2yq-TdZV202cjd5rBAsDPJGEVno,44900
@@ -456,9 +456,9 @@ datahub/ingestion/source/redshift/profile.py,sha256=H1Xtc2rXScUv4w0b2BbM7POjYEwq
456
456
  datahub/ingestion/source/redshift/query.py,sha256=HKobQ-0crARgT8Mkfe-WBqVR9ZadYCZ9DGaUoEHHHww,48234
457
457
  datahub/ingestion/source/redshift/redshift.py,sha256=RN8rao3j7nocnnD6oPcEju09-8mOZTE4vFkgy_13Az8,41293
458
458
  datahub/ingestion/source/redshift/redshift_data_reader.py,sha256=zc69jwXHdF-w8J4Hq-ZQ6BjHQ75Ij2iNDMpoRJlcmlU,1724
459
- datahub/ingestion/source/redshift/redshift_schema.py,sha256=7F-l_omOuKMuGE_rBWXVPG_GWXFKnCMzC4frNxZB9cs,24800
459
+ datahub/ingestion/source/redshift/redshift_schema.py,sha256=2U8IIPRJkL-HWUeWswOzvcT1hdTBQgPMhr6tYCDuqrM,25226
460
460
  datahub/ingestion/source/redshift/report.py,sha256=aCFDFUbz5xde8b_eRIHSBiELoo9LZFtDpp2lSadiPHU,2937
461
- datahub/ingestion/source/redshift/usage.py,sha256=Q7R-caJovLXv33uZepMGX5Cvm4DqQSLZdiL_s-p06wU,17473
461
+ datahub/ingestion/source/redshift/usage.py,sha256=szdg3cUw4UpZ8rXMZufIAVTq2iiI1VsmrFgjAEH8Dv4,17521
462
462
  datahub/ingestion/source/s3/__init__.py,sha256=HjqFPj11WtNFZM3kcVshlDb7kOsc19-l_3LM8PBjlJM,56
463
463
  datahub/ingestion/source/s3/config.py,sha256=lElFXgEpKDT9SVoiXvtx98wV6Gp880qP4pLQaOGJGOo,7828
464
464
  datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=FfrcgK-JEF94vw-l3q6pN6FENXb-wZzW2w1VUZVkwW8,3620
@@ -476,7 +476,7 @@ datahub/ingestion/source/schema_inference/avro.py,sha256=aaqCMhLU2nxMJYPSNZv0o0A
476
476
  datahub/ingestion/source/schema_inference/base.py,sha256=dI98TOieCqqA1SdB6729EAReanGX2AC7UgSDkPls8Sg,379
477
477
  datahub/ingestion/source/schema_inference/csv_tsv.py,sha256=ypuBZEAf8Hx2Efrvu1nMWDdqVH_lg4i7N68YCwi8NiU,2259
478
478
  datahub/ingestion/source/schema_inference/json.py,sha256=p5S-3idn65V2uad5T8txs1UakA4cfllcrxfN-6qltss,2577
479
- datahub/ingestion/source/schema_inference/object.py,sha256=dhSOtxVJHbTDY0hWeHwdLYHnOsW07Omk7Y4DPeztie0,5847
479
+ datahub/ingestion/source/schema_inference/object.py,sha256=ERR0XdiGE_qBWbNvt1oEWPYeB7ZNAsCnTZTF3ngn4F8,6582
480
480
  datahub/ingestion/source/schema_inference/parquet.py,sha256=CdqsNuiabLLCulWbuPMssijeFmKLv3M5MKFIhlatpWA,3456
481
481
  datahub/ingestion/source/sigma/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
482
482
  datahub/ingestion/source/sigma/config.py,sha256=ztZf0YisGSXKgKeqP9ipDlRKLXU-Y-XABqm7HCJ8pvA,6265
@@ -523,9 +523,9 @@ datahub/ingestion/source/sql/hana.py,sha256=V6bGVLVjI1VL0deebg8VxIL8Ls-oxUvpSvX9
523
523
  datahub/ingestion/source/sql/hive.py,sha256=SPmAWlk63V-s-loBTU2hXsQA7xA4sa0iPK6pCbF-AJ8,31600
524
524
  datahub/ingestion/source/sql/hive_metastore.py,sha256=UBB7mV2eKuCxv3voi0F3tqF2MyRObSYxArAxETZfO4E,35997
525
525
  datahub/ingestion/source/sql/mariadb.py,sha256=om6QoG5UtDldt1N6AfIWp3T-HXNaaqFmpz2i0JAemfM,654
526
- datahub/ingestion/source/sql/mysql.py,sha256=_KhTODU7mqAoJOlrvRdPa7ihQkYLkgrZwaseQbasotM,5358
526
+ datahub/ingestion/source/sql/mysql.py,sha256=h0kv86-8SxBTmaEhmcyqGcKoaWQ4peUdiE9sfhrTuqY,9734
527
527
  datahub/ingestion/source/sql/oracle.py,sha256=nKMM1O67SkxCgT781eENl5xXpIR8_p5joTSdAYzQwHY,29988
528
- datahub/ingestion/source/sql/postgres.py,sha256=blkO6bI0eDKFK8UNwUYcYtm_ObrQuWVSy5GyfdhL5dg,14274
528
+ datahub/ingestion/source/sql/postgres.py,sha256=Vk1NVI0zJPzMj4SKJ9jfyGu4DY3bow724BDn10BaxzU,17478
529
529
  datahub/ingestion/source/sql/presto.py,sha256=58py4M3UYxkGpbBFA1o96H154eUhD2dBm1hpxxYlYYM,4256
530
530
  datahub/ingestion/source/sql/sql_common.py,sha256=EZGoeGlOYZoOrXOiKDI-S1mw-sPVV33PZQ_mPJlEvRc,57759
531
531
  datahub/ingestion/source/sql/sql_config.py,sha256=u3nGZYYl1WtaxfNsDU5bglgZ5Jq3Fxk9xei_CUIAXB0,8222
@@ -535,7 +535,7 @@ datahub/ingestion/source/sql/sql_report.py,sha256=gw-OPHSExp_b6DRjvwqE1U6Bpkwekx
535
535
  datahub/ingestion/source/sql/sql_types.py,sha256=AVeBBXw8aKB1_jw6Wtg58miu-YUfN_-7ZcXwSF-ESgA,16021
536
536
  datahub/ingestion/source/sql/sql_utils.py,sha256=q-Bsk6WxlsRtrw9RXBxvqI3zuaMTC_F25T2VrCziR9I,8418
537
537
  datahub/ingestion/source/sql/sqlalchemy_data_reader.py,sha256=FvHZ4JEK3aR2DYOBZiT_ZsAy12RjTu4t_KIR_92B11k,2644
538
- datahub/ingestion/source/sql/sqlalchemy_uri.py,sha256=u0ZvgdJjXZdo_vl7YIQfYuuWbGwpnH6OSozI2e8ZV4I,858
538
+ datahub/ingestion/source/sql/sqlalchemy_uri.py,sha256=wE_GX2TtkFEvscC_Epy5hUfBxtE6mUQoVPb7fUea0jk,1882
539
539
  datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py,sha256=KOpbmDIE2h1hyYEsbVHJi2B7FlsyUMTXZx4diyzltQg,1826
540
540
  datahub/ingestion/source/sql/teradata.py,sha256=YydlPGndFGZcpvlmim3T-1yaAmsFt08TZVOTo1R3GLo,66871
541
541
  datahub/ingestion/source/sql/trino.py,sha256=o5hm84iwRHO59TD2LaEqYgF2LYIcSUIKmlgu1VudGBY,19254
@@ -646,8 +646,8 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
646
646
  datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
647
647
  datahub/lite/lite_util.py,sha256=G0LQHKkyEb1pc_q183g6hflShclGx7kikgMaOxtVVcs,4545
648
648
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
649
- datahub/metadata/_internal_schema_classes.py,sha256=1UZsNj9XmThYFXbG39BVKlaTTFywzayhVVon6svD3kM,1076970
650
- datahub/metadata/schema.avsc,sha256=P6j7fiukfv03ZW8gis3m3mVKGlSV2JhgMcmrtf5sU7Q,775491
649
+ datahub/metadata/_internal_schema_classes.py,sha256=M3j1TDrK43RYlhKUsaqeHgjbw5ERkAzKxdC4pD4dLEg,1077060
650
+ datahub/metadata/schema.avsc,sha256=mUP8XLRosJg0WlBymK0-EAlGqrC-j7bQIDbOVtODrR8,775591
651
651
  datahub/metadata/schema_classes.py,sha256=tPT8iHCak4IsZi_oL0nirbPpI8ETTPTZzapqLRpeKU4,1326
652
652
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
653
653
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
@@ -782,7 +782,7 @@ datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc,sha256=4wac7sluRIq-0ZjO
782
782
  datahub/metadata/schemas/DataHubIngestionSourceKey.avsc,sha256=TGmm9WEGTaABs7kt5Uc-N-kbc5Sd-2sQwx-JpfAptvw,545
783
783
  datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc,sha256=q6ZyMoxInwmrkrXkUgMe-i-WZzAxbjcvJ-EI99SnEp8,599
784
784
  datahub/metadata/schemas/DataHubPageModuleKey.avsc,sha256=NyFN8cVO6s6rtgoLGJJGfcPfpGr5PfmZlIhM6ajldfQ,460
785
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc,sha256=53Fj4ztBJqo9QMWuza2Kdtfpr2nTOTW0XuuXW77ugB8,10347
785
+ datahub/metadata/schemas/DataHubPageModuleProperties.avsc,sha256=hbIEkpjQxVxSZOQmTVIjOGnGTNeaA0r0oYNKPpwuddg,10443
786
786
  datahub/metadata/schemas/DataHubPageTemplateKey.avsc,sha256=0sVqwL97Rp8YHPytp2RqUP5hIW048hmT2hPNP5k6arc,472
787
787
  datahub/metadata/schemas/DataHubPageTemplateProperties.avsc,sha256=FyNcZIniQy9m6yN9DT4XsPkDrxUsU7tRTqmfdGoEtMU,8565
788
788
  datahub/metadata/schemas/DataHubPersonaInfo.avsc,sha256=OUvbTgPQsBtzkDDb9pxHXpQ6A7dkL77ZnCXZ-MLEG14,227
@@ -1128,8 +1128,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1128
1128
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1129
1129
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1130
1130
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1131
- acryl_datahub-1.3.0.1rc3.dist-info/METADATA,sha256=Q32VN8kEGo-T0nge3wOkp_EmXJQQZKtZYl9SnsCu3PY,184504
1132
- acryl_datahub-1.3.0.1rc3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1133
- acryl_datahub-1.3.0.1rc3.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1134
- acryl_datahub-1.3.0.1rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1135
- acryl_datahub-1.3.0.1rc3.dist-info/RECORD,,
1131
+ acryl_datahub-1.3.0.1rc5.dist-info/METADATA,sha256=ckjfgTlPEUgZH1sYvZm3MLupBZvBbsC-zokD3Q2ekno,184688
1132
+ acryl_datahub-1.3.0.1rc5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1133
+ acryl_datahub-1.3.0.1rc5.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1134
+ acryl_datahub-1.3.0.1rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1135
+ acryl_datahub-1.3.0.1rc5.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.3.0.1rc3"
3
+ __version__ = "1.3.0.1rc5"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -3,11 +3,13 @@ from datetime import datetime, timedelta, timezone
3
3
  from enum import Enum
4
4
  from http import HTTPStatus
5
5
  from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
6
+ from urllib.parse import parse_qs, urlparse
6
7
 
7
8
  import boto3
8
9
  import requests
9
10
  from boto3.session import Session
10
11
  from botocore.config import DEFAULT_TIMEOUT, Config
12
+ from botocore.exceptions import ClientError, NoCredentialsError
11
13
  from botocore.utils import fix_s3_host
12
14
  from pydantic.fields import Field
13
15
 
@@ -465,6 +467,165 @@ class AwsConnectionConfig(ConfigModel):
465
467
  def get_lakeformation_client(self) -> "LakeFormationClient":
466
468
  return self.get_session().client("lakeformation", config=self._aws_config())
467
469
 
470
+ def get_rds_client(self):
471
+ """Get an RDS client for generating IAM auth tokens."""
472
+ return self.get_session().client("rds", config=self._aws_config())
473
+
474
+
475
+ def generate_rds_iam_token(
476
+ endpoint: str,
477
+ username: str,
478
+ port: int,
479
+ aws_config: AwsConnectionConfig,
480
+ ) -> str:
481
+ """
482
+ Generate an AWS RDS IAM authentication token.
483
+
484
+ boto3's generate_db_auth_token() returns a presigned URL in the format:
485
+ "hostname:port/?Action=connect&DBUser=username&X-Amz-Date=...&X-Amz-Expires=..."
486
+
487
+ This token should be used as-is by pymysql/psycopg2 drivers.
488
+
489
+ Args:
490
+ endpoint: RDS endpoint hostname
491
+ username: Database username for IAM authentication
492
+ port: Database port (5432 for PostgreSQL, 3306 for MySQL)
493
+ aws_config: AwsConnectionConfig for session management and credentials
494
+
495
+ Returns:
496
+ Authentication token (presigned URL format)
497
+
498
+ Raises:
499
+ ValueError: If AWS credentials are not found or token generation fails
500
+
501
+ """
502
+ try:
503
+ client = aws_config.get_rds_client()
504
+ token = client.generate_db_auth_token(
505
+ DBHostname=endpoint, Port=port, DBUsername=username
506
+ )
507
+ logger.debug(f"Generated RDS IAM token for {username}@{endpoint}:{port}")
508
+ return token
509
+ except NoCredentialsError as e:
510
+ raise ValueError("AWS credentials not found") from e
511
+ except ClientError as e:
512
+ raise ValueError(f"Failed to generate RDS IAM token: {e}") from e
513
+
514
+
515
+ class RDSIAMTokenManager:
516
+ """
517
+ Manages RDS IAM token lifecycle with automatic refresh.
518
+
519
+ RDS IAM tokens include expiration information in the URL parameters.
520
+ This manager parses the token expiry and refreshes before expiration
521
+ to ensure uninterrupted database access.
522
+ """
523
+
524
+ def __init__(
525
+ self,
526
+ endpoint: str,
527
+ username: str,
528
+ port: int,
529
+ aws_config: AwsConnectionConfig,
530
+ refresh_threshold_minutes: int = 5,
531
+ ):
532
+ """
533
+ Initialize the token manager.
534
+
535
+ Args:
536
+ endpoint: RDS endpoint hostname
537
+ username: Database username for IAM authentication
538
+ port: Database port
539
+ aws_config: AwsConnectionConfig for session management and credentials
540
+ refresh_threshold_minutes: Refresh token when this many minutes remain before expiry
541
+ """
542
+ self.endpoint = endpoint
543
+ self.username = username
544
+ self.port = port
545
+ self.aws_config = aws_config
546
+ self.refresh_threshold = timedelta(minutes=refresh_threshold_minutes)
547
+
548
+ self._current_token: Optional[str] = None
549
+ self._token_expires_at: Optional[datetime] = None
550
+
551
+ def get_token(self) -> str:
552
+ """
553
+ Get current token, refreshing if necessary.
554
+
555
+ Returns:
556
+ Valid authentication token
557
+
558
+ Raises:
559
+ RuntimeError: If token generation or refresh fails
560
+ """
561
+ if self._needs_refresh():
562
+ self._refresh_token()
563
+
564
+ assert self._current_token is not None
565
+ return self._current_token
566
+
567
+ def _needs_refresh(self) -> bool:
568
+ """Check if token needs to be refreshed."""
569
+ if self._current_token is None or self._token_expires_at is None:
570
+ return True
571
+
572
+ time_until_expiry = self._token_expires_at - datetime.now(timezone.utc)
573
+ return time_until_expiry <= self.refresh_threshold
574
+
575
+ def _parse_token_expiry(self, token: str) -> datetime:
576
+ """
577
+ Parse token expiry from X-Amz-Date and X-Amz-Expires URL parameters.
578
+
579
+ Args:
580
+ token: RDS IAM authentication token (presigned URL)
581
+
582
+ Returns:
583
+ Expiration datetime in UTC
584
+
585
+ Raises:
586
+ ValueError: If token URL format is invalid or missing required parameters
587
+ """
588
+ try:
589
+ parsed_url = urlparse(token)
590
+ query_params = parse_qs(parsed_url.query)
591
+
592
+ # Extract X-Amz-Date (ISO 8601 format: YYYYMMDDTHHMMSSZ)
593
+ amz_date_list = query_params.get("X-Amz-Date")
594
+ if not amz_date_list:
595
+ raise ValueError("Missing X-Amz-Date parameter in RDS IAM token")
596
+ amz_date_str = amz_date_list[0]
597
+
598
+ # Extract X-Amz-Expires (duration in seconds)
599
+ amz_expires_list = query_params.get("X-Amz-Expires")
600
+ if not amz_expires_list:
601
+ raise ValueError("Missing X-Amz-Expires parameter in RDS IAM token")
602
+ amz_expires_seconds = int(amz_expires_list[0])
603
+
604
+ # Parse X-Amz-Date to datetime
605
+ token_issued_at = datetime.strptime(amz_date_str, "%Y%m%dT%H%M%SZ").replace(
606
+ tzinfo=timezone.utc
607
+ )
608
+
609
+ # Calculate expiration
610
+ return token_issued_at + timedelta(seconds=amz_expires_seconds)
611
+
612
+ except (ValueError, KeyError, IndexError) as e:
613
+ raise ValueError(
614
+ f"Failed to parse RDS IAM token expiry: {e}. Token format may be invalid."
615
+ ) from e
616
+
617
+ def _refresh_token(self) -> None:
618
+ """Generate and store a new token with parsed expiry."""
619
+ logger.info("Refreshing RDS IAM authentication token")
620
+ self._current_token = generate_rds_iam_token(
621
+ endpoint=self.endpoint,
622
+ username=self.username,
623
+ port=self.port,
624
+ aws_config=self.aws_config,
625
+ )
626
+ self._token_expires_at = self._parse_token_expiry(self._current_token)
627
+ logger.debug(f"Token will expire at {self._token_expires_at}")
628
+
468
629
 
469
630
  class AwsSourceConfig(EnvConfigMixin, AwsConnectionConfig):
470
631
  """
@@ -449,10 +449,12 @@ class BigQuerySchemaGenerator:
449
449
  ):
450
450
  yield wu
451
451
  except Exception as e:
452
- if self.config.is_profiling_enabled():
453
- action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission, bigquery.tables.getData permission?"
452
+ # If configuration indicates we need table data access (for profiling or use_tables_list_query_v2),
453
+ # include bigquery.tables.getData in the error message since that's likely the missing permission
454
+ if self.config.have_table_data_read_permission:
455
+ action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list, bigquery.tables.getData permissions?"
454
456
  else:
455
- action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission?"
457
+ action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permissions?"
456
458
 
457
459
  self.report.failure(
458
460
  title="Unable to get tables for dataset",
@@ -15,6 +15,7 @@ from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable
15
15
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
16
16
  from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
17
17
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
18
+ from datahub.utilities.perf_timer import PerfTimer
18
19
 
19
20
  logger: logging.Logger = logging.getLogger(__name__)
20
21
 
@@ -243,9 +244,13 @@ class RedshiftDataDictionary:
243
244
  conn: redshift_connector.Connection, query: str
244
245
  ) -> redshift_connector.Cursor:
245
246
  cursor: redshift_connector.Cursor = conn.cursor()
246
-
247
- logger.debug(f"Query : {query}")
248
- cursor.execute(query)
247
+ with PerfTimer() as timer:
248
+ query_hash_id = hash(query)
249
+ logger.info(f"Executing query [{query_hash_id}]\n{query}")
250
+ cursor.execute(query)
251
+ logger.info(
252
+ f"Time taken query [{query_hash_id}: {timer.elapsed_seconds():.3f} seconds"
253
+ )
249
254
  return cursor
250
255
 
251
256
  @staticmethod
@@ -545,8 +550,7 @@ class RedshiftDataDictionary:
545
550
  conn: redshift_connector.Connection,
546
551
  query: str,
547
552
  ) -> Iterable[LineageRow]:
548
- cursor = conn.cursor()
549
- cursor.execute(query)
553
+ cursor = RedshiftDataDictionary.get_query_result(conn=conn, query=query)
550
554
  field_names = [i[0] for i in cursor.description]
551
555
 
552
556
  rows = cursor.fetchmany()
@@ -603,9 +607,7 @@ class RedshiftDataDictionary:
603
607
  conn: redshift_connector.Connection,
604
608
  query: str,
605
609
  ) -> Iterable[TempTableRow]:
606
- cursor = conn.cursor()
607
-
608
- cursor.execute(query)
610
+ cursor = RedshiftDataDictionary.get_query_result(conn=conn, query=query)
609
611
 
610
612
  field_names = [i[0] for i in cursor.description]
611
613
 
@@ -662,8 +664,9 @@ class RedshiftDataDictionary:
662
664
  def get_outbound_datashares(
663
665
  conn: redshift_connector.Connection,
664
666
  ) -> Iterable[OutboundDatashare]:
665
- cursor = conn.cursor()
666
- cursor.execute(RedshiftCommonQuery.list_outbound_datashares())
667
+ cursor = RedshiftDataDictionary.get_query_result(
668
+ conn=conn, query=RedshiftCommonQuery.list_outbound_datashares()
669
+ )
667
670
  for item in cursor.fetchall():
668
671
  yield OutboundDatashare(
669
672
  share_name=item[1],
@@ -678,8 +681,10 @@ class RedshiftDataDictionary:
678
681
  conn: redshift_connector.Connection,
679
682
  database: str,
680
683
  ) -> Optional[InboundDatashare]:
681
- cursor = conn.cursor()
682
- cursor.execute(RedshiftCommonQuery.get_inbound_datashare(database))
684
+ cursor = RedshiftDataDictionary.get_query_result(
685
+ conn=conn,
686
+ query=RedshiftCommonQuery.get_inbound_datashare(database),
687
+ )
683
688
  item = cursor.fetchone()
684
689
  if item:
685
690
  return InboundDatashare(
@@ -25,6 +25,7 @@ from datahub.ingestion.source.redshift.query import (
25
25
  RedshiftServerlessQuery,
26
26
  )
27
27
  from datahub.ingestion.source.redshift.redshift_schema import (
28
+ RedshiftDataDictionary,
28
29
  RedshiftTable,
29
30
  RedshiftView,
30
31
  )
@@ -263,8 +264,7 @@ class RedshiftUsageExtractor:
263
264
  connection: redshift_connector.Connection,
264
265
  all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
265
266
  ) -> Iterable[RedshiftAccessEvent]:
266
- cursor = connection.cursor()
267
- cursor.execute(query)
267
+ cursor = RedshiftDataDictionary.get_query_result(conn=connection, query=query)
268
268
  results = cursor.fetchmany()
269
269
  field_names = [i[0] for i in cursor.description]
270
270
  while results:
@@ -1,4 +1,4 @@
1
- from collections import Counter
1
+ from collections import Counter, defaultdict
2
2
  from typing import Any, Counter as CounterType, Dict, Sequence, Tuple, Union
3
3
 
4
4
  from typing_extensions import TypedDict
@@ -84,7 +84,7 @@ def is_nullable_collection(
84
84
 
85
85
 
86
86
  def construct_schema(
87
- collection: Sequence[Dict[str, Any]], delimiter: str
87
+ collection: Sequence[Dict[str, Any]], delimiter: str = "."
88
88
  ) -> Dict[Tuple[str, ...], SchemaDescription]:
89
89
  """
90
90
  Construct (infer) a schema from a collection of documents.
@@ -104,9 +104,11 @@ def construct_schema(
104
104
  string to concatenate field names by
105
105
  """
106
106
 
107
- schema: Dict[Tuple[str, ...], BasicSchemaDescription] = {}
107
+ schema: Dict[Tuple[str, ...], BasicSchemaDescription] = defaultdict(
108
+ lambda: {"types": Counter(), "count": 0}
109
+ )
108
110
 
109
- def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> None:
111
+ def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> int:
110
112
  """
111
113
  Recursively update the schema with a document, which may/may not contain nested fields.
112
114
 
@@ -118,18 +120,24 @@ def construct_schema(
118
120
  prefix of fields that the document is under, pass an empty tuple when initializing
119
121
  """
120
122
 
123
+ # we want to make sure that parents of nested structures are included first, before their children, so that
124
+ # they are displayed properly in the UI, also in the event of trimming the list (which happens, for example,
125
+ # in mongodb ingestor)
126
+ max_count = 0
121
127
  for key, value in doc.items():
122
128
  new_parent_prefix = parent_prefix + (key,)
123
129
 
124
130
  # if nested value, look at the types within
125
131
  if isinstance(value, dict):
126
- append_to_schema(value, new_parent_prefix)
132
+ max_count = max(append_to_schema(value, new_parent_prefix), max_count)
127
133
  # if array of values, check what types are within
128
134
  if isinstance(value, list):
129
135
  for item in value:
130
136
  # if dictionary, add it as a nested object
131
137
  if isinstance(item, dict):
132
- append_to_schema(item, new_parent_prefix)
138
+ max_count = max(
139
+ append_to_schema(item, new_parent_prefix), max_count
140
+ )
133
141
 
134
142
  # don't record None values (counted towards nullable)
135
143
  if value is not None:
@@ -143,6 +151,14 @@ def construct_schema(
143
151
  # update the type count
144
152
  schema[new_parent_prefix]["types"].update({type(value): 1})
145
153
  schema[new_parent_prefix]["count"] += 1
154
+ max_count = max(schema[new_parent_prefix]["count"], max_count)
155
+
156
+ if parent_prefix != ():
157
+ schema[parent_prefix]["count"] = max(
158
+ schema[parent_prefix]["count"], max_count
159
+ )
160
+
161
+ return max_count
146
162
 
147
163
  for document in collection:
148
164
  append_to_schema(document, ())
@@ -1,14 +1,17 @@
1
1
  # This import verifies that the dependencies are available.
2
-
3
- from typing import List
2
+ import logging
3
+ from typing import TYPE_CHECKING, Any, List, Optional
4
4
 
5
5
  import pymysql # noqa: F401
6
6
  from pydantic.fields import Field
7
- from sqlalchemy import util
7
+ from sqlalchemy import create_engine, event, inspect, util
8
8
  from sqlalchemy.dialects.mysql import BIT, base
9
9
  from sqlalchemy.dialects.mysql.enumerated import SET
10
10
  from sqlalchemy.engine.reflection import Inspector
11
11
 
12
+ if TYPE_CHECKING:
13
+ from sqlalchemy.engine import Engine
14
+
12
15
  from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
13
16
  from datahub.ingestion.api.decorators import (
14
17
  SourceCapability,
@@ -18,11 +21,16 @@ from datahub.ingestion.api.decorators import (
18
21
  platform_name,
19
22
  support_status,
20
23
  )
24
+ from datahub.ingestion.source.aws.aws_common import (
25
+ AwsConnectionConfig,
26
+ RDSIAMTokenManager,
27
+ )
21
28
  from datahub.ingestion.source.sql.sql_common import (
22
29
  make_sqlalchemy_type,
23
30
  register_custom_type,
24
31
  )
25
32
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
33
+ from datahub.ingestion.source.sql.sqlalchemy_uri import parse_host_port
26
34
  from datahub.ingestion.source.sql.stored_procedures.base import (
27
35
  BaseProcedure,
28
36
  )
@@ -31,6 +39,9 @@ from datahub.ingestion.source.sql.two_tier_sql_source import (
31
39
  TwoTierSQLAlchemySource,
32
40
  )
33
41
  from datahub.metadata.schema_classes import BytesTypeClass
42
+ from datahub.utilities.str_enum import StrEnum
43
+
44
+ logger = logging.getLogger(__name__)
34
45
 
35
46
  SET.__repr__ = util.generic_repr # type:ignore
36
47
 
@@ -54,11 +65,33 @@ base.ischema_names["polygon"] = POLYGON
54
65
  base.ischema_names["decimal128"] = DECIMAL128
55
66
 
56
67
 
68
+ class MySQLAuthMode(StrEnum):
69
+ """Authentication mode for MySQL connection."""
70
+
71
+ PASSWORD = "PASSWORD"
72
+ AWS_IAM = "AWS_IAM"
73
+
74
+
57
75
  class MySQLConnectionConfig(SQLAlchemyConnectionConfig):
58
76
  # defaults
59
77
  host_port: str = Field(default="localhost:3306", description="MySQL host URL.")
60
78
  scheme: HiddenFromDocs[str] = "mysql+pymysql"
61
79
 
80
+ # Authentication configuration
81
+ auth_mode: MySQLAuthMode = Field(
82
+ default=MySQLAuthMode.PASSWORD,
83
+ description="Authentication mode to use for the MySQL connection. "
84
+ "Options are 'PASSWORD' (default) for standard username/password authentication, "
85
+ "or 'AWS_IAM' for AWS RDS IAM authentication.",
86
+ )
87
+ aws_config: AwsConnectionConfig = Field(
88
+ default_factory=AwsConnectionConfig,
89
+ description="AWS configuration for RDS IAM authentication (only used when auth_mode is AWS_IAM). "
90
+ "Provides full control over AWS credentials, region, profiles, role assumption, retry logic, and proxy settings. "
91
+ "If not explicitly configured, boto3 will automatically use the default credential chain and region from "
92
+ "environment variables (AWS_DEFAULT_REGION, AWS_REGION), AWS config files (~/.aws/config), or IAM role metadata.",
93
+ )
94
+
62
95
 
63
96
  class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
64
97
  def get_identifier(self, *, schema: str, table: str) -> str:
@@ -91,9 +124,27 @@ class MySQLSource(TwoTierSQLAlchemySource):
91
124
  Table, row, and column statistics via optional SQL profiling
92
125
  """
93
126
 
94
- def __init__(self, config, ctx):
127
+ config: MySQLConfig
128
+
129
+ def __init__(self, config: MySQLConfig, ctx: Any):
95
130
  super().__init__(config, ctx, self.get_platform())
96
131
 
132
+ self._rds_iam_token_manager: Optional[RDSIAMTokenManager] = None
133
+ if config.auth_mode == MySQLAuthMode.AWS_IAM:
134
+ hostname, port = parse_host_port(config.host_port, default_port=3306)
135
+ if port is None:
136
+ raise ValueError("Port must be specified for RDS IAM authentication")
137
+
138
+ if not config.username:
139
+ raise ValueError("username is required for RDS IAM authentication")
140
+
141
+ self._rds_iam_token_manager = RDSIAMTokenManager(
142
+ endpoint=hostname,
143
+ username=config.username,
144
+ port=port,
145
+ aws_config=config.aws_config,
146
+ )
147
+
97
148
  def get_platform(self):
98
149
  return "mysql"
99
150
 
@@ -102,6 +153,52 @@ class MySQLSource(TwoTierSQLAlchemySource):
102
153
  config = MySQLConfig.parse_obj(config_dict)
103
154
  return cls(config, ctx)
104
155
 
156
+ def _setup_rds_iam_event_listener(
157
+ self, engine: "Engine", database_name: Optional[str] = None
158
+ ) -> None:
159
+ """Setup SQLAlchemy event listener to inject RDS IAM tokens."""
160
+ if not (
161
+ self.config.auth_mode == MySQLAuthMode.AWS_IAM
162
+ and self._rds_iam_token_manager
163
+ ):
164
+ return
165
+
166
+ def do_connect_listener(_dialect, _conn_rec, _cargs, cparams):
167
+ if not self._rds_iam_token_manager:
168
+ raise RuntimeError("RDS IAM Token Manager is not initialized")
169
+ cparams["password"] = self._rds_iam_token_manager.get_token()
170
+ # PyMySQL requires SSL to be enabled for RDS IAM authentication.
171
+ # Preserve any existing SSL configuration, otherwise enable with default settings.
172
+ # The {"ssl": True} dict is a workaround to make PyMySQL recognize that SSL
173
+ # should be enabled, since the library requires a truthy value in the ssl parameter.
174
+ # See https://pymysql.readthedocs.io/en/latest/modules/connections.html#pymysql.connections.Connection
175
+ cparams["ssl"] = cparams.get("ssl") or {"ssl": True}
176
+
177
+ event.listen(engine, "do_connect", do_connect_listener) # type: ignore[misc]
178
+
179
+ def get_inspectors(self):
180
+ url = self.config.get_sql_alchemy_url()
181
+ logger.debug(f"sql_alchemy_url={url}")
182
+
183
+ engine = create_engine(url, **self.config.options)
184
+ self._setup_rds_iam_event_listener(engine)
185
+
186
+ with engine.connect() as conn:
187
+ inspector = inspect(conn)
188
+ if self.config.database and self.config.database != "":
189
+ databases = [self.config.database]
190
+ else:
191
+ databases = inspector.get_schema_names()
192
+ for db in databases:
193
+ if self.config.database_pattern.allowed(db):
194
+ url = self.config.get_sql_alchemy_url(current_db=db)
195
+ db_engine = create_engine(url, **self.config.options)
196
+ self._setup_rds_iam_event_listener(db_engine, database_name=db)
197
+
198
+ with db_engine.connect() as conn:
199
+ inspector = inspect(conn)
200
+ yield inspector
201
+
105
202
  def add_profile_metadata(self, inspector: Inspector) -> None:
106
203
  if not self.config.is_profiling_enabled():
107
204
  return