acryl-datahub 1.3.0.1rc3__py3-none-any.whl → 1.3.0.1rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
- from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
3
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
4
4
 
5
5
  # This import verifies that the dependencies are available.
6
6
  import psycopg2 # noqa: F401
@@ -14,9 +14,12 @@ import sqlalchemy.dialects.postgresql as custom_types
14
14
  from geoalchemy2 import Geometry # noqa: F401
15
15
  from pydantic import BaseModel
16
16
  from pydantic.fields import Field
17
- from sqlalchemy import create_engine, inspect
17
+ from sqlalchemy import create_engine, event, inspect
18
18
  from sqlalchemy.engine.reflection import Inspector
19
19
 
20
+ if TYPE_CHECKING:
21
+ from sqlalchemy.engine import Engine
22
+
20
23
  from datahub.configuration.common import AllowDenyPattern
21
24
  from datahub.emitter import mce_builder
22
25
  from datahub.emitter.mcp_builder import mcps_from_mce
@@ -30,12 +33,17 @@ from datahub.ingestion.api.decorators import (
30
33
  support_status,
31
34
  )
32
35
  from datahub.ingestion.api.workunit import MetadataWorkUnit
36
+ from datahub.ingestion.source.aws.aws_common import (
37
+ AwsConnectionConfig,
38
+ RDSIAMTokenManager,
39
+ )
33
40
  from datahub.ingestion.source.sql.sql_common import (
34
41
  SQLAlchemySource,
35
42
  SqlWorkUnit,
36
43
  register_custom_type,
37
44
  )
38
45
  from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
46
+ from datahub.ingestion.source.sql.sqlalchemy_uri import parse_host_port
39
47
  from datahub.ingestion.source.sql.stored_procedures.base import (
40
48
  BaseProcedure,
41
49
  )
@@ -44,6 +52,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
44
52
  BytesTypeClass,
45
53
  MapTypeClass,
46
54
  )
55
+ from datahub.utilities.str_enum import StrEnum
47
56
 
48
57
  logger: logging.Logger = logging.getLogger(__name__)
49
58
 
@@ -100,12 +109,34 @@ class ViewLineageEntry(BaseModel):
100
109
  dependent_schema: str
101
110
 
102
111
 
112
+ class PostgresAuthMode(StrEnum):
113
+ """Authentication mode for PostgreSQL connection."""
114
+
115
+ PASSWORD = "PASSWORD"
116
+ AWS_IAM = "AWS_IAM"
117
+
118
+
103
119
  class BasePostgresConfig(BasicSQLAlchemyConfig):
104
120
  scheme: str = Field(default="postgresql+psycopg2", description="database scheme")
105
121
  schema_pattern: AllowDenyPattern = Field(
106
122
  default=AllowDenyPattern(deny=["information_schema"])
107
123
  )
108
124
 
125
+ # Authentication configuration
126
+ auth_mode: PostgresAuthMode = Field(
127
+ default=PostgresAuthMode.PASSWORD,
128
+ description="Authentication mode to use for the PostgreSQL connection. "
129
+ "Options are 'PASSWORD' (default) for standard username/password authentication, "
130
+ "or 'AWS_IAM' for AWS RDS IAM authentication.",
131
+ )
132
+ aws_config: AwsConnectionConfig = Field(
133
+ default_factory=AwsConnectionConfig,
134
+ description="AWS configuration for RDS IAM authentication (only used when auth_mode is AWS_IAM). "
135
+ "Provides full control over AWS credentials, region, profiles, role assumption, retry logic, and proxy settings. "
136
+ "If not explicitly configured, boto3 will automatically use the default credential chain and region from "
137
+ "environment variables (AWS_DEFAULT_REGION, AWS_REGION), AWS config files (~/.aws/config), or IAM role metadata.",
138
+ )
139
+
109
140
 
110
141
  class PostgresConfig(BasePostgresConfig):
111
142
  database_pattern: AllowDenyPattern = Field(
@@ -160,6 +191,22 @@ class PostgresSource(SQLAlchemySource):
160
191
  def __init__(self, config: PostgresConfig, ctx: PipelineContext):
161
192
  super().__init__(config, ctx, self.get_platform())
162
193
 
194
+ self._rds_iam_token_manager: Optional[RDSIAMTokenManager] = None
195
+ if config.auth_mode == PostgresAuthMode.AWS_IAM:
196
+ hostname, port = parse_host_port(config.host_port, default_port=5432)
197
+ if port is None:
198
+ raise ValueError("Port must be specified for RDS IAM authentication")
199
+
200
+ if not config.username:
201
+ raise ValueError("username is required for RDS IAM authentication")
202
+
203
+ self._rds_iam_token_manager = RDSIAMTokenManager(
204
+ endpoint=hostname,
205
+ username=config.username,
206
+ port=port,
207
+ aws_config=config.aws_config,
208
+ )
209
+
163
210
  def get_platform(self):
164
211
  return "postgres"
165
212
 
@@ -168,13 +215,36 @@ class PostgresSource(SQLAlchemySource):
168
215
  config = PostgresConfig.parse_obj(config_dict)
169
216
  return cls(config, ctx)
170
217
 
218
+ def _setup_rds_iam_event_listener(
219
+ self, engine: "Engine", database_name: Optional[str] = None
220
+ ) -> None:
221
+ """Setup SQLAlchemy event listener to inject RDS IAM tokens."""
222
+ if not (
223
+ self.config.auth_mode == PostgresAuthMode.AWS_IAM
224
+ and self._rds_iam_token_manager
225
+ ):
226
+ return
227
+
228
+ def do_connect_listener(_dialect, _conn_rec, _cargs, cparams):
229
+ if not self._rds_iam_token_manager:
230
+ raise RuntimeError("RDS IAM Token Manager is not initialized")
231
+ cparams["password"] = self._rds_iam_token_manager.get_token()
232
+ if cparams.get("sslmode") not in ("require", "verify-ca", "verify-full"):
233
+ cparams["sslmode"] = "require"
234
+
235
+ event.listen(engine, "do_connect", do_connect_listener) # type: ignore[misc]
236
+
171
237
  def get_inspectors(self) -> Iterable[Inspector]:
172
238
  # Note: get_sql_alchemy_url will choose `sqlalchemy_uri` over the passed in database
173
239
  url = self.config.get_sql_alchemy_url(
174
240
  database=self.config.database or self.config.initial_database
175
241
  )
242
+
176
243
  logger.debug(f"sql_alchemy_url={url}")
244
+
177
245
  engine = create_engine(url, **self.config.options)
246
+ self._setup_rds_iam_event_listener(engine)
247
+
178
248
  with engine.connect() as conn:
179
249
  if self.config.database or self.config.sqlalchemy_uri:
180
250
  inspector = inspect(conn)
@@ -182,14 +252,21 @@ class PostgresSource(SQLAlchemySource):
182
252
  else:
183
253
  # pg_database catalog - https://www.postgresql.org/docs/current/catalog-pg-database.html
184
254
  # exclude template databases - https://www.postgresql.org/docs/current/manage-ag-templatedbs.html
255
+ # exclude rdsadmin - AWS RDS administrative database
185
256
  databases = conn.execute(
186
- "SELECT datname from pg_database where datname not in ('template0', 'template1')"
257
+ "SELECT datname from pg_database where datname not in ('template0', 'template1', 'rdsadmin')"
187
258
  )
188
259
  for db in databases:
189
260
  if not self.config.database_pattern.allowed(db["datname"]):
190
261
  continue
262
+
191
263
  url = self.config.get_sql_alchemy_url(database=db["datname"])
192
- with create_engine(url, **self.config.options).connect() as conn:
264
+ db_engine = create_engine(url, **self.config.options)
265
+ self._setup_rds_iam_event_listener(
266
+ db_engine, database_name=db["datname"]
267
+ )
268
+
269
+ with db_engine.connect() as conn:
193
270
  inspector = inspect(conn)
194
271
  yield inspector
195
272
 
@@ -1,8 +1,45 @@
1
- from typing import Any, Dict, Optional
1
+ from typing import Any, Dict, Optional, Tuple
2
2
 
3
3
  from sqlalchemy.engine import URL
4
4
 
5
5
 
6
+ def parse_host_port(
7
+ host_port: str, default_port: Optional[int] = None
8
+ ) -> Tuple[str, Optional[int]]:
9
+ """
10
+ Parse a host:port string into separate host and port components.
11
+
12
+ Args:
13
+ host_port: String in format "host:port" or just "host"
14
+ default_port: Optional default port to use if not specified in host_port
15
+
16
+ Returns:
17
+ Tuple of (hostname, port) where port may be None if not specified
18
+
19
+ Examples:
20
+ >>> parse_host_port("localhost:3306")
21
+ ('localhost', 3306)
22
+ >>> parse_host_port("localhost")
23
+ ('localhost', None)
24
+ >>> parse_host_port("localhost", 5432)
25
+ ('localhost', 5432)
26
+ >>> parse_host_port("db.example.com:invalid", 3306)
27
+ ('db.example.com', 3306)
28
+ """
29
+ try:
30
+ host, port_str = host_port.rsplit(":", 1)
31
+ port: Optional[int]
32
+ try:
33
+ port = int(port_str)
34
+ except ValueError:
35
+ # Port is not a valid integer
36
+ port = default_port
37
+ return host, port
38
+ except ValueError:
39
+ # No colon found, entire string is the hostname
40
+ return host_port, default_port
41
+
42
+
6
43
  def make_sqlalchemy_uri(
7
44
  scheme: str,
8
45
  username: Optional[str],
@@ -14,12 +51,7 @@ def make_sqlalchemy_uri(
14
51
  host: Optional[str] = None
15
52
  port: Optional[int] = None
16
53
  if at:
17
- try:
18
- host, port_str = at.rsplit(":", 1)
19
- port = int(port_str)
20
- except ValueError:
21
- host = at
22
- port = None
54
+ host, port = parse_host_port(at)
23
55
  if uri_opts:
24
56
  uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
25
57
 
@@ -20228,6 +20228,9 @@ class DataHubPageModuleTypeClass(object):
20228
20228
  RELATED_TERMS = "RELATED_TERMS"
20229
20229
  """Module displaying the related terms of a given glossary term"""
20230
20230
 
20231
+ PLATFORMS = "PLATFORMS"
20232
+ """Module displaying the platforms in an instance"""
20233
+
20231
20234
 
20232
20235
 
20233
20236
  class DataHubPageModuleVisibilityClass(DictWrapper):
@@ -4856,6 +4856,7 @@
4856
4856
  "HIERARCHY": "A module displaying a hierarchy to navigate",
4857
4857
  "LINK": "Link type module",
4858
4858
  "OWNED_ASSETS": "Module displaying assets owned by a user",
4859
+ "PLATFORMS": "Module displaying the platforms in an instance",
4859
4860
  "RELATED_TERMS": "Module displaying the related terms of a given glossary term",
4860
4861
  "RICH_TEXT": "Module containing rich text to be rendered"
4861
4862
  },
@@ -4871,7 +4872,8 @@
4871
4872
  "ASSETS",
4872
4873
  "CHILD_HIERARCHY",
4873
4874
  "DATA_PRODUCTS",
4874
- "RELATED_TERMS"
4875
+ "RELATED_TERMS",
4876
+ "PLATFORMS"
4875
4877
  ],
4876
4878
  "doc": "Enum containing the types of page modules that there are"
4877
4879
  },
@@ -26,6 +26,7 @@
26
26
  "HIERARCHY": "A module displaying a hierarchy to navigate",
27
27
  "LINK": "Link type module",
28
28
  "OWNED_ASSETS": "Module displaying assets owned by a user",
29
+ "PLATFORMS": "Module displaying the platforms in an instance",
29
30
  "RELATED_TERMS": "Module displaying the related terms of a given glossary term",
30
31
  "RICH_TEXT": "Module containing rich text to be rendered"
31
32
  },
@@ -41,7 +42,8 @@
41
42
  "ASSETS",
42
43
  "CHILD_HIERARCHY",
43
44
  "DATA_PRODUCTS",
44
- "RELATED_TERMS"
45
+ "RELATED_TERMS",
46
+ "PLATFORMS"
45
47
  ],
46
48
  "doc": "Enum containing the types of page modules that there are"
47
49
  },