acryl-datahub 1.3.0.1rc4__py3-none-any.whl → 1.3.0.1rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc5.dist-info}/METADATA +2396 -2392
- {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc5.dist-info}/RECORD +18 -18
- datahub/_version.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +161 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +5 -3
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/usage.py +2 -2
- datahub/ingestion/source/sql/mysql.py +101 -4
- datahub/ingestion/source/sql/postgres.py +81 -4
- datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
- datahub/metadata/_internal_schema_classes.py +547 -544
- datahub/metadata/_urns/urn_defs.py +1729 -1729
- datahub/metadata/schema.avsc +18384 -18382
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +3 -1
- {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc5.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
4
4
|
|
|
5
5
|
# This import verifies that the dependencies are available.
|
|
6
6
|
import psycopg2 # noqa: F401
|
|
@@ -14,9 +14,12 @@ import sqlalchemy.dialects.postgresql as custom_types
|
|
|
14
14
|
from geoalchemy2 import Geometry # noqa: F401
|
|
15
15
|
from pydantic import BaseModel
|
|
16
16
|
from pydantic.fields import Field
|
|
17
|
-
from sqlalchemy import create_engine, inspect
|
|
17
|
+
from sqlalchemy import create_engine, event, inspect
|
|
18
18
|
from sqlalchemy.engine.reflection import Inspector
|
|
19
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from sqlalchemy.engine import Engine
|
|
22
|
+
|
|
20
23
|
from datahub.configuration.common import AllowDenyPattern
|
|
21
24
|
from datahub.emitter import mce_builder
|
|
22
25
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
@@ -30,12 +33,17 @@ from datahub.ingestion.api.decorators import (
|
|
|
30
33
|
support_status,
|
|
31
34
|
)
|
|
32
35
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
36
|
+
from datahub.ingestion.source.aws.aws_common import (
|
|
37
|
+
AwsConnectionConfig,
|
|
38
|
+
RDSIAMTokenManager,
|
|
39
|
+
)
|
|
33
40
|
from datahub.ingestion.source.sql.sql_common import (
|
|
34
41
|
SQLAlchemySource,
|
|
35
42
|
SqlWorkUnit,
|
|
36
43
|
register_custom_type,
|
|
37
44
|
)
|
|
38
45
|
from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
|
|
46
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri import parse_host_port
|
|
39
47
|
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
40
48
|
BaseProcedure,
|
|
41
49
|
)
|
|
@@ -44,6 +52,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
44
52
|
BytesTypeClass,
|
|
45
53
|
MapTypeClass,
|
|
46
54
|
)
|
|
55
|
+
from datahub.utilities.str_enum import StrEnum
|
|
47
56
|
|
|
48
57
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
49
58
|
|
|
@@ -100,12 +109,34 @@ class ViewLineageEntry(BaseModel):
|
|
|
100
109
|
dependent_schema: str
|
|
101
110
|
|
|
102
111
|
|
|
112
|
+
class PostgresAuthMode(StrEnum):
|
|
113
|
+
"""Authentication mode for PostgreSQL connection."""
|
|
114
|
+
|
|
115
|
+
PASSWORD = "PASSWORD"
|
|
116
|
+
AWS_IAM = "AWS_IAM"
|
|
117
|
+
|
|
118
|
+
|
|
103
119
|
class BasePostgresConfig(BasicSQLAlchemyConfig):
|
|
104
120
|
scheme: str = Field(default="postgresql+psycopg2", description="database scheme")
|
|
105
121
|
schema_pattern: AllowDenyPattern = Field(
|
|
106
122
|
default=AllowDenyPattern(deny=["information_schema"])
|
|
107
123
|
)
|
|
108
124
|
|
|
125
|
+
# Authentication configuration
|
|
126
|
+
auth_mode: PostgresAuthMode = Field(
|
|
127
|
+
default=PostgresAuthMode.PASSWORD,
|
|
128
|
+
description="Authentication mode to use for the PostgreSQL connection. "
|
|
129
|
+
"Options are 'PASSWORD' (default) for standard username/password authentication, "
|
|
130
|
+
"or 'AWS_IAM' for AWS RDS IAM authentication.",
|
|
131
|
+
)
|
|
132
|
+
aws_config: AwsConnectionConfig = Field(
|
|
133
|
+
default_factory=AwsConnectionConfig,
|
|
134
|
+
description="AWS configuration for RDS IAM authentication (only used when auth_mode is AWS_IAM). "
|
|
135
|
+
"Provides full control over AWS credentials, region, profiles, role assumption, retry logic, and proxy settings. "
|
|
136
|
+
"If not explicitly configured, boto3 will automatically use the default credential chain and region from "
|
|
137
|
+
"environment variables (AWS_DEFAULT_REGION, AWS_REGION), AWS config files (~/.aws/config), or IAM role metadata.",
|
|
138
|
+
)
|
|
139
|
+
|
|
109
140
|
|
|
110
141
|
class PostgresConfig(BasePostgresConfig):
|
|
111
142
|
database_pattern: AllowDenyPattern = Field(
|
|
@@ -160,6 +191,22 @@ class PostgresSource(SQLAlchemySource):
|
|
|
160
191
|
def __init__(self, config: PostgresConfig, ctx: PipelineContext):
|
|
161
192
|
super().__init__(config, ctx, self.get_platform())
|
|
162
193
|
|
|
194
|
+
self._rds_iam_token_manager: Optional[RDSIAMTokenManager] = None
|
|
195
|
+
if config.auth_mode == PostgresAuthMode.AWS_IAM:
|
|
196
|
+
hostname, port = parse_host_port(config.host_port, default_port=5432)
|
|
197
|
+
if port is None:
|
|
198
|
+
raise ValueError("Port must be specified for RDS IAM authentication")
|
|
199
|
+
|
|
200
|
+
if not config.username:
|
|
201
|
+
raise ValueError("username is required for RDS IAM authentication")
|
|
202
|
+
|
|
203
|
+
self._rds_iam_token_manager = RDSIAMTokenManager(
|
|
204
|
+
endpoint=hostname,
|
|
205
|
+
username=config.username,
|
|
206
|
+
port=port,
|
|
207
|
+
aws_config=config.aws_config,
|
|
208
|
+
)
|
|
209
|
+
|
|
163
210
|
def get_platform(self):
|
|
164
211
|
return "postgres"
|
|
165
212
|
|
|
@@ -168,13 +215,36 @@ class PostgresSource(SQLAlchemySource):
|
|
|
168
215
|
config = PostgresConfig.parse_obj(config_dict)
|
|
169
216
|
return cls(config, ctx)
|
|
170
217
|
|
|
218
|
+
def _setup_rds_iam_event_listener(
|
|
219
|
+
self, engine: "Engine", database_name: Optional[str] = None
|
|
220
|
+
) -> None:
|
|
221
|
+
"""Setup SQLAlchemy event listener to inject RDS IAM tokens."""
|
|
222
|
+
if not (
|
|
223
|
+
self.config.auth_mode == PostgresAuthMode.AWS_IAM
|
|
224
|
+
and self._rds_iam_token_manager
|
|
225
|
+
):
|
|
226
|
+
return
|
|
227
|
+
|
|
228
|
+
def do_connect_listener(_dialect, _conn_rec, _cargs, cparams):
|
|
229
|
+
if not self._rds_iam_token_manager:
|
|
230
|
+
raise RuntimeError("RDS IAM Token Manager is not initialized")
|
|
231
|
+
cparams["password"] = self._rds_iam_token_manager.get_token()
|
|
232
|
+
if cparams.get("sslmode") not in ("require", "verify-ca", "verify-full"):
|
|
233
|
+
cparams["sslmode"] = "require"
|
|
234
|
+
|
|
235
|
+
event.listen(engine, "do_connect", do_connect_listener) # type: ignore[misc]
|
|
236
|
+
|
|
171
237
|
def get_inspectors(self) -> Iterable[Inspector]:
|
|
172
238
|
# Note: get_sql_alchemy_url will choose `sqlalchemy_uri` over the passed in database
|
|
173
239
|
url = self.config.get_sql_alchemy_url(
|
|
174
240
|
database=self.config.database or self.config.initial_database
|
|
175
241
|
)
|
|
242
|
+
|
|
176
243
|
logger.debug(f"sql_alchemy_url={url}")
|
|
244
|
+
|
|
177
245
|
engine = create_engine(url, **self.config.options)
|
|
246
|
+
self._setup_rds_iam_event_listener(engine)
|
|
247
|
+
|
|
178
248
|
with engine.connect() as conn:
|
|
179
249
|
if self.config.database or self.config.sqlalchemy_uri:
|
|
180
250
|
inspector = inspect(conn)
|
|
@@ -182,14 +252,21 @@ class PostgresSource(SQLAlchemySource):
|
|
|
182
252
|
else:
|
|
183
253
|
# pg_database catalog - https://www.postgresql.org/docs/current/catalog-pg-database.html
|
|
184
254
|
# exclude template databases - https://www.postgresql.org/docs/current/manage-ag-templatedbs.html
|
|
255
|
+
# exclude rdsadmin - AWS RDS administrative database
|
|
185
256
|
databases = conn.execute(
|
|
186
|
-
"SELECT datname from pg_database where datname not in ('template0', 'template1')"
|
|
257
|
+
"SELECT datname from pg_database where datname not in ('template0', 'template1', 'rdsadmin')"
|
|
187
258
|
)
|
|
188
259
|
for db in databases:
|
|
189
260
|
if not self.config.database_pattern.allowed(db["datname"]):
|
|
190
261
|
continue
|
|
262
|
+
|
|
191
263
|
url = self.config.get_sql_alchemy_url(database=db["datname"])
|
|
192
|
-
|
|
264
|
+
db_engine = create_engine(url, **self.config.options)
|
|
265
|
+
self._setup_rds_iam_event_listener(
|
|
266
|
+
db_engine, database_name=db["datname"]
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
with db_engine.connect() as conn:
|
|
193
270
|
inspector = inspect(conn)
|
|
194
271
|
yield inspector
|
|
195
272
|
|
|
@@ -1,8 +1,45 @@
|
|
|
1
|
-
from typing import Any, Dict, Optional
|
|
1
|
+
from typing import Any, Dict, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
from sqlalchemy.engine import URL
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
def parse_host_port(
|
|
7
|
+
host_port: str, default_port: Optional[int] = None
|
|
8
|
+
) -> Tuple[str, Optional[int]]:
|
|
9
|
+
"""
|
|
10
|
+
Parse a host:port string into separate host and port components.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
host_port: String in format "host:port" or just "host"
|
|
14
|
+
default_port: Optional default port to use if not specified in host_port
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Tuple of (hostname, port) where port may be None if not specified
|
|
18
|
+
|
|
19
|
+
Examples:
|
|
20
|
+
>>> parse_host_port("localhost:3306")
|
|
21
|
+
('localhost', 3306)
|
|
22
|
+
>>> parse_host_port("localhost")
|
|
23
|
+
('localhost', None)
|
|
24
|
+
>>> parse_host_port("localhost", 5432)
|
|
25
|
+
('localhost', 5432)
|
|
26
|
+
>>> parse_host_port("db.example.com:invalid", 3306)
|
|
27
|
+
('db.example.com', 3306)
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
host, port_str = host_port.rsplit(":", 1)
|
|
31
|
+
port: Optional[int]
|
|
32
|
+
try:
|
|
33
|
+
port = int(port_str)
|
|
34
|
+
except ValueError:
|
|
35
|
+
# Port is not a valid integer
|
|
36
|
+
port = default_port
|
|
37
|
+
return host, port
|
|
38
|
+
except ValueError:
|
|
39
|
+
# No colon found, entire string is the hostname
|
|
40
|
+
return host_port, default_port
|
|
41
|
+
|
|
42
|
+
|
|
6
43
|
def make_sqlalchemy_uri(
|
|
7
44
|
scheme: str,
|
|
8
45
|
username: Optional[str],
|
|
@@ -14,12 +51,7 @@ def make_sqlalchemy_uri(
|
|
|
14
51
|
host: Optional[str] = None
|
|
15
52
|
port: Optional[int] = None
|
|
16
53
|
if at:
|
|
17
|
-
|
|
18
|
-
host, port_str = at.rsplit(":", 1)
|
|
19
|
-
port = int(port_str)
|
|
20
|
-
except ValueError:
|
|
21
|
-
host = at
|
|
22
|
-
port = None
|
|
54
|
+
host, port = parse_host_port(at)
|
|
23
55
|
if uri_opts:
|
|
24
56
|
uri_opts = {k: v for k, v in uri_opts.items() if v is not None}
|
|
25
57
|
|