acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/METADATA +2469 -2467
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/RECORD +50 -48
- datahub/_version.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +26 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/docker_check.py +2 -1
- datahub/cli/docker_cli.py +4 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/quickstart_versioning.py +2 -2
- datahub/cli/specific/dataproduct_cli.py +2 -4
- datahub/cli/specific/user_cli.py +172 -1
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/kafka.py +6 -4
- datahub/emitter/mce_builder.py +2 -4
- datahub/emitter/rest_emitter.py +15 -15
- datahub/entrypoints.py +2 -0
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/source.py +5 -0
- datahub/ingestion/graph/client.py +197 -0
- datahub/ingestion/graph/config.py +2 -2
- datahub/ingestion/sink/datahub_rest.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +20 -13
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
- datahub/ingestion/source/grafana/models.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +39 -19
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
- datahub/ingestion/source/mode.py +13 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
- datahub/ingestion/source/sql/mssql/source.py +7 -1
- datahub/ingestion/source/sql/teradata.py +80 -65
- datahub/ingestion/source/unity/config.py +31 -0
- datahub/ingestion/source/unity/proxy.py +73 -0
- datahub/ingestion/source/unity/source.py +27 -70
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/metadata/_internal_schema_classes.py +544 -544
- datahub/metadata/_urns/urn_defs.py +1728 -1728
- datahub/metadata/schema.avsc +15157 -15157
- datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
- datahub/sql_parsing/sqlglot_lineage.py +7 -0
- datahub/telemetry/telemetry.py +8 -3
- datahub/utilities/file_backed_collections.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/sample_data.py +5 -4
- datahub/emitter/sql_parsing_builder.py +0 -306
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
-
import os
|
|
4
3
|
import os.path
|
|
5
4
|
import re
|
|
6
5
|
from typing import Dict, Optional
|
|
@@ -13,10 +12,11 @@ from packaging.version import parse
|
|
|
13
12
|
from pydantic import BaseModel
|
|
14
13
|
|
|
15
14
|
from datahub._version import nice_version_name
|
|
15
|
+
from datahub.configuration.env_vars import get_force_local_quickstart_mapping
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
|
-
LOCAL_QUICKSTART_MAPPING_FILE =
|
|
19
|
+
LOCAL_QUICKSTART_MAPPING_FILE = get_force_local_quickstart_mapping()
|
|
20
20
|
DEFAULT_LOCAL_CONFIG_PATH = "~/.datahub/quickstart/quickstart_version_mapping.yaml"
|
|
21
21
|
DEFAULT_REMOTE_CONFIG_PATH = "https://raw.githubusercontent.com/datahub-project/datahub/master/docker/quickstart/quickstart_version_mapping.yaml"
|
|
22
22
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import difflib
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
import os
|
|
5
4
|
import pathlib
|
|
6
5
|
import sys
|
|
7
6
|
from pathlib import Path
|
|
@@ -14,6 +13,7 @@ from click_default_group import DefaultGroup
|
|
|
14
13
|
|
|
15
14
|
from datahub.api.entities.dataproduct.dataproduct import DataProduct
|
|
16
15
|
from datahub.cli.specific.file_loader import load_file
|
|
16
|
+
from datahub.configuration.env_vars import get_dataproduct_external_url
|
|
17
17
|
from datahub.emitter.mce_builder import (
|
|
18
18
|
make_group_urn,
|
|
19
19
|
make_user_urn,
|
|
@@ -84,9 +84,7 @@ def mutate(file: Path, validate_assets: bool, external_url: str, upsert: bool) -
|
|
|
84
84
|
with get_default_graph(ClientMode.CLI) as graph:
|
|
85
85
|
data_product: DataProduct = DataProduct.from_yaml(file, graph)
|
|
86
86
|
external_url_override = (
|
|
87
|
-
external_url
|
|
88
|
-
or os.getenv("DATAHUB_DATAPRODUCT_EXTERNAL_URL")
|
|
89
|
-
or data_product.external_url
|
|
87
|
+
external_url or get_dataproduct_external_url() or data_product.external_url
|
|
90
88
|
)
|
|
91
89
|
data_product.external_url = external_url_override
|
|
92
90
|
if upsert and not graph.exists(data_product.urn):
|
datahub/cli/specific/user_cli.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import pathlib
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
4
5
|
|
|
5
6
|
import click
|
|
6
7
|
from click_default_group import DefaultGroup
|
|
7
8
|
|
|
8
9
|
from datahub.api.entities.corpuser.corpuser import CorpUser, CorpUserGenerationConfig
|
|
9
10
|
from datahub.cli.specific.file_loader import load_file
|
|
10
|
-
from datahub.
|
|
11
|
+
from datahub.configuration.common import OperationalError
|
|
12
|
+
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
11
13
|
from datahub.ingestion.graph.config import ClientMode
|
|
12
14
|
from datahub.upgrade import upgrade
|
|
13
15
|
|
|
@@ -55,3 +57,172 @@ def upsert(file: Path, override_editable: bool) -> None:
|
|
|
55
57
|
f"Update failed for id {user_config.get('id')}. due to {e}",
|
|
56
58
|
fg="red",
|
|
57
59
|
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def validate_user_id_options(
|
|
63
|
+
user_id: Optional[str], email_as_id: bool, email: str
|
|
64
|
+
) -> str:
|
|
65
|
+
"""
|
|
66
|
+
Validate user ID options and return the final user ID to use.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
user_id: Optional explicit user ID
|
|
70
|
+
email_as_id: Whether to use email as the user ID
|
|
71
|
+
email: User's email address
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
The final user ID to use for the URN
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ValueError: If validation fails (neither or both options provided)
|
|
78
|
+
"""
|
|
79
|
+
if not user_id and not email_as_id:
|
|
80
|
+
raise ValueError("Must specify either --id or --email-as-id flag")
|
|
81
|
+
|
|
82
|
+
if user_id and email_as_id:
|
|
83
|
+
raise ValueError("Cannot specify both --id and --email-as-id flag")
|
|
84
|
+
|
|
85
|
+
if email_as_id:
|
|
86
|
+
return email
|
|
87
|
+
|
|
88
|
+
assert user_id is not None
|
|
89
|
+
return user_id
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def create_native_user_in_datahub(
|
|
93
|
+
graph: DataHubGraph,
|
|
94
|
+
user_id: str,
|
|
95
|
+
email: str,
|
|
96
|
+
display_name: str,
|
|
97
|
+
password: str,
|
|
98
|
+
role: Optional[str] = None,
|
|
99
|
+
) -> str:
|
|
100
|
+
"""
|
|
101
|
+
Create a native DataHub user.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
graph: DataHubGraph client
|
|
105
|
+
user_id: User identifier (used in URN)
|
|
106
|
+
email: User's email address
|
|
107
|
+
display_name: User's full display name
|
|
108
|
+
password: User's password
|
|
109
|
+
role: Optional role to assign (Admin, Editor, or Reader)
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
The created user's URN
|
|
113
|
+
|
|
114
|
+
Raises:
|
|
115
|
+
ValueError: If user already exists or role is invalid
|
|
116
|
+
OperationalError: If user creation fails due to API/network errors
|
|
117
|
+
"""
|
|
118
|
+
user_urn = f"urn:li:corpuser:{user_id}"
|
|
119
|
+
|
|
120
|
+
if graph.exists(user_urn):
|
|
121
|
+
raise ValueError(f"User with ID {user_id} already exists (urn: {user_urn})")
|
|
122
|
+
|
|
123
|
+
created_user_urn = graph.create_native_user(
|
|
124
|
+
user_id=user_id,
|
|
125
|
+
email=email,
|
|
126
|
+
display_name=display_name,
|
|
127
|
+
password=password,
|
|
128
|
+
role=role,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return created_user_urn
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@user.command(name="add")
|
|
135
|
+
@click.option("--id", "user_id", type=str, help="User identifier (used in URN)")
|
|
136
|
+
@click.option("--email", required=True, type=str, help="User's email address")
|
|
137
|
+
@click.option(
|
|
138
|
+
"--email-as-id",
|
|
139
|
+
is_flag=True,
|
|
140
|
+
default=False,
|
|
141
|
+
help="Use email address as user ID (alternative to --id)",
|
|
142
|
+
)
|
|
143
|
+
@click.option(
|
|
144
|
+
"--display-name", required=True, type=str, help="User's full display name"
|
|
145
|
+
)
|
|
146
|
+
@click.option(
|
|
147
|
+
"--password",
|
|
148
|
+
is_flag=True,
|
|
149
|
+
default=False,
|
|
150
|
+
help="Prompt for password (hidden input)",
|
|
151
|
+
)
|
|
152
|
+
@click.option(
|
|
153
|
+
"--role",
|
|
154
|
+
required=False,
|
|
155
|
+
type=click.Choice(
|
|
156
|
+
["Admin", "Editor", "Reader", "admin", "editor", "reader"], case_sensitive=False
|
|
157
|
+
),
|
|
158
|
+
help="Optional role to assign (Admin, Editor, or Reader)",
|
|
159
|
+
)
|
|
160
|
+
@upgrade.check_upgrade
|
|
161
|
+
def add(
|
|
162
|
+
user_id: str,
|
|
163
|
+
email: str,
|
|
164
|
+
email_as_id: bool,
|
|
165
|
+
display_name: str,
|
|
166
|
+
password: bool,
|
|
167
|
+
role: str,
|
|
168
|
+
) -> None:
|
|
169
|
+
"""Create a native DataHub user with email/password authentication"""
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
final_user_id = validate_user_id_options(user_id, email_as_id, email)
|
|
173
|
+
except ValueError as e:
|
|
174
|
+
click.secho(f"Error: {str(e)}", fg="red")
|
|
175
|
+
raise SystemExit(1) from e
|
|
176
|
+
|
|
177
|
+
if not password:
|
|
178
|
+
click.secho(
|
|
179
|
+
"Error: --password flag is required to prompt for password input",
|
|
180
|
+
fg="red",
|
|
181
|
+
)
|
|
182
|
+
raise SystemExit(1)
|
|
183
|
+
|
|
184
|
+
password_value = click.prompt(
|
|
185
|
+
"Enter password", hide_input=True, confirmation_prompt=True
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
189
|
+
try:
|
|
190
|
+
created_user_urn = create_native_user_in_datahub(
|
|
191
|
+
graph, final_user_id, email, display_name, password_value, role
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if role:
|
|
195
|
+
click.secho(
|
|
196
|
+
f"Successfully created user {final_user_id} with role {role.capitalize()} (URN: {created_user_urn})",
|
|
197
|
+
fg="green",
|
|
198
|
+
)
|
|
199
|
+
else:
|
|
200
|
+
click.secho(
|
|
201
|
+
f"Successfully created user {final_user_id} (URN: {created_user_urn})",
|
|
202
|
+
fg="green",
|
|
203
|
+
)
|
|
204
|
+
except ValueError as e:
|
|
205
|
+
click.secho(f"Error: {str(e)}", fg="red")
|
|
206
|
+
raise SystemExit(1) from e
|
|
207
|
+
except OperationalError as e:
|
|
208
|
+
error_msg = e.message if hasattr(e, "message") else str(e.args[0])
|
|
209
|
+
click.secho(f"Error: {error_msg}", fg="red")
|
|
210
|
+
|
|
211
|
+
if hasattr(e, "info") and e.info:
|
|
212
|
+
logger.debug(f"Error details: {e.info}")
|
|
213
|
+
if "status_code" in e.info:
|
|
214
|
+
click.secho(f" HTTP Status: {e.info['status_code']}", fg="red")
|
|
215
|
+
if "response_text" in e.info:
|
|
216
|
+
click.secho(
|
|
217
|
+
f" Response: {e.info['response_text'][:200]}", fg="red"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
click.secho(
|
|
221
|
+
"\nTip: Run with DATAHUB_DEBUG=1 environment variable for detailed logs",
|
|
222
|
+
fg="yellow",
|
|
223
|
+
)
|
|
224
|
+
raise SystemExit(1) from e
|
|
225
|
+
except Exception as e:
|
|
226
|
+
click.secho(f"Unexpected error: {str(e)}", fg="red")
|
|
227
|
+
logger.exception("Unexpected error during user creation")
|
|
228
|
+
raise SystemExit(1) from e
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
# ABOUTME: Central registry for all environment variables used in metadata-ingestion.
|
|
2
|
+
# ABOUTME: All environment variable reads should go through this module for discoverability and maintainability.
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
# ============================================================================
|
|
8
|
+
# Core DataHub Configuration
|
|
9
|
+
# ============================================================================
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_gms_url() -> Optional[str]:
|
|
13
|
+
"""Complete GMS URL (takes precedence over separate host/port)."""
|
|
14
|
+
return os.getenv("DATAHUB_GMS_URL")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_gms_host() -> Optional[str]:
|
|
18
|
+
"""GMS host (fallback for URL, deprecated)."""
|
|
19
|
+
return os.getenv("DATAHUB_GMS_HOST")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_gms_port() -> Optional[str]:
|
|
23
|
+
"""GMS port number."""
|
|
24
|
+
return os.getenv("DATAHUB_GMS_PORT")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_gms_protocol() -> str:
|
|
28
|
+
"""Protocol for GMS connection (http/https)."""
|
|
29
|
+
return os.getenv("DATAHUB_GMS_PROTOCOL", "http")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_gms_token() -> Optional[str]:
|
|
33
|
+
"""Authentication token for GMS."""
|
|
34
|
+
return os.getenv("DATAHUB_GMS_TOKEN")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_system_client_id() -> Optional[str]:
|
|
38
|
+
"""System client ID for OAuth/auth."""
|
|
39
|
+
return os.getenv("DATAHUB_SYSTEM_CLIENT_ID")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_system_client_secret() -> Optional[str]:
|
|
43
|
+
"""System client secret for OAuth/auth."""
|
|
44
|
+
return os.getenv("DATAHUB_SYSTEM_CLIENT_SECRET")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_skip_config() -> bool:
|
|
48
|
+
"""Skip loading config file (forces env variables)."""
|
|
49
|
+
return os.getenv("DATAHUB_SKIP_CONFIG", "").lower() == "true"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_gms_base_path() -> str:
|
|
53
|
+
"""Base path for GMS API endpoints."""
|
|
54
|
+
return os.getenv("DATAHUB_GMS_BASE_PATH", "")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ============================================================================
|
|
58
|
+
# REST Emitter Configuration
|
|
59
|
+
# ============================================================================
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_rest_emitter_default_retry_max_times() -> str:
|
|
63
|
+
"""Max retry attempts for failed requests."""
|
|
64
|
+
return os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_rest_emitter_batch_max_payload_bytes() -> int:
|
|
68
|
+
"""Maximum payload size in bytes for batch operations."""
|
|
69
|
+
return int(
|
|
70
|
+
os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_BYTES", str(15 * 1024 * 1024))
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_rest_emitter_batch_max_payload_length() -> int:
|
|
75
|
+
"""Maximum number of MCPs per batch."""
|
|
76
|
+
return int(os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", "200"))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_emit_mode() -> Optional[str]:
|
|
80
|
+
"""Emission mode (SYNC_PRIMARY, SYNC_WAIT, ASYNC, ASYNC_WAIT)."""
|
|
81
|
+
return os.getenv("DATAHUB_EMIT_MODE")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_rest_emitter_default_endpoint() -> Optional[str]:
|
|
85
|
+
"""REST endpoint type (RESTLI or OPENAPI)."""
|
|
86
|
+
return os.getenv("DATAHUB_REST_EMITTER_DEFAULT_ENDPOINT")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_emitter_trace() -> bool:
|
|
90
|
+
"""Enable detailed emitter tracing."""
|
|
91
|
+
return os.getenv("DATAHUB_EMITTER_TRACE", "").lower() == "true"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ============================================================================
|
|
95
|
+
# REST Sink Configuration
|
|
96
|
+
# ============================================================================
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_rest_sink_default_max_threads() -> int:
|
|
100
|
+
"""Max thread pool size for async operations."""
|
|
101
|
+
return int(os.getenv("DATAHUB_REST_SINK_DEFAULT_MAX_THREADS", "15"))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def get_rest_sink_default_mode() -> Optional[str]:
|
|
105
|
+
"""Sink mode (SYNC, ASYNC, ASYNC_BATCH)."""
|
|
106
|
+
return os.getenv("DATAHUB_REST_SINK_DEFAULT_MODE")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# ============================================================================
|
|
110
|
+
# Telemetry & Monitoring
|
|
111
|
+
# ============================================================================
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_telemetry_timeout() -> str:
|
|
115
|
+
"""Telemetry timeout in seconds."""
|
|
116
|
+
return os.getenv("DATAHUB_TELEMETRY_TIMEOUT", "10")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get_sentry_dsn() -> Optional[str]:
|
|
120
|
+
"""Sentry error tracking DSN."""
|
|
121
|
+
return os.getenv("SENTRY_DSN")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def get_sentry_environment() -> str:
|
|
125
|
+
"""Sentry environment (dev/prod)."""
|
|
126
|
+
return os.getenv("SENTRY_ENVIRONMENT", "dev")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# ============================================================================
|
|
130
|
+
# Logging & Debug Configuration
|
|
131
|
+
# ============================================================================
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def get_suppress_logging_manager() -> Optional[str]:
|
|
135
|
+
"""Suppress DataHub logging manager initialization."""
|
|
136
|
+
return os.getenv("DATAHUB_SUPPRESS_LOGGING_MANAGER")
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_no_color() -> bool:
|
|
140
|
+
"""Disable colored logging output."""
|
|
141
|
+
return os.getenv("NO_COLOR", "").lower() == "true"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_test_mode() -> Optional[str]:
|
|
145
|
+
"""Indicates running in test context."""
|
|
146
|
+
return os.getenv("DATAHUB_TEST_MODE")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_debug() -> bool:
|
|
150
|
+
"""Enable debug mode."""
|
|
151
|
+
return os.getenv("DATAHUB_DEBUG", "").lower() == "true"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ============================================================================
|
|
155
|
+
# Data Processing Configuration
|
|
156
|
+
# ============================================================================
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_sql_agg_query_log() -> str:
|
|
160
|
+
"""SQL aggregator query logging level."""
|
|
161
|
+
return os.getenv("DATAHUB_SQL_AGG_QUERY_LOG", "DISABLED")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def get_dataset_urn_to_lower() -> str:
|
|
165
|
+
"""Convert dataset URNs to lowercase."""
|
|
166
|
+
return os.getenv("DATAHUB_DATASET_URN_TO_LOWER", "false")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# ============================================================================
|
|
170
|
+
# Integration-Specific Configuration
|
|
171
|
+
# ============================================================================
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def get_kafka_schema_registry_url() -> Optional[str]:
|
|
175
|
+
"""Kafka schema registry URL."""
|
|
176
|
+
return os.getenv("KAFKA_SCHEMAREGISTRY_URL")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def get_spark_version() -> Optional[str]:
|
|
180
|
+
"""Spark version (for S3 source)."""
|
|
181
|
+
return os.getenv("SPARK_VERSION")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def get_bigquery_schema_parallelism() -> int:
|
|
185
|
+
"""Parallelism level for BigQuery schema extraction."""
|
|
186
|
+
return int(os.getenv("DATAHUB_BIGQUERY_SCHEMA_PARALLELISM", "20"))
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def get_snowflake_schema_parallelism() -> int:
|
|
190
|
+
"""Parallelism level for Snowflake schema extraction."""
|
|
191
|
+
return int(os.getenv("DATAHUB_SNOWFLAKE_SCHEMA_PARALLELISM", "20"))
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def get_powerbi_m_query_parse_timeout() -> int:
|
|
195
|
+
"""Timeout for PowerBI M query parsing."""
|
|
196
|
+
return int(os.getenv("DATAHUB_POWERBI_M_QUERY_PARSE_TIMEOUT", "60"))
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def get_trace_powerbi_mquery_parser() -> bool:
|
|
200
|
+
"""Enable PowerBI M query parser tracing."""
|
|
201
|
+
return os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", "").lower() == "true"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def get_lookml_git_test_ssh_key() -> Optional[str]:
|
|
205
|
+
"""SSH key for LookML Git tests."""
|
|
206
|
+
return os.getenv("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# ============================================================================
|
|
210
|
+
# AWS/Cloud Configuration
|
|
211
|
+
# ============================================================================
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def get_aws_lambda_function_name() -> Optional[str]:
|
|
215
|
+
"""Indicates running in AWS Lambda."""
|
|
216
|
+
return os.getenv("AWS_LAMBDA_FUNCTION_NAME")
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def get_aws_execution_env() -> Optional[str]:
|
|
220
|
+
"""AWS execution environment."""
|
|
221
|
+
return os.getenv("AWS_EXECUTION_ENV")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def get_aws_web_identity_token_file() -> Optional[str]:
|
|
225
|
+
"""OIDC token file path."""
|
|
226
|
+
return os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE")
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def get_aws_role_arn() -> Optional[str]:
|
|
230
|
+
"""AWS role ARN for OIDC."""
|
|
231
|
+
return os.getenv("AWS_ROLE_ARN")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def get_aws_app_runner_service_id() -> Optional[str]:
|
|
235
|
+
"""AWS App Runner service ID."""
|
|
236
|
+
return os.getenv("AWS_APP_RUNNER_SERVICE_ID")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def get_ecs_container_metadata_uri_v4() -> Optional[str]:
|
|
240
|
+
"""ECS metadata endpoint v4."""
|
|
241
|
+
return os.getenv("ECS_CONTAINER_METADATA_URI_V4")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_ecs_container_metadata_uri() -> Optional[str]:
|
|
245
|
+
"""ECS metadata endpoint v3."""
|
|
246
|
+
return os.getenv("ECS_CONTAINER_METADATA_URI")
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def get_elastic_beanstalk_environment_name() -> Optional[str]:
|
|
250
|
+
"""Elastic Beanstalk environment."""
|
|
251
|
+
return os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME")
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# ============================================================================
|
|
255
|
+
# Docker & Local Development
|
|
256
|
+
# ============================================================================
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def get_compose_project_name() -> str:
|
|
260
|
+
"""Docker Compose project name."""
|
|
261
|
+
return os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def get_docker_compose_base() -> Optional[str]:
|
|
265
|
+
"""Base path for Docker Compose files."""
|
|
266
|
+
return os.getenv("DOCKER_COMPOSE_BASE")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def get_datahub_version() -> Optional[str]:
|
|
270
|
+
"""DataHub version (set during docker init)."""
|
|
271
|
+
return os.getenv("DATAHUB_VERSION")
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def get_mapped_mysql_port() -> Optional[str]:
|
|
275
|
+
"""MySQL port mapping (set during docker init)."""
|
|
276
|
+
return os.getenv("DATAHUB_MAPPED_MYSQL_PORT")
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def get_mapped_kafka_broker_port() -> Optional[str]:
|
|
280
|
+
"""Kafka broker port mapping (set during docker init)."""
|
|
281
|
+
return os.getenv("DATAHUB_MAPPED_KAFKA_BROKER_PORT")
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def get_mapped_elastic_port() -> Optional[str]:
|
|
285
|
+
"""Elasticsearch port mapping (set during docker init)."""
|
|
286
|
+
return os.getenv("DATAHUB_MAPPED_ELASTIC_PORT")
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def get_metadata_service_auth_enabled() -> str:
|
|
290
|
+
"""Enable/disable auth in Docker."""
|
|
291
|
+
return os.getenv("METADATA_SERVICE_AUTH_ENABLED", "false")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def get_ui_ingestion_default_cli_version() -> Optional[str]:
|
|
295
|
+
"""CLI version for UI ingestion (set during init)."""
|
|
296
|
+
return os.getenv("UI_INGESTION_DEFAULT_CLI_VERSION")
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
# ============================================================================
|
|
300
|
+
# Utility & Helper Configuration
|
|
301
|
+
# ============================================================================
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def get_datahub_component() -> str:
|
|
305
|
+
"""Component name for user agent tracking."""
|
|
306
|
+
return os.getenv("DATAHUB_COMPONENT", "datahub")
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def get_force_local_quickstart_mapping() -> str:
|
|
310
|
+
"""Force local quickstart mapping file."""
|
|
311
|
+
return os.getenv("FORCE_LOCAL_QUICKSTART_MAPPING", "")
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def get_dataproduct_external_url() -> Optional[str]:
|
|
315
|
+
"""External URL for data products."""
|
|
316
|
+
return os.getenv("DATAHUB_DATAPRODUCT_EXTERNAL_URL")
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def get_override_sqlite_version_req() -> str:
|
|
320
|
+
"""Override SQLite version requirement."""
|
|
321
|
+
return os.getenv("OVERRIDE_SQLITE_VERSION_REQ", "")
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def get_update_entity_registry() -> str:
|
|
325
|
+
"""Update entity registry during tests."""
|
|
326
|
+
return os.getenv("UPDATE_ENTITY_REGISTRY", "false")
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def get_ci() -> Optional[str]:
|
|
330
|
+
"""Indicates running in CI environment."""
|
|
331
|
+
return os.getenv("CI")
|
datahub/configuration/kafka.py
CHANGED
|
@@ -1,19 +1,21 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
1
|
from pydantic import Field, validator
|
|
4
2
|
|
|
5
3
|
from datahub.configuration.common import ConfigModel, ConfigurationError
|
|
4
|
+
from datahub.configuration.env_vars import (
|
|
5
|
+
get_gms_base_path,
|
|
6
|
+
get_kafka_schema_registry_url,
|
|
7
|
+
)
|
|
6
8
|
from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
|
|
7
9
|
from datahub.configuration.validate_host_port import validate_host_port
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
def _get_schema_registry_url() -> str:
|
|
11
13
|
"""Get schema registry URL with proper base path handling."""
|
|
12
|
-
explicit_url =
|
|
14
|
+
explicit_url = get_kafka_schema_registry_url()
|
|
13
15
|
if explicit_url:
|
|
14
16
|
return explicit_url
|
|
15
17
|
|
|
16
|
-
base_path =
|
|
18
|
+
base_path = get_gms_base_path()
|
|
17
19
|
if base_path in ("/", ""):
|
|
18
20
|
base_path = ""
|
|
19
21
|
|
datahub/emitter/mce_builder.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import hashlib
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
-
import os
|
|
7
6
|
import re
|
|
8
7
|
import time
|
|
9
8
|
from datetime import datetime, timezone
|
|
@@ -26,6 +25,7 @@ import typing_inspect
|
|
|
26
25
|
from avrogen.dict_wrapper import DictWrapper
|
|
27
26
|
from typing_extensions import assert_never
|
|
28
27
|
|
|
28
|
+
from datahub.configuration.env_vars import get_dataset_urn_to_lower
|
|
29
29
|
from datahub.emitter.enum_helpers import get_enum_options
|
|
30
30
|
from datahub.metadata.schema_classes import (
|
|
31
31
|
AssertionKeyClass,
|
|
@@ -72,9 +72,7 @@ ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass))
|
|
|
72
72
|
|
|
73
73
|
DEFAULT_FLOW_CLUSTER = "prod"
|
|
74
74
|
UNKNOWN_USER = "urn:li:corpuser:unknown"
|
|
75
|
-
DATASET_URN_TO_LOWER: bool = (
|
|
76
|
-
os.getenv("DATAHUB_DATASET_URN_TO_LOWER", "false") == "true"
|
|
77
|
-
)
|
|
75
|
+
DATASET_URN_TO_LOWER: bool = get_dataset_urn_to_lower() == "true"
|
|
78
76
|
|
|
79
77
|
if TYPE_CHECKING:
|
|
80
78
|
from datahub.emitter.mcp_builder import DatahubKey
|