acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/METADATA +2469 -2467
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/RECORD +50 -48
- datahub/_version.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +26 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/docker_check.py +2 -1
- datahub/cli/docker_cli.py +4 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/quickstart_versioning.py +2 -2
- datahub/cli/specific/dataproduct_cli.py +2 -4
- datahub/cli/specific/user_cli.py +172 -1
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/kafka.py +6 -4
- datahub/emitter/mce_builder.py +2 -4
- datahub/emitter/rest_emitter.py +15 -15
- datahub/entrypoints.py +2 -0
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/source.py +5 -0
- datahub/ingestion/graph/client.py +197 -0
- datahub/ingestion/graph/config.py +2 -2
- datahub/ingestion/sink/datahub_rest.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +20 -13
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
- datahub/ingestion/source/grafana/models.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +39 -19
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
- datahub/ingestion/source/mode.py +13 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
- datahub/ingestion/source/sql/mssql/source.py +7 -1
- datahub/ingestion/source/sql/teradata.py +80 -65
- datahub/ingestion/source/unity/config.py +31 -0
- datahub/ingestion/source/unity/proxy.py +73 -0
- datahub/ingestion/source/unity/source.py +27 -70
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/metadata/_internal_schema_classes.py +544 -544
- datahub/metadata/_urns/urn_defs.py +1728 -1728
- datahub/metadata/schema.avsc +15157 -15157
- datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
- datahub/sql_parsing/sqlglot_lineage.py +7 -0
- datahub/telemetry/telemetry.py +8 -3
- datahub/utilities/file_backed_collections.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/sample_data.py +5 -4
- datahub/emitter/sql_parsing_builder.py +0 -306
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/top_level.txt +0 -0
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
|
3
3
|
import functools
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
-
import os
|
|
7
6
|
import re
|
|
8
7
|
import time
|
|
9
8
|
from collections import defaultdict
|
|
@@ -33,7 +32,6 @@ from typing_extensions import deprecated
|
|
|
33
32
|
from datahub._version import nice_version_name
|
|
34
33
|
from datahub.cli import config_utils
|
|
35
34
|
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else
|
|
36
|
-
from datahub.cli.env_utils import get_boolean_env_variable
|
|
37
35
|
from datahub.configuration.common import (
|
|
38
36
|
ConfigEnum,
|
|
39
37
|
ConfigModel,
|
|
@@ -42,6 +40,14 @@ from datahub.configuration.common import (
|
|
|
42
40
|
TraceTimeoutError,
|
|
43
41
|
TraceValidationError,
|
|
44
42
|
)
|
|
43
|
+
from datahub.configuration.env_vars import (
|
|
44
|
+
get_emit_mode,
|
|
45
|
+
get_emitter_trace,
|
|
46
|
+
get_rest_emitter_batch_max_payload_bytes,
|
|
47
|
+
get_rest_emitter_batch_max_payload_length,
|
|
48
|
+
get_rest_emitter_default_endpoint,
|
|
49
|
+
get_rest_emitter_default_retry_max_times,
|
|
50
|
+
)
|
|
45
51
|
from datahub.emitter.generic_emitter import Emitter
|
|
46
52
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
47
53
|
from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
|
|
@@ -82,11 +88,9 @@ _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on
|
|
|
82
88
|
504,
|
|
83
89
|
]
|
|
84
90
|
_DEFAULT_RETRY_METHODS = ["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"]
|
|
85
|
-
_DEFAULT_RETRY_MAX_TIMES = int(
|
|
86
|
-
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
87
|
-
)
|
|
91
|
+
_DEFAULT_RETRY_MAX_TIMES = int(get_rest_emitter_default_retry_max_times())
|
|
88
92
|
|
|
89
|
-
_DATAHUB_EMITTER_TRACE =
|
|
93
|
+
_DATAHUB_EMITTER_TRACE = get_emitter_trace()
|
|
90
94
|
|
|
91
95
|
_DEFAULT_CLIENT_MODE: ClientMode = ClientMode.SDK
|
|
92
96
|
|
|
@@ -98,17 +102,13 @@ TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
|
|
|
98
102
|
# The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
|
|
99
103
|
# for overhead like request headers.
|
|
100
104
|
# This applies to pretty much all calls to GMS.
|
|
101
|
-
INGEST_MAX_PAYLOAD_BYTES =
|
|
102
|
-
os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_BYTES", 15 * 1024 * 1024)
|
|
103
|
-
)
|
|
105
|
+
INGEST_MAX_PAYLOAD_BYTES = get_rest_emitter_batch_max_payload_bytes()
|
|
104
106
|
|
|
105
107
|
# This limit is somewhat arbitrary. All GMS endpoints will timeout
|
|
106
108
|
# and return a 500 if processing takes too long. To avoid sending
|
|
107
109
|
# too much to the backend and hitting a timeout, we try to limit
|
|
108
110
|
# the number of MCPs we send in a batch.
|
|
109
|
-
BATCH_INGEST_MAX_PAYLOAD_LENGTH =
|
|
110
|
-
os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
|
|
111
|
-
)
|
|
111
|
+
BATCH_INGEST_MAX_PAYLOAD_LENGTH = get_rest_emitter_batch_max_payload_length()
|
|
112
112
|
|
|
113
113
|
|
|
114
114
|
def preserve_unicode_escapes(obj: Any) -> Any:
|
|
@@ -147,7 +147,7 @@ class EmitMode(ConfigEnum):
|
|
|
147
147
|
|
|
148
148
|
_DEFAULT_EMIT_MODE = pydantic.parse_obj_as(
|
|
149
149
|
EmitMode,
|
|
150
|
-
|
|
150
|
+
get_emit_mode() or EmitMode.SYNC_PRIMARY,
|
|
151
151
|
)
|
|
152
152
|
|
|
153
153
|
|
|
@@ -158,7 +158,7 @@ class RestSinkEndpoint(ConfigEnum):
|
|
|
158
158
|
|
|
159
159
|
DEFAULT_REST_EMITTER_ENDPOINT = pydantic.parse_obj_as(
|
|
160
160
|
RestSinkEndpoint,
|
|
161
|
-
|
|
161
|
+
get_rest_emitter_default_endpoint() or RestSinkEndpoint.RESTLI,
|
|
162
162
|
)
|
|
163
163
|
|
|
164
164
|
|
|
@@ -478,7 +478,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
478
478
|
if self._openapi_ingestion is None:
|
|
479
479
|
# No constructor parameter
|
|
480
480
|
if (
|
|
481
|
-
not
|
|
481
|
+
not get_rest_emitter_default_endpoint()
|
|
482
482
|
and self._session_config.client_mode == ClientMode.SDK
|
|
483
483
|
and self._server_config.supports_feature(ServiceFeature.OPEN_API_SDK)
|
|
484
484
|
):
|
datahub/entrypoints.py
CHANGED
|
@@ -22,6 +22,7 @@ from datahub.cli.docker_cli import docker
|
|
|
22
22
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
23
23
|
from datahub.cli.exists_cli import exists
|
|
24
24
|
from datahub.cli.get_cli import get
|
|
25
|
+
from datahub.cli.graphql_cli import graphql
|
|
25
26
|
from datahub.cli.ingest_cli import ingest
|
|
26
27
|
from datahub.cli.migrate import migrate
|
|
27
28
|
from datahub.cli.put_cli import put
|
|
@@ -169,6 +170,7 @@ datahub.add_command(ingest)
|
|
|
169
170
|
datahub.add_command(delete)
|
|
170
171
|
datahub.add_command(exists)
|
|
171
172
|
datahub.add_command(get)
|
|
173
|
+
datahub.add_command(graphql)
|
|
172
174
|
datahub.add_command(put)
|
|
173
175
|
datahub.add_command(state)
|
|
174
176
|
datahub.add_command(telemetry_cli)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TYPE_CHECKING, Iterable, List
|
|
3
|
+
|
|
4
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
5
|
+
from datahub.metadata.schema_classes import InputFieldClass, InputFieldsClass
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from datahub.ingestion.api.source import SourceReport
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ValidateInputFieldsProcessor:
|
|
14
|
+
def __init__(self, report: "SourceReport"):
|
|
15
|
+
self.report = report
|
|
16
|
+
|
|
17
|
+
def validate_input_fields(
|
|
18
|
+
self,
|
|
19
|
+
stream: Iterable[MetadataWorkUnit],
|
|
20
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
21
|
+
"""
|
|
22
|
+
Validate input fields and filter out invalid ones.
|
|
23
|
+
|
|
24
|
+
Invalid input fields have empty or missing fieldPath values, which would cause
|
|
25
|
+
URN generation to fail when sent to the server. This processor filters them out
|
|
26
|
+
and reports them as warnings.
|
|
27
|
+
"""
|
|
28
|
+
for wu in stream:
|
|
29
|
+
input_fields_aspect = wu.get_aspect_of_type(InputFieldsClass)
|
|
30
|
+
if input_fields_aspect and input_fields_aspect.fields:
|
|
31
|
+
valid_fields: List[InputFieldClass] = []
|
|
32
|
+
invalid_count = 0
|
|
33
|
+
|
|
34
|
+
for input_field in input_fields_aspect.fields:
|
|
35
|
+
if (
|
|
36
|
+
input_field.schemaField
|
|
37
|
+
and input_field.schemaField.fieldPath
|
|
38
|
+
and input_field.schemaField.fieldPath.strip()
|
|
39
|
+
):
|
|
40
|
+
valid_fields.append(input_field)
|
|
41
|
+
else:
|
|
42
|
+
invalid_count += 1
|
|
43
|
+
|
|
44
|
+
if invalid_count > 0:
|
|
45
|
+
logger.debug(
|
|
46
|
+
f"Filtered {invalid_count} invalid input field(s) with empty fieldPath for {wu.get_urn()}"
|
|
47
|
+
)
|
|
48
|
+
self.report.num_input_fields_filtered += invalid_count
|
|
49
|
+
self.report.warning(
|
|
50
|
+
title="Invalid input fields filtered",
|
|
51
|
+
message="Input fields with empty fieldPath values were filtered out to prevent ingestion errors",
|
|
52
|
+
context=f"Filtered {invalid_count} invalid input field(s) for {wu.get_urn()}",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Update the aspect with only valid fields
|
|
56
|
+
if valid_fields:
|
|
57
|
+
input_fields_aspect.fields = valid_fields
|
|
58
|
+
else:
|
|
59
|
+
# If no valid fields remain, skip this workunit entirely
|
|
60
|
+
logger.debug(
|
|
61
|
+
f"All input fields were invalid for {wu.get_urn()}, skipping InputFieldsClass workunit"
|
|
62
|
+
)
|
|
63
|
+
# Don't yield this workunit
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
yield wu
|
|
67
|
+
|
|
68
|
+
def _remove_input_fields_aspect(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
|
|
69
|
+
"""Remove InputFieldsClass aspect from a workunit."""
|
|
70
|
+
# For MCPs, we can simply not yield the aspect
|
|
71
|
+
# For MCEs, we need to remove it from the snapshot
|
|
72
|
+
if hasattr(wu.metadata, "aspect") and isinstance(
|
|
73
|
+
wu.metadata.aspect, InputFieldsClass
|
|
74
|
+
):
|
|
75
|
+
# This is an MCP with InputFieldsClass, skip it
|
|
76
|
+
return wu
|
|
77
|
+
|
|
78
|
+
if hasattr(wu.metadata, "proposedSnapshot"):
|
|
79
|
+
snapshot = wu.metadata.proposedSnapshot
|
|
80
|
+
if hasattr(snapshot, "aspects"):
|
|
81
|
+
snapshot.aspects = [
|
|
82
|
+
aspect
|
|
83
|
+
for aspect in snapshot.aspects
|
|
84
|
+
if not isinstance(aspect, InputFieldsClass)
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
return wu
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -31,6 +31,9 @@ from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import
|
|
|
31
31
|
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
32
32
|
EnsureAspectSizeProcessor,
|
|
33
33
|
)
|
|
34
|
+
from datahub.ingestion.api.auto_work_units.auto_validate_input_fields import (
|
|
35
|
+
ValidateInputFieldsProcessor,
|
|
36
|
+
)
|
|
34
37
|
from datahub.ingestion.api.closeable import Closeable
|
|
35
38
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
36
39
|
from datahub.ingestion.api.report import ExamplesReport, Report
|
|
@@ -215,6 +218,7 @@ class SourceReport(ExamplesReport, IngestionStageReport):
|
|
|
215
218
|
event_not_produced_warn: bool = True
|
|
216
219
|
events_produced: int = 0
|
|
217
220
|
events_produced_per_sec: int = 0
|
|
221
|
+
num_input_fields_filtered: int = 0
|
|
218
222
|
|
|
219
223
|
_structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
|
|
220
224
|
|
|
@@ -543,6 +547,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
543
547
|
browse_path_processor,
|
|
544
548
|
partial(auto_workunit_reporter, self.get_report()),
|
|
545
549
|
auto_patch_last_modified,
|
|
550
|
+
ValidateInputFieldsProcessor(self.get_report()).validate_input_fields,
|
|
546
551
|
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
547
552
|
]
|
|
548
553
|
|
|
@@ -30,6 +30,7 @@ from typing_extensions import deprecated
|
|
|
30
30
|
|
|
31
31
|
from datahub._codegen.aspect import _Aspect
|
|
32
32
|
from datahub.cli import config_utils
|
|
33
|
+
from datahub.cli.cli_utils import guess_frontend_url_from_gms_url
|
|
33
34
|
from datahub.configuration.common import ConfigModel, GraphError, OperationalError
|
|
34
35
|
from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
|
|
35
36
|
from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
|
|
@@ -2071,6 +2072,202 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
2071
2072
|
|
|
2072
2073
|
return res["reportAssertionResult"]
|
|
2073
2074
|
|
|
2075
|
+
def _get_invite_token(self) -> str:
|
|
2076
|
+
"""
|
|
2077
|
+
Retrieve an invite token for user creation.
|
|
2078
|
+
|
|
2079
|
+
Returns:
|
|
2080
|
+
Invite token string
|
|
2081
|
+
|
|
2082
|
+
Raises:
|
|
2083
|
+
OperationalError: If invite token retrieval fails
|
|
2084
|
+
"""
|
|
2085
|
+
get_invite_token_query = """
|
|
2086
|
+
query getInviteToken($input: GetInviteTokenInput!) {
|
|
2087
|
+
getInviteToken(input: $input) {
|
|
2088
|
+
inviteToken
|
|
2089
|
+
}
|
|
2090
|
+
}
|
|
2091
|
+
"""
|
|
2092
|
+
|
|
2093
|
+
try:
|
|
2094
|
+
invite_token_response = self.execute_graphql(
|
|
2095
|
+
query=get_invite_token_query,
|
|
2096
|
+
variables={"input": {}},
|
|
2097
|
+
)
|
|
2098
|
+
invite_token = invite_token_response.get("getInviteToken", {}).get(
|
|
2099
|
+
"inviteToken"
|
|
2100
|
+
)
|
|
2101
|
+
if not invite_token:
|
|
2102
|
+
raise OperationalError(
|
|
2103
|
+
"Failed to retrieve invite token. Ensure you have admin permissions.",
|
|
2104
|
+
{},
|
|
2105
|
+
)
|
|
2106
|
+
return invite_token
|
|
2107
|
+
except Exception as e:
|
|
2108
|
+
raise OperationalError(
|
|
2109
|
+
f"Failed to retrieve invite token: {str(e)}", {}
|
|
2110
|
+
) from e
|
|
2111
|
+
|
|
2112
|
+
def _create_user_with_token(
|
|
2113
|
+
self,
|
|
2114
|
+
user_urn: str,
|
|
2115
|
+
email: str,
|
|
2116
|
+
display_name: str,
|
|
2117
|
+
password: str,
|
|
2118
|
+
invite_token: str,
|
|
2119
|
+
) -> None:
|
|
2120
|
+
"""
|
|
2121
|
+
Create a user using the signup endpoint.
|
|
2122
|
+
|
|
2123
|
+
Args:
|
|
2124
|
+
user_urn: User URN (urn:li:corpuser:{user_id})
|
|
2125
|
+
email: User's email address
|
|
2126
|
+
display_name: Full display name for the user
|
|
2127
|
+
password: User's password
|
|
2128
|
+
invite_token: Invite token for user creation
|
|
2129
|
+
|
|
2130
|
+
Raises:
|
|
2131
|
+
OperationalError: If user creation fails
|
|
2132
|
+
"""
|
|
2133
|
+
frontend_url = guess_frontend_url_from_gms_url(self._gms_server)
|
|
2134
|
+
signup_url = f"{frontend_url}/signUp"
|
|
2135
|
+
signup_payload = {
|
|
2136
|
+
"userUrn": user_urn,
|
|
2137
|
+
"email": email,
|
|
2138
|
+
"fullName": display_name,
|
|
2139
|
+
"password": password,
|
|
2140
|
+
"title": "Other",
|
|
2141
|
+
"inviteToken": invite_token,
|
|
2142
|
+
}
|
|
2143
|
+
|
|
2144
|
+
logger.debug(
|
|
2145
|
+
f"Creating user with URN={user_urn}, email={email} at URL: {signup_url}"
|
|
2146
|
+
)
|
|
2147
|
+
logger.debug(
|
|
2148
|
+
f"Signup payload: {json.dumps({**signup_payload, 'password': '***'})}"
|
|
2149
|
+
)
|
|
2150
|
+
|
|
2151
|
+
try:
|
|
2152
|
+
response = self._session.post(signup_url, json=signup_payload)
|
|
2153
|
+
logger.debug(f"Response status code: {response.status_code}")
|
|
2154
|
+
logger.debug(f"Response headers: {dict(response.headers)}")
|
|
2155
|
+
logger.debug(f"Response content length: {len(response.text)}")
|
|
2156
|
+
|
|
2157
|
+
response.raise_for_status()
|
|
2158
|
+
|
|
2159
|
+
# The /signUp endpoint returns 200 with empty body on success
|
|
2160
|
+
logger.debug("User created successfully")
|
|
2161
|
+
|
|
2162
|
+
except HTTPError as http_err:
|
|
2163
|
+
error_details = {
|
|
2164
|
+
"url": signup_url,
|
|
2165
|
+
"status_code": response.status_code,
|
|
2166
|
+
"response_text": response.text[:500],
|
|
2167
|
+
}
|
|
2168
|
+
try:
|
|
2169
|
+
error_json = response.json()
|
|
2170
|
+
error_details["error_response"] = error_json
|
|
2171
|
+
error_msg = error_json.get("message", str(http_err))
|
|
2172
|
+
except JSONDecodeError:
|
|
2173
|
+
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
|
|
2174
|
+
|
|
2175
|
+
raise OperationalError(
|
|
2176
|
+
f"Failed to create user: {error_msg}",
|
|
2177
|
+
error_details,
|
|
2178
|
+
) from http_err
|
|
2179
|
+
except Exception as e:
|
|
2180
|
+
raise OperationalError(
|
|
2181
|
+
f"Failed to create user: {str(e)}",
|
|
2182
|
+
{"url": signup_url, "error_type": type(e).__name__},
|
|
2183
|
+
) from e
|
|
2184
|
+
|
|
2185
|
+
def _assign_role_to_user(self, user_urn: str, role: str) -> None:
|
|
2186
|
+
"""
|
|
2187
|
+
Assign a role to a user.
|
|
2188
|
+
|
|
2189
|
+
Args:
|
|
2190
|
+
user_urn: User URN
|
|
2191
|
+
role: Role to assign (Admin, Editor, or Reader)
|
|
2192
|
+
|
|
2193
|
+
Raises:
|
|
2194
|
+
ValueError: If role is invalid
|
|
2195
|
+
"""
|
|
2196
|
+
normalized_role = role.capitalize()
|
|
2197
|
+
valid_roles = ["Admin", "Editor", "Reader"]
|
|
2198
|
+
if normalized_role not in valid_roles:
|
|
2199
|
+
raise ValueError(
|
|
2200
|
+
f"Invalid role '{role}'. Must be one of: {', '.join(valid_roles)}"
|
|
2201
|
+
)
|
|
2202
|
+
|
|
2203
|
+
role_urn = f"urn:li:dataHubRole:{normalized_role}"
|
|
2204
|
+
|
|
2205
|
+
batch_assign_role_mutation = """
|
|
2206
|
+
mutation batchAssignRole($input: BatchAssignRoleInput!) {
|
|
2207
|
+
batchAssignRole(input: $input)
|
|
2208
|
+
}
|
|
2209
|
+
"""
|
|
2210
|
+
|
|
2211
|
+
try:
|
|
2212
|
+
self.execute_graphql(
|
|
2213
|
+
query=batch_assign_role_mutation,
|
|
2214
|
+
variables={"input": {"roleUrn": role_urn, "actors": [user_urn]}},
|
|
2215
|
+
)
|
|
2216
|
+
except Exception as e:
|
|
2217
|
+
logger.warning(f"Role assignment failed for user {user_urn}: {str(e)}")
|
|
2218
|
+
raise
|
|
2219
|
+
|
|
2220
|
+
def create_native_user(
|
|
2221
|
+
self,
|
|
2222
|
+
user_id: str,
|
|
2223
|
+
email: str,
|
|
2224
|
+
display_name: str,
|
|
2225
|
+
password: str,
|
|
2226
|
+
role: Optional[str] = None,
|
|
2227
|
+
) -> str:
|
|
2228
|
+
"""
|
|
2229
|
+
Create a native DataHub user with email/password authentication.
|
|
2230
|
+
|
|
2231
|
+
Args:
|
|
2232
|
+
user_id: User identifier (will be used in the URN)
|
|
2233
|
+
email: User's email address
|
|
2234
|
+
display_name: Full display name for the user
|
|
2235
|
+
password: User's password
|
|
2236
|
+
role: Optional role to assign (Admin, Editor, or Reader)
|
|
2237
|
+
|
|
2238
|
+
Returns:
|
|
2239
|
+
User URN of the created user (urn:li:corpuser:{user_id})
|
|
2240
|
+
|
|
2241
|
+
Raises:
|
|
2242
|
+
OperationalError: If user creation fails
|
|
2243
|
+
ValueError: If role is invalid
|
|
2244
|
+
"""
|
|
2245
|
+
# Validate role before creating user
|
|
2246
|
+
if role:
|
|
2247
|
+
normalized_role = role.capitalize()
|
|
2248
|
+
valid_roles = ["Admin", "Editor", "Reader"]
|
|
2249
|
+
if normalized_role not in valid_roles:
|
|
2250
|
+
raise ValueError(
|
|
2251
|
+
f"Invalid role '{role}'. Must be one of: {', '.join(valid_roles)}"
|
|
2252
|
+
)
|
|
2253
|
+
|
|
2254
|
+
user_urn = f"urn:li:corpuser:{user_id}"
|
|
2255
|
+
|
|
2256
|
+
invite_token = self._get_invite_token()
|
|
2257
|
+
self._create_user_with_token(
|
|
2258
|
+
user_urn, email, display_name, password, invite_token
|
|
2259
|
+
)
|
|
2260
|
+
|
|
2261
|
+
if role:
|
|
2262
|
+
try:
|
|
2263
|
+
self._assign_role_to_user(user_urn, role)
|
|
2264
|
+
except Exception as e:
|
|
2265
|
+
logger.warning(
|
|
2266
|
+
f"User {email} created successfully, but role assignment failed: {str(e)}"
|
|
2267
|
+
)
|
|
2268
|
+
|
|
2269
|
+
return user_urn
|
|
2270
|
+
|
|
2074
2271
|
def close(self) -> None:
|
|
2075
2272
|
self._make_schema_resolver.cache_clear()
|
|
2076
2273
|
super().close()
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import os
|
|
2
1
|
from enum import Enum, auto
|
|
3
2
|
from typing import Dict, List, Optional
|
|
4
3
|
|
|
5
4
|
from datahub.configuration.common import ConfigModel
|
|
5
|
+
from datahub.configuration.env_vars import get_datahub_component
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class ClientMode(Enum):
|
|
@@ -11,7 +11,7 @@ class ClientMode(Enum):
|
|
|
11
11
|
SDK = auto()
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
DATAHUB_COMPONENT_ENV: str =
|
|
14
|
+
DATAHUB_COMPONENT_ENV: str = get_datahub_component().lower()
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class DatahubClientConfig(ConfigModel):
|
|
@@ -3,7 +3,6 @@ import contextlib
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import functools
|
|
5
5
|
import logging
|
|
6
|
-
import os
|
|
7
6
|
import threading
|
|
8
7
|
import uuid
|
|
9
8
|
from enum import auto
|
|
@@ -16,6 +15,10 @@ from datahub.configuration.common import (
|
|
|
16
15
|
ConfigurationError,
|
|
17
16
|
OperationalError,
|
|
18
17
|
)
|
|
18
|
+
from datahub.configuration.env_vars import (
|
|
19
|
+
get_rest_sink_default_max_threads,
|
|
20
|
+
get_rest_sink_default_mode,
|
|
21
|
+
)
|
|
19
22
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
23
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
21
24
|
from datahub.emitter.rest_emitter import (
|
|
@@ -47,9 +50,7 @@ from datahub.utilities.server_config_util import set_gms_config
|
|
|
47
50
|
|
|
48
51
|
logger = logging.getLogger(__name__)
|
|
49
52
|
|
|
50
|
-
_DEFAULT_REST_SINK_MAX_THREADS =
|
|
51
|
-
os.getenv("DATAHUB_REST_SINK_DEFAULT_MAX_THREADS", 15)
|
|
52
|
-
)
|
|
53
|
+
_DEFAULT_REST_SINK_MAX_THREADS = get_rest_sink_default_max_threads()
|
|
53
54
|
|
|
54
55
|
|
|
55
56
|
class RestSinkMode(ConfigEnum):
|
|
@@ -63,7 +64,7 @@ class RestSinkMode(ConfigEnum):
|
|
|
63
64
|
|
|
64
65
|
|
|
65
66
|
_DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
|
|
66
|
-
RestSinkMode,
|
|
67
|
+
RestSinkMode, get_rest_sink_default_mode() or RestSinkMode.ASYNC_BATCH
|
|
67
68
|
)
|
|
68
69
|
|
|
69
70
|
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
from datetime import datetime, timedelta, timezone
|
|
4
3
|
from enum import Enum
|
|
5
4
|
from http import HTTPStatus
|
|
@@ -17,6 +16,16 @@ from datahub.configuration.common import (
|
|
|
17
16
|
ConfigModel,
|
|
18
17
|
PermissiveConfigModel,
|
|
19
18
|
)
|
|
19
|
+
from datahub.configuration.env_vars import (
|
|
20
|
+
get_aws_app_runner_service_id,
|
|
21
|
+
get_aws_execution_env,
|
|
22
|
+
get_aws_lambda_function_name,
|
|
23
|
+
get_aws_role_arn,
|
|
24
|
+
get_aws_web_identity_token_file,
|
|
25
|
+
get_ecs_container_metadata_uri,
|
|
26
|
+
get_ecs_container_metadata_uri_v4,
|
|
27
|
+
get_elastic_beanstalk_environment_name,
|
|
28
|
+
)
|
|
20
29
|
from datahub.configuration.source_common import EnvConfigMixin
|
|
21
30
|
|
|
22
31
|
logger = logging.getLogger(__name__)
|
|
@@ -100,27 +109,25 @@ def detect_aws_environment() -> AwsEnvironment:
|
|
|
100
109
|
Order matters as some environments may have multiple indicators.
|
|
101
110
|
"""
|
|
102
111
|
# Check Lambda first as it's most specific
|
|
103
|
-
if
|
|
104
|
-
if
|
|
112
|
+
if get_aws_lambda_function_name():
|
|
113
|
+
if (get_aws_execution_env() or "").startswith("CloudFormation"):
|
|
105
114
|
return AwsEnvironment.CLOUD_FORMATION
|
|
106
115
|
return AwsEnvironment.LAMBDA
|
|
107
116
|
|
|
108
117
|
# Check EKS (IRSA)
|
|
109
|
-
if
|
|
118
|
+
if get_aws_web_identity_token_file() and get_aws_role_arn():
|
|
110
119
|
return AwsEnvironment.EKS
|
|
111
120
|
|
|
112
121
|
# Check App Runner
|
|
113
|
-
if
|
|
122
|
+
if get_aws_app_runner_service_id():
|
|
114
123
|
return AwsEnvironment.APP_RUNNER
|
|
115
124
|
|
|
116
125
|
# Check ECS
|
|
117
|
-
if
|
|
118
|
-
"ECS_CONTAINER_METADATA_URI"
|
|
119
|
-
):
|
|
126
|
+
if get_ecs_container_metadata_uri_v4() or get_ecs_container_metadata_uri():
|
|
120
127
|
return AwsEnvironment.ECS
|
|
121
128
|
|
|
122
129
|
# Check Elastic Beanstalk
|
|
123
|
-
if
|
|
130
|
+
if get_elastic_beanstalk_environment_name():
|
|
124
131
|
return AwsEnvironment.BEANSTALK
|
|
125
132
|
|
|
126
133
|
if is_running_on_ec2():
|
|
@@ -155,7 +162,7 @@ def get_instance_role_arn() -> Optional[str]:
|
|
|
155
162
|
def get_lambda_role_arn() -> Optional[str]:
|
|
156
163
|
"""Get the Lambda function's role ARN"""
|
|
157
164
|
try:
|
|
158
|
-
function_name =
|
|
165
|
+
function_name = get_aws_lambda_function_name()
|
|
159
166
|
if not function_name:
|
|
160
167
|
return None
|
|
161
168
|
|
|
@@ -181,7 +188,7 @@ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
|
|
|
181
188
|
return role_arn, AwsServicePrincipal.LAMBDA.value
|
|
182
189
|
|
|
183
190
|
elif env == AwsEnvironment.EKS:
|
|
184
|
-
role_arn =
|
|
191
|
+
role_arn = get_aws_role_arn()
|
|
185
192
|
return role_arn, AwsServicePrincipal.EKS.value
|
|
186
193
|
|
|
187
194
|
elif env == AwsEnvironment.APP_RUNNER:
|
|
@@ -194,8 +201,8 @@ def get_current_identity() -> Tuple[Optional[str], Optional[str]]:
|
|
|
194
201
|
|
|
195
202
|
elif env == AwsEnvironment.ECS:
|
|
196
203
|
try:
|
|
197
|
-
metadata_uri =
|
|
198
|
-
|
|
204
|
+
metadata_uri = (
|
|
205
|
+
get_ecs_container_metadata_uri_v4() or get_ecs_container_metadata_uri()
|
|
199
206
|
)
|
|
200
207
|
if metadata_uri:
|
|
201
208
|
response = requests.get(f"{metadata_uri}/task", timeout=1)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
import re
|
|
4
3
|
from copy import deepcopy
|
|
5
4
|
from datetime import timedelta
|
|
@@ -8,6 +7,7 @@ from typing import Dict, List, Optional, Union
|
|
|
8
7
|
from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
|
|
9
8
|
|
|
10
9
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
10
|
+
from datahub.configuration.env_vars import get_bigquery_schema_parallelism
|
|
11
11
|
from datahub.configuration.source_common import (
|
|
12
12
|
EnvConfigMixin,
|
|
13
13
|
LowerCaseDatasetUrnConfigMixin,
|
|
@@ -31,9 +31,7 @@ from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
|
31
31
|
|
|
32
32
|
logger = logging.getLogger(__name__)
|
|
33
33
|
|
|
34
|
-
DEFAULT_BQ_SCHEMA_PARALLELISM =
|
|
35
|
-
os.getenv("DATAHUB_BIGQUERY_SCHEMA_PARALLELISM", 20)
|
|
36
|
-
)
|
|
34
|
+
DEFAULT_BQ_SCHEMA_PARALLELISM = get_bigquery_schema_parallelism()
|
|
37
35
|
|
|
38
36
|
# Regexp for sharded tables.
|
|
39
37
|
# A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date.
|
|
@@ -92,6 +92,7 @@ class Dashboard(_GrafanaBaseModel):
|
|
|
92
92
|
"""Custom parsing to handle nested panel extraction."""
|
|
93
93
|
dashboard_data = data.get("dashboard", {})
|
|
94
94
|
_panel_data = dashboard_data.get("panels", [])
|
|
95
|
+
panels = []
|
|
95
96
|
try:
|
|
96
97
|
panels = cls.extract_panels(_panel_data)
|
|
97
98
|
except Exception as e:
|
|
@@ -108,6 +109,10 @@ class Dashboard(_GrafanaBaseModel):
|
|
|
108
109
|
if "meta" in dashboard_dict:
|
|
109
110
|
del dashboard_dict["meta"]
|
|
110
111
|
|
|
112
|
+
# Handle refresh field type mismatch - convert boolean to string
|
|
113
|
+
if "refresh" in dashboard_dict and isinstance(dashboard_dict["refresh"], bool):
|
|
114
|
+
dashboard_dict["refresh"] = str(dashboard_dict["refresh"])
|
|
115
|
+
|
|
111
116
|
return super().parse_obj(dashboard_dict)
|
|
112
117
|
|
|
113
118
|
|